KTextEditor

katetextloader.h
1/*
2 SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#ifndef KATE_TEXTLOADER_H
8#define KATE_TEXTLOADER_H
9
10#include <QCryptographicHash>
11#include <QFile>
12#include <QMimeDatabase>
13#include <QString>
14#include <QStringDecoder>
15
16#include <KCompressionDevice>
17#include <KEncodingProber>
18
19#include "katetextbuffer.h"
20
21namespace Kate
22{
23/**
24 * loader block size, load 256 kb at once per default
25 * if file size is smaller, fall back to file size
26 * must be a multiple of 2
27 */
28static const qint64 KATE_FILE_LOADER_BS = 256 * 1024;
29
30/**
31 * File Loader, will handle reading of files + detecting encoding
32 */
34{
35public:
36 /**
37 * Construct file loader for given file.
38 * @param filename file to open
39 * @param proberType prober type
40 * @param lineLengthLimit limit for lines to load, else we break them up in smaller ones
41 */
42 TextLoader(const QString &filename, KEncodingProber::ProberType proberType, int lineLengthLimit)
43 : m_eof(false) // default to not eof
44 , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline
45 , m_lastWasR(false) // we have not found a \r as last char
46 , m_position(0)
47 , m_lastLineStart(0)
48 , m_eol(TextBuffer::eolUnknown) // no eol type detected atm
49 , m_buffer(KATE_FILE_LOADER_BS, 0)
50 , m_digest(QCryptographicHash::Sha1)
51 , m_bomFound(false)
52 , m_firstRead(true)
53 , m_proberType(proberType)
54 , m_fileSize(0)
55 , m_lineLengthLimit(lineLengthLimit)
56 {
57 // try to get mimetype for on the fly decompression, don't rely on filename!
58 QFile testMime(filename);
59 if (testMime.open(QIODevice::ReadOnly)) {
60 m_fileSize = testMime.size();
61 }
62 m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(filename, &testMime).name();
63
64 // construct filter device
66 m_file = new KCompressionDevice(filename, compressionType);
67 }
68
69 /**
70 * Destructor
71 */
73 {
74 delete m_file;
75 }
76
77 /**
78 * open file with given codec
79 * @param codec codec to use, if 0, will do some auto-detect or fallback
80 * @return success
81 */
82 bool open(const QString &codec)
83 {
84 m_codec = codec;
85 m_eof = false;
86 m_lastWasEndOfLine = true;
87 m_lastWasR = false;
88 m_position = 0;
89 m_lastLineStart = 0;
90 m_alreadyScanned = -1;
91 m_eol = TextBuffer::eolUnknown;
92 m_text.clear();
93 m_converterState = m_codec.isEmpty() ? QStringDecoder() : QStringDecoder(m_codec.toUtf8().constData());
94 m_bomFound = false;
95 m_firstRead = true;
96
97 // init the hash with the git header
98 const QString header = QStringLiteral("blob %1").arg(m_fileSize);
99 m_digest.reset();
100 m_digest.addData(QByteArray(header.toLatin1() + '\0'));
101
102 // if already opened, close the file...
103 if (m_file->isOpen()) {
104 m_file->close();
105 }
106
107 return m_file->open(QIODevice::ReadOnly);
108 }
109
110 /**
111 * end of file reached?
112 * @return end of file reached
113 */
114 bool eof() const
115 {
116 return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length());
117 }
118
119 /**
120 * Detected end of line mode for this file.
121 * Detected during reading, is valid after complete file is read.
122 * @return eol mode of this file
123 */
125 {
126 return m_eol;
127 }
128
129 /**
130 * BOM found?
131 * @return byte order mark found?
132 */
134 {
135 return m_bomFound;
136 }
137
138 /**
139 * mime type used to create filter dev
140 * @return mime-type of filter device
141 */
143 {
144 return m_mimeType;
145 }
146
147 /**
148 * internal Unicode data array
149 * @return internal Unicode data
150 */
151 const QChar *unicode() const
152 {
153 return m_text.unicode();
154 }
155
156 /**
157 * Get codec for this loader
158 * @return currently in use codec of this loader
159 */
161 {
162 return m_codec;
163 }
164
165 /**
166 * read a line, return length + offset in Unicode data
167 * @param offset offset into internal Unicode data for read line
168 * @param length length of read line
169 * @param tooLongLinesWrapped was a too long line seen?
170 * @param longestLineLoaded length of the longest line that hit the limit
171 * @return true if no encoding errors occurred
172 */
173 bool readLine(int &offset, int &length, bool &tooLongLinesWrapped, int &longestLineLoaded)
174 {
175 length = 0;
176 offset = 0;
177 bool encodingError = false;
178
179 static const QLatin1Char cr(QLatin1Char('\r'));
180 static const QLatin1Char lf(QLatin1Char('\n'));
181
182 /**
183 * did we read two time but got no stuff? encoding error
184 * fixes problem with one character latin-1 files, which lead to crash otherwise!
185 * bug 272579
186 */
187 bool failedToConvertOnce = false;
188
189 /**
190 * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true
191 * BUG: 440359
192 */
193 bool bomPreviouslyFound = m_bomFound;
194
195 // honor the line length limit early
196 const auto lineLimitHandler = [this, &offset, &length, &tooLongLinesWrapped, &longestLineLoaded](int lineStart, int textLength) {
197 if ((m_lineLengthLimit <= 0) || (textLength <= m_lineLengthLimit)) {
198 return false;
199 }
200
201 // remember stick error
202 tooLongLinesWrapped = true;
203 longestLineLoaded = std::max(longestLineLoaded, textLength);
204
205 // search for place to wrap
206 int spacePosition = m_lineLengthLimit - 1;
207 for (int testPosition = m_lineLengthLimit - 1; (testPosition >= 0) && (testPosition >= (m_lineLengthLimit - (m_lineLengthLimit / 10)));
208 --testPosition) {
209 // wrap place found?
210 if (m_text[lineStart + testPosition].isSpace() || m_text[lineStart + testPosition].isPunct()) {
211 spacePosition = testPosition;
212 break;
213 }
214 }
215
216 m_lastWasEndOfLine = false;
217 m_lastWasR = false;
218
219 // line data
220 offset = lineStart;
221 length = spacePosition + 1;
222
223 m_lastLineStart = m_position = (lineStart + length);
224 return true;
225 };
226
227 /**
228 * reading loop
229 */
230 while (m_position <= m_text.length()) {
231 // handle too long lines early even if we not yet have seen the end
232 if (m_alreadyScanned > m_lastLineStart && lineLimitHandler(m_lastLineStart, m_alreadyScanned - m_lastLineStart)) {
233 return !encodingError;
234 }
235
236 if (m_position == m_text.length()) {
237 // try to load more text if something is around
238 if (!m_eof) {
239 // kill the old lines...
240 m_text.remove(0, m_lastLineStart);
241
242 // try to read new data
243 const int c = m_file->read(m_buffer.data(), m_buffer.size());
244
245 // if any text is there, append it....
246 if (c > 0) {
247 // update hash sum
248 m_digest.addData(QByteArrayView(m_buffer.data(), c));
249
250 // detect byte order marks & codec for byte order marks on first read
251 if (m_firstRead) {
252 /**
253 * if no codec given, do autodetection
254 */
255 if (!m_converterState.isValid()) {
256 /**
257 * first: try to get HTML header encoding, includes BOM handling
258 */
259 m_converterState = QStringDecoder::decoderForHtml(m_buffer);
260
261 /**
262 * else: use KEncodingProber
263 */
264 if (!m_converterState.isValid()) {
265 KEncodingProber prober(m_proberType);
266 prober.feed(m_buffer.constData(), c);
267
268 // we found codec with some confidence?
269 if (prober.confidence() > 0.5) {
270 m_converterState = QStringDecoder(prober.encoding().constData());
271 }
272 }
273
274 // no codec, no chance, encoding error, else remember the codec name
275 if (!m_converterState.isValid()) {
276 return false;
277 }
278 }
279
280 // we want to convert the bom for later detection
281 m_converterState = QStringDecoder(m_converterState.name(), QStringConverter::Flag::ConvertInitialBom);
282
283 // remember name, might have changed
284 m_codec = QString::fromUtf8(m_converterState.name());
285 }
286
287 // detect broken encoding
288 Q_ASSERT(m_converterState.isValid());
289 const QString unicode = m_converterState.decode(QByteArrayView(m_buffer.data(), c));
290 encodingError = encodingError || m_converterState.hasError();
291
292 // check and remove bom
293 if (m_firstRead && !unicode.isEmpty() && (unicode.front() == QChar::ByteOrderMark || unicode.front() == QChar::ByteOrderSwapped)) {
294 m_bomFound = true;
295 m_text.append(QStringView(unicode).last(unicode.size() - 1));
296
297 // swapped BOM is encoding error
298 encodingError = encodingError || unicode.front() == QChar::ByteOrderSwapped;
299 } else {
300 m_text.append(unicode);
301 }
302 m_firstRead = false;
303 }
304
305 // is file completely read ?
306 m_eof = (c == -1) || (c == 0);
307
308 // recalc current pos and last pos
309 m_position -= m_lastLineStart;
310 m_alreadyScanned = m_position - 1;
311 m_lastLineStart = 0;
312 }
313
314 // oh oh, end of file, escape !
315 if (m_eof && (m_position == m_text.length())) {
316 m_lastWasEndOfLine = false;
317
318 // line data
319 offset = m_lastLineStart;
320 length = m_position - m_lastLineStart;
321
322 m_lastLineStart = m_position;
323
324 lineLimitHandler(offset, length);
325 return !encodingError && !failedToConvertOnce;
326 }
327
328 // empty? try again
329 if (m_position == m_text.length()) {
330 if (!bomPreviouslyFound && m_bomFound) {
331 // BOM was processed above, so we didn't fail to convert
332 bomPreviouslyFound = true;
333 } else {
334 failedToConvertOnce = true;
335 }
336 continue;
337 }
338 }
339
340 for (; m_position < m_text.length(); m_position++) {
341 m_alreadyScanned = m_position;
342 QChar current_char = m_text.at(m_position);
343 if (current_char == lf) {
344 m_lastWasEndOfLine = true;
345
346 if (m_lastWasR) {
347 m_lastLineStart++;
348 m_lastWasR = false;
349 m_eol = TextBuffer::eolDos;
350 } else {
351 // line data
352 offset = m_lastLineStart;
353 length = m_position - m_lastLineStart;
354
355 m_lastLineStart = m_position + 1;
356 m_position++;
357
358 // only win, if not dos!
359 if (m_eol != TextBuffer::eolDos) {
360 m_eol = TextBuffer::eolUnix;
361 }
362
363 lineLimitHandler(offset, length);
364 return !encodingError;
365 }
366 } else if (current_char == cr) {
367 m_lastWasEndOfLine = true;
368 m_lastWasR = true;
369
370 // line data
371 offset = m_lastLineStart;
372 length = m_position - m_lastLineStart;
373
374 m_lastLineStart = m_position + 1;
375 m_position++;
376
377 // should only win of first time!
378 if (m_eol == TextBuffer::eolUnknown) {
379 m_eol = TextBuffer::eolMac;
380 }
381
382 lineLimitHandler(offset, length);
383 return !encodingError;
384 } else if (current_char == QChar::LineSeparator) {
385 m_lastWasEndOfLine = true;
386
387 // line data
388 offset = m_lastLineStart;
389 length = m_position - m_lastLineStart;
390
391 m_lastLineStart = m_position + 1;
392 m_position++;
393
394 lineLimitHandler(offset, length);
395 return !encodingError;
396 } else {
397 m_lastWasEndOfLine = false;
398 m_lastWasR = false;
399 }
400 }
401 }
402
403 return !encodingError;
404 }
405
406 QByteArray digest()
407 {
408 return m_digest.result();
409 }
410
411private:
412 QString m_codec;
413 bool m_eof;
414 bool m_lastWasEndOfLine;
415 bool m_lastWasR;
416 int m_position;
417 int m_lastLineStart;
418 int m_alreadyScanned = -1;
420 QString m_mimeType;
421 QIODevice *m_file;
422 QByteArray m_buffer;
423 QCryptographicHash m_digest;
424 QString m_text;
425 QStringDecoder m_converterState;
426 bool m_bomFound;
427 bool m_firstRead;
428 KEncodingProber::ProberType m_proberType;
429 quint64 m_fileSize;
430 const int m_lineLengthLimit;
431};
432
433}
434
435#endif
static CompressionType compressionTypeForMimeType(const QString &mimetype)
float confidence() const
ProberState feed(QByteArrayView data)
QByteArray encoding() const
Class representing a text buffer.
EndOfLineMode
End of line mode.
File Loader, will handle reading of files + detecting encoding.
const QChar * unicode() const
internal Unicode data array
QString textCodec() const
Get codec for this loader.
bool eof() const
end of file reached?
const QString & mimeTypeForFilterDev() const
mime type used to create filter dev
TextLoader(const QString &filename, KEncodingProber::ProberType proberType, int lineLengthLimit)
Construct file loader for given file.
bool readLine(int &offset, int &length, bool &tooLongLinesWrapped, int &longestLineLoaded)
read a line, return length + offset in Unicode data
bool open(const QString &codec)
open file with given codec
TextBuffer::EndOfLineMode eol() const
Detected end of line mode for this file.
~TextLoader()
Destructor.
bool byteOrderMarkFound() const
BOM found?
const char * constData() const const
char * data()
qsizetype size() const const
bool addData(QIODevice *device)
QByteArray result() const const
bool open(FILE *fh, OpenMode mode, FileHandleFlags handleFlags)
virtual qint64 size() const const override
virtual void close()
bool isOpen() const const
virtual bool open(QIODeviceBase::OpenMode mode)
QByteArray read(qint64 maxSize)
QMimeType mimeTypeForFileNameAndData(const QString &fileName, QIODevice *device) const const
QString & append(QChar ch)
QString arg(Args &&... args) const const
const QChar at(qsizetype position) const const
void clear()
QString fromUtf8(QByteArrayView str)
bool isEmpty() const const
qsizetype length() const const
QString & remove(QChar ch, Qt::CaseSensitivity cs)
QByteArray toLatin1() const const
QByteArray toUtf8() const const
const QChar * unicode() const const
bool hasError() const const
bool isValid() const const
const char * name() const const
EncodedData< QByteArrayView > decode(QByteArrayView ba)
QStringDecoder decoderForHtml(QByteArrayView data)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Fri May 17 2024 11:56:21 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.