KTextEditor

katetextloader.h
1/*
2 SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#ifndef KATE_TEXTLOADER_H
8#define KATE_TEXTLOADER_H
9
10#include <QCryptographicHash>
11#include <QFile>
12#include <QMimeDatabase>
13#include <QString>
14#include <QStringDecoder>
15
16#include <KCompressionDevice>
17#include <KEncodingProber>
18
19#include "katetextbuffer.h"
20
21namespace Kate
22{
23/**
24 * loader block size, load 256 kb at once per default
25 * if file size is smaller, fall back to file size
26 * must be a multiple of 2
27 */
28static const qint64 KATE_FILE_LOADER_BS = 256 * 1024;
29
30/**
31 * File Loader, will handle reading of files + detecting encoding
32 */
34{
35public:
36 /**
37 * Construct file loader for given file.
38 * @param filename file to open
39 * @param proberType prober type
40 * @param lineLengthLimit limit for lines to load, else we break them up in smaller ones
41 */
42 TextLoader(const QString &filename, KEncodingProber::ProberType proberType, int lineLengthLimit)
43 : m_eof(false) // default to not eof
44 , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline
45 , m_lastWasR(false) // we have not found a \r as last char
46 , m_position(0)
47 , m_lastLineStart(0)
48 , m_eol(TextBuffer::eolUnknown) // no eol type detected atm
49 , m_buffer(KATE_FILE_LOADER_BS, 0)
50 , m_digest(QCryptographicHash::Sha1)
51 , m_bomFound(false)
52 , m_firstRead(true)
53 , m_proberType(proberType)
54 , m_fileSize(0)
55 , m_lineLengthLimit(lineLengthLimit)
56 {
57 // try to get mimetype for on the fly decompression, don't rely on filename!
58 QFile testMime(filename);
59 if (testMime.open(QIODevice::ReadOnly)) {
60 m_fileSize = testMime.size();
61 }
62 m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(filename, &testMime).name();
63
64 // construct filter device
66 m_file = new KCompressionDevice(filename, compressionType);
67 }
68
69 /**
70 * Destructor
71 */
73 {
74 delete m_file;
75 }
76
77 /**
78 * open file with given codec
79 * @param codec codec to use, if 0, will do some auto-detect or fallback
80 * @return success
81 */
82 bool open(const QString &codec)
83 {
84 m_codec = codec;
85 m_eof = false;
86 m_lastWasEndOfLine = true;
87 m_lastWasR = false;
88 m_position = 0;
89 m_lastLineStart = 0;
90 m_alreadyScanned = -1;
91 m_eol = TextBuffer::eolUnknown;
92 m_text.clear();
93 m_converterState = m_codec.isEmpty() ? QStringDecoder() : QStringDecoder(m_codec.toUtf8().constData());
94 m_bomFound = false;
95 m_firstRead = true;
96
97 // init the hash with the git header
98 const QString header = QStringLiteral("blob %1").arg(m_fileSize);
99 m_digest.reset();
100 m_digest.addData(QByteArray(header.toLatin1() + '\0'));
101
102 // if already opened, close the file...
103 if (m_file->isOpen()) {
104 m_file->close();
105 }
106
107 return m_file->open(QIODevice::ReadOnly);
108 }
109
110 /**
111 * end of file reached?
112 * @return end of file reached
113 */
114 bool eof() const
115 {
116 return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length());
117 }
118
119 /**
120 * Detected end of line mode for this file.
121 * Detected during reading, is valid after complete file is read.
122 * @return eol mode of this file
123 */
125 {
126 return m_eol;
127 }
128
129 /**
130 * BOM found?
131 * @return byte order mark found?
132 */
134 {
135 return m_bomFound;
136 }
137
138 /**
139 * mime type used to create filter dev
140 * @return mime-type of filter device
141 */
143 {
144 return m_mimeType;
145 }
146
147 /**
148 * internal Unicode data array
149 * @return internal Unicode data
150 */
151 const QChar *unicode() const
152 {
153 return m_text.unicode();
154 }
155
156 /**
157 * Get codec for this loader
158 * @return currently in use codec of this loader
159 */
161 {
162 return m_codec;
163 }
164
165 /**
166 * read a line, return length + offset in Unicode data
167 * @param offset offset into internal Unicode data for read line
168 * @param length length of read line
169 * @param tooLongLinesWrapped was a too long line seen?
170 * @param longestLineLoaded length of the longest line that hit the limit
171 * @return true if no encoding errors occurred
172 */
173 bool readLine(int &offset, int &length, bool &tooLongLinesWrapped, int &longestLineLoaded)
174 {
175 length = 0;
176 offset = 0;
177 bool encodingError = false;
178
179 static const QLatin1Char cr(QLatin1Char('\r'));
180 static const QLatin1Char lf(QLatin1Char('\n'));
181
182 /**
183 * did we read two time but got no stuff? encoding error
184 * fixes problem with one character latin-1 files, which lead to crash otherwise!
185 * bug 272579
186 */
187 bool failedToConvertOnce = false;
188
189 /**
190 * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true
191 * BUG: 440359
192 */
193 bool bomPreviouslyFound = m_bomFound;
194
195 // honor the line length limit early
196 const auto lineLimitHandler = [this, &offset, &length, &tooLongLinesWrapped, &longestLineLoaded](int lineStart, int textLength) {
197 if ((m_lineLengthLimit <= 0) || (textLength <= m_lineLengthLimit)) {
198 return false;
199 }
200
201 // remember stick error
202 tooLongLinesWrapped = true;
203 longestLineLoaded = std::max(longestLineLoaded, textLength);
204
205 // search for place to wrap
206 int spacePosition = m_lineLengthLimit - 1;
207 for (int testPosition = m_lineLengthLimit - 1; (testPosition >= 0) && (testPosition >= (m_lineLengthLimit - (m_lineLengthLimit / 10)));
208 --testPosition) {
209 // wrap place found?
210 if (m_text[lineStart + testPosition].isSpace() || m_text[lineStart + testPosition].isPunct()) {
211 spacePosition = testPosition;
212 break;
213 }
214 }
215
216 m_lastWasEndOfLine = false;
217 m_lastWasR = false;
218
219 // line data
220 offset = lineStart;
221 length = spacePosition + 1;
222
223 m_lastLineStart = m_position = (lineStart + length);
224 return true;
225 };
226
227 /**
228 * reading loop
229 */
230 while (m_position <= m_text.length()) {
231 // handle too long lines early even if we not yet have seen the end
232 if (m_alreadyScanned > m_lastLineStart && lineLimitHandler(m_lastLineStart, m_alreadyScanned - m_lastLineStart)) {
233 return !encodingError;
234 }
235
236 if (m_position == m_text.length()) {
237 // try to load more text if something is around
238 if (!m_eof) {
239 // kill the old lines...
240 m_text.remove(0, m_lastLineStart);
241
242 // try to read new data
243 const int c = m_file->read(m_buffer.data(), m_buffer.size());
244
245 // if any text is there, append it....
246 if (c > 0) {
247 // update hash sum
248 m_digest.addData(QByteArrayView(m_buffer.data(), c));
249
250 // detect byte order marks & codec for byte order marks on first read
251 if (m_firstRead) {
252 // if no codec given, do autodetection
253 if (!m_converterState.isValid()) {
254 // use KEncodingProber first, QStringDecoder::decoderForHtml does fallback to UTF-8
255 KEncodingProber prober(m_proberType);
256 prober.feed(QByteArrayView(m_buffer.data(), c));
257
258 // we found a codec with some confidence?
259 if (const QStringDecoder decoder(prober.encoding().constData()); decoder.isValid() && (prober.confidence() > 0.5)) {
260 m_converterState = QStringDecoder(prober.encoding().constData());
261 } else {
262 // try to get HTML encoding, will default to UTF-8
263 // see https://doc.qt.io/qt-6/qstringdecoder.html#decoderForHtml
264 m_converterState = QStringDecoder::decoderForHtml(m_buffer);
265 }
266
267 // no codec, no chance, encoding error, else remember the codec name
268 if (!m_converterState.isValid()) {
269 return false;
270 }
271 }
272
273 // we want to convert the bom for later detection
274 m_converterState = QStringDecoder(m_converterState.name(), QStringConverter::Flag::ConvertInitialBom);
275
276 // remember name, might have changed
277 m_codec = QString::fromUtf8(m_converterState.name());
278 }
279
280 // detect broken encoding
281 Q_ASSERT(m_converterState.isValid());
282 const QString unicode = m_converterState.decode(QByteArrayView(m_buffer.data(), c));
283 encodingError = encodingError || m_converterState.hasError();
284
285 // check and remove bom
286 if (m_firstRead && !unicode.isEmpty() && (unicode.front() == QChar::ByteOrderMark || unicode.front() == QChar::ByteOrderSwapped)) {
287 m_bomFound = true;
288 m_text.append(QStringView(unicode).last(unicode.size() - 1));
289
290 // swapped BOM is encoding error
291 encodingError = encodingError || unicode.front() == QChar::ByteOrderSwapped;
292 } else {
293 m_text.append(unicode);
294 }
295 m_firstRead = false;
296 }
297
298 // is file completely read ?
299 m_eof = (c == -1) || (c == 0);
300
301 // recalc current pos and last pos
302 m_position -= m_lastLineStart;
303 m_alreadyScanned = m_position - 1;
304 m_lastLineStart = 0;
305 }
306
307 // oh oh, end of file, escape !
308 if (m_eof && (m_position == m_text.length())) {
309 m_lastWasEndOfLine = false;
310
311 // line data
312 offset = m_lastLineStart;
313 length = m_position - m_lastLineStart;
314
315 m_lastLineStart = m_position;
316
317 lineLimitHandler(offset, length);
318 return !encodingError && !failedToConvertOnce;
319 }
320
321 // empty? try again
322 if (m_position == m_text.length()) {
323 if (!bomPreviouslyFound && m_bomFound) {
324 // BOM was processed above, so we didn't fail to convert
325 bomPreviouslyFound = true;
326 } else {
327 failedToConvertOnce = true;
328 }
329 continue;
330 }
331 }
332
333 for (; m_position < m_text.length(); m_position++) {
334 m_alreadyScanned = m_position;
335 QChar current_char = m_text.at(m_position);
336 if (current_char == lf) {
337 m_lastWasEndOfLine = true;
338
339 if (m_lastWasR) {
340 m_lastLineStart++;
341 m_lastWasR = false;
342 m_eol = TextBuffer::eolDos;
343 } else {
344 // line data
345 offset = m_lastLineStart;
346 length = m_position - m_lastLineStart;
347
348 m_lastLineStart = m_position + 1;
349 m_position++;
350
351 // only win, if not dos!
352 if (m_eol != TextBuffer::eolDos) {
353 m_eol = TextBuffer::eolUnix;
354 }
355
356 lineLimitHandler(offset, length);
357 return !encodingError;
358 }
359 } else if (current_char == cr) {
360 m_lastWasEndOfLine = true;
361 m_lastWasR = true;
362
363 // line data
364 offset = m_lastLineStart;
365 length = m_position - m_lastLineStart;
366
367 m_lastLineStart = m_position + 1;
368 m_position++;
369
370 // should only win of first time!
371 if (m_eol == TextBuffer::eolUnknown) {
372 m_eol = TextBuffer::eolMac;
373 }
374
375 lineLimitHandler(offset, length);
376 return !encodingError;
377 } else if (current_char == QChar::LineSeparator) {
378 m_lastWasEndOfLine = true;
379
380 // line data
381 offset = m_lastLineStart;
382 length = m_position - m_lastLineStart;
383
384 m_lastLineStart = m_position + 1;
385 m_position++;
386
387 lineLimitHandler(offset, length);
388 return !encodingError;
389 } else {
390 m_lastWasEndOfLine = false;
391 m_lastWasR = false;
392 }
393 }
394 }
395
396 return !encodingError;
397 }
398
399 QByteArray digest()
400 {
401 return m_digest.result();
402 }
403
404private:
405 QString m_codec;
406 bool m_eof;
407 bool m_lastWasEndOfLine;
408 bool m_lastWasR;
409 int m_position;
410 int m_lastLineStart;
411 int m_alreadyScanned = -1;
413 QString m_mimeType;
414 QIODevice *m_file;
415 QByteArray m_buffer;
416 QCryptographicHash m_digest;
417 QString m_text;
418 QStringDecoder m_converterState;
419 bool m_bomFound;
420 bool m_firstRead;
421 KEncodingProber::ProberType m_proberType;
422 quint64 m_fileSize;
423 const int m_lineLengthLimit;
424};
425
426}
427
428#endif
static CompressionType compressionTypeForMimeType(const QString &mimetype)
float confidence() const
ProberState feed(QByteArrayView data)
QByteArray encoding() const
Class representing a text buffer.
EndOfLineMode
End of line mode.
File Loader, will handle reading of files + detecting encoding.
const QChar * unicode() const
internal Unicode data array
QString textCodec() const
Get codec for this loader.
bool eof() const
end of file reached?
const QString & mimeTypeForFilterDev() const
mime type used to create filter dev
TextLoader(const QString &filename, KEncodingProber::ProberType proberType, int lineLengthLimit)
Construct file loader for given file.
bool readLine(int &offset, int &length, bool &tooLongLinesWrapped, int &longestLineLoaded)
read a line, return length + offset in Unicode data
bool open(const QString &codec)
open file with given codec
TextBuffer::EndOfLineMode eol() const
Detected end of line mode for this file.
~TextLoader()
Destructor.
bool byteOrderMarkFound() const
BOM found?
const char * constData() const const
char * data()
qsizetype size() const const
bool addData(QIODevice *device)
QByteArray result() const const
bool open(FILE *fh, OpenMode mode, FileHandleFlags handleFlags)
virtual qint64 size() const const override
virtual void close()
bool isOpen() const const
virtual bool open(QIODeviceBase::OpenMode mode)
QByteArray read(qint64 maxSize)
QMimeType mimeTypeForFileNameAndData(const QString &fileName, QIODevice *device) const const
QString & append(QChar ch)
QString arg(Args &&... args) const const
const QChar at(qsizetype position) const const
void clear()
QString fromUtf8(QByteArrayView str)
bool isEmpty() const const
qsizetype length() const const
QString & remove(QChar ch, Qt::CaseSensitivity cs)
QByteArray toLatin1() const const
QByteArray toUtf8() const const
const QChar * unicode() const const
bool hasError() const const
bool isValid() const const
const char * name() const const
EncodedData< QByteArrayView > decode(QByteArrayView ba)
QStringDecoder decoderForHtml(QByteArrayView data)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Fri Oct 11 2024 12:17:27 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.