KTextEditor

katetextloader.h
1 /*
2  SPDX-FileCopyrightText: 2010 Christoph Cullmann <[email protected]>
3 
4  SPDX-License-Identifier: LGPL-2.0-or-later
5 */
6 
7 #ifndef KATE_TEXTLOADER_H
8 #define KATE_TEXTLOADER_H
9 
10 #include <QCryptographicHash>
11 #include <QFile>
12 #include <QMimeDatabase>
13 #include <QString>
14 
15 // on the fly compression
16 #include <KCompressionDevice>
17 
18 namespace Kate
19 {
20 /**
21  * loader block size, load 256 kb at once per default
22  * if file size is smaller, fall back to file size
23  * must be a multiple of 2
24  */
25 static const qint64 KATE_FILE_LOADER_BS = 256 * 1024;
26 
27 /**
28  * File Loader, will handle reading of files + detecting encoding
29  */
31 {
32 public:
33  /**
34  * Construct file loader for given file.
35  * @param filename file to open
36  * @param proberType prober type
37  */
38  TextLoader(const QString &filename, KEncodingProber::ProberType proberType)
39  : m_codec(nullptr)
40  , m_eof(false) // default to not eof
41  , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline
42  , m_lastWasR(false) // we have not found a \r as last char
43  , m_position(0)
44  , m_lastLineStart(0)
45  , m_eol(TextBuffer::eolUnknown) // no eol type detected atm
46  , m_buffer(KATE_FILE_LOADER_BS, 0)
47  , m_digest(QCryptographicHash::Sha1)
48  , m_converterState(nullptr)
49  , m_bomFound(false)
50  , m_firstRead(true)
51  , m_proberType(proberType)
52  , m_fileSize(0)
53  {
54  // try to get mimetype for on the fly decompression, don't rely on filename!
55  QFile testMime(filename);
56  if (testMime.open(QIODevice::ReadOnly)) {
57  m_fileSize = testMime.size();
58  }
59  m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(filename, &testMime).name();
60 
61  // construct filter device
63  m_file = new KCompressionDevice(filename, compressionType);
64  }
65 
66  /**
67  * Destructor
68  */
70  {
71  delete m_file;
72  delete m_converterState;
73  }
74 
75  /**
76  * open file with given codec
77  * @param codec codec to use, if 0, will do some auto-detect or fallback
78  * @return success
79  */
80  bool open(QTextCodec *codec)
81  {
82  m_codec = codec;
83  m_eof = false;
84  m_lastWasEndOfLine = true;
85  m_lastWasR = false;
86  m_position = 0;
87  m_lastLineStart = 0;
88  m_eol = TextBuffer::eolUnknown;
89  m_text.clear();
90  delete m_converterState;
92  m_bomFound = false;
93  m_firstRead = true;
94 
95  // init the hash with the git header
96  const QString header = QStringLiteral("blob %1").arg(m_fileSize);
97  m_digest.reset();
98  m_digest.addData(QByteArray(header.toLatin1() + '\0'));
99 
100  // if already opened, close the file...
101  if (m_file->isOpen()) {
102  m_file->close();
103  }
104 
105  return m_file->open(QIODevice::ReadOnly);
106  }
107 
108  /**
109  * end of file reached?
110  * @return end of file reached
111  */
112  bool eof() const
113  {
114  return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length());
115  }
116 
117  /**
118  * Detected end of line mode for this file.
119  * Detected during reading, is valid after complete file is read.
120  * @return eol mode of this file
121  */
123  {
124  return m_eol;
125  }
126 
127  /**
128  * BOM found?
129  * @return byte order mark found?
130  */
131  bool byteOrderMarkFound() const
132  {
133  return m_bomFound;
134  }
135 
136  /**
137  * mime type used to create filter dev
138  * @return mime-type of filter device
139  */
141  {
142  return m_mimeType;
143  }
144 
145  /**
146  * internal Unicode data array
147  * @return internal Unicode data
148  */
149  const QChar *unicode() const
150  {
151  return m_text.unicode();
152  }
153 
154  /**
155  * Get codec for this loader
156  * @return currently in use codec of this loader
157  */
159  {
160  return m_codec;
161  }
162 
163  /**
164  * read a line, return length + offset in Unicode data
165  * @param offset offset into internal Unicode data for read line
166  * @param length length of read line
167  * @return true if no encoding errors occurred
168  */
169  bool readLine(int &offset, int &length)
170  {
171  length = 0;
172  offset = 0;
173  bool encodingError = false;
174 
175  static const QLatin1Char cr(QLatin1Char('\r'));
176  static const QLatin1Char lf(QLatin1Char('\n'));
177 
178  /**
179  * did we read two time but got no stuff? encoding error
180  * fixes problem with one character latin-1 files, which lead to crash otherwise!
181  * bug 272579
182  */
183  bool failedToConvertOnce = false;
184  /**
185  * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true
186  * BUG: 440359
187  */
188  bool bomPreviouslyFound = m_bomFound;
189 
190  /**
191  * reading loop
192  */
193  while (m_position <= m_text.length()) {
194  if (m_position == m_text.length()) {
195  // try to load more text if something is around
196  if (!m_eof) {
197  // kill the old lines...
198  m_text.remove(0, m_lastLineStart);
199 
200  // try to read new data
201  const int c = m_file->read(m_buffer.data(), m_buffer.size());
202 
203  // if any text is there, append it....
204  if (c > 0) {
205  // update hash sum
206  m_digest.addData(m_buffer.data(), c);
207 
208  // detect byte order marks & codec for byte order marks on first read
209  int bomBytes = 0;
210  if (m_firstRead) {
211  // use first 16 bytes max to allow BOM detection of codec
212  QByteArray bom(m_buffer.data(), qMin(16, c));
213  QTextCodec *codecForByteOrderMark = QTextCodec::codecForUtfText(bom, nullptr);
214 
215  // if codecForByteOrderMark != null, we found a BOM!
216  // BUT we only capture BOM if no codec was set, or the BOM encodes the same codec as m_codec.
217  // These additional checks are necessary so that the (coincidentally matching) BOM characters won't be eaten for non-UTF encodings
218  // TODO: support BOMs for other encodings? (see e.g. https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding)
219  if (codecForByteOrderMark && (!m_codec || codecForByteOrderMark->mibEnum() == m_codec->mibEnum())) {
220  m_bomFound = true;
221 
222  // eat away the different boms!
223  const int mib = codecForByteOrderMark->mibEnum();
224  if (mib == 106) { // utf8
225  bomBytes = 3;
226  } else if (mib == 1013 || mib == 1014 || mib == 1015) { // utf16
227  bomBytes = 2;
228  } else if (mib == 1017 || mib == 1018 || mib == 1019) { // utf32
229  bomBytes = 4;
230  }
231  }
232 
233  /**
234  * if no codec given, do autodetection
235  */
236  if (!m_codec) {
237  /**
238  * byte order said something about encoding?
239  */
240  if (codecForByteOrderMark) {
241  m_codec = codecForByteOrderMark;
242  } else {
243  /**
244  * no Unicode BOM found, trigger prober
245  */
246 
247  /**
248  * first: try to get HTML header encoding
249  */
250  if (QTextCodec *codecForHtml = QTextCodec::codecForHtml(m_buffer, nullptr)) {
251  m_codec = codecForHtml;
252  }
253 
254  /**
255  * else: use KEncodingProber
256  */
257  else {
258  KEncodingProber prober(m_proberType);
259  prober.feed(m_buffer.constData(), c);
260 
261  // we found codec with some confidence?
262  if (prober.confidence() > 0.5) {
263  m_codec = QTextCodec::codecForName(prober.encoding());
264  }
265  }
266 
267  // no codec, no chance, encoding error
268  if (!m_codec) {
269  return false;
270  }
271  }
272  }
273 
274  m_firstRead = false;
275  }
276 
277  // detect broken encoding, we did before use QTextCodec::ConvertInvalidToNull and check for 0 chars
278  // this lead to issues with files containing 0 chars, therefore use the invalidChars field of the state
279  Q_ASSERT(m_codec);
280  QString unicode = m_codec->toUnicode(m_buffer.constData() + bomBytes, c - bomBytes, m_converterState);
281  encodingError = encodingError || m_converterState->invalidChars;
282  m_text.append(unicode);
283  }
284 
285  // is file completely read ?
286  m_eof = (c == -1) || (c == 0);
287 
288  // recalc current pos and last pos
289  m_position -= m_lastLineStart;
290  m_lastLineStart = 0;
291  }
292 
293  // oh oh, end of file, escape !
294  if (m_eof && (m_position == m_text.length())) {
295  m_lastWasEndOfLine = false;
296 
297  // line data
298  offset = m_lastLineStart;
299  length = m_position - m_lastLineStart;
300 
301  m_lastLineStart = m_position;
302 
303  return !encodingError && !failedToConvertOnce;
304  }
305 
306  // empty? try again
307  if (m_position == m_text.length()) {
308  if (!bomPreviouslyFound && m_bomFound) {
309  // BOM was processed above, so we didn't fail to convert
310  bomPreviouslyFound = true;
311  } else {
312  failedToConvertOnce = true;
313  }
314  continue;
315  }
316  }
317 
318  QChar current_char = m_text.at(m_position);
319  if (current_char == lf) {
320  m_lastWasEndOfLine = true;
321 
322  if (m_lastWasR) {
323  m_lastLineStart++;
324  m_lastWasR = false;
325  m_eol = TextBuffer::eolDos;
326  } else {
327  // line data
328  offset = m_lastLineStart;
329  length = m_position - m_lastLineStart;
330 
331  m_lastLineStart = m_position + 1;
332  m_position++;
333 
334  // only win, if not dos!
335  if (m_eol != TextBuffer::eolDos) {
336  m_eol = TextBuffer::eolUnix;
337  }
338 
339  return !encodingError;
340  }
341  } else if (current_char == cr) {
342  m_lastWasEndOfLine = true;
343  m_lastWasR = true;
344 
345  // line data
346  offset = m_lastLineStart;
347  length = m_position - m_lastLineStart;
348 
349  m_lastLineStart = m_position + 1;
350  m_position++;
351 
352  // should only win of first time!
353  if (m_eol == TextBuffer::eolUnknown) {
354  m_eol = TextBuffer::eolMac;
355  }
356 
357  return !encodingError;
358  } else if (current_char == QChar::LineSeparator) {
359  m_lastWasEndOfLine = true;
360 
361  // line data
362  offset = m_lastLineStart;
363  length = m_position - m_lastLineStart;
364 
365  m_lastLineStart = m_position + 1;
366  m_position++;
367 
368  return !encodingError;
369  } else {
370  m_lastWasEndOfLine = false;
371  m_lastWasR = false;
372  }
373 
374  m_position++;
375  }
376 
377  return !encodingError;
378  }
379 
380  QByteArray digest()
381  {
382  return m_digest.result();
383  }
384 
385 private:
386  QTextCodec *m_codec;
387  bool m_eof;
388  bool m_lastWasEndOfLine;
389  bool m_lastWasR;
390  int m_position;
391  int m_lastLineStart;
393  QString m_mimeType;
394  QIODevice *m_file;
395  QByteArray m_buffer;
396  QCryptographicHash m_digest;
397  QString m_text;
398  QTextCodec::ConverterState *m_converterState;
399  bool m_bomFound;
400  bool m_firstRead;
401  KEncodingProber::ProberType m_proberType;
402  quint64 m_fileSize;
403 };
404 
405 }
406 
407 #endif
TextBuffer::EndOfLineMode eol() const
Detected end of line mode for this file.
bool open(QTextCodec *codec)
open file with given codec
void addData(const char *data, int length)
File Loader, will handle reading of files + detecting encoding.
virtual bool open(QIODevice::OpenMode mode) override
Class representing a text buffer.
void clear()
virtual bool open(QIODevice::OpenMode mode)
bool eof() const
end of file reached?
QByteArray result() const const
QByteArray toLatin1() const const
const QChar * unicode() const const
const QString & mimeTypeForFilterDev() const
mime type used to create filter dev
virtual int mibEnum() const const=0
EndOfLineMode
End of line mode.
ProberState feed(const QByteArray &data)
bool readLine(int &offset, int &length)
read a line, return length + offset in Unicode data
QByteArray encoding() const
static CompressionType compressionTypeForMimeType(const QString &mimetype)
bool isOpen() const const
int length() const const
QTextCodec * codecForName(const QByteArray &name)
~TextLoader()
Destructor.
QTextCodec * codecForUtfText(const QByteArray &ba)
float confidence() const
bool byteOrderMarkFound() const
BOM found?
QString & remove(int position, int n)
virtual qint64 size() const const override
qint64 read(char *data, qint64 maxSize)
QMimeType mimeTypeForFileNameAndData(const QString &fileName, QIODevice *device) const const
QTextCodec * textCodec() const
Get codec for this loader.
const char * constData() const const
QString arg(qlonglong a, int fieldWidth, int base, QChar fillChar) const const
QTextCodec * codecForHtml(const QByteArray &ba)
const QChar at(int position) const const
int size() const const
const QChar * unicode() const
internal Unicode data array
virtual void close()
QString toUnicode(const QByteArray &a) const const
QString & append(QChar ch)
TextLoader(const QString &filename, KEncodingProber::ProberType proberType)
Construct file loader for given file.
char * data()
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Sun Mar 26 2023 03:55:36 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.