KHtml

kencodingdetector.cpp
1 /*
2  This file is part of the KDE libraries
3 
4  Copyright (C) 1999 Lars Knoll ([email protected])
5  Copyright (C) 2003 Dirk Mueller ([email protected])
6  Copyright (C) 2003 Apple Computer, Inc.
7  Copyright (C) 2007 Nick Shaforostoff ([email protected])
8 
9  This library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU Library General Public
11  License as published by the Free Software Foundation; either
12  version 2 of the License, or (at your option) any later version.
13 
14  This library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  Library General Public License for more details.
18 
19  You should have received a copy of the GNU Library General Public License
20  along with this library; see the file COPYING.LIB. If not, write to
21  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22  Boston, MA 02110-1301, USA.
23 */
24 //----------------------------------------------------------------------------
25 //
26 // decoder for input stream
27 
28 #include "kencodingdetector.h"
29 
30 #undef DECODE_DEBUG
31 //#define DECODE_DEBUG
32 
33 #define MAX_BUFFER 16*1024
34 
35 #include <assert.h>
36 
37 #include "guess_ja_p.h"
38 
39 #include "khtml_debug.h"
40 #include <QRegExp>
41 #include <QTextCodec>
42 
43 #include "kcharsets.h"
44 #include <klocalizedstring.h>
45 
46 #include <ctype.h>
47 
48 enum MIB {
49  MibLatin1 = 4,
50  Mib8859_8 = 85,
51  MibUtf8 = 106,
52  MibUcs2 = 1000,
53  MibUtf16 = 1015,
54  MibUtf16BE = 1013,
55  MibUtf16LE = 1014
56 };
57 
58 static bool is16Bit(QTextCodec *codec)
59 {
60  switch (codec->mibEnum()) {
61  case MibUtf16:
62  case MibUtf16BE:
63  case MibUtf16LE:
64  case MibUcs2:
65  return true;
66  default:
67  return false;
68  }
69 }
70 
71 class KEncodingDetectorPrivate
72 {
73 public:
74  QTextCodec *m_codec;
75  QTextDecoder *m_decoder; // utf16
76  QTextCodec *m_defaultCodec;
77  QByteArray m_storeDecoderName;
78 
79  KEncodingDetector::EncodingChoiceSource m_source;
80  KEncodingDetector::AutoDetectScript m_autoDetectLanguage;
81 
82  bool m_visualRTL : 1;
83  bool m_seenBody : 1;
84  bool m_writtingHappened : 1;
85  bool m_analyzeCalled : 1; //for decode()
86  int m_multiByte;
87 
88  QByteArray m_bufferForDefferedEncDetection;
89 
90  KEncodingDetectorPrivate()
91  : m_codec(QTextCodec::codecForMib(MibLatin1))
92  , m_decoder(m_codec->makeDecoder())
93  , m_defaultCodec(m_codec)
94  , m_source(KEncodingDetector::DefaultEncoding)
95  , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection)
96  , m_visualRTL(false)
97  , m_seenBody(false)
98  , m_writtingHappened(false)
99  , m_analyzeCalled(false)
100  , m_multiByte(0)
101  {
102  }
103 
104  KEncodingDetectorPrivate(QTextCodec *codec, KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script)
105  : m_codec(codec)
106  , m_decoder(m_codec->makeDecoder())
107  , m_defaultCodec(m_codec)
108  , m_source(source)
109  , m_autoDetectLanguage(script)
110  , m_visualRTL(false)
111  , m_seenBody(false)
112  , m_writtingHappened(false)
113  , m_analyzeCalled(false)
114  , m_multiByte(0)
115  {
116  }
117 
118  ~KEncodingDetectorPrivate()
119  {
120  delete m_decoder;
121  }
122 
123  // Returns true if the encoding was explicitly specified someplace.
124  bool isExplicitlySpecifiedEncoding()
125  {
126  return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding;
127  }
128 };
129 
130 static QByteArray automaticDetectionForArabic(const unsigned char *ptr, int size)
131 {
132  for (int i = 0; i < size; ++i) {
133  if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
134  || (ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB) || (ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA)
135  || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
136  || (ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF) || (ptr[ i ] >= 0xF3)) {
137  return "cp1256";
138  }
139  }
140 
141  return "iso-8859-6";
142 }
143 
144 static QByteArray automaticDetectionForBaltic(const unsigned char *ptr, int size)
145 {
146  for (int i = 0; i < size; ++i) {
147  if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E)) {
148  return "cp1257";
149  }
150 
151  if (ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5) {
152  return "iso-8859-13";
153  }
154  }
155 
156  return "iso-8859-13";
157 }
158 
159 static QByteArray automaticDetectionForCentralEuropean(const unsigned char *ptr, int size)
160 {
161  QByteArray charset = QByteArray();
162  for (int i = 0; i < size; ++i) {
163  if (ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) {
164  if (ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98) {
165  return "ibm852";
166  }
167 
168  if (i + 1 > size) {
169  return "cp1250";
170  } else { // maybe ibm852 ?
171  charset = "cp1250";
172  continue;
173  }
174  }
175  if (ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0) {
176  if (i + 1 > size) {
177  return "iso-8859-2";
178  } else { // maybe ibm852 ?
179  if (charset.isNull()) {
180  charset = "iso-8859-2";
181  }
182  continue;
183  }
184  }
185  }
186 
187  if (charset.isNull()) {
188  charset = "iso-8859-3";
189  }
190 
191  return charset.data();
192 }
193 
194 static QByteArray automaticDetectionForCyrillic(const unsigned char *ptr, int size)
195 {
196 #ifdef DECODE_DEBUG
197  qCWarning(KHTML_LOG) << "KEncodingDetector: Cyr heuristics";
198 #endif
199 
200 // if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
201 // return "utf8";
202  int utf8_mark = 0;
203  int koi_score = 0;
204  int cp1251_score = 0;
205 
206  int koi_st = 0;
207  int cp1251_st = 0;
208 
209 // int koi_na=0;
210 // int cp1251_na=0;
211 
212  int koi_o_capital = 0;
213  int koi_o = 0;
214  int cp1251_o_capital = 0;
215  int cp1251_o = 0;
216 
217  int koi_a_capital = 0;
218  int koi_a = 0;
219  int cp1251_a_capital = 0;
220  int cp1251_a = 0;
221 
222  int koi_s_capital = 0;
223  int koi_s = 0;
224  int cp1251_s_capital = 0;
225  int cp1251_s = 0;
226 
227  int koi_i_capital = 0;
228  int koi_i = 0;
229  int cp1251_i_capital = 0;
230  int cp1251_i = 0;
231 
232  int cp1251_small_range = 0;
233  int koi_small_range = 0;
234  int ibm866_small_range = 0;
235 
236  int i;
237  for (i = 1; (i < size) && (cp1251_small_range + koi_small_range < 1000); ++i) {
238  if (ptr[i] > 0xdf) {
239  ++cp1251_small_range;
240 
241  if (ptr[i] == 0xee) { //small o
242  ++cp1251_o;
243  } else if (ptr[i] == 0xe0) { //small a
244  ++cp1251_a;
245  } else if (ptr[i] == 0xe8) { //small i
246  ++cp1251_i;
247  } else if (ptr[i] == 0xf1) { //small s
248  ++cp1251_s;
249  } else if (ptr[i] == 0xf2 && ptr[i - 1] == 0xf1) { //small st
250  ++cp1251_st;
251  }
252 
253  else if (ptr[i] == 0xef) {
254  ++koi_o_capital;
255  } else if (ptr[i] == 0xe1) {
256  ++koi_a_capital;
257  } else if (ptr[i] == 0xe9) {
258  ++koi_i_capital;
259  } else if (ptr[i] == 0xf3) {
260  ++koi_s_capital;
261  }
262 
263  } else if (ptr[i] > 0xbf) {
264  ++koi_small_range;
265 
266  if (ptr[i] == 0xd0 || ptr[i] == 0xd1) { //small o
267  ++utf8_mark;
268  } else if (ptr[i] == 0xcf) { //small o
269  ++koi_o;
270  } else if (ptr[i] == 0xc1) { //small a
271  ++koi_a;
272  } else if (ptr[i] == 0xc9) { //small i
273  ++koi_i;
274  } else if (ptr[i] == 0xd3) { //small s
275  ++koi_s;
276  } else if (ptr[i] == 0xd4 && ptr[i - 1] == 0xd3) { //small st
277  ++koi_st;
278  }
279 
280  else if (ptr[i] == 0xce) {
281  ++cp1251_o_capital;
282  } else if (ptr[i] == 0xc0) {
283  ++cp1251_a_capital;
284  } else if (ptr[i] == 0xc8) {
285  ++cp1251_i_capital;
286  } else if (ptr[i] == 0xd1) {
287  ++cp1251_s_capital;
288  }
289  } else if (ptr[i] > 0x9f && ptr[i] < 0xb0) { //first 16 letterz is 60%
290  ++ibm866_small_range;
291  }
292 
293  }
294 
295  //cannot decide?
296  if (cp1251_small_range + koi_small_range + ibm866_small_range < 8) {
297  return "";
298  }
299 
300  if (3 * utf8_mark > cp1251_small_range + koi_small_range + ibm866_small_range) {
301 #ifdef DECODE_DEBUG
302  qCWarning(KHTML_LOG) << "Cyr Enc Detection: UTF8";
303 #endif
304  return "UTF-8";
305  }
306 
307  if (ibm866_small_range > cp1251_small_range + koi_small_range) {
308  return "ibm866";
309  }
310 
311 // QByteArray koi_string = "koi8-u";
312 // QByteArray cp1251_string = "cp1251";
313 
314  if (cp1251_st == 0 && koi_st > 1) {
315  koi_score += 10;
316  } else if (koi_st == 0 && cp1251_st > 1) {
317  cp1251_score += 10;
318  }
319 
320  if (cp1251_st && koi_st) {
321  if (cp1251_st / koi_st > 2) {
322  cp1251_score += 20;
323  } else if (koi_st / cp1251_st > 2) {
324  koi_score += 20;
325  }
326  }
327 
328  if (cp1251_a > koi_a) {
329  cp1251_score += 10;
330  } else if (cp1251_a || koi_a) {
331  koi_score += 10;
332  }
333 
334  if (cp1251_o > koi_o) {
335  cp1251_score += 10;
336  } else if (cp1251_o || koi_o) {
337  koi_score += 10;
338  }
339 
340  if (cp1251_i > koi_i) {
341  cp1251_score += 10;
342  } else if (cp1251_i || koi_i) {
343  koi_score += 10;
344  }
345 
346  if (cp1251_s > koi_s) {
347  cp1251_score += 10;
348  } else if (cp1251_s || koi_s) {
349  koi_score += 10;
350  }
351 
352  if (cp1251_a_capital > koi_a_capital) {
353  cp1251_score += 9;
354  } else if (cp1251_a_capital || koi_a_capital) {
355  koi_score += 9;
356  }
357 
358  if (cp1251_o_capital > koi_o_capital) {
359  cp1251_score += 9;
360  } else if (cp1251_o_capital || koi_o_capital) {
361  koi_score += 9;
362  }
363 
364  if (cp1251_i_capital > koi_i_capital) {
365  cp1251_score += 9;
366  } else if (cp1251_i_capital || koi_i_capital) {
367  koi_score += 9;
368  }
369 
370  if (cp1251_s_capital > koi_s_capital) {
371  cp1251_score += 9;
372  } else if (cp1251_s_capital || koi_s_capital) {
373  koi_score += 9;
374  }
375 #ifdef DECODE_DEBUG
376  qCWarning(KHTML_LOG) << "koi_score " << koi_score << " cp1251_score " << cp1251_score;
377 #endif
378  if (abs(koi_score - cp1251_score) < 10) {
379  //fallback...
380  cp1251_score = cp1251_small_range;
381  koi_score = koi_small_range;
382  }
383  if (cp1251_score > koi_score) {
384  return "cp1251";
385  } else {
386  return "koi8-u";
387  }
388 
389 // if (cp1251_score>koi_score)
390 // setEncoding("cp1251",AutoDetectedEncoding);
391 // else
392 // setEncoding("koi8-u",AutoDetectedEncoding);
393 // return true;
394 
395 }
396 
397 static QByteArray automaticDetectionForGreek(const unsigned char *ptr, int size)
398 {
399  for (int i = 0; i < size; ++i) {
400  if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
401  || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
402  || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE) {
403  return "cp1253";
404  }
405  }
406 
407  return "iso-8859-7";
408 }
409 
410 static QByteArray automaticDetectionForHebrew(const unsigned char *ptr, int size)
411 {
412  for (int i = 0; i < size; ++i) {
413  if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89) || ptr[ i ] == 0x8B
414  || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || (ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9)
415  || (ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8)) {
416  return "cp1255";
417  }
418 
419  if (ptr[ i ] == 0xDF) {
420  return "iso-8859-8-i";
421  }
422  }
423 
424  return "iso-8859-8-i";
425 }
426 
427 static QByteArray automaticDetectionForJapanese(const unsigned char *ptr, int size)
428 {
429  JapaneseCode kc;
430 
431  switch (kc.guess_jp((const char *)ptr, size)) {
432  case JapaneseCode::JIS:
433  return "jis7";
434  case JapaneseCode::EUC:
435  return "eucjp";
436  case JapaneseCode::SJIS:
437  return "sjis";
438  case JapaneseCode::UTF8:
439  return "utf8";
440  default:
441  break;
442  }
443 
444  return "";
445 }
446 
447 static QByteArray automaticDetectionForTurkish(const unsigned char *ptr, int size)
448 {
449  for (int i = 0; i < size; ++i) {
450  if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C) || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C) || ptr[ i ] == 0x9F) {
451  return "cp1254";
452  }
453  }
454 
455  return "iso-8859-9";
456 }
457 
458 static QByteArray automaticDetectionForWesternEuropean(const unsigned char *ptr, int size)
459 {
460  --size;
461  uint nonansi_count = 0;
462  for (int i = 0; i < size; ++i) {
463  if (ptr[i] > 0x79) {
464  ++nonansi_count;
465  if (ptr[i] > 0xc1 && ptr[i] < 0xf0 && ptr[i + 1] > 0x7f && ptr[i + 1] < 0xc0) {
466  return "UTF-8";
467  }
468  if (ptr[i] >= 0x78 && ptr[i] <= 0x9F) {
469  return "cp1252";
470  }
471  }
472 
473  }
474 
475  if (nonansi_count > 0) {
476  return "iso-8859-15";
477  }
478 
479  return "";
480 }
481 
482 // Other browsers allow comments in the head section, so we need to also.
483 // It's important not to look for tags inside the comments.
484 static void skipComment(const char *&ptr, const char *pEnd)
485 {
486  const char *p = ptr;
487  // Allow <!-->; other browsers do.
488  if (*p == '>') {
489  p++;
490  } else {
491  while (p != pEnd) {
492  if (*p == '-') {
493  // This is the real end of comment, "-->".
494  if (p[1] == '-' && p[2] == '>') {
495  p += 3;
496  break;
497  }
498  // This is the incorrect end of comment that other browsers allow, "--!>".
499  if (p[1] == '-' && p[2] == '!' && p[3] == '>') {
500  p += 4;
501  break;
502  }
503  }
504  p++;
505  }
506  }
507  ptr = p;
508 }
509 
510 // Returns the position of the encoding string.
511 static int findXMLEncoding(const QByteArray &str, int &encodingLength)
512 {
513  int len = str.length();
514  int pos = str.indexOf("encoding");
515  if (pos == -1) {
516  return -1;
517  }
518  pos += 8;
519 
520  // Skip spaces and stray control characters.
521  while (pos < len && str[pos] <= ' ') {
522  ++pos;
523  }
524 
525  //Bail out if nothing after
526  // Skip equals sign.
527  if (pos >= len || str[pos] != '=') {
528  return -1;
529  }
530  ++pos;
531 
532  // Skip spaces and stray control characters.
533  while (pos < len && str[pos] <= ' ') {
534  ++pos;
535  }
536 
537  //Bail out if nothing after
538  if (pos >= len) {
539  return -1;
540  }
541 
542  // Skip quotation mark.
543  char quoteMark = str[pos];
544  if (quoteMark != '"' && quoteMark != '\'') {
545  return -1;
546  }
547  ++pos;
548 
549  // Find the trailing quotation mark.
550  int end = pos;
551  while (end < len && str[end] != quoteMark) {
552  ++end;
553  }
554 
555  if (end >= len) {
556  return -1;
557  }
558 
559  encodingLength = end - pos;
560  return pos;
561 }
562 
563 bool KEncodingDetector::processNull(char *data, int len)
564 {
565  bool bin = false;
566  if (is16Bit(d->m_codec)) {
567  for (int i = 1; i < len; i += 2) {
568  if ((data[i] == '\0') && (data[i - 1] == '\0')) {
569  bin = true;
570  data[i] = ' ';
571  }
572  }
573  return bin;
574  }
575  // replace '\0' by spaces, for buggy pages
576  int i = len - 1;
577  while (--i >= 0) {
578  if (data[i] == 0) {
579  bin = true;
580  data[i] = ' ';
581  }
582  }
583  return bin;
584 }
585 
586 bool KEncodingDetector::errorsIfUtf8(const char *data, int length)
587 {
588  if (d->m_codec->mibEnum() != MibUtf8) {
589  return false; //means no errors
590  }
591 // #define highest1Bits (unsigned char)0x80
592 // #define highest2Bits (unsigned char)0xC0
593 // #define highest3Bits (unsigned char)0xE0
594 // #define highest4Bits (unsigned char)0xF0
595 // #define highest5Bits (unsigned char)0xF8
596  static const unsigned char highest1Bits = 0x80;
597  static const unsigned char highest2Bits = 0xC0;
598  static const unsigned char highest3Bits = 0xE0;
599  static const unsigned char highest4Bits = 0xF0;
600  static const unsigned char highest5Bits = 0xF8;
601 
602  for (int i = 0; i < length; ++i) {
603  unsigned char c = data[i];
604 
605  if (d->m_multiByte > 0) {
606  if ((c & highest2Bits) == 0x80) {
607  --(d->m_multiByte);
608  continue;
609  }
610 #ifdef DECODE_DEBUG
611  qCWarning(KHTML_LOG) << "EncDetector: Broken UTF8";
612 #endif
613  return true;
614  }
615 
616  // most significant bit zero, single char
617  if ((c & highest1Bits) == 0x00) {
618  continue;
619  }
620 
621  // 110xxxxx => init 1 following bytes
622  if ((c & highest3Bits) == 0xC0) {
623  d->m_multiByte = 1;
624  continue;
625  }
626 
627  // 1110xxxx => init 2 following bytes
628  if ((c & highest4Bits) == 0xE0) {
629  d->m_multiByte = 2;
630  continue;
631  }
632 
633  // 11110xxx => init 3 following bytes
634  if ((c & highest5Bits) == 0xF0) {
635  d->m_multiByte = 3;
636  continue;
637  }
638 #ifdef DECODE_DEBUG
639  qCWarning(KHTML_LOG) << "EncDetector:_Broken UTF8";
640 #endif
641  return true;
642  }
643  return false;
644 }
645 
646 KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate)
647 {
648 }
649 
650 KEncodingDetector::KEncodingDetector(QTextCodec *codec, EncodingChoiceSource source, AutoDetectScript script) :
651  d(new KEncodingDetectorPrivate(codec, source, script))
652 {
653 }
654 
655 KEncodingDetector::~KEncodingDetector()
656 {
657  delete d;
658 }
659 
660 void KEncodingDetector::setAutoDetectLanguage(KEncodingDetector::AutoDetectScript lang)
661 {
662  d->m_autoDetectLanguage = lang;
663 }
664 KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const
665 {
666  return d->m_autoDetectLanguage;
667 }
668 
669 KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const
670 {
671  return d->m_source;
672 }
673 
674 const char *KEncodingDetector::encoding() const
675 {
676  d->m_storeDecoderName = d->m_codec->name();
677  return d->m_storeDecoderName.constData();
678 }
679 
680 bool KEncodingDetector::visuallyOrdered() const
681 {
682  return d->m_visualRTL;
683 }
684 
685 // const QTextCodec* KEncodingDetector::codec() const
686 // {
687 // return d->m_codec;
688 // }
689 
691 {
692  return d->m_decoder;
693 }
694 
696 {
697  assert(d->m_defaultCodec);
698  d->m_bufferForDefferedEncDetection.clear();
699  d->m_writtingHappened = false;
700  d->m_analyzeCalled = false;
701  d->m_multiByte = 0;
702  delete d->m_decoder;
703  if (!d->m_codec) {
704  d->m_codec = d->m_defaultCodec;
705  }
706  d->m_decoder = d->m_codec->makeDecoder();
707 }
708 
709 bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
710 {
711  QTextCodec *codec;
712  QByteArray enc(_encoding);
713  if (/*enc.isNull() || */enc.isEmpty()) {
714  if (type == DefaultEncoding) {
715  codec = d->m_defaultCodec;
716  } else {
717  return false;
718  }
719  } else {
720  //QString->QTextCodec
721 
722  enc = enc.toLower();
723  // hebrew visually ordered
724  if (enc == "visual") {
725  enc = "iso8859-8";
726  }
727  bool b;
728  codec = KCharsets::charsets()->codecForName(QLatin1String(enc.data()), b);
729  if (!b) {
730  return false;
731  }
732  }
733 
734  if (d->m_codec->mibEnum() == codec->mibEnum()) {
735  // We already have the codec, but we still want to re-set the type,
736  // as we may have overwritten a default with a detected
737  d->m_source = type;
738  return true;
739  }
740 
741  if ((type == EncodingFromMetaTag || type == EncodingFromXMLHeader) && is16Bit(codec)) {
742  //Sometimes the codec specified is absurd, i.e. UTF-16 despite
743  //us decoding a meta tag as ASCII. In that case, ignore it.
744  return false;
745  }
746 
747  if (codec->mibEnum() == Mib8859_8) {
748  //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
749  codec = QTextCodec::codecForName("iso8859-8-i");
750 
751  // visually ordered unless one of the following
752  if (!(enc == "iso-8859-8-i" || enc == "iso_8859-8-i" || enc == "csiso88598i" || enc == "logical")) {
753  d->m_visualRTL = true;
754  }
755  }
756 
757  d->m_codec = codec;
758  d->m_source = type;
759  delete d->m_decoder;
760  d->m_decoder = d->m_codec->makeDecoder();
761 #ifdef DECODE_DEBUG
762  qCDebug(KHTML_LOG) << "KEncodingDetector::encoding used is" << d->m_codec->name();
763 #endif
764  return true;
765 }
766 
767 QString KEncodingDetector::decode(const char *data, int len)
768 {
769  processNull(const_cast<char *>(data), len);
770  if (!d->m_analyzeCalled) {
771  analyze(data, len);
772  d->m_analyzeCalled = true;
773  }
774 
775  return d->m_decoder->toUnicode(data, len);
776 }
777 
779 {
780  processNull(const_cast<char *>(data.data()), data.size());
781  if (!d->m_analyzeCalled) {
782  analyze(data.data(), data.size());
783  d->m_analyzeCalled = true;
784  }
785 
786  return d->m_decoder->toUnicode(data);
787 }
788 
790 {
791 #ifdef DECODE_DEBUG
792  qCWarning(KHTML_LOG) << "KEncodingDetector: decoding " << len << " bytes";
793 #endif
794  if (d->m_writtingHappened) {
795 #ifdef DECODE_DEBUG
796  qCWarning(KHTML_LOG) << "KEncodingDetector: d->m_writtingHappened " << d->m_codec->name();
797 #endif
798  processNull(const_cast<char *>(data), len);
799  return d->m_decoder->toUnicode(data, len);
800  } else {
801  if (d->m_bufferForDefferedEncDetection.isEmpty()) {
802  // If encoding detection produced something, and we either got to the body or
803  // actually saw the encoding explicitly, we're done.
804  if (analyze(data, len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) {
805 #ifdef DECODE_DEBUG
806  qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened first time " << d->m_codec->name();
807 #endif
808  processNull(const_cast<char *>(data), len);
809  d->m_writtingHappened = true;
810  return d->m_decoder->toUnicode(data, len);
811  } else {
812 #ifdef DECODE_DEBUG
813  qCWarning(KHTML_LOG) << "KEncodingDetector: begin deffer";
814 #endif
815  d->m_bufferForDefferedEncDetection = data;
816  }
817  } else {
818  d->m_bufferForDefferedEncDetection += data;
819  // As above, but also limit the buffer size. We must use the entire buffer here,
820  // since the boundaries might split the meta tag, etc.
821  bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
822  if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
823  d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER) {
824  d->m_writtingHappened = true;
825  d->m_bufferForDefferedEncDetection.replace('\0', ' ');
826  QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
827  d->m_bufferForDefferedEncDetection.clear();
828 #ifdef DECODE_DEBUG
829  qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
830 #endif
831  return result;
832  }
833  }
834  }
835 
836  return QString();
837 }
838 
840 {
841  return d->m_decoder ? d->m_decoder->hasFailure() : false;
842 }
843 
845 {
846  if (d->m_bufferForDefferedEncDetection.isEmpty()) {
847  return QString();
848  }
849 
850  d->m_bufferForDefferedEncDetection.replace('\0', ' ');
851  QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
852  d->m_bufferForDefferedEncDetection.clear();
853 #ifdef DECODE_DEBUG
854  qCWarning(KHTML_LOG) << "KEncodingDetector:flush() " << d->m_bufferForDefferedEncDetection.length() << " bytes " << d->m_codec->name();
855 #endif
856  return result;
857 }
858 
859 bool KEncodingDetector::analyze(const char *data, int len)
860 {
861  // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
862  // maximumBOMLength = 10
863  // Even if the user has chosen utf16 we still need to auto-detect the endianness
864  if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) {
865  // Extract the first three bytes.
866  const uchar *udata = (const uchar *)data;
867  uchar c1 = *udata++;
868  uchar c2 = *udata++;
869  uchar c3 = *udata++;
870 
871  // Check for the BOM
872  const char *autoDetectedEncoding;
873  if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
874  autoDetectedEncoding = "UTF-16";
875  } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
876  autoDetectedEncoding = "UTF-8";
877  } else if (c1 == 0x00 || c2 == 0x00) {
878  uchar c4 = *udata++;
879  uchar c5 = *udata++;
880  uchar c6 = *udata++;
881  uchar c7 = *udata++;
882  uchar c8 = *udata++;
883  uchar c9 = *udata++;
884  uchar c10 = *udata++;
885 
886  int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
887  int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
888  if ((nul_count_even == 0 && nul_count_odd == 5) || (nul_count_even == 5 && nul_count_odd == 0)) {
889  autoDetectedEncoding = "UTF-16";
890  } else {
891  autoDetectedEncoding = nullptr;
892  }
893  } else {
894  autoDetectedEncoding = nullptr;
895  }
896 
897  // If we found a BOM, use the encoding it implies.
898  if (autoDetectedEncoding != nullptr) {
899  d->m_source = BOM;
900  d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
901  assert(d->m_codec);
902  //enc = d->m_codec->name();
903  delete d->m_decoder;
904  d->m_decoder = d->m_codec->makeDecoder();
905 #ifdef DECODE_DEBUG
906  qCWarning(KHTML_LOG) << "Detection by BOM";
907 #endif
908  if (is16Bit(d->m_codec) && c2 == 0x00) {
909  // utf16LE, we need to put the decoder in LE mode
910  char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
911  d->m_decoder->toUnicode(reverseUtf16, 2);
912  }
913  return true;
914  }
915  }
916 
917  //exit from routine in case it was called to only detect byte order for utf-16
918  if (d->m_source == UserChosenEncoding) {
919 #ifdef DECODE_DEBUG
920  qCWarning(KHTML_LOG) << "KEncodingDetector: UserChosenEncoding exit ";
921 #endif
922 
923  if (errorsIfUtf8(data, len)) {
924  setEncoding("", DefaultEncoding);
925  }
926  return true;
927  }
928 
929  // HTTP header takes precedence over meta-type stuff
930  if (d->m_source == EncodingFromHTTPHeader) {
931  return true;
932  }
933 
934  if (!d->m_seenBody) {
935  // we still don't have an encoding, and are in the head
936  // the following tags are allowed in <head>:
937  // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
938  const char *ptr = data;
939  const char *pEnd = data + len;
940 
941  while (ptr != pEnd) {
942  if (*ptr != '<') {
943  ++ptr;
944  continue;
945  }
946  ++ptr;
947  // Handle comments.
948  if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
949  ptr += 3;
950  skipComment(ptr, pEnd);
951  continue;
952  }
953 
954  // Handle XML header, which can have encoding in it.
955  if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
956  const char *end = ptr;
957  while (*end != '>' && end < pEnd) {
958  end++;
959  }
960  if (*end == '\0' || end == pEnd) {
961  break;
962  }
963  QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator
964  int length;
965  int pos = findXMLEncoding(str, length);
966  // also handles the case when specified encoding aint correct
967  if (pos != -1 && setEncoding(str.mid(pos, length).data(), EncodingFromXMLHeader)) {
968  return true;
969  }
970  }
971 
972  //look for <meta>, stop if we reach <body>
973  while (
974  !(((*ptr >= 'a') && (*ptr <= 'z')) ||
975  ((*ptr >= 'A') && (*ptr <= 'Z')))
976  && ptr < pEnd
977  ) {
978  ++ptr;
979  }
980 
981  char tmp[5];
982  int length = 0;
983  const char *max = ptr + 4;
984  if (pEnd < max) {
985  max = pEnd;
986  }
987  while (
988  (((*ptr >= 'a') && (*ptr <= 'z')) ||
989  ((*ptr >= 'A') && (*ptr <= 'Z')) ||
990  ((*ptr >= '0') && (*ptr <= '9')))
991  && ptr < max
992  ) {
993  tmp[length] = tolower(*ptr);
994  ++ptr;
995  ++length;
996  }
997  tmp[length] = 0;
998  if (tmp[0] == 'm' && tmp[1] == 'e' && tmp[2] == 't' && tmp[3] == 'a') {
999  // found a meta tag...
1000  const char *end = ptr;
1001  while (*end != '>' && *end != '\0' && end < pEnd) {
1002  end++;
1003  }
1004  //if ( *end == '\0' ) break;
1005  const QByteArray str = QByteArray(ptr, (end - ptr) + 1).toLower();
1006  const int strLength = str.length();
1007  int pos = 0;
1008  //if( (pos = str.find("http-equiv", pos)) == -1) break;
1009  //if( (pos = str.find("content-type", pos)) == -1) break;
1010  if ((pos = str.indexOf("charset")) == -1) {
1011  continue;
1012  }
1013  pos += 6;
1014  // skip to '='
1015  if ((pos = str.indexOf("=", pos)) == -1) {
1016  continue;
1017  }
1018 
1019  // skip '='
1020  ++pos;
1021 
1022  // skip whitespace before encoding itself
1023  while (pos < strLength && str[pos] <= ' ') {
1024  ++pos;
1025  }
1026 
1027  // there may also be an opening quote, if this is a charset= and not a http-equiv.
1028  if (pos < strLength && (str[pos] == '"' || str[pos] == '\'')) {
1029  ++pos;
1030  }
1031 
1032  // skip whitespace
1033  while (pos < strLength && str[pos] <= ' ') {
1034  ++pos;
1035  }
1036 
1037  if (pos == strLength) {
1038  continue;
1039  }
1040 
1041  int endpos = pos;
1042  while (endpos < strLength &&
1043  (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
1044  && str[endpos] != ';' && str[endpos] != '>')) {
1045  ++endpos;
1046  }
1047 #ifdef DECODE_DEBUG
1048  qCDebug(KHTML_LOG) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos, endpos - pos).data();
1049 #endif
1050  if (setEncoding(str.mid(pos, endpos - pos).data(), EncodingFromMetaTag)) {
1051  return true;
1052  }
1053  } else if (tmp[0] == 'b' && tmp[1] == 'o' && tmp[2] == 'd' && tmp[3] == 'y') {
1054  d->m_seenBody = true;
1055  break;
1056  }
1057  }
1058  }
1059 
1060  if (len < 20) {
1061  return false;
1062  }
1063 
1064 #ifdef DECODE_DEBUG
1065  qCDebug(KHTML_LOG) << "KEncodingDetector: using heuristics (" << strlen(data) << ")";
1066 #endif
1067 
1068  switch (d->m_autoDetectLanguage) {
1069  case KEncodingDetector::Arabic:
1070  return setEncoding(automaticDetectionForArabic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1071 // break;
1072  case KEncodingDetector::Baltic:
1073  return setEncoding(automaticDetectionForBaltic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1074 // break;
1075  case KEncodingDetector::CentralEuropean:
1076  return setEncoding(automaticDetectionForCentralEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1077 // break;
1078  case KEncodingDetector::Cyrillic:
1079  return setEncoding(automaticDetectionForCyrillic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1080 // break;
1081  case KEncodingDetector::Greek:
1082  return setEncoding(automaticDetectionForGreek((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1083 // break;
1084  case KEncodingDetector::Hebrew:
1085  return setEncoding(automaticDetectionForHebrew((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1086 // break;
1087  case KEncodingDetector::Japanese:
1088  return setEncoding(automaticDetectionForJapanese((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1089 // break;
1090  case KEncodingDetector::Turkish:
1091  return setEncoding(automaticDetectionForTurkish((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1092 // break;
1093  case KEncodingDetector::WesternEuropean:
1094  if (setEncoding(automaticDetectionForWesternEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding)) {
1095  return true;
1096  } else if (d->m_defaultCodec->mibEnum() == MibLatin1) { //detection for khtml
1097  return setEncoding("iso-8859-15", AutoDetectedEncoding);
1098  } else { //use default provided by eg katepart
1099  return setEncoding("", DefaultEncoding);
1100  }
1101 // break;
1102  case KEncodingDetector::SemiautomaticDetection:
1103  case KEncodingDetector::ChineseSimplified:
1104  case KEncodingDetector::ChineseTraditional:
1105  case KEncodingDetector::Korean:
1106  case KEncodingDetector::Thai:
1107  case KEncodingDetector::Unicode:
1108  case KEncodingDetector::NorthernSaami:
1109  case KEncodingDetector::SouthEasternEurope:
1110  case KEncodingDetector::None:
1111  // huh. somethings broken in this code ### FIXME
1112  //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
1113  break;
1114  }
1115 
1116  return true;
1117 }
1118 
1119 KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString &lang)
1120 {
1121  if (lang.isEmpty()) {
1122  return KEncodingDetector::None;
1123  } else if (lang == i18nc("@item Text character set", "Unicode")) {
1124  return KEncodingDetector::Unicode;
1125  } else if (lang == i18nc("@item Text character set", "Cyrillic")) {
1126  return KEncodingDetector::Cyrillic;
1127  } else if (lang == i18nc("@item Text character set", "Western European")) {
1128  return KEncodingDetector::WesternEuropean;
1129  } else if (lang == i18nc("@item Text character set", "Central European")) {
1130  return KEncodingDetector::CentralEuropean;
1131  } else if (lang == i18nc("@item Text character set", "Greek")) {
1132  return KEncodingDetector::Greek;
1133  } else if (lang == i18nc("@item Text character set", "Hebrew")) {
1134  return KEncodingDetector::Hebrew;
1135  } else if (lang == i18nc("@item Text character set", "Turkish")) {
1136  return KEncodingDetector::Turkish;
1137  } else if (lang == i18nc("@item Text character set", "Japanese")) {
1138  return KEncodingDetector::Japanese;
1139  } else if (lang == i18nc("@item Text character set", "Baltic")) {
1140  return KEncodingDetector::Baltic;
1141  } else if (lang == i18nc("@item Text character set", "Arabic")) {
1142  return KEncodingDetector::Arabic;
1143  }
1144 
1145  return KEncodingDetector::None;
1146 }
1147 
1148 bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)
1149 {
1150  switch (script) {
1151  case KEncodingDetector::Arabic:
1152  return true;
1153  case KEncodingDetector::Baltic:
1154  return true;
1155  case KEncodingDetector::CentralEuropean:
1156  return true;
1157  case KEncodingDetector::Cyrillic:
1158  return true;
1159  case KEncodingDetector::Greek:
1160  return true;
1161  case KEncodingDetector::Hebrew:
1162  return true;
1163  case KEncodingDetector::Japanese:
1164  return true;
1165  case KEncodingDetector::Turkish:
1166  return true;
1167  case KEncodingDetector::WesternEuropean:
1168  return true;
1169  case KEncodingDetector::ChineseTraditional:
1170  return true;
1171  case KEncodingDetector::ChineseSimplified:
1172  return true;
1173  case KEncodingDetector::Unicode:
1174  return true;
1175  break;
1176  default:
1177  return false;
1178  }
1179 }
1180 
1181 QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script)
1182 {
1183  switch (script) {
1184  case KEncodingDetector::Arabic:
1185  return i18nc("@item Text character set", "Arabic");
1186  break;
1187  case KEncodingDetector::Baltic:
1188  return i18nc("@item Text character set", "Baltic");
1189  break;
1190  case KEncodingDetector::CentralEuropean:
1191  return i18nc("@item Text character set", "Central European");
1192  break;
1193  case KEncodingDetector::Cyrillic:
1194  return i18nc("@item Text character set", "Cyrillic");
1195  break;
1196  case KEncodingDetector::Greek:
1197  return i18nc("@item Text character set", "Greek");
1198  break;
1199  case KEncodingDetector::Hebrew:
1200  return i18nc("@item Text character set", "Hebrew");
1201  break;
1202  case KEncodingDetector::Japanese:
1203  return i18nc("@item Text character set", "Japanese");
1204  break;
1205  case KEncodingDetector::Turkish:
1206  return i18nc("@item Text character set", "Turkish");
1207  break;
1208  case KEncodingDetector::WesternEuropean:
1209  return i18nc("@item Text character set", "Western European");
1210  break;
1211  case KEncodingDetector::ChineseTraditional:
1212  return i18nc("@item Text character set", "Chinese Traditional");
1213  break;
1214  case KEncodingDetector::ChineseSimplified:
1215  return i18nc("@item Text character set", "Chinese Simplified");
1216  break;
1217  case KEncodingDetector::Korean:
1218  return i18nc("@item Text character set", "Korean");
1219  break;
1220  case KEncodingDetector::Thai:
1221  return i18nc("@item Text character set", "Thai");
1222  break;
1223  case KEncodingDetector::Unicode:
1224  return i18nc("@item Text character set", "Unicode");
1225  break;
1226  //case KEncodingDetector::SemiautomaticDetection:
1227  default:
1228  return QString();
1229 
1230  }
1231 }
1232 
1233 #undef DECODE_DEBUG
1234 
QTextCodec * codecForName(const QString &name) const
bool decodedInvalidCharacters() const
This method checks whether invalid characters were found during a decoding operation.
QByteArray toLower() const const
Provides encoding detection capabilities.
QString decodeWithBuffering(const char *data, int len)
Convenience method that uses buffering.
bool isNull() const const
bool isEmpty() const const
int length() const const
static AutoDetectScript scriptForName(const QString &lang)
Takes lang name after it were i18n()&#39;ed.
int indexOf(char ch, int from) const const
QString i18nc(const char *context, const char *text, const TYPE &arg...)
bool isEmpty() const const
QTextDecoder * decoder()
QByteArray mid(int pos, int len) const const
KEncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
bool processNull(char *data, int length)
This nice method will kill all 0 bytes (or double bytes) and remember if this was a binary or not ;) ...
static KCharsets * charsets()
virtual int mibEnum() const const =0
bool setEncoding(const char *encoding, EncodingChoiceSource type)
const QList< QKeySequence > & end()
QString flush()
Convenience method to be used with decodeForHtml.
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
void resetDecoder()
Resets the decoder.
QString decode(const char *data, int len)
The main class method.
QTextCodec * codecForName(const QByteArray &name)
const char * encoding() const
Convenience method.
bool analyze(const char *data, int len)
Analyze text data.
char * data()
int size() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Tue Oct 26 2021 22:48:03 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.