• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdelibs API Reference
  • KDE Home
  • Contact Us
 

KDECore

  • sources
  • kde-4.14
  • kdelibs
  • kdecore
  • localization
kencodingdetector.cpp
Go to the documentation of this file.
1 /*
2  This file is part of the KDE libraries
3 
4  Copyright (C) 1999 Lars Knoll (knoll@kde.org)
5  Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
6  Copyright (C) 2003 Apple Computer, Inc.
7  Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
8 
9  This library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU Library General Public
11  License as published by the Free Software Foundation; either
12  version 2 of the License, or (at your option) any later version.
13 
14  This library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  Library General Public License for more details.
18 
19  You should have received a copy of the GNU Library General Public License
20  along with this library; see the file COPYING.LIB. If not, write to
21  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22  Boston, MA 02110-1301, USA.
23 */
24 //----------------------------------------------------------------------------
25 //
26 // decoder for input stream
27 
28 #include "kencodingdetector.h"
29 
30 #undef DECODE_DEBUG
31 //#define DECODE_DEBUG
32 
33 #define MAX_BUFFER 16*1024
34 
35 #include <assert.h>
36 
37 #include "guess_ja_p.h"
38 
39 #include <QRegExp>
40 #include <QTextCodec>
41 
42 #include <kglobal.h>
43 #include <kcharsets.h>
44 #include <kdebug.h>
45 #include <klocale.h>
46 
47 #include <ctype.h>
48 
49 enum MIB
50 {
51  MibLatin1 = 4,
52  Mib8859_8 = 85,
53  MibUtf8 = 106,
54  MibUcs2 = 1000,
55  MibUtf16 = 1015,
56  MibUtf16BE = 1013,
57  MibUtf16LE = 1014
58 };
59 
60 static bool is16Bit(QTextCodec* codec)
61 {
62  switch (codec->mibEnum())
63  {
64  case MibUtf16:
65  case MibUtf16BE:
66  case MibUtf16LE:
67  case MibUcs2:
68  return true;
69  default:
70  return false;
71  }
72 }
73 
74 class KEncodingDetectorPrivate
75 {
76 public:
77  QTextCodec *m_codec;
78  QTextDecoder *m_decoder; // utf16
79  QTextCodec *m_defaultCodec;
80  QByteArray m_storeDecoderName;
81 
82  KEncodingDetector::EncodingChoiceSource m_source;
83  KEncodingDetector::AutoDetectScript m_autoDetectLanguage;
84 
85  bool m_visualRTL : 1;
86  bool m_seenBody : 1;
87  bool m_writtingHappened : 1;
88  bool m_analyzeCalled : 1; //for decode()
89  int m_multiByte;
90 
91  QByteArray m_bufferForDefferedEncDetection;
92 
93  KEncodingDetectorPrivate()
94  : m_codec(QTextCodec::codecForMib(MibLatin1))
95  , m_decoder(m_codec->makeDecoder())
96  , m_defaultCodec(m_codec)
97  , m_source(KEncodingDetector::DefaultEncoding)
98  , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection)
99  , m_visualRTL(false)
100  , m_seenBody(false)
101  , m_writtingHappened(false)
102  , m_analyzeCalled(false)
103  , m_multiByte(0)
104  {
105  }
106 
107  KEncodingDetectorPrivate(QTextCodec* codec,KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script)
108  : m_codec(codec)
109  , m_decoder(m_codec->makeDecoder())
110  , m_defaultCodec(m_codec)
111  , m_source(source)
112  , m_autoDetectLanguage(script)
113  , m_visualRTL(false)
114  , m_seenBody(false)
115  , m_writtingHappened(false)
116  , m_analyzeCalled(false)
117  , m_multiByte(0)
118  {
119  }
120 
121  ~KEncodingDetectorPrivate()
122  {
123  delete m_decoder;
124  }
125 
126  // Returns true if the encoding was explicitly specified someplace.
127  bool isExplicitlySpecifiedEncoding()
128  {
129  return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding;
130  }
131 };
132 
133 
134 static QByteArray automaticDetectionForArabic( const unsigned char* ptr, int size )
135 {
136  for ( int i = 0; i < size; ++i ) {
137  if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
138  || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
139  || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
140  || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
141  return "cp1256";
142  }
143  }
144 
145  return "iso-8859-6";
146 }
147 
148 static QByteArray automaticDetectionForBaltic( const unsigned char* ptr, int size )
149 {
150  for ( int i = 0; i < size; ++i ) {
151  if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
152  return "cp1257";
153 
154  if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
155  return "iso-8859-13";
156  }
157 
158  return "iso-8859-13";
159 }
160 
161 static QByteArray automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
162 {
163  QByteArray charset = QByteArray();
164  for ( int i = 0; i < size; ++i ) {
165  if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
166  if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
167  return "ibm852";
168 
169  if ( i + 1 > size )
170  return "cp1250";
171  else { // maybe ibm852 ?
172  charset = "cp1250";
173  continue;
174  }
175  }
176  if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
177  if ( i + 1 > size )
178  return "iso-8859-2";
179  else { // maybe ibm852 ?
180  if ( charset.isNull() )
181  charset = "iso-8859-2";
182  continue;
183  }
184  }
185  }
186 
187  if ( charset.isNull() )
188  charset = "iso-8859-3";
189 
190  return charset.data();
191 }
192 
193 static QByteArray automaticDetectionForCyrillic( const unsigned char* ptr, int size)
194 {
195 #ifdef DECODE_DEBUG
196  kWarning() << "KEncodingDetector: Cyr heuristics";
197 #endif
198 
199 // if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
200 // return "utf8";
201  int utf8_mark=0;
202  int koi_score=0;
203  int cp1251_score=0;
204 
205  int koi_st=0;
206  int cp1251_st=0;
207 
208 // int koi_na=0;
209 // int cp1251_na=0;
210 
211  int koi_o_capital=0;
212  int koi_o=0;
213  int cp1251_o_capital=0;
214  int cp1251_o=0;
215 
216  int koi_a_capital=0;
217  int koi_a=0;
218  int cp1251_a_capital=0;
219  int cp1251_a=0;
220 
221  int koi_s_capital=0;
222  int koi_s=0;
223  int cp1251_s_capital=0;
224  int cp1251_s=0;
225 
226  int koi_i_capital=0;
227  int koi_i=0;
228  int cp1251_i_capital=0;
229  int cp1251_i=0;
230 
231  int cp1251_small_range=0;
232  int koi_small_range=0;
233  int ibm866_small_range=0;
234 
235  int i;
236  for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
237  {
238  if (ptr[i]>0xdf)
239  {
240  ++cp1251_small_range;
241 
242  if (ptr[i]==0xee)//small o
243  ++cp1251_o;
244  else if (ptr[i]==0xe0)//small a
245  ++cp1251_a;
246  else if (ptr[i]==0xe8)//small i
247  ++cp1251_i;
248  else if (ptr[i]==0xf1)//small s
249  ++cp1251_s;
250  else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st
251  ++cp1251_st;
252 
253  else if (ptr[i]==0xef)
254  ++koi_o_capital;
255  else if (ptr[i]==0xe1)
256  ++koi_a_capital;
257  else if (ptr[i]==0xe9)
258  ++koi_i_capital;
259  else if (ptr[i]==0xf3)
260  ++koi_s_capital;
261 
262  }
263  else if (ptr[i]>0xbf)
264  {
265  ++koi_small_range;
266 
267  if (ptr[i]==0xd0||ptr[i]==0xd1)//small o
268  ++utf8_mark;
269  else if (ptr[i]==0xcf)//small o
270  ++koi_o;
271  else if (ptr[i]==0xc1)//small a
272  ++koi_a;
273  else if (ptr[i]==0xc9)//small i
274  ++koi_i;
275  else if (ptr[i]==0xd3)//small s
276  ++koi_s;
277  else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st
278  ++koi_st;
279 
280  else if (ptr[i]==0xce)
281  ++cp1251_o_capital;
282  else if (ptr[i]==0xc0)
283  ++cp1251_a_capital;
284  else if (ptr[i]==0xc8)
285  ++cp1251_i_capital;
286  else if (ptr[i]==0xd1)
287  ++cp1251_s_capital;
288  }
289  else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60%
290  ++ibm866_small_range;
291 
292  }
293 
294  //cannot decide?
295  if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
296  {
297  return "";
298  }
299 
300  if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
301  {
302 #ifdef DECODE_DEBUG
303  kWarning() << "Cyr Enc Detection: UTF8";
304 #endif
305  return "UTF-8";
306  }
307 
308  if (ibm866_small_range>cp1251_small_range+koi_small_range)
309  return "ibm866";
310 
311 // QByteArray koi_string = "koi8-u";
312 // QByteArray cp1251_string = "cp1251";
313 
314  if (cp1251_st==0 && koi_st>1)
315  koi_score+=10;
316  else if (koi_st==0 && cp1251_st>1)
317  cp1251_score+=10;
318 
319  if (cp1251_st && koi_st)
320  {
321  if (cp1251_st/koi_st>2)
322  cp1251_score+=20;
323  else if (koi_st/cp1251_st>2)
324  koi_score+=20;
325  }
326 
327  if (cp1251_a>koi_a)
328  cp1251_score+=10;
329  else if (cp1251_a || koi_a)
330  koi_score+=10;
331 
332  if (cp1251_o>koi_o)
333  cp1251_score+=10;
334  else if (cp1251_o || koi_o)
335  koi_score+=10;
336 
337  if (cp1251_i>koi_i)
338  cp1251_score+=10;
339  else if (cp1251_i || koi_i)
340  koi_score+=10;
341 
342  if (cp1251_s>koi_s)
343  cp1251_score+=10;
344  else if (cp1251_s || koi_s)
345  koi_score+=10;
346 
347  if (cp1251_a_capital>koi_a_capital)
348  cp1251_score+=9;
349  else if (cp1251_a_capital || koi_a_capital)
350  koi_score+=9;
351 
352  if (cp1251_o_capital>koi_o_capital)
353  cp1251_score+=9;
354  else if (cp1251_o_capital || koi_o_capital)
355  koi_score+=9;
356 
357  if (cp1251_i_capital>koi_i_capital)
358  cp1251_score+=9;
359  else if (cp1251_i_capital || koi_i_capital)
360  koi_score+=9;
361 
362  if (cp1251_s_capital>koi_s_capital)
363  cp1251_score+=9;
364  else if (cp1251_s_capital || koi_s_capital)
365  koi_score+=9;
366 #ifdef DECODE_DEBUG
367  kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
368 #endif
369  if (abs(koi_score-cp1251_score)<10)
370  {
371  //fallback...
372  cp1251_score=cp1251_small_range;
373  koi_score=koi_small_range;
374  }
375  if (cp1251_score>koi_score)
376  return "cp1251";
377  else
378  return "koi8-u";
379 
380 
381 // if (cp1251_score>koi_score)
382 // setEncoding("cp1251",AutoDetectedEncoding);
383 // else
384 // setEncoding("koi8-u",AutoDetectedEncoding);
385 // return true;
386 
387 }
388 
389 static QByteArray automaticDetectionForGreek( const unsigned char* ptr, int size )
390 {
391  for ( int i = 0; i < size; ++i ) {
392  if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
393  || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
394  || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
395  return "cp1253";
396  }
397  }
398 
399  return "iso-8859-7";
400 }
401 
402 static QByteArray automaticDetectionForHebrew( const unsigned char* ptr, int size )
403 {
404  for ( int i = 0; i < size; ++i ) {
405  if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
406  || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
407  || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
408  return "cp1255";
409  }
410 
411  if ( ptr[ i ] == 0xDF )
412  return "iso-8859-8-i";
413  }
414 
415  return "iso-8859-8-i";
416 }
417 
418 static QByteArray automaticDetectionForJapanese( const unsigned char* ptr, int size )
419 {
420  JapaneseCode kc;
421 
422  switch ( kc.guess_jp( (const char*)ptr, size ) ) {
423  case JapaneseCode::JIS:
424  return "jis7";
425  case JapaneseCode::EUC:
426  return "eucjp";
427  case JapaneseCode::SJIS:
428  return "sjis";
429  case JapaneseCode::UTF8:
430  return "utf8";
431  default:
432  break;
433  }
434 
435  return "";
436 }
437 
438 static QByteArray automaticDetectionForTurkish( const unsigned char* ptr, int size )
439 {
440  for ( int i = 0; i < size; ++i ) {
441  if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
442  return "cp1254";
443  }
444  }
445 
446  return "iso-8859-9";
447 }
448 
449 static QByteArray automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
450 {
451  --size;
452  uint nonansi_count=0;
453  for (int i=0; i<size; ++i)
454  {
455  if (ptr[i]>0x79)
456  {
457  ++nonansi_count;
458  if ( ptr[i]>0xc1 && ptr[i]<0xf0 && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
459  {
460  return "UTF-8";
461  }
462  if (ptr[i] >= 0x78 && ptr[i]<=0x9F )
463  {
464  return "cp1252";
465  }
466  }
467 
468  }
469 
470  if (nonansi_count>0)
471  return "iso-8859-15";
472 
473  return "";
474 }
475 
476 // Other browsers allow comments in the head section, so we need to also.
477 // It's important not to look for tags inside the comments.
478 static void skipComment(const char *&ptr, const char *pEnd)
479 {
480  const char *p = ptr;
481  // Allow <!-->; other browsers do.
482  if (*p=='>')
483  {
484  p++;
485  }
486  else
487  {
488  while (p!=pEnd)
489  {
490  if (*p=='-')
491  {
492  // This is the real end of comment, "-->".
493  if (p[1]=='-' && p[2]=='>')
494  {
495  p += 3;
496  break;
497  }
498  // This is the incorrect end of comment that other browsers allow, "--!>".
499  if (p[1] == '-' && p[2] == '!' && p[3] == '>')
500  {
501  p += 4;
502  break;
503  }
504  }
505  p++;
506  }
507  }
508  ptr=p;
509 }
510 
511 // Returns the position of the encoding string.
512 static int findXMLEncoding(const QByteArray &str, int &encodingLength)
513 {
514  int len = str.length();
515  int pos = str.indexOf("encoding");
516  if (pos == -1)
517  return -1;
518  pos += 8;
519 
520  // Skip spaces and stray control characters.
521  while (pos<len && str[pos]<=' ')
522  ++pos;
523 
524  //Bail out if nothing after
525  // Skip equals sign.
526  if (pos>=len || str[pos] != '=')
527  return -1;
528  ++pos;
529 
530  // Skip spaces and stray control characters.
531  while (pos<len && str[pos]<=' ')
532  ++pos;
533 
534  //Bail out if nothing after
535  if (pos >= len)
536  return -1;
537 
538  // Skip quotation mark.
539  char quoteMark = str[pos];
540  if (quoteMark != '"' && quoteMark != '\'')
541  return -1;
542  ++pos;
543 
544  // Find the trailing quotation mark.
545  int end=pos;
546  while (end<len && str[end]!=quoteMark)
547  ++end;
548 
549  if (end>=len)
550  return -1;
551 
552  encodingLength = end-pos;
553  return pos;
554 }
555 
556 bool KEncodingDetector::processNull(char *data, int len)
557 {
558  bool bin=false;
559  if(is16Bit(d->m_codec))
560  {
561  for (int i=1; i < len; i+=2)
562  {
563  if ((data[i]=='\0') && (data[i-1]=='\0'))
564  {
565  bin=true;
566  data[i]=' ';
567  }
568  }
569  return bin;
570  }
571  // replace '\0' by spaces, for buggy pages
572  int i = len-1;
573  while(--i>=0)
574  {
575  if(data[i]==0)
576  {
577  bin=true;
578  data[i]=' ';
579  }
580  }
581  return bin;
582 }
583 
584 
585 bool KEncodingDetector::errorsIfUtf8 (const char* data, int length)
586 {
587  if (d->m_codec->mibEnum()!=MibUtf8)
588  return false; //means no errors
589 // #define highest1Bits (unsigned char)0x80
590 // #define highest2Bits (unsigned char)0xC0
591 // #define highest3Bits (unsigned char)0xE0
592 // #define highest4Bits (unsigned char)0xF0
593 // #define highest5Bits (unsigned char)0xF8
594 static const unsigned char highest1Bits = 0x80;
595 static const unsigned char highest2Bits = 0xC0;
596 static const unsigned char highest3Bits = 0xE0;
597 static const unsigned char highest4Bits = 0xF0;
598 static const unsigned char highest5Bits = 0xF8;
599 
600  for (int i=0; i<length; ++i)
601  {
602  unsigned char c = data[i];
603 
604  if (d->m_multiByte>0)
605  {
606  if ((c & highest2Bits) == 0x80)
607  {
608  --(d->m_multiByte);
609  continue;
610  }
611 #ifdef DECODE_DEBUG
612  kWarning() << "EncDetector: Broken UTF8";
613 #endif
614  return true;
615  }
616 
617  // most significant bit zero, single char
618  if ((c & highest1Bits) == 0x00)
619  continue;
620 
621  // 110xxxxx => init 1 following bytes
622  if ((c & highest3Bits) == 0xC0)
623  {
624  d->m_multiByte = 1;
625  continue;
626  }
627 
628  // 1110xxxx => init 2 following bytes
629  if ((c & highest4Bits) == 0xE0)
630  {
631  d->m_multiByte = 2;
632  continue;
633  }
634 
635  // 11110xxx => init 3 following bytes
636  if ((c & highest5Bits) == 0xF0)
637  {
638  d->m_multiByte = 3;
639  continue;
640  }
641 #ifdef DECODE_DEBUG
642  kWarning() << "EncDetector:_Broken UTF8";
643 #endif
644  return true;
645  }
646  return false;
647 }
648 
649 
650 KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate)
651 {
652 }
653 
654 KEncodingDetector::KEncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
655  d(new KEncodingDetectorPrivate(codec,source,script))
656 {
657 }
658 
659 KEncodingDetector::~KEncodingDetector()
660 {
661  delete d;
662 }
663 
664 void KEncodingDetector::setAutoDetectLanguage( KEncodingDetector::AutoDetectScript lang)
665 {
666  d->m_autoDetectLanguage=lang;
667 }
668 KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const
669 {
670  return d->m_autoDetectLanguage;
671 }
672 
673 KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const
674 {
675  return d->m_source;
676 }
677 
678 const char* KEncodingDetector::encoding() const
679 {
680  d->m_storeDecoderName = d->m_codec->name();
681  return d->m_storeDecoderName.constData();
682 }
683 
684 bool KEncodingDetector::visuallyOrdered() const
685 {
686  return d->m_visualRTL;
687 }
688 
689 // const QTextCodec* KEncodingDetector::codec() const
690 // {
691 // return d->m_codec;
692 // }
693 
694 QTextDecoder* KEncodingDetector::decoder()
695 {
696  return d->m_decoder;
697 }
698 
699 void KEncodingDetector::resetDecoder()
700 {
701  assert(d->m_defaultCodec);
702  d->m_bufferForDefferedEncDetection.clear();
703  d->m_writtingHappened = false;
704  d->m_analyzeCalled = false;
705  d->m_multiByte = 0;
706  delete d->m_decoder;
707  if (!d->m_codec)
708  d->m_codec = d->m_defaultCodec;
709  d->m_decoder = d->m_codec->makeDecoder();
710 }
711 
712 bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
713 {
714  QTextCodec *codec;
715  QByteArray enc(_encoding);
716  if(/*enc.isNull() || */enc.isEmpty())
717  {
718  if (type==DefaultEncoding)
719  codec=d->m_defaultCodec;
720  else
721  return false;
722  }
723  else
724  {
725  //QString->QTextCodec
726 
727  enc = enc.toLower();
728  // hebrew visually ordered
729  if(enc=="visual")
730  enc="iso8859-8";
731  bool b;
732  codec = KGlobal::charsets()->codecForName(QLatin1String(enc), b);
733  if (!b)
734  return false;
735  }
736 
737  if (d->m_codec->mibEnum()==codec->mibEnum())
738  {
739  // We already have the codec, but we still want to re-set the type,
740  // as we may have overwritten a default with a detected
741  d->m_source = type;
742  return true;
743  }
744 
745  if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
746  {
747  //Sometimes the codec specified is absurd, i.e. UTF-16 despite
748  //us decoding a meta tag as ASCII. In that case, ignore it.
749  return false;
750  }
751 
752  if (codec->mibEnum() == Mib8859_8)
753  {
754  //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
755  codec = QTextCodec::codecForName("iso8859-8-i");
756 
757  // visually ordered unless one of the following
758  if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
759  d->m_visualRTL = true;
760  }
761 
762  d->m_codec = codec;
763  d->m_source = type;
764  delete d->m_decoder;
765  d->m_decoder = d->m_codec->makeDecoder();
766 #ifdef DECODE_DEBUG
767  kDebug(6005) << "KEncodingDetector::encoding used is" << d->m_codec->name();
768 #endif
769  return true;
770 }
771 
772 QString KEncodingDetector::decode(const char *data, int len)
773 {
774  processNull(const_cast<char *>(data),len);
775  if (!d->m_analyzeCalled)
776  {
777  analyze(data,len);
778  d->m_analyzeCalled=true;
779  }
780 
781  return d->m_decoder->toUnicode(data,len);
782 }
783 
784 QString KEncodingDetector::decode(const QByteArray &data)
785 {
786  processNull(const_cast<char *>(data.data()),data.size());
787  if (!d->m_analyzeCalled)
788  {
789  analyze(data.data(),data.size());
790  d->m_analyzeCalled=true;
791  }
792 
793  return d->m_decoder->toUnicode(data);
794 }
795 
796 QString KEncodingDetector::decodeWithBuffering(const char *data, int len)
797 {
798 #ifdef DECODE_DEBUG
799  kWarning() << "KEncodingDetector: decoding "<<len<<" bytes";
800 #endif
801  if (d->m_writtingHappened)
802  {
803 #ifdef DECODE_DEBUG
804  kWarning() << "KEncodingDetector: d->m_writtingHappened "<< d->m_codec->name();
805 #endif
806  processNull(const_cast<char *>(data),len);
807  return d->m_decoder->toUnicode(data, len);
808  }
809  else
810  {
811  if (d->m_bufferForDefferedEncDetection.isEmpty())
812  {
813  // If encoding detection produced something, and we either got to the body or
814  // actually saw the encoding explicitly, we're done.
815  if (analyze(data,len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding()))
816  {
817 #ifdef DECODE_DEBUG
818  kWarning() << "KEncodingDetector: m_writtingHappened first time "<< d->m_codec->name();
819 #endif
820  processNull(const_cast<char *>(data),len);
821  d->m_writtingHappened=true;
822  return d->m_decoder->toUnicode(data, len);
823  }
824  else
825  {
826 #ifdef DECODE_DEBUG
827  kWarning() << "KEncodingDetector: begin deffer";
828 #endif
829  d->m_bufferForDefferedEncDetection=data;
830  }
831  }
832  else
833  {
834  d->m_bufferForDefferedEncDetection+=data;
835  // As above, but also limit the buffer size. We must use the entire buffer here,
836  // since the boundaries might split the meta tag, etc.
837  bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
838  if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
839  d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER)
840  {
841  d->m_writtingHappened=true;
842  d->m_bufferForDefferedEncDetection.replace('\0',' ');
843  QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
844  d->m_bufferForDefferedEncDetection.clear();
845 #ifdef DECODE_DEBUG
846  kWarning() << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
847 #endif
848  return result;
849  }
850  }
851  }
852 
853  return QString();
854 }
855 
856 bool KEncodingDetector::decodedInvalidCharacters() const
857 {
858  return d->m_decoder ? d->m_decoder->hasFailure() : false;
859 }
860 
861 QString KEncodingDetector::flush()
862 {
863  if (d->m_bufferForDefferedEncDetection.isEmpty())
864  return QString();
865 
866  d->m_bufferForDefferedEncDetection.replace('\0',' ');
867  QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
868  d->m_bufferForDefferedEncDetection.clear();
869 #ifdef DECODE_DEBUG
870  kWarning() << "KEncodingDetector:flush() "<< d->m_bufferForDefferedEncDetection.length()<<" bytes "<< d->m_codec->name();
871 #endif
872  return result;
873 }
874 
875 bool KEncodingDetector::analyze(const char *data, int len)
876 {
877  // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
878  // maximumBOMLength = 10
879  // Even if the user has chosen utf16 we still need to auto-detect the endianness
880  if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
881  {
882  // Extract the first three bytes.
883  const uchar *udata = (const uchar *)data;
884  uchar c1 = *udata++;
885  uchar c2 = *udata++;
886  uchar c3 = *udata++;
887 
888  // Check for the BOM
889  const char *autoDetectedEncoding;
890  if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
891  {
892  autoDetectedEncoding = "UTF-16";
893  }
894  else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
895  {
896  autoDetectedEncoding = "UTF-8";
897  }
898  else if (c1 == 0x00 || c2 == 0x00)
899  {
900  uchar c4 = *udata++;
901  uchar c5 = *udata++;
902  uchar c6 = *udata++;
903  uchar c7 = *udata++;
904  uchar c8 = *udata++;
905  uchar c9 = *udata++;
906  uchar c10 = *udata++;
907 
908  int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
909  int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
910  if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
911  autoDetectedEncoding = "UTF-16";
912  else
913  autoDetectedEncoding = 0;
914  }
915  else
916  {
917  autoDetectedEncoding = 0;
918  }
919 
920  // If we found a BOM, use the encoding it implies.
921  if (autoDetectedEncoding != 0)
922  {
923  d->m_source = BOM;
924  d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
925  assert(d->m_codec);
926  //enc = d->m_codec->name();
927  delete d->m_decoder;
928  d->m_decoder = d->m_codec->makeDecoder();
929 #ifdef DECODE_DEBUG
930  kWarning() << "Detection by BOM";
931 #endif
932  if (is16Bit(d->m_codec) && c2==0x00)
933  {
934  // utf16LE, we need to put the decoder in LE mode
935  char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
936  d->m_decoder->toUnicode(reverseUtf16, 2);
937  }
938  return true;
939  }
940  }
941 
942  //exit from routine in case it was called to only detect byte order for utf-16
943  if (d->m_source==UserChosenEncoding)
944  {
945 #ifdef DECODE_DEBUG
946  kWarning() << "KEncodingDetector: UserChosenEncoding exit ";
947 #endif
948 
949  if (errorsIfUtf8(data, len))
950  setEncoding("",DefaultEncoding);
951  return true;
952  }
953 
954  // HTTP header takes precedence over meta-type stuff
955  if (d->m_source==EncodingFromHTTPHeader)
956  return true;
957 
958  if (!d->m_seenBody)
959  {
960  // we still don't have an encoding, and are in the head
961  // the following tags are allowed in <head>:
962  // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
963  const char *ptr = data;
964  const char *pEnd = data+len;
965 
966  while(ptr != pEnd)
967  {
968  if(*ptr!='<')
969  {
970  ++ptr;
971  continue;
972  }
973  ++ptr;
974  // Handle comments.
975  if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
976  {
977  ptr += 3;
978  skipComment(ptr, pEnd);
979  continue;
980  }
981 
982  // Handle XML header, which can have encoding in it.
983  if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
984  {
985  const char *end = ptr;
986  while (*end != '>' && end < pEnd)
987  end++;
988  if (*end == '\0' || end == pEnd)
989  break;
990  QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator
991  int length;
992  int pos = findXMLEncoding(str, length);
993  // also handles the case when specified encoding aint correct
994  if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
995  {
996  return true;
997  }
998  }
999 
1000  //look for <meta>, stop if we reach <body>
1001  while (
1002  !(((*ptr >= 'a') && (*ptr <= 'z')) ||
1003  ((*ptr >= 'A') && (*ptr <= 'Z')))
1004  && ptr < pEnd
1005  )
1006  ++ptr;
1007 
1008  char tmp[5];
1009  int length=0;
1010  const char* max=ptr+4;
1011  if (pEnd<max)
1012  max=pEnd;
1013  while (
1014  (((*ptr >= 'a') && (*ptr <= 'z')) ||
1015  ((*ptr >= 'A') && (*ptr <= 'Z')) ||
1016  ((*ptr >= '0') && (*ptr <= '9')))
1017  && ptr < max
1018  )
1019  {
1020  tmp[length] = tolower( *ptr );
1021  ++ptr;
1022  ++length;
1023  }
1024  tmp[length] = 0;
1025  if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
1026  {
1027  // found a meta tag...
1028  const char* end = ptr;
1029  while(*end != '>' && *end != '\0' && end<pEnd)
1030  end++;
1031  //if ( *end == '\0' ) break;
1032  QByteArray str( ptr, (end-ptr)+1);
1033  str = str.toLower();
1034  const int strLength = str.length();
1035  int pos=0;
1036  //if( (pos = str.find("http-equiv", pos)) == -1) break;
1037  //if( (pos = str.find("content-type", pos)) == -1) break;
1038  if( (pos = str.indexOf("charset")) == -1)
1039  continue;
1040  pos+=6;
1041  // skip to '='
1042  if( (pos = str.indexOf("=", pos)) == -1)
1043  continue;
1044 
1045  // skip '='
1046  ++pos;
1047 
1048  // skip whitespace before encoding itself
1049  while (pos < strLength && str[pos] <= ' ')
1050  ++pos;
1051 
1052  // there may also be an opening quote, if this is a charset= and not a http-equiv.
1053  if (pos < strLength && (str[pos] == '"' || str[pos] == '\''))
1054  ++pos;
1055 
1056  // skip whitespace
1057  while (pos < strLength && str[pos] <= ' ')
1058  ++pos;
1059 
1060  if ( pos == strLength)
1061  continue;
1062 
1063  int endpos = pos;
1064  while( endpos < strLength &&
1065  (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
1066  && str[endpos] != ';' && str[endpos] != '>') )
1067  ++endpos;
1068  #ifdef DECODE_DEBUG
1069  kDebug( 6005 ) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
1070  #endif
1071  if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
1072  return true;
1073  }
1074  else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
1075  {
1076  d->m_seenBody=true;
1077  break;
1078  }
1079  }
1080  }
1081 
1082  if (len<20)
1083  return false;
1084 
1085 #ifdef DECODE_DEBUG
1086  kDebug( 6005 ) << "KEncodingDetector: using heuristics (" << strlen(data) << ")";
1087 #endif
1088 
1089  switch ( d->m_autoDetectLanguage)
1090  {
1091  case KEncodingDetector::Arabic:
1092  return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1093 // break;
1094  case KEncodingDetector::Baltic:
1095  return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1096 // break;
1097  case KEncodingDetector::CentralEuropean:
1098  return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
1099  break;
1100  case KEncodingDetector::Cyrillic:
1101  return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
1102 // break;
1103  case KEncodingDetector::Greek:
1104  return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
1105 // break;
1106  case KEncodingDetector::Hebrew:
1107  return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
1108 // break;
1109  case KEncodingDetector::Japanese:
1110  return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
1111 // break;
1112  case KEncodingDetector::Turkish:
1113  return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
1114 // break;
1115  case KEncodingDetector::WesternEuropean:
1116  if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
1117  return true;
1118  else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml
1119  {
1120  return setEncoding("iso-8859-15",AutoDetectedEncoding);
1121  }
1122  else //use default provided by eg katepart
1123  {
1124  return setEncoding("",DefaultEncoding);
1125  }
1126 // break;
1127  case KEncodingDetector::SemiautomaticDetection:
1128  case KEncodingDetector::ChineseSimplified:
1129  case KEncodingDetector::ChineseTraditional:
1130  case KEncodingDetector::Korean:
1131  case KEncodingDetector::Thai:
1132  case KEncodingDetector::Unicode:
1133  case KEncodingDetector::NorthernSaami:
1134  case KEncodingDetector::SouthEasternEurope:
1135  case KEncodingDetector::None:
1136  // huh. somethings broken in this code ### FIXME
1137  //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
1138  break;
1139  }
1140 
1141  return true;
1142 }
1143 
1144 
1145 KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString& lang)
1146 {
1147  if (lang.isEmpty())
1148  return KEncodingDetector::None;
1149  else if (lang==i18nc("@item Text character set", "Unicode"))
1150  return KEncodingDetector::Unicode;
1151  else if (lang==i18nc("@item Text character set", "Cyrillic"))
1152  return KEncodingDetector::Cyrillic;
1153  else if (lang==i18nc("@item Text character set", "Western European"))
1154  return KEncodingDetector::WesternEuropean;
1155  else if (lang==i18nc("@item Text character set", "Central European"))
1156  return KEncodingDetector::CentralEuropean;
1157  else if (lang==i18nc("@item Text character set", "Greek"))
1158  return KEncodingDetector::Greek;
1159  else if (lang==i18nc("@item Text character set", "Hebrew"))
1160  return KEncodingDetector::Hebrew;
1161  else if (lang==i18nc("@item Text character set", "Turkish"))
1162  return KEncodingDetector::Turkish;
1163  else if (lang==i18nc("@item Text character set", "Japanese"))
1164  return KEncodingDetector::Japanese;
1165  else if (lang==i18nc("@item Text character set", "Baltic"))
1166  return KEncodingDetector::Baltic;
1167  else if (lang==i18nc("@item Text character set", "Arabic"))
1168  return KEncodingDetector::Arabic;
1169 
1170  return KEncodingDetector::None;
1171 }
1172 
1173 bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)
1174 {
1175  switch (script)
1176  {
1177  case KEncodingDetector::Arabic:
1178  return true;
1179  case KEncodingDetector::Baltic:
1180  return true;
1181  case KEncodingDetector::CentralEuropean:
1182  return true;
1183  case KEncodingDetector::Cyrillic:
1184  return true;
1185  case KEncodingDetector::Greek:
1186  return true;
1187  case KEncodingDetector::Hebrew:
1188  return true;
1189  case KEncodingDetector::Japanese:
1190  return true;
1191  case KEncodingDetector::Turkish:
1192  return true;
1193  case KEncodingDetector::WesternEuropean:
1194  return true;
1195  case KEncodingDetector::ChineseTraditional:
1196  return true;
1197  case KEncodingDetector::ChineseSimplified:
1198  return true;
1199  case KEncodingDetector::Unicode:
1200  return true;
1201  break;
1202  default:
1203  return false;
1204  }
1205 }
1206 
1207 QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script)
1208 {
1209  switch (script)
1210  {
1211  case KEncodingDetector::Arabic:
1212  return i18nc("@item Text character set", "Arabic");
1213  break;
1214  case KEncodingDetector::Baltic:
1215  return i18nc("@item Text character set", "Baltic");
1216  break;
1217  case KEncodingDetector::CentralEuropean:
1218  return i18nc("@item Text character set", "Central European");
1219  break;
1220  case KEncodingDetector::Cyrillic:
1221  return i18nc("@item Text character set", "Cyrillic");
1222  break;
1223  case KEncodingDetector::Greek:
1224  return i18nc("@item Text character set", "Greek");
1225  break;
1226  case KEncodingDetector::Hebrew:
1227  return i18nc("@item Text character set", "Hebrew");
1228  break;
1229  case KEncodingDetector::Japanese:
1230  return i18nc("@item Text character set", "Japanese");
1231  break;
1232  case KEncodingDetector::Turkish:
1233  return i18nc("@item Text character set", "Turkish");
1234  break;
1235  case KEncodingDetector::WesternEuropean:
1236  return i18nc("@item Text character set", "Western European");
1237  break;
1238  case KEncodingDetector::ChineseTraditional:
1239  return i18nc("@item Text character set", "Chinese Traditional");
1240  break;
1241  case KEncodingDetector::ChineseSimplified:
1242  return i18nc("@item Text character set", "Chinese Simplified");
1243  break;
1244  case KEncodingDetector::Korean:
1245  return i18nc("@item Text character set", "Korean");
1246  break;
1247  case KEncodingDetector::Thai:
1248  return i18nc("@item Text character set", "Thai");
1249  break;
1250  case KEncodingDetector::Unicode:
1251  return i18nc("@item Text character set", "Unicode");
1252  break;
1253  //case KEncodingDetector::SemiautomaticDetection:
1254  default:
1255  return QString();
1256 
1257  }
1258 }
1259 
1260 #undef DECODE_DEBUG
1261 
KEncodingDetector::ChineseSimplified
Definition: kencodingdetector.h:79
KCharsets::codecForName
QTextCodec * codecForName(const QString &name) const
Provided for compatibility.
Definition: kcharsets.cpp:696
KEncodingDetector::AutoDetectedEncoding
Definition: kencodingdetector.h:64
KEncodingDetector::Japanese
Definition: kencodingdetector.h:84
KEncodingDetector::Arabic
Definition: kencodingdetector.h:76
MibUtf16
Definition: kencodingdetector.cpp:55
kcharsets.h
Mib8859_8
Definition: kencodingdetector.cpp:52
automaticDetectionForTurkish
static QByteArray automaticDetectionForTurkish(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:438
kdebug.h
KEncodingDetector::ChineseTraditional
Definition: kencodingdetector.h:80
KEncodingDetector::decodedInvalidCharacters
bool decodedInvalidCharacters() const
This method checks whether invalid characters were found during a decoding operation.
Definition: kencodingdetector.cpp:856
is16Bit
static bool is16Bit(QTextCodec *codec)
Definition: kencodingdetector.cpp:60
guess_ja_p.h
KEncodingDetector::Korean
Definition: kencodingdetector.h:85
QByteArray::toLower
QByteArray toLower() const
MibUtf16LE
Definition: kencodingdetector.cpp:57
KEncodingDetector
Provides encoding detection capabilities.
Definition: kencodingdetector.h:58
KEncodingDetector::decodeWithBuffering
QString decodeWithBuffering(const char *data, int len)
Convenience method that uses buffering.
Definition: kencodingdetector.cpp:796
QByteArray
MibUcs2
Definition: kencodingdetector.cpp:54
QByteArray::isNull
bool isNull() const
KEncodingDetector::setAutoDetectLanguage
void setAutoDetectLanguage(AutoDetectScript)
Definition: kencodingdetector.cpp:664
KEncodingDetector::DefaultEncoding
Definition: kencodingdetector.h:63
KEncodingDetector::EncodingChoiceSource
EncodingChoiceSource
Definition: kencodingdetector.h:61
QByteArray::isEmpty
bool isEmpty() const
KEncodingDetector::Cyrillic
Definition: kencodingdetector.h:81
KEncodingDetector::visuallyOrdered
bool visuallyOrdered() const
Definition: kencodingdetector.cpp:684
MibUtf8
Definition: kencodingdetector.cpp:53
MibUtf16BE
Definition: kencodingdetector.cpp:56
khtml::JapaneseCode
Definition: guess_ja_p.h:85
findXMLEncoding
static int findXMLEncoding(const QByteArray &str, int &encodingLength)
Definition: kencodingdetector.cpp:512
QByteArray::length
int length() const
klocale.h
automaticDetectionForBaltic
static QByteArray automaticDetectionForBaltic(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:148
KEncodingDetector::scriptForName
static AutoDetectScript scriptForName(const QString &lang)
Takes lang name after it were i18n()'ed.
Definition: kencodingdetector.cpp:1145
KEncodingDetector::EncodingFromMetaTag
Definition: kencodingdetector.h:67
i18nc
QString i18nc(const char *ctxt, const char *text)
Returns a localized version of a string and a context.
Definition: klocalizedstring.h:797
KEncodingDetector::AutoDetectScript
AutoDetectScript
Definition: kencodingdetector.h:72
KEncodingDetector::Baltic
Definition: kencodingdetector.h:77
QByteArray::indexOf
int indexOf(char ch, int from) const
kglobal.h
automaticDetectionForCyrillic
static QByteArray automaticDetectionForCyrillic(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:193
KEncodingDetector::encodingChoiceSource
EncodingChoiceSource encodingChoiceSource() const
Definition: kencodingdetector.cpp:673
KEncodingDetector::EncodingFromHTTPHeader
Definition: kencodingdetector.h:68
kencodingdetector.h
automaticDetectionForGreek
static QByteArray automaticDetectionForGreek(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:389
KEncodingDetector::~KEncodingDetector
~KEncodingDetector()
Definition: kencodingdetector.cpp:659
QString::isEmpty
bool isEmpty() const
KEncodingDetector::Greek
Definition: kencodingdetector.h:82
khtml::JapaneseCode::guess_jp
enum Type guess_jp(const char *buf, int buflen)
Definition: guess_ja.cpp:305
KGlobal::charsets
KCharsets * charsets()
The global charset manager.
Definition: kglobal.cpp:214
automaticDetectionForCentralEuropean
static QByteArray automaticDetectionForCentralEuropean(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:161
MIB
MIB
Definition: kencodingdetector.cpp:49
QString
QTextCodec
automaticDetectionForHebrew
static QByteArray automaticDetectionForHebrew(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:402
KEncodingDetector::decoder
QTextDecoder * decoder()
Definition: kencodingdetector.cpp:694
QByteArray::mid
QByteArray mid(int pos, int len) const
KEncodingDetector::KEncodingDetector
KEncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
Definition: kencodingdetector.cpp:650
skipComment
static void skipComment(const char *&ptr, const char *pEnd)
Definition: kencodingdetector.cpp:478
KEncodingDetector::NorthernSaami
Definition: kencodingdetector.h:86
KEncodingDetector::processNull
bool processNull(char *data, int length)
This nice method will kill all 0 bytes (or double bytes) and remember if this was a binary or not ;) ...
Definition: kencodingdetector.cpp:556
MibLatin1
Definition: kencodingdetector.cpp:51
automaticDetectionForJapanese
static QByteArray automaticDetectionForJapanese(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:418
kWarning
#define kWarning
Definition: kdebug.h:322
QTextCodec::mibEnum
virtual int mibEnum() const =0
KEncodingDetector::SemiautomaticDetection
Definition: kencodingdetector.h:75
KEncodingDetector::WesternEuropean
Definition: kencodingdetector.h:91
KEncodingDetector::BOM
Definition: kencodingdetector.h:65
KEncodingDetector::setEncoding
bool setEncoding(const char *encoding, EncodingChoiceSource type)
Definition: kencodingdetector.cpp:712
KEncodingDetector::EncodingFromXMLHeader
Definition: kencodingdetector.h:66
KEncodingDetector::flush
QString flush()
Convenience method to be used with decodeForHtml.
Definition: kencodingdetector.cpp:861
KEncodingDetector::hasAutoDetectionForScript
static bool hasAutoDetectionForScript(AutoDetectScript)
Definition: kencodingdetector.cpp:1173
KEncodingDetector::errorsIfUtf8
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
Definition: kencodingdetector.cpp:585
QLatin1String
KEncodingDetector::resetDecoder
void resetDecoder()
Resets the decoder.
Definition: kencodingdetector.cpp:699
KEncodingDetector::decode
QString decode(const char *data, int len)
The main class method.
Definition: kencodingdetector.cpp:772
QTextCodec::codecForName
QTextCodec * codecForName(const QByteArray &name)
KEncodingDetector::CentralEuropean
Definition: kencodingdetector.h:78
KEncodingDetector::Hebrew
Definition: kencodingdetector.h:83
KEncodingDetector::encoding
const char * encoding() const
Convenience method.
Definition: kencodingdetector.cpp:678
KEncodingDetector::analyze
bool analyze(const char *data, int len)
Analyze text data.
Definition: kencodingdetector.cpp:875
QByteArray::data
char * data()
automaticDetectionForWesternEuropean
static QByteArray automaticDetectionForWesternEuropean(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:449
KEncodingDetector::SouthEasternEurope
Definition: kencodingdetector.h:87
QTextDecoder
kDebug
#define kDebug
Definition: kdebug.h:316
MAX_BUFFER
#define MAX_BUFFER
Definition: kencodingdetector.cpp:33
KEncodingDetector::Thai
Definition: kencodingdetector.h:88
automaticDetectionForArabic
static QByteArray automaticDetectionForArabic(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:134
QByteArray::size
int size() const
KEncodingDetector::None
Definition: kencodingdetector.h:74
KEncodingDetector::autoDetectLanguage
AutoDetectScript autoDetectLanguage() const
Definition: kencodingdetector.cpp:668
KEncodingDetector::Turkish
Definition: kencodingdetector.h:89
KEncodingDetector::nameForScript
static QString nameForScript(AutoDetectScript)
Definition: kencodingdetector.cpp:1207
KEncodingDetector::UserChosenEncoding
Definition: kencodingdetector.h:69
KEncodingDetector::Unicode
Definition: kencodingdetector.h:90
This file is part of the KDE documentation.
Documentation copyright © 1996-2020 The KDE developers.
Generated on Mon Jun 22 2020 13:22:11 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KDECore

Skip menu "KDECore"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs API Reference

Skip menu "kdelibs API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal