00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "encodingdetector.h"
00029
00030 #undef DECODE_DEBUG
00031
00032
00033 #define MAX_BUFFER 16*1024
00034
00035 #include <assert.h>
00036 #include <stdlib.h>
00037
00038 #include "encodingdetector_ja_p.h"
00039
00040 #include <qregexp.h>
00041 #include <qtextcodec.h>
00042
00043 #include <kglobal.h>
00044 #include <kcharsets.h>
00045 #include <kdebug.h>
00046 #include <klocale.h>
00047
00048 #include <ctype.h>
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065 typedef struct _PangoScriptForLang {
00066 const char lang[6];
00067 EncodingDetector::AutoDetectScript scripts[1];
00068 } PangoScriptForLang;
00069
00070
00071
00072
00073
00074
00075
00076 #define PANGO_SCRIPT_ARMENIAN EncodingDetector::None
00077 #define PANGO_SCRIPT_BENGALI EncodingDetector::None
00078 #define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None
00079 #define PANGO_SCRIPT_CHEROKEE EncodingDetector::None
00080 #define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None
00081 #define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None
00082 #define PANGO_SCRIPT_GUJARATI EncodingDetector::None
00083 #define PANGO_SCRIPT_GURMUKHI EncodingDetector::None
00084 #define PANGO_SCRIPT_KANNADA EncodingDetector::None
00085 #define PANGO_SCRIPT_KHMER EncodingDetector::None
00086 #define PANGO_SCRIPT_LAO EncodingDetector::None
00087 #define PANGO_SCRIPT_MALAYALAM EncodingDetector::None
00088 #define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None
00089 #define PANGO_SCRIPT_MYANMAR EncodingDetector::None
00090 #define PANGO_SCRIPT_ORIYA EncodingDetector::None
00091 #define PANGO_SCRIPT_SINHALA EncodingDetector::None
00092 #define PANGO_SCRIPT_SYRIAC EncodingDetector::None
00093 #define PANGO_SCRIPT_TAGALOG EncodingDetector::None
00094 #define PANGO_SCRIPT_TAMIL EncodingDetector::None
00095 #define PANGO_SCRIPT_TIBETAN EncodingDetector::None
00096 #define PANGO_SCRIPT_TELUGU EncodingDetector::None
00097
00098
00099 #define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic
00100 #define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic
00101 #define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope
00102 #define PANGO_SCRIPT_GREEK EncodingDetector::Greek
00103 #define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew
00104 #define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean
00105 #define PANGO_SCRIPT_THAI EncodingDetector::Thai
00106
00107
00108 static const PangoScriptForLang pango_script_for_lang[] = {
00109 { "aa", { PANGO_SCRIPT_LATIN } },
00110 { "ab", { PANGO_SCRIPT_CYRILLIC } },
00111 { "af", { PANGO_SCRIPT_LATIN } },
00112 { "am", { PANGO_SCRIPT_ETHIOPIC } },
00113 { "ar", { PANGO_SCRIPT_ARABIC } },
00114 { "as", { PANGO_SCRIPT_BENGALI } },
00115 { "ast", { PANGO_SCRIPT_LATIN } },
00116 { "ava", { PANGO_SCRIPT_CYRILLIC } },
00117 { "ay", { PANGO_SCRIPT_LATIN } },
00118 { "az-ir", { PANGO_SCRIPT_ARABIC } },
00119 { "az", { PANGO_SCRIPT_CYRILLIC } },
00120 { "bam", { PANGO_SCRIPT_LATIN } },
00121 { "ba", { PANGO_SCRIPT_CYRILLIC } },
00122 { "be", { PANGO_SCRIPT_CYRILLIC } },
00123 { "bg", { PANGO_SCRIPT_CYRILLIC } },
00124 { "bh", { PANGO_SCRIPT_DEVANAGARI } },
00125 { "bho", { PANGO_SCRIPT_DEVANAGARI } },
00126 { "bi", { PANGO_SCRIPT_LATIN } },
00127 { "bin", { PANGO_SCRIPT_LATIN } },
00128 { "bn", { PANGO_SCRIPT_BENGALI } },
00129 { "bo", { PANGO_SCRIPT_TIBETAN } },
00130 { "br", { PANGO_SCRIPT_LATIN } },
00131 { "bs", { PANGO_SCRIPT_LATIN } },
00132 { "bua", { PANGO_SCRIPT_CYRILLIC } },
00133 { "ca", { PANGO_SCRIPT_LATIN } },
00134 { "ce", { PANGO_SCRIPT_CYRILLIC } },
00135 { "chm", { PANGO_SCRIPT_CYRILLIC } },
00136 { "chr", { PANGO_SCRIPT_CHEROKEE } },
00137 { "ch", { PANGO_SCRIPT_LATIN } },
00138 { "co", { PANGO_SCRIPT_LATIN } },
00139 { "cs", { PANGO_SCRIPT_LATIN } },
00140 { "cu", { PANGO_SCRIPT_CYRILLIC } },
00141 { "cv", { PANGO_SCRIPT_CYRILLIC } },
00142 { "cy", { PANGO_SCRIPT_LATIN } },
00143 { "da", { PANGO_SCRIPT_LATIN } },
00144 { "de", { PANGO_SCRIPT_LATIN } },
00145 { "dz", { PANGO_SCRIPT_TIBETAN } },
00146 { "el", { PANGO_SCRIPT_GREEK } },
00147 { "en", { PANGO_SCRIPT_LATIN } },
00148 { "eo", { PANGO_SCRIPT_LATIN } },
00149 { "es", { PANGO_SCRIPT_LATIN } },
00150
00151 { "et", { EncodingDetector::Baltic } },
00152 { "eu", { PANGO_SCRIPT_LATIN } },
00153 { "fa", { PANGO_SCRIPT_ARABIC } },
00154 { "fi", { PANGO_SCRIPT_LATIN } },
00155 { "fj", { PANGO_SCRIPT_LATIN } },
00156 { "fo", { PANGO_SCRIPT_LATIN } },
00157 { "fr", { PANGO_SCRIPT_LATIN } },
00158 { "ful", { PANGO_SCRIPT_LATIN } },
00159 { "fur", { PANGO_SCRIPT_LATIN } },
00160 { "fy", { PANGO_SCRIPT_LATIN } },
00161 { "ga", { PANGO_SCRIPT_LATIN } },
00162 { "gd", { PANGO_SCRIPT_LATIN } },
00163 { "gez", { PANGO_SCRIPT_ETHIOPIC } },
00164 { "gl", { PANGO_SCRIPT_LATIN } },
00165 { "gn", { PANGO_SCRIPT_LATIN } },
00166 { "gu", { PANGO_SCRIPT_GUJARATI } },
00167 { "gv", { PANGO_SCRIPT_LATIN } },
00168 { "ha", { PANGO_SCRIPT_LATIN } },
00169 { "haw", { PANGO_SCRIPT_LATIN } },
00170 { "he", { PANGO_SCRIPT_HEBREW } },
00171 { "hi", { PANGO_SCRIPT_DEVANAGARI } },
00172 { "ho", { PANGO_SCRIPT_LATIN } },
00173 { "hr", { PANGO_SCRIPT_LATIN } },
00174 { "hu", { PANGO_SCRIPT_LATIN } },
00175 { "hy", { PANGO_SCRIPT_ARMENIAN } },
00176 { "ia", { PANGO_SCRIPT_LATIN } },
00177 { "ibo", { PANGO_SCRIPT_LATIN } },
00178 { "id", { PANGO_SCRIPT_LATIN } },
00179 { "ie", { PANGO_SCRIPT_LATIN } },
00180 { "ik", { PANGO_SCRIPT_CYRILLIC } },
00181 { "io", { PANGO_SCRIPT_LATIN } },
00182 { "is", { PANGO_SCRIPT_LATIN } },
00183 { "it", { PANGO_SCRIPT_LATIN } },
00184 { "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL } },
00185
00186 { "ja", { EncodingDetector::Japanese } },
00187 { "kaa", { PANGO_SCRIPT_CYRILLIC } },
00188 { "ka", { PANGO_SCRIPT_GEORGIAN } },
00189 { "ki", { PANGO_SCRIPT_LATIN } },
00190 { "kk", { PANGO_SCRIPT_CYRILLIC } },
00191 { "kl", { PANGO_SCRIPT_LATIN } },
00192 { "km", { PANGO_SCRIPT_KHMER } },
00193 { "kn", { PANGO_SCRIPT_KANNADA } },
00194
00195 { "ko", { EncodingDetector::Korean } },
00196 { "kok", { PANGO_SCRIPT_DEVANAGARI } },
00197 { "ks", { PANGO_SCRIPT_DEVANAGARI } },
00198 { "ku-ir", { PANGO_SCRIPT_ARABIC } },
00199 { "ku", { PANGO_SCRIPT_CYRILLIC } },
00200 { "kum", { PANGO_SCRIPT_CYRILLIC } },
00201 { "kv", { PANGO_SCRIPT_CYRILLIC } },
00202 { "kw", { PANGO_SCRIPT_LATIN } },
00203 { "ky", { PANGO_SCRIPT_CYRILLIC } },
00204 { "la", { PANGO_SCRIPT_LATIN } },
00205 { "lb", { PANGO_SCRIPT_LATIN } },
00206 { "lez", { PANGO_SCRIPT_CYRILLIC } },
00207 { "ln", { PANGO_SCRIPT_LATIN } },
00208 { "lo", { PANGO_SCRIPT_LAO } },
00209
00210 { "lt", { EncodingDetector::Baltic } },
00211
00212 { "lv", { EncodingDetector::Baltic } },
00213 { "mg", { PANGO_SCRIPT_LATIN } },
00214 { "mh", { PANGO_SCRIPT_LATIN } },
00215 { "mi", { PANGO_SCRIPT_LATIN } },
00216 { "mk", { PANGO_SCRIPT_CYRILLIC } },
00217 { "ml", { PANGO_SCRIPT_MALAYALAM } },
00218 { "mn", { PANGO_SCRIPT_MONGOLIAN } },
00219 { "mo", { PANGO_SCRIPT_CYRILLIC } },
00220 { "mr", { PANGO_SCRIPT_DEVANAGARI } },
00221 { "mt", { PANGO_SCRIPT_LATIN } },
00222 { "my", { PANGO_SCRIPT_MYANMAR } },
00223 { "nb", { PANGO_SCRIPT_LATIN } },
00224 { "nds", { PANGO_SCRIPT_LATIN } },
00225 { "ne", { PANGO_SCRIPT_DEVANAGARI } },
00226 { "nl", { PANGO_SCRIPT_LATIN } },
00227 { "nn", { PANGO_SCRIPT_LATIN } },
00228 { "no", { PANGO_SCRIPT_LATIN } },
00229 { "nr", { PANGO_SCRIPT_LATIN } },
00230 { "nso", { PANGO_SCRIPT_LATIN } },
00231 { "ny", { PANGO_SCRIPT_LATIN } },
00232 { "oc", { PANGO_SCRIPT_LATIN } },
00233 { "om", { PANGO_SCRIPT_LATIN } },
00234 { "or", { PANGO_SCRIPT_ORIYA } },
00235 { "os", { PANGO_SCRIPT_CYRILLIC } },
00236 { "pa", { PANGO_SCRIPT_GURMUKHI } },
00237 { "pl", { PANGO_SCRIPT_LATIN } },
00238 { "ps-af", { PANGO_SCRIPT_ARABIC } },
00239 { "ps-pk", { PANGO_SCRIPT_ARABIC } },
00240 { "pt", { PANGO_SCRIPT_LATIN } },
00241 { "rm", { PANGO_SCRIPT_LATIN } },
00242 { "ro", { PANGO_SCRIPT_LATIN } },
00243 { "ru", { PANGO_SCRIPT_CYRILLIC } },
00244 { "sah", { PANGO_SCRIPT_CYRILLIC } },
00245 { "sa", { PANGO_SCRIPT_DEVANAGARI } },
00246 { "sco", { PANGO_SCRIPT_LATIN } },
00247 { "sel", { PANGO_SCRIPT_CYRILLIC } },
00248 { "se", { PANGO_SCRIPT_LATIN } },
00249 { "sh", { PANGO_SCRIPT_CYRILLIC } },
00250 { "si", { PANGO_SCRIPT_SINHALA } },
00251 { "sk", { PANGO_SCRIPT_LATIN } },
00252 { "sl", { PANGO_SCRIPT_LATIN } },
00253 { "sma", { PANGO_SCRIPT_LATIN } },
00254 { "smj", { PANGO_SCRIPT_LATIN } },
00255 { "smn", { PANGO_SCRIPT_LATIN } },
00256 { "sms", { PANGO_SCRIPT_LATIN } },
00257 { "sm", { PANGO_SCRIPT_LATIN } },
00258 { "so", { PANGO_SCRIPT_LATIN } },
00259 { "sq", { PANGO_SCRIPT_LATIN } },
00260 { "sr", { PANGO_SCRIPT_CYRILLIC } },
00261 { "ss", { PANGO_SCRIPT_LATIN } },
00262 { "st", { PANGO_SCRIPT_LATIN } },
00263 { "sv", { PANGO_SCRIPT_LATIN } },
00264 { "sw", { PANGO_SCRIPT_LATIN } },
00265 { "syr", { PANGO_SCRIPT_SYRIAC } },
00266 { "ta", { PANGO_SCRIPT_TAMIL } },
00267 { "te", { PANGO_SCRIPT_TELUGU } },
00268 { "tg", { PANGO_SCRIPT_CYRILLIC } },
00269 { "th", { PANGO_SCRIPT_THAI } },
00270 { "ti-er", { PANGO_SCRIPT_ETHIOPIC } },
00271 { "ti-et", { PANGO_SCRIPT_ETHIOPIC } },
00272 { "tig", { PANGO_SCRIPT_ETHIOPIC } },
00273 { "tk", { PANGO_SCRIPT_CYRILLIC } },
00274 { "tl", { PANGO_SCRIPT_TAGALOG } },
00275 { "tn", { PANGO_SCRIPT_LATIN } },
00276 { "to", { PANGO_SCRIPT_LATIN } },
00277
00278 { "tr", { EncodingDetector::Turkish } },
00279 { "ts", { PANGO_SCRIPT_LATIN } },
00280 { "tt", { PANGO_SCRIPT_CYRILLIC } },
00281 { "tw", { PANGO_SCRIPT_LATIN } },
00282 { "tyv", { PANGO_SCRIPT_CYRILLIC } },
00283 { "ug", { PANGO_SCRIPT_ARABIC } },
00284 { "uk", { PANGO_SCRIPT_CYRILLIC } },
00285 { "ur", { PANGO_SCRIPT_ARABIC } },
00286 { "uz", { PANGO_SCRIPT_CYRILLIC } },
00287 { "ven", { PANGO_SCRIPT_LATIN } },
00288 { "vi", { PANGO_SCRIPT_LATIN } },
00289 { "vot", { PANGO_SCRIPT_LATIN } },
00290 { "vo", { PANGO_SCRIPT_LATIN } },
00291 { "wa", { PANGO_SCRIPT_LATIN } },
00292 { "wen", { PANGO_SCRIPT_LATIN } },
00293 { "wo", { PANGO_SCRIPT_LATIN } },
00294 { "xh", { PANGO_SCRIPT_LATIN } },
00295 { "yap", { PANGO_SCRIPT_LATIN } },
00296 { "yi", { PANGO_SCRIPT_HEBREW } },
00297 { "yo", { PANGO_SCRIPT_LATIN } },
00298
00299 { "zh-cn", { EncodingDetector::ChineseSimplified } },
00300
00301 { "zh-hk", { EncodingDetector::ChineseTraditional } },
00302
00303 { "zh-mo", { EncodingDetector::ChineseTraditional } },
00304
00305 { "zh-sg", { EncodingDetector::ChineseSimplified } },
00306
00307 { "zh-tw", { EncodingDetector::ChineseTraditional } },
00308 { "zu", { PANGO_SCRIPT_LATIN } },
00309 { "\x00", { EncodingDetector::None } }
00310 };
00311
00312 enum MIB
00313 {
00314 MibLatin1 = 4,
00315 Mib8859_8 = 85,
00316 MibUtf8 = 106,
00317 MibUcs2 = 1000,
00318 MibUtf16 = 1015,
00319 MibUtf16BE = 1013,
00320 MibUtf16LE = 1014
00321 };
00322
00323 static bool is16Bit(QTextCodec* codec)
00324 {
00325 switch (codec->mibEnum())
00326 {
00327 case MibUtf16:
00328 case MibUtf16BE:
00329 case MibUtf16LE:
00330 case MibUcs2:
00331 return true;
00332 default:
00333 return false;
00334 }
00335 }
00336
00337 class EncodingDetectorPrivate
00338 {
00339 public:
00340 QTextCodec *m_codec;
00341 QTextDecoder *m_decoder;
00342 QTextCodec *m_defaultCodec;
00343 QCString m_storeDecoderName;
00344
00345 EncodingDetector::EncodingChoiceSource m_source;
00346 EncodingDetector::AutoDetectScript m_autoDetectLanguage;
00347
00348 bool m_visualRTL : 1;
00349 bool m_seenBody : 1;
00350 bool m_writtingHappened : 1;
00351 bool m_analyzeCalled : 1;
00352 int m_multiByte;
00353
00354 QCString m_bufferForDefferedEncDetection;
00355
00356 EncodingDetectorPrivate()
00357 : m_codec(QTextCodec::codecForMib(MibLatin1))
00358 , m_decoder(m_codec->makeDecoder())
00359 , m_defaultCodec(m_codec)
00360 , m_source(EncodingDetector::DefaultEncoding)
00361 , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection)
00362 , m_visualRTL(false)
00363 , m_seenBody(false)
00364 , m_writtingHappened(false)
00365 , m_analyzeCalled(false)
00366 , m_multiByte(0)
00367 {
00368 }
00369
00370 EncodingDetectorPrivate(QTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
00371 : m_codec(codec)
00372 , m_decoder(m_codec->makeDecoder())
00373 , m_defaultCodec(m_codec)
00374 , m_source(source)
00375 , m_autoDetectLanguage(script)
00376 , m_visualRTL(false)
00377 , m_seenBody(false)
00378 , m_writtingHappened(false)
00379 , m_analyzeCalled(false)
00380 , m_multiByte(0)
00381 {
00382 }
00383
00384 ~EncodingDetectorPrivate()
00385 {
00386 delete m_decoder;
00387 }
00388 };
00389
00390
00391 static QCString automaticDetectionForArabic( const unsigned char* ptr, int size )
00392 {
00393 for ( int i = 0; i < size; ++i ) {
00394 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
00395 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
00396 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
00397 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
00398 return "cp1256";
00399 }
00400 }
00401
00402 return "iso-8859-6";
00403 }
00404
00405 static QCString automaticDetectionForBaltic( const unsigned char* ptr, int size )
00406 {
00407 for ( int i = 0; i < size; ++i ) {
00408 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
00409 return "cp1257";
00410
00411 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
00412 return "iso-8859-13";
00413 }
00414
00415 return "iso-8859-13";
00416 }
00417
00418 static QCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
00419 {
00420 QCString charset;
00421 for ( int i = 0; i < size; ++i ) {
00422 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
00423 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
00424 return "ibm852";
00425
00426 if ( i + 1 > size )
00427 return "cp1250";
00428 else {
00429 charset = "cp1250";
00430 continue;
00431 }
00432 }
00433 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
00434 if ( i + 1 > size )
00435 return "iso-8859-2";
00436 else {
00437 if ( charset.isNull() )
00438 charset = "iso-8859-2";
00439 continue;
00440 }
00441 }
00442 }
00443
00444 if ( charset.isNull() )
00445 charset = "iso-8859-3";
00446
00447 return charset.data();
00448 }
00449
00450 static QCString automaticDetectionForCyrillic( const unsigned char* ptr, int size)
00451 {
00452 #ifdef DECODE_DEBUG
00453 kWarning() << "EncodingDetector: Cyr heuristics";
00454 #endif
00455
00456
00457
00458 int utf8_mark=0;
00459 int koi_score=0;
00460 int cp1251_score=0;
00461
00462 int koi_st=0;
00463 int cp1251_st=0;
00464
00465
00466
00467
00468 int koi_o_capital=0;
00469 int koi_o=0;
00470 int cp1251_o_capital=0;
00471 int cp1251_o=0;
00472
00473 int koi_a_capital=0;
00474 int koi_a=0;
00475 int cp1251_a_capital=0;
00476 int cp1251_a=0;
00477
00478 int koi_s_capital=0;
00479 int koi_s=0;
00480 int cp1251_s_capital=0;
00481 int cp1251_s=0;
00482
00483 int koi_i_capital=0;
00484 int koi_i=0;
00485 int cp1251_i_capital=0;
00486 int cp1251_i=0;
00487
00488 int cp1251_small_range=0;
00489 int koi_small_range=0;
00490 int ibm866_small_range=0;
00491
00492 int i;
00493 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
00494 {
00495 if (ptr[i]>0xdf)
00496 {
00497 ++cp1251_small_range;
00498
00499 if (ptr[i]==0xee)
00500 ++cp1251_o;
00501 else if (ptr[i]==0xe0)
00502 ++cp1251_a;
00503 else if (ptr[i]==0xe8)
00504 ++cp1251_i;
00505 else if (ptr[i]==0xf1)
00506 ++cp1251_s;
00507 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)
00508 ++cp1251_st;
00509
00510 else if (ptr[i]==0xef)
00511 ++koi_o_capital;
00512 else if (ptr[i]==0xe1)
00513 ++koi_a_capital;
00514 else if (ptr[i]==0xe9)
00515 ++koi_i_capital;
00516 else if (ptr[i]==0xf3)
00517 ++koi_s_capital;
00518
00519 }
00520 else if (ptr[i]>0xbf)
00521 {
00522 ++koi_small_range;
00523
00524 if (ptr[i]==0xd0||ptr[i]==0xd1)
00525 ++utf8_mark;
00526 else if (ptr[i]==0xcf)
00527 ++koi_o;
00528 else if (ptr[i]==0xc1)
00529 ++koi_a;
00530 else if (ptr[i]==0xc9)
00531 ++koi_i;
00532 else if (ptr[i]==0xd3)
00533 ++koi_s;
00534 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)
00535 ++koi_st;
00536
00537 else if (ptr[i]==0xce)
00538 ++cp1251_o_capital;
00539 else if (ptr[i]==0xc0)
00540 ++cp1251_a_capital;
00541 else if (ptr[i]==0xc8)
00542 ++cp1251_i_capital;
00543 else if (ptr[i]==0xd1)
00544 ++cp1251_s_capital;
00545 }
00546 else if (ptr[i]>0x9f && ptr[i]<0xb0)
00547 ++ibm866_small_range;
00548
00549 }
00550
00551
00552 if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
00553 {
00554 return "";
00555 }
00556
00557 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
00558 {
00559 #ifdef DECODE_DEBUG
00560 kWarning() << "Cyr Enc Detection: UTF8";
00561 #endif
00562 return "UTF-8";
00563 }
00564
00565 if (ibm866_small_range>cp1251_small_range+koi_small_range)
00566 return "ibm866";
00567
00568
00569
00570
00571 if (cp1251_st==0 && koi_st>1)
00572 koi_score+=10;
00573 else if (koi_st==0 && cp1251_st>1)
00574 cp1251_score+=10;
00575
00576 if (cp1251_st && koi_st)
00577 {
00578 if (cp1251_st/koi_st>2)
00579 cp1251_score+=20;
00580 else if (koi_st/cp1251_st>2)
00581 koi_score+=20;
00582 }
00583
00584 if (cp1251_a>koi_a)
00585 cp1251_score+=10;
00586 else if (cp1251_a || koi_a)
00587 koi_score+=10;
00588
00589 if (cp1251_o>koi_o)
00590 cp1251_score+=10;
00591 else if (cp1251_o || koi_o)
00592 koi_score+=10;
00593
00594 if (cp1251_i>koi_i)
00595 cp1251_score+=10;
00596 else if (cp1251_i || koi_i)
00597 koi_score+=10;
00598
00599 if (cp1251_s>koi_s)
00600 cp1251_score+=10;
00601 else if (cp1251_s || koi_s)
00602 koi_score+=10;
00603
00604 if (cp1251_a_capital>koi_a_capital)
00605 cp1251_score+=9;
00606 else if (cp1251_a_capital || koi_a_capital)
00607 koi_score+=9;
00608
00609 if (cp1251_o_capital>koi_o_capital)
00610 cp1251_score+=9;
00611 else if (cp1251_o_capital || koi_o_capital)
00612 koi_score+=9;
00613
00614 if (cp1251_i_capital>koi_i_capital)
00615 cp1251_score+=9;
00616 else if (cp1251_i_capital || koi_i_capital)
00617 koi_score+=9;
00618
00619 if (cp1251_s_capital>koi_s_capital)
00620 cp1251_score+=9;
00621 else if (cp1251_s_capital || koi_s_capital)
00622 koi_score+=9;
00623 #ifdef DECODE_DEBUG
00624 kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
00625 #endif
00626 if (abs(koi_score-cp1251_score)<10)
00627 {
00628
00629 cp1251_score=cp1251_small_range;
00630 koi_score=koi_small_range;
00631 }
00632 if (cp1251_score>koi_score)
00633 return "cp1251";
00634 else
00635 return "koi8-u";
00636
00637
00638
00639
00640
00641
00642
00643
00644 }
00645
00646 static QCString automaticDetectionForGreek( const unsigned char* ptr, int size )
00647 {
00648 for ( int i = 0; i < size; ++i ) {
00649 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
00650 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
00651 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
00652 return "cp1253";
00653 }
00654 }
00655
00656 return "iso-8859-7";
00657 }
00658
00659 static QCString automaticDetectionForHebrew( const unsigned char* ptr, int size )
00660 {
00661 for ( int i = 0; i < size; ++i ) {
00662 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
00663 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
00664 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
00665 return "cp1255";
00666 }
00667
00668 if ( ptr[ i ] == 0xDF )
00669 return "iso-8859-8-i";
00670 }
00671
00672 return "iso-8859-8-i";
00673 }
00674
00675 static QCString automaticDetectionForJapanese( const unsigned char* ptr, int size )
00676 {
00677 JapaneseCode kc;
00678
00679 switch ( kc.guess_jp( (const char*)ptr, size ) ) {
00680 case JapaneseCode::JIS:
00681 return "jis7";
00682 case JapaneseCode::EUC:
00683 return "eucjp";
00684 case JapaneseCode::SJIS:
00685 return "sjis";
00686 case JapaneseCode::UTF8:
00687 return "utf8";
00688 default:
00689 break;
00690 }
00691
00692 return "";
00693 }
00694
00695 static QCString automaticDetectionForTurkish( const unsigned char* ptr, int size )
00696 {
00697 for ( int i = 0; i < size; ++i ) {
00698 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
00699 return "cp1254";
00700 }
00701 }
00702
00703 return "iso-8859-9";
00704 }
00705
00706 static QCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
00707 {
00708 uint nonansi_count=0;
00709 for (int i=0; i<size; ++i)
00710 {
00711 if (ptr[i]>0x79)
00712 {
00713 ++nonansi_count;
00714 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
00715 {
00716 return "UTF-8";
00717 }
00718 if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
00719 {
00720 return "cp1252";
00721 }
00722 }
00723
00724 }
00725
00726 if (nonansi_count>0)
00727 return "iso-8859-15";
00728
00729 return "";
00730 }
00731
00732
00733
00734 static void skipComment(const char *&ptr, const char *pEnd)
00735 {
00736 const char *p = ptr;
00737
00738 if (*p=='>')
00739 {
00740 p++;
00741 }
00742 else
00743 {
00744 while (p!=pEnd)
00745 {
00746 if (*p=='-')
00747 {
00748
00749 if (p[1]=='-' && p[2]=='>')
00750 {
00751 p += 3;
00752 break;
00753 }
00754
00755 if (p[1] == '-' && p[2] == '!' && p[3] == '>')
00756 {
00757 p += 4;
00758 break;
00759 }
00760 }
00761 p++;
00762 }
00763 }
00764 ptr=p;
00765 }
00766
00767
00768 static int findXMLEncoding(const QCString &str, int &encodingLength)
00769 {
00770 int len = str.length();
00771 int pos = str.find("encoding");
00772 if (pos == -1)
00773 return -1;
00774 pos += 8;
00775
00776
00777 while (pos<len && str[pos]<=' ')
00778 ++pos;
00779
00780
00781
00782 if (pos>=len || str[pos] != '=')
00783 return -1;
00784 ++pos;
00785
00786
00787 while (pos<len && str[pos]<=' ')
00788 ++pos;
00789
00790
00791 if (pos >= len)
00792 return -1;
00793
00794
00795 char quoteMark = str[pos];
00796 if (quoteMark != '"' && quoteMark != '\'')
00797 return -1;
00798 ++pos;
00799
00800
00801 int end=pos;
00802 while (end<len && str[end]!=quoteMark)
00803 ++end;
00804
00805 if (end>=len)
00806 return -1;
00807
00808 encodingLength = end-pos;
00809 return pos;
00810 }
00811
00812
00813 bool EncodingDetector::errorsIfUtf8 (const char* data, int length)
00814 {
00815 if (d->m_codec->mibEnum()!=MibUtf8)
00816 return false;
00817
00818
00819
00820
00821
00822 static const unsigned char highest1Bits = 0x80;
00823 static const unsigned char highest2Bits = 0xC0;
00824 static const unsigned char highest3Bits = 0xE0;
00825 static const unsigned char highest4Bits = 0xF0;
00826 static const unsigned char highest5Bits = 0xF8;
00827
00828 for (int i=0; i<length; ++i)
00829 {
00830 unsigned char c = data[i];
00831
00832 if (d->m_multiByte>0)
00833 {
00834 if ((c & highest2Bits) == 0x80)
00835 {
00836 --(d->m_multiByte);
00837 continue;
00838 }
00839 #ifdef DECODE_DEBUG
00840 kWarning() << "EncDetector: Broken UTF8";
00841 #endif
00842 return true;
00843 }
00844
00845
00846 if ((c & highest1Bits) == 0x00)
00847 continue;
00848
00849
00850 if ((c & highest3Bits) == 0xC0)
00851 {
00852 d->m_multiByte = 1;
00853 continue;
00854 }
00855
00856
00857 if ((c & highest4Bits) == 0xE0)
00858 {
00859 d->m_multiByte = 2;
00860 continue;
00861 }
00862
00863
00864 if ((c & highest5Bits) == 0xF0)
00865 {
00866 d->m_multiByte = 3;
00867 continue;
00868 }
00869 #ifdef DECODE_DEBUG
00870 kWarning() << "EncDetector:_Broken UTF8";
00871 #endif
00872 return true;
00873 }
00874 return false;
00875 }
00876
00877 EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate)
00878 {
00879 }
00880
00881 EncodingDetector::EncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
00882 d(new EncodingDetectorPrivate(codec,source,script))
00883 {
00884 }
00885
00886 EncodingDetector::~EncodingDetector()
00887 {
00888 delete d;
00889 }
00890
00891 void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
00892 {
00893 d->m_autoDetectLanguage=lang;
00894 }
00895 EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const
00896 {
00897 return d->m_autoDetectLanguage;
00898 }
00899
00900 EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const
00901 {
00902 return d->m_source;
00903 }
00904
00905 const char* EncodingDetector::encoding() const
00906 {
00907 d->m_storeDecoderName = d->m_codec->name();
00908 return d->m_storeDecoderName.data();
00909 }
00910
00911 bool EncodingDetector::visuallyOrdered() const
00912 {
00913 return d->m_visualRTL;
00914 }
00915
00916
00917
00918
00919
00920
00921 QTextDecoder* EncodingDetector::decoder()
00922 {
00923 return d->m_decoder;
00924 }
00925
00926 bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
00927 {
00928 QTextCodec *codec;
00929 QCString enc(_encoding);
00930 if(enc.isEmpty())
00931 {
00932 if (type==DefaultEncoding)
00933 codec=d->m_defaultCodec;
00934 else
00935 return false;
00936 }
00937 else
00938 {
00939
00940
00941 enc = enc.lower();
00942
00943 if(enc=="visual")
00944 enc="iso8859-8";
00945 bool b;
00946 codec = KGlobal::charsets()->codecForName(enc, b);
00947 if (!b)
00948 return false;
00949 }
00950
00951 if (d->m_codec->mibEnum()==codec->mibEnum())
00952 return true;
00953
00954 if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
00955 {
00956
00957
00958 return false;
00959 }
00960
00961 if (codec->mibEnum() == Mib8859_8)
00962 {
00963
00964 codec = QTextCodec::codecForName("iso8859-8-i");
00965
00966
00967 if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
00968 d->m_visualRTL = true;
00969 }
00970
00971 d->m_codec = codec;
00972 d->m_source = type;
00973 delete d->m_decoder;
00974 d->m_decoder = d->m_codec->makeDecoder();
00975 #ifdef DECODE_DEBUG
00976 kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name();
00977 #endif
00978 return true;
00979 }
00980
00981 bool EncodingDetector::analyze(const QByteArray &data)
00982 {
00983 return analyze( data.data(), data.size() );
00984 }
00985
00986 bool EncodingDetector::analyze(const char *data, int len)
00987 {
00988
00989
00990
00991 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
00992 {
00993
00994 const uchar *udata = (const uchar *)data;
00995 uchar c1 = *udata++;
00996 uchar c2 = *udata++;
00997 uchar c3 = *udata++;
00998
00999
01000 const char *autoDetectedEncoding;
01001 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
01002 {
01003 autoDetectedEncoding = "ISO-10646-UCS-2";
01004 }
01005 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
01006 {
01007 autoDetectedEncoding = "UTF-8";
01008 }
01009 else if (c1 == 0x00 || c2 == 0x00)
01010 {
01011 uchar c4 = *udata++;
01012 uchar c5 = *udata++;
01013 uchar c6 = *udata++;
01014 uchar c7 = *udata++;
01015 uchar c8 = *udata++;
01016 uchar c9 = *udata++;
01017 uchar c10 = *udata++;
01018
01019 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
01020 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
01021 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
01022 autoDetectedEncoding = "ISO-10646-UCS-2";
01023 else
01024 autoDetectedEncoding = 0;
01025 }
01026 else
01027 {
01028 autoDetectedEncoding = 0;
01029 }
01030
01031
01032 if (autoDetectedEncoding != 0)
01033 {
01034 d->m_source = BOM;
01035 d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
01036 assert(d->m_codec);
01037
01038 delete d->m_decoder;
01039 d->m_decoder = d->m_codec->makeDecoder();
01040 #ifdef DECODE_DEBUG
01041 kWarning() << "Detection by BOM";
01042 #endif
01043 if (is16Bit(d->m_codec) && c2==0x00)
01044 {
01045
01046 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
01047 d->m_decoder->toUnicode(reverseUtf16, 2);
01048 }
01049 return true;
01050 }
01051 }
01052
01053
01054 if (d->m_source==UserChosenEncoding)
01055 {
01056 #ifdef DECODE_DEBUG
01057 kWarning() << "EncodingDetector: UserChosenEncoding exit ";
01058 #endif
01059
01060 if (errorsIfUtf8(data, len))
01061 setEncoding("",DefaultEncoding);
01062 return true;
01063 }
01064 #if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz
01065 if (!d->m_seenBody)
01066 {
01067
01068
01069
01070 const char *ptr = data;
01071 const char *pEnd = data+len;
01072
01073 while(ptr != pEnd)
01074 {
01075 if(*ptr!='<')
01076 {
01077 ++ptr;
01078 continue;
01079 }
01080 ++ptr;
01081
01082 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
01083 {
01084 ptr += 3;
01085 skipComment(ptr, pEnd);
01086 continue;
01087 }
01088
01089
01090 if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
01091 {
01092 const char *end = ptr;
01093 while (*end != '>' && end < pEnd)
01094 end++;
01095 if (*end == '\0' || end == pEnd)
01096 break;
01097 QCString str(ptr, end - ptr + 1);
01098 int length;
01099 int pos = findXMLEncoding(str, length);
01100
01101 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
01102 {
01103 return true;
01104 }
01105 }
01106
01107
01108 while (
01109 !((*ptr >= 'a') && (*ptr <= 'z') ||
01110 (*ptr >= 'A') && (*ptr <= 'Z'))
01111 && ptr < pEnd
01112 )
01113 ++ptr;
01114
01115 char tmp[5];
01116 int length=0;
01117 const char* max=ptr+4;
01118 if (pEnd<max)
01119 max=pEnd;
01120 while (
01121 ((*ptr >= 'a') && (*ptr <= 'z') ||
01122 (*ptr >= 'A') && (*ptr <= 'Z') ||
01123 (*ptr >= '0') && (*ptr <= '9'))
01124 && ptr < max
01125 )
01126 {
01127 tmp[length] = tolower( *ptr );
01128 ++ptr;
01129 ++length;
01130 }
01131 tmp[length] = 0;
01132 if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
01133 {
01134
01135 const char* end = ptr;
01136 while(*end != '>' && *end != '\0' && end<pEnd)
01137 end++;
01138
01139 QCString str( ptr, (end-ptr)+1);
01140 str = str.lower();
01141 int pos=0;
01142
01143
01144 if( (pos = str.find("charset")) == -1)
01145 continue;
01146 pos+=6;
01147
01148 if( (pos = str.find('=', pos)) == -1)
01149 continue;
01150
01151
01152 while (pos < (int)str.length() && str[pos] <= ' ')
01153 ++pos;
01154 if ( pos == (int)str.length())
01155 continue;
01156
01157 int endpos = pos;
01158 while( endpos < str.length() &&
01159 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
01160 && str[endpos] != ';' && str[endpos] != '>') )
01161 ++endpos;
01162 #ifdef DECODE_DEBUG
01163 kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
01164 #endif
01165 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
01166 return true;
01167 }
01168 else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
01169 {
01170 d->m_seenBody=true;
01171 break;
01172 }
01173 }
01174 }
01175
01176 if (d->m_source==EncodingFromHTTPHeader)
01177 return true;
01178 #endif
01179
01180 if (len < 1)
01181 {
01182 setEncoding("",DefaultEncoding);
01183 return false;
01184 }
01185 #ifdef DECODE_DEBUG
01186 kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")";
01187 #endif
01188
01189 switch ( d->m_autoDetectLanguage )
01190 {
01191 case EncodingDetector::Arabic:
01192 return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01193
01194 case EncodingDetector::Baltic:
01195 return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01196
01197 case EncodingDetector::CentralEuropean:
01198 return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
01199 break;
01200 case EncodingDetector::Cyrillic:
01201 return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
01202
01203 case EncodingDetector::Greek:
01204 return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
01205
01206 case EncodingDetector::Hebrew:
01207 return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
01208
01209 case EncodingDetector::Japanese:
01210 return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
01211
01212 case EncodingDetector::Turkish:
01213 return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
01214
01215 case EncodingDetector::WesternEuropean:
01216 if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
01217 return true;
01218 else if (d->m_defaultCodec->mibEnum()==MibLatin1)
01219 {
01220 return setEncoding("iso-8859-15",AutoDetectedEncoding);
01221 }
01222 else
01223 {
01224 return setEncoding("",DefaultEncoding);
01225 }
01226
01227 case EncodingDetector::SemiautomaticDetection:
01228 case EncodingDetector::ChineseSimplified:
01229 case EncodingDetector::ChineseTraditional:
01230 case EncodingDetector::Korean:
01231 case EncodingDetector::Thai:
01232 case EncodingDetector::Unicode:
01233 case EncodingDetector::NorthernSaami:
01234 case EncodingDetector::SouthEasternEurope:
01235 case EncodingDetector::None:
01236
01237
01238 break;
01239 }
01240
01241 setEncoding("",DefaultEncoding);
01242 return true;
01243 }
01244
01245
01246 EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const QString& lang)
01247 {
01248 if (lang.isEmpty())
01249 return EncodingDetector::None;
01250 else if (lang==i18n("@item Text character set", "Unicode"))
01251 return EncodingDetector::Unicode;
01252 else if (lang==i18n("@item Text character set", "Cyrillic"))
01253 return EncodingDetector::Cyrillic;
01254 else if (lang==i18n("@item Text character set", "Western European"))
01255 return EncodingDetector::WesternEuropean;
01256 else if (lang==i18n("@item Text character set", "Central European"))
01257 return EncodingDetector::CentralEuropean;
01258 else if (lang==i18n("@item Text character set", "Greek"))
01259 return EncodingDetector::Greek;
01260 else if (lang==i18n("@item Text character set", "Hebrew"))
01261 return EncodingDetector::Hebrew;
01262 else if (lang==i18n("@item Text character set", "Turkish"))
01263 return EncodingDetector::Turkish;
01264 else if (lang==i18n("@item Text character set", "Japanese"))
01265 return EncodingDetector::Japanese;
01266 else if (lang==i18n("@item Text character set", "Baltic"))
01267 return EncodingDetector::Baltic;
01268 else if (lang==i18n("@item Text character set", "Arabic"))
01269 return EncodingDetector::Arabic;
01270
01271 return EncodingDetector::None;
01272 }
01273
01274 bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
01275 {
01276 switch (script)
01277 {
01278 case EncodingDetector::Arabic:
01279 return true;
01280 case EncodingDetector::Baltic:
01281 return true;
01282 case EncodingDetector::CentralEuropean:
01283 return true;
01284 case EncodingDetector::Cyrillic:
01285 return true;
01286 case EncodingDetector::Greek:
01287 return true;
01288 case EncodingDetector::Hebrew:
01289 return true;
01290 case EncodingDetector::Japanese:
01291 return true;
01292 case EncodingDetector::Turkish:
01293 return true;
01294 case EncodingDetector::WesternEuropean:
01295 return true;
01296 case EncodingDetector::ChineseTraditional:
01297 return true;
01298 case EncodingDetector::ChineseSimplified:
01299 return true;
01300 case EncodingDetector::Unicode:
01301 return true;
01302 break;
01303 default:
01304 return false;
01305 }
01306 }
01307
01308 QString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
01309 {
01310 switch (script)
01311 {
01312 case EncodingDetector::Arabic:
01313 return i18n("@item Text character set", "Arabic");
01314 break;
01315 case EncodingDetector::Baltic:
01316 return i18n("@item Text character set", "Baltic");
01317 break;
01318 case EncodingDetector::CentralEuropean:
01319 return i18n("@item Text character set", "Central European");
01320 break;
01321 case EncodingDetector::Cyrillic:
01322 return i18n("@item Text character set", "Cyrillic");
01323 break;
01324 case EncodingDetector::Greek:
01325 return i18n("@item Text character set", "Greek");
01326 break;
01327 case EncodingDetector::Hebrew:
01328 return i18n("@item Text character set", "Hebrew");
01329 break;
01330 case EncodingDetector::Japanese:
01331 return i18n("@item Text character set", "Japanese");
01332 break;
01333 case EncodingDetector::Turkish:
01334 return i18n("@item Text character set", "Turkish");
01335 break;
01336 case EncodingDetector::WesternEuropean:
01337 return i18n("@item Text character set", "Western European");
01338 break;
01339 case EncodingDetector::ChineseTraditional:
01340 return i18n("@item Text character set", "Chinese Traditional");
01341 break;
01342 case EncodingDetector::ChineseSimplified:
01343 return i18n("@item Text character set", "Chinese Simplified");
01344 break;
01345 case EncodingDetector::Korean:
01346 return i18n("@item Text character set", "Korean");
01347 break;
01348 case EncodingDetector::Thai:
01349 return i18n("@item Text character set", "Thai");
01350 break;
01351 case EncodingDetector::Unicode:
01352 return i18n("@item Text character set", "Unicode");
01353 break;
01354
01355 default:
01356 return QString();
01357
01358 }
01359 }
01360
01361 EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const QString &lc)
01362 {
01363
01364
01365 const char *langStr = pango_script_for_lang[0].lang;
01366
01367 for ( int i = 0; langStr; i++ ) {
01368 langStr = pango_script_for_lang[i].lang;
01369
01370 if ( lc.startsWith( QString::fromAscii( langStr ) ) )
01371 return pango_script_for_lang[i].scripts[0];
01372 }
01373 return None;
01374 }
01375
01376 #undef DECODE_DEBUG
01377