kmail

encodingdetector.cpp

00001 /*
00002     This file was taken from the KDE 4.x libraries and backported to Qt 3.
00003 
00004     Copyright (C) 1999 Lars Knoll (knoll@kde.org)
00005     Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
00006     Copyright (C) 2003 Apple Computer, Inc.
00007     Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
00008 
00009     This library is free software; you can redistribute it and/or
00010     modify it under the terms of the GNU Library General Public
00011     License as published by the Free Software Foundation; either
00012     version 2 of the License, or (at your option) any later version.
00013 
00014     This library is distributed in the hope that it will be useful,
00015     but WITHOUT ANY WARRANTY; without even the implied warranty of
00016     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00017     Library General Public License for more details.
00018 
00019     You should have received a copy of the GNU Library General Public License
00020     along with this library; see the file COPYING.LIB.  If not, write to
00021     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00022     Boston, MA 02110-1301, USA.
00023 */
00024 //----------------------------------------------------------------------------
00025 //
00026 // decoder for input stream
00027 
00028 #include "encodingdetector.h"
00029 
00030 #undef DECODE_DEBUG
00031 //#define DECODE_DEBUG
00032 
00033 #define MAX_BUFFER 16*1024
00034 
00035 #include <assert.h>
00036 #include <stdlib.h>
00037 
00038 #include "encodingdetector_ja_p.h"
00039 
00040 #include <qregexp.h>
00041 #include <qtextcodec.h>
00042 
00043 #include <kglobal.h>
00044 #include <kcharsets.h>
00045 #include <kdebug.h>
00046 #include <klocale.h>
00047 
00048 #include <ctype.h>
00049 
00050 // The following table was taken from libpango 1.19.3 and slightly modified.
00051 // Multiple scripts per language were removed and the entries were reordered so
00052 // that simple substring matching will work. For example, bam was put before ba
00053 // so that the first match will be likely the right match. Otherwise "ba" would
00054 // match "bam" but we would have to search on to find "bam" which is what we want.
00055 // The original file is called pango-script-lang-table.h
00056 
00057 /* pango-script-lang-table.h:
00058  * 
00059  * Generated by gen-script-for-lang-new.c
00060  * Date: 2007-10-26
00061  * Source: fontconfig-2.4.91
00062  * 
00063  * Do not edit. // I did. Sue me ;)
00064  */
00065 typedef struct _PangoScriptForLang {
00066   const char lang[6];
00067   EncodingDetector::AutoDetectScript scripts[1];
00068 } PangoScriptForLang;
00069 
00070 //Unfortunately EncodingDetector does not know all scripts that Pango knows.
00071 //Also, using EncodingDetector::CentralEuropean for the appropriate countries
00072 //might give better results in some cases.
00073 //One especially important (many speakers/literates) omission is the lack of
00074 //Indian scripts.
00075 
00076 #define PANGO_SCRIPT_ARMENIAN EncodingDetector::None
00077 #define PANGO_SCRIPT_BENGALI EncodingDetector::None
00078 #define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None
00079 #define PANGO_SCRIPT_CHEROKEE EncodingDetector::None
00080 #define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None
00081 #define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None
00082 #define PANGO_SCRIPT_GUJARATI EncodingDetector::None
00083 #define PANGO_SCRIPT_GURMUKHI EncodingDetector::None
00084 #define PANGO_SCRIPT_KANNADA EncodingDetector::None
00085 #define PANGO_SCRIPT_KHMER EncodingDetector::None
00086 #define PANGO_SCRIPT_LAO EncodingDetector::None
00087 #define PANGO_SCRIPT_MALAYALAM EncodingDetector::None
00088 #define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None
00089 #define PANGO_SCRIPT_MYANMAR EncodingDetector::None
00090 #define PANGO_SCRIPT_ORIYA EncodingDetector::None
00091 #define PANGO_SCRIPT_SINHALA EncodingDetector::None
00092 #define PANGO_SCRIPT_SYRIAC EncodingDetector::None
00093 #define PANGO_SCRIPT_TAGALOG EncodingDetector::None
00094 #define PANGO_SCRIPT_TAMIL EncodingDetector::None
00095 #define PANGO_SCRIPT_TIBETAN EncodingDetector::None
00096 #define PANGO_SCRIPT_TELUGU EncodingDetector::None
00097 
00098 //Instead of changing the table even more...
00099 #define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic
00100 #define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic
00101 #define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope
00102 #define PANGO_SCRIPT_GREEK EncodingDetector::Greek
00103 #define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew
00104 #define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean
00105 #define PANGO_SCRIPT_THAI EncodingDetector::Thai
00106 
00107 
00108 static const PangoScriptForLang pango_script_for_lang[] = {
00109   { "aa",    { PANGO_SCRIPT_LATIN/*62*/ } },
00110   { "ab",    { PANGO_SCRIPT_CYRILLIC/*90*/ } },
00111   { "af",    { PANGO_SCRIPT_LATIN/*69*/ } },
00112   { "am",    { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
00113   { "ar",    { PANGO_SCRIPT_ARABIC/*125*/ } },
00114   { "as",    { PANGO_SCRIPT_BENGALI/*89*/ } },
00115   { "ast",   { PANGO_SCRIPT_LATIN/*66*/ } },
00116   { "ava",   { PANGO_SCRIPT_CYRILLIC/*67*/ } },
00117   { "ay",    { PANGO_SCRIPT_LATIN/*60*/ } },
00118   { "az-ir", { PANGO_SCRIPT_ARABIC/*129*/ } },
00119   { "az",    { PANGO_SCRIPT_CYRILLIC/*80*/ } }, //, PANGO_SCRIPT_LATIN/*68*/ } },
00120   { "bam",   { PANGO_SCRIPT_LATIN/*60*/ } },
00121   { "ba",    { PANGO_SCRIPT_CYRILLIC/*82*/ } },
00122   { "be",    { PANGO_SCRIPT_CYRILLIC/*68*/ } },
00123   { "bg",    { PANGO_SCRIPT_CYRILLIC/*60*/ } },
00124   { "bh",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
00125   { "bho",   { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
00126   { "bi",    { PANGO_SCRIPT_LATIN/*58*/ } },
00127   { "bin",   { PANGO_SCRIPT_LATIN/*76*/ } },
00128   { "bn",    { PANGO_SCRIPT_BENGALI/*89*/ } },
00129   { "bo",    { PANGO_SCRIPT_TIBETAN/*95*/ } },
00130   { "br",    { PANGO_SCRIPT_LATIN/*64*/ } },
00131   { "bs",    { PANGO_SCRIPT_LATIN/*62*/ } },
00132   { "bua",   { PANGO_SCRIPT_CYRILLIC/*70*/ } },
00133   { "ca",    { PANGO_SCRIPT_LATIN/*74*/ } },
00134   { "ce",    { PANGO_SCRIPT_CYRILLIC/*67*/ } },
00135   { "chm",   { PANGO_SCRIPT_CYRILLIC/*76*/ } },
00136   { "chr",   { PANGO_SCRIPT_CHEROKEE/*85*/ } },
00137   { "ch",    { PANGO_SCRIPT_LATIN/*58*/ } },
00138   { "co",    { PANGO_SCRIPT_LATIN/*84*/ } },
00139   { "cs",    { PANGO_SCRIPT_LATIN/*82*/ } },
00140   { "cu",    { PANGO_SCRIPT_CYRILLIC/*103*/ } },
00141   { "cv",    { PANGO_SCRIPT_CYRILLIC/*72*/ } }, //, PANGO_SCRIPT_LATIN/*2*/ } },
00142   { "cy",    { PANGO_SCRIPT_LATIN/*78*/ } },
00143   { "da",    { PANGO_SCRIPT_LATIN/*70*/ } },
00144   { "de",    { PANGO_SCRIPT_LATIN/*59*/ } },
00145   { "dz",    { PANGO_SCRIPT_TIBETAN/*95*/ } },
00146   { "el",    { PANGO_SCRIPT_GREEK/*69*/ } },
00147   { "en",    { PANGO_SCRIPT_LATIN/*72*/ } },
00148   { "eo",    { PANGO_SCRIPT_LATIN/*64*/ } },
00149   { "es",    { PANGO_SCRIPT_LATIN/*66*/ } },
00150 //  { "et",    { PANGO_SCRIPT_LATIN/*64*/ } },
00151   { "et",    { EncodingDetector::Baltic } },
00152   { "eu",    { PANGO_SCRIPT_LATIN/*56*/ } },
00153   { "fa",    { PANGO_SCRIPT_ARABIC/*129*/ } },
00154   { "fi",    { PANGO_SCRIPT_LATIN/*62*/ } },
00155   { "fj",    { PANGO_SCRIPT_LATIN/*52*/ } },
00156   { "fo",    { PANGO_SCRIPT_LATIN/*68*/ } },
00157   { "fr",    { PANGO_SCRIPT_LATIN/*84*/ } },
00158   { "ful",   { PANGO_SCRIPT_LATIN/*62*/ } },
00159   { "fur",   { PANGO_SCRIPT_LATIN/*66*/ } },
00160   { "fy",    { PANGO_SCRIPT_LATIN/*75*/ } },
00161   { "ga",    { PANGO_SCRIPT_LATIN/*80*/ } },
00162   { "gd",    { PANGO_SCRIPT_LATIN/*70*/ } },
00163   { "gez",   { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
00164   { "gl",    { PANGO_SCRIPT_LATIN/*66*/ } },
00165   { "gn",    { PANGO_SCRIPT_LATIN/*70*/ } },
00166   { "gu",    { PANGO_SCRIPT_GUJARATI/*78*/ } },
00167   { "gv",    { PANGO_SCRIPT_LATIN/*54*/ } },
00168   { "ha",    { PANGO_SCRIPT_LATIN/*60*/ } },
00169   { "haw",   { PANGO_SCRIPT_LATIN/*62*/ } },
00170   { "he",    { PANGO_SCRIPT_HEBREW/*27*/ } },
00171   { "hi",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
00172   { "ho",    { PANGO_SCRIPT_LATIN/*52*/ } },
00173   { "hr",    { PANGO_SCRIPT_LATIN/*62*/ } },
00174   { "hu",    { PANGO_SCRIPT_LATIN/*70*/ } },
00175   { "hy",    { PANGO_SCRIPT_ARMENIAN/*77*/ } },
00176   { "ia",    { PANGO_SCRIPT_LATIN/*52*/ } },
00177   { "ibo",   { PANGO_SCRIPT_LATIN/*58*/ } },
00178   { "id",    { PANGO_SCRIPT_LATIN/*54*/ } },
00179   { "ie",    { PANGO_SCRIPT_LATIN/*52*/ } },
00180   { "ik",    { PANGO_SCRIPT_CYRILLIC/*68*/ } },
00181   { "io",    { PANGO_SCRIPT_LATIN/*52*/ } },
00182   { "is",    { PANGO_SCRIPT_LATIN/*70*/ } },
00183   { "it",    { PANGO_SCRIPT_LATIN/*72*/ } },
00184   { "iu",    { PANGO_SCRIPT_CANADIAN_ABORIGINAL/*161*/ } },
00185 //  { "ja",    { PANGO_SCRIPT_HAN/*6356*/, PANGO_SCRIPT_KATAKANA/*88*/, PANGO_SCRIPT_HIRAGANA/*85*/ } },
00186   { "ja",    { EncodingDetector::Japanese } },
00187   { "kaa",   { PANGO_SCRIPT_CYRILLIC/*78*/ } },
00188   { "ka",    { PANGO_SCRIPT_GEORGIAN/*33*/ } },
00189   { "ki",    { PANGO_SCRIPT_LATIN/*56*/ } },
00190   { "kk",    { PANGO_SCRIPT_CYRILLIC/*77*/ } },
00191   { "kl",    { PANGO_SCRIPT_LATIN/*81*/ } },
00192   { "km",    { PANGO_SCRIPT_KHMER/*70*/ } },
00193   { "kn",    { PANGO_SCRIPT_KANNADA/*80*/ } },
00194 //  { "ko",    { PANGO_SCRIPT_HANGUL/*2443*/ } },
00195   { "ko",    { EncodingDetector::Korean } },
00196   { "kok",   { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
00197   { "ks",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
00198   { "ku-ir", { PANGO_SCRIPT_ARABIC/*32*/ } },
00199   { "ku",    { PANGO_SCRIPT_CYRILLIC/*60*/ } }, //, PANGO_SCRIPT_LATIN/*4*/ } },
00200   { "kum",   { PANGO_SCRIPT_CYRILLIC/*66*/ } },
00201   { "kv",    { PANGO_SCRIPT_CYRILLIC/*70*/ } },
00202   { "kw",    { PANGO_SCRIPT_LATIN/*64*/ } },
00203   { "ky",    { PANGO_SCRIPT_CYRILLIC/*70*/ } },
00204   { "la",    { PANGO_SCRIPT_LATIN/*68*/ } },
00205   { "lb",    { PANGO_SCRIPT_LATIN/*75*/ } },
00206   { "lez",   { PANGO_SCRIPT_CYRILLIC/*67*/ } },
00207   { "ln",    { PANGO_SCRIPT_LATIN/*78*/ } },
00208   { "lo",    { PANGO_SCRIPT_LAO/*65*/ } },
00209 //  { "lt",    { PANGO_SCRIPT_LATIN/*70*/ } },
00210   { "lt",    { EncodingDetector::Baltic } },
00211 //  { "lv",    { PANGO_SCRIPT_LATIN/*78*/ } },
00212   { "lv",    { EncodingDetector::Baltic } },
00213   { "mg",    { PANGO_SCRIPT_LATIN/*56*/ } },
00214   { "mh",    { PANGO_SCRIPT_LATIN/*62*/ } },
00215   { "mi",    { PANGO_SCRIPT_LATIN/*64*/ } },
00216   { "mk",    { PANGO_SCRIPT_CYRILLIC/*42*/ } },
00217   { "ml",    { PANGO_SCRIPT_MALAYALAM/*78*/ } },
00218   { "mn",    { PANGO_SCRIPT_MONGOLIAN/*130*/ } },
00219   { "mo",    { PANGO_SCRIPT_CYRILLIC/*66*/ } }, //, PANGO_SCRIPT_LATIN/*62*/ } },
00220   { "mr",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
00221   { "mt",    { PANGO_SCRIPT_LATIN/*72*/ } },
00222   { "my",    { PANGO_SCRIPT_MYANMAR/*48*/ } },
00223   { "nb",    { PANGO_SCRIPT_LATIN/*70*/ } },
00224   { "nds",   { PANGO_SCRIPT_LATIN/*59*/ } },
00225   { "ne",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
00226   { "nl",    { PANGO_SCRIPT_LATIN/*82*/ } },
00227   { "nn",    { PANGO_SCRIPT_LATIN/*76*/ } },
00228   { "no",    { PANGO_SCRIPT_LATIN/*70*/ } },
00229   { "nr",    { PANGO_SCRIPT_LATIN/*52*/ } },
00230   { "nso",   { PANGO_SCRIPT_LATIN/*58*/ } },
00231   { "ny",    { PANGO_SCRIPT_LATIN/*54*/ } },
00232   { "oc",    { PANGO_SCRIPT_LATIN/*70*/ } },
00233   { "om",    { PANGO_SCRIPT_LATIN/*52*/ } },
00234   { "or",    { PANGO_SCRIPT_ORIYA/*79*/ } },
00235   { "os",    { PANGO_SCRIPT_CYRILLIC/*66*/ } },
00236   { "pa",    { PANGO_SCRIPT_GURMUKHI/*63*/ } },
00237   { "pl",    { PANGO_SCRIPT_LATIN/*70*/ } },
00238   { "ps-af", { PANGO_SCRIPT_ARABIC/*49*/ } },
00239   { "ps-pk", { PANGO_SCRIPT_ARABIC/*49*/ } },
00240   { "pt",    { PANGO_SCRIPT_LATIN/*82*/ } },
00241   { "rm",    { PANGO_SCRIPT_LATIN/*66*/ } },
00242   { "ro",    { PANGO_SCRIPT_LATIN/*62*/ } },
00243   { "ru",    { PANGO_SCRIPT_CYRILLIC/*66*/ } },
00244   { "sah",   { PANGO_SCRIPT_CYRILLIC/*76*/ } },
00245   { "sa",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
00246   { "sco",   { PANGO_SCRIPT_LATIN/*56*/ } },
00247   { "sel",   { PANGO_SCRIPT_CYRILLIC/*66*/ } },
00248   { "se",    { PANGO_SCRIPT_LATIN/*66*/ } },
00249   { "sh",    { PANGO_SCRIPT_CYRILLIC/*76*/ } },
00250   { "si",    { PANGO_SCRIPT_SINHALA/*77*/ } },
00251   { "sk",    { PANGO_SCRIPT_LATIN/*86*/ } },
00252   { "sl",    { PANGO_SCRIPT_LATIN/*62*/ } },
00253   { "sma",   { PANGO_SCRIPT_LATIN/*60*/ } },
00254   { "smj",   { PANGO_SCRIPT_LATIN/*60*/ } },
00255   { "smn",   { PANGO_SCRIPT_LATIN/*68*/ } },
00256   { "sms",   { PANGO_SCRIPT_LATIN/*80*/ } },
00257   { "sm",    { PANGO_SCRIPT_LATIN/*52*/ } },
00258   { "so",    { PANGO_SCRIPT_LATIN/*52*/ } },
00259   { "sq",    { PANGO_SCRIPT_LATIN/*56*/ } },
00260   { "sr",    { PANGO_SCRIPT_CYRILLIC/*76*/ } },
00261   { "ss",    { PANGO_SCRIPT_LATIN/*52*/ } },
00262   { "st",    { PANGO_SCRIPT_LATIN/*52*/ } },
00263   { "sv",    { PANGO_SCRIPT_LATIN/*68*/ } },
00264   { "sw",    { PANGO_SCRIPT_LATIN/*52*/ } },
00265   { "syr",   { PANGO_SCRIPT_SYRIAC/*45*/ } },
00266   { "ta",    { PANGO_SCRIPT_TAMIL/*48*/ } },
00267   { "te",    { PANGO_SCRIPT_TELUGU/*80*/ } },
00268   { "tg",    { PANGO_SCRIPT_CYRILLIC/*78*/ } },
00269   { "th",    { PANGO_SCRIPT_THAI/*86*/ } },
00270   { "ti-er", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
00271   { "ti-et", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
00272   { "tig",   { PANGO_SCRIPT_ETHIOPIC/*221*/ } },
00273   { "tk",    { PANGO_SCRIPT_CYRILLIC/*74*/ } },
00274   { "tl",    { PANGO_SCRIPT_TAGALOG/*19*/ } },
00275   { "tn",    { PANGO_SCRIPT_LATIN/*58*/ } },
00276   { "to",    { PANGO_SCRIPT_LATIN/*52*/ } },
00277 //  { "tr",    { PANGO_SCRIPT_LATIN/*70*/ } },
00278   { "tr",    { EncodingDetector::Turkish } },
00279   { "ts",    { PANGO_SCRIPT_LATIN/*52*/ } },
00280   { "tt",    { PANGO_SCRIPT_CYRILLIC/*76*/ } },
00281   { "tw",    { PANGO_SCRIPT_LATIN/*70*/ } },
00282   { "tyv",   { PANGO_SCRIPT_CYRILLIC/*70*/ } },
00283   { "ug",    { PANGO_SCRIPT_ARABIC/*125*/ } },
00284   { "uk",    { PANGO_SCRIPT_CYRILLIC/*72*/ } },
00285   { "ur",    { PANGO_SCRIPT_ARABIC/*145*/ } },
00286   { "uz",    { PANGO_SCRIPT_CYRILLIC/*68*/ } },
00287   { "ven",   { PANGO_SCRIPT_LATIN/*62*/ } },
00288   { "vi",    { PANGO_SCRIPT_LATIN/*186*/ } },
00289   { "vot",   { PANGO_SCRIPT_LATIN/*62*/ } },
00290   { "vo",    { PANGO_SCRIPT_LATIN/*54*/ } },
00291   { "wa",    { PANGO_SCRIPT_LATIN/*70*/ } },
00292   { "wen",   { PANGO_SCRIPT_LATIN/*76*/ } },
00293   { "wo",    { PANGO_SCRIPT_LATIN/*66*/ } },
00294   { "xh",    { PANGO_SCRIPT_LATIN/*52*/ } },
00295   { "yap",   { PANGO_SCRIPT_LATIN/*58*/ } },
00296   { "yi",    { PANGO_SCRIPT_HEBREW/*27*/ } },
00297   { "yo",    { PANGO_SCRIPT_LATIN/*114*/ } },
00298 //  { "zh-cn", { PANGO_SCRIPT_HAN/*6763*/ } },
00299   { "zh-cn", { EncodingDetector::ChineseSimplified } },
00300 //  { "zh-hk", { PANGO_SCRIPT_HAN/*2213*/ } },
00301   { "zh-hk", { EncodingDetector::ChineseTraditional } },
00302 //  { "zh-mo", { PANGO_SCRIPT_HAN/*2213*/ } },
00303   { "zh-mo", { EncodingDetector::ChineseTraditional } },
00304 //  { "zh-sg", { PANGO_SCRIPT_HAN/*6763*/ } },
00305   { "zh-sg", { EncodingDetector::ChineseSimplified } },
00306 //  { "zh-tw", { PANGO_SCRIPT_HAN/*13063*/ } },
00307   { "zh-tw", { EncodingDetector::ChineseTraditional } },
00308   { "zu",    { PANGO_SCRIPT_LATIN/*52*/ } },
00309   { "\x00",    { EncodingDetector::None } }      //end mark
00310 };
00311 
00312 enum MIB
00313 {
00314     MibLatin1  = 4,
00315     Mib8859_8  = 85,
00316     MibUtf8    = 106,
00317     MibUcs2    = 1000,
00318     MibUtf16   = 1015,
00319     MibUtf16BE = 1013,
00320     MibUtf16LE = 1014
00321 };
00322 
00323 static bool is16Bit(QTextCodec* codec)
00324 {
00325     switch (codec->mibEnum())
00326     {
00327     case MibUtf16:
00328     case MibUtf16BE:
00329     case MibUtf16LE:
00330     case MibUcs2:
00331         return true;
00332     default:
00333         return false;
00334     }
00335 }
00336 
00337 class EncodingDetectorPrivate
00338 {
00339 public:
00340     QTextCodec *m_codec;
00341     QTextDecoder *m_decoder; // utf16
00342     QTextCodec *m_defaultCodec;
00343     QCString  m_storeDecoderName;
00344 
00345     EncodingDetector::EncodingChoiceSource m_source;
00346     EncodingDetector::AutoDetectScript m_autoDetectLanguage;
00347 
00348     bool m_visualRTL : 1;
00349     bool m_seenBody : 1;
00350     bool m_writtingHappened : 1;
00351     bool m_analyzeCalled : 1; //for decode()
00352     int m_multiByte;
00353 
00354     QCString m_bufferForDefferedEncDetection;
00355 
00356     EncodingDetectorPrivate()
00357             : m_codec(QTextCodec::codecForMib(MibLatin1))
00358             , m_decoder(m_codec->makeDecoder())
00359             , m_defaultCodec(m_codec)
00360             , m_source(EncodingDetector::DefaultEncoding)
00361             , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection)
00362             , m_visualRTL(false)
00363             , m_seenBody(false)
00364             , m_writtingHappened(false)
00365             , m_analyzeCalled(false)
00366             , m_multiByte(0)
00367     {
00368     }
00369 
00370     EncodingDetectorPrivate(QTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
00371             : m_codec(codec)
00372             , m_decoder(m_codec->makeDecoder())
00373             , m_defaultCodec(m_codec)
00374             , m_source(source)
00375             , m_autoDetectLanguage(script)
00376             , m_visualRTL(false)
00377             , m_seenBody(false)
00378             , m_writtingHappened(false)
00379             , m_analyzeCalled(false)
00380             , m_multiByte(0)
00381     {
00382     }
00383 
00384     ~EncodingDetectorPrivate()
00385     {
00386         delete m_decoder;
00387     }
00388 };
00389 
00390 
00391 static QCString automaticDetectionForArabic( const unsigned char* ptr, int size )
00392 {
00393     for ( int i = 0; i < size; ++i ) {
00394         if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
00395              || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
00396              || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
00397              || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
00398             return "cp1256";
00399         }
00400     }
00401 
00402     return "iso-8859-6";
00403 }
00404 
00405 static QCString automaticDetectionForBaltic( const unsigned char* ptr, int size )
00406 {
00407     for ( int i = 0; i < size; ++i ) {
00408         if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
00409              return "cp1257";
00410 
00411         if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
00412             return "iso-8859-13";
00413     }
00414 
00415     return "iso-8859-13";
00416 }
00417 
00418 static QCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
00419 {
00420     QCString charset;
00421     for ( int i = 0; i < size; ++i ) {
00422         if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
00423             if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
00424                 return "ibm852";
00425 
00426             if ( i + 1 > size )
00427                 return "cp1250";
00428             else { // maybe ibm852 ?
00429                 charset = "cp1250";
00430                 continue;
00431             }
00432         }
00433         if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
00434             if ( i + 1 > size )
00435                 return "iso-8859-2";
00436             else {  // maybe ibm852 ?
00437                 if ( charset.isNull() )
00438                     charset = "iso-8859-2";
00439                 continue;
00440             }
00441         }
00442     }
00443 
00444     if ( charset.isNull() )
00445         charset = "iso-8859-3";
00446 
00447     return charset.data();
00448 }
00449 
00450 static QCString automaticDetectionForCyrillic( const unsigned char* ptr, int size)
00451 {
00452 #ifdef DECODE_DEBUG
00453         kWarning() << "EncodingDetector: Cyr heuristics";
00454 #endif
00455 
00456 //     if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
00457 //         return "utf8";
00458     int utf8_mark=0;
00459     int koi_score=0;
00460     int cp1251_score=0;
00461 
00462     int koi_st=0;
00463     int cp1251_st=0;
00464 
00465 //     int koi_na=0;
00466 //     int cp1251_na=0;
00467 
00468     int koi_o_capital=0;
00469     int koi_o=0;
00470     int cp1251_o_capital=0;
00471     int cp1251_o=0;
00472 
00473     int koi_a_capital=0;
00474     int koi_a=0;
00475     int cp1251_a_capital=0;
00476     int cp1251_a=0;
00477 
00478     int koi_s_capital=0;
00479     int koi_s=0;
00480     int cp1251_s_capital=0;
00481     int cp1251_s=0;
00482 
00483     int koi_i_capital=0;
00484     int koi_i=0;
00485     int cp1251_i_capital=0;
00486     int cp1251_i=0;
00487 
00488     int cp1251_small_range=0;
00489     int koi_small_range=0;
00490     int ibm866_small_range=0;
00491 
00492     int i;
00493     for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
00494     {
00495         if (ptr[i]>0xdf)
00496         {
00497             ++cp1251_small_range;
00498 
00499             if (ptr[i]==0xee)//small o
00500                 ++cp1251_o;
00501             else if (ptr[i]==0xe0)//small a
00502                 ++cp1251_a;
00503             else if (ptr[i]==0xe8)//small i
00504                 ++cp1251_i;
00505             else if (ptr[i]==0xf1)//small s
00506                 ++cp1251_s;
00507             else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st
00508                 ++cp1251_st;
00509 
00510             else if (ptr[i]==0xef)
00511                 ++koi_o_capital;
00512             else if (ptr[i]==0xe1)
00513                 ++koi_a_capital;
00514             else if (ptr[i]==0xe9)
00515                 ++koi_i_capital;
00516             else if (ptr[i]==0xf3)
00517                 ++koi_s_capital;
00518 
00519         }
00520         else if (ptr[i]>0xbf)
00521         {
00522             ++koi_small_range;
00523 
00524             if (ptr[i]==0xd0||ptr[i]==0xd1)//small o
00525                 ++utf8_mark;
00526             else if (ptr[i]==0xcf)//small o
00527                 ++koi_o;
00528             else if (ptr[i]==0xc1)//small a
00529                 ++koi_a;
00530             else if (ptr[i]==0xc9)//small i
00531                 ++koi_i;
00532             else if (ptr[i]==0xd3)//small s
00533                 ++koi_s;
00534             else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st
00535                 ++koi_st;
00536 
00537             else if (ptr[i]==0xce)
00538                 ++cp1251_o_capital;
00539             else if (ptr[i]==0xc0)
00540                 ++cp1251_a_capital;
00541             else if (ptr[i]==0xc8)
00542                 ++cp1251_i_capital;
00543             else if (ptr[i]==0xd1)
00544                 ++cp1251_s_capital;
00545         }
00546         else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60%
00547             ++ibm866_small_range;
00548 
00549     }
00550 
00551     //cannot decide?
00552     if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
00553     {
00554         return "";
00555     }
00556 
00557     if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
00558     {
00559 #ifdef DECODE_DEBUG
00560         kWarning() << "Cyr Enc Detection: UTF8";
00561 #endif
00562         return "UTF-8";
00563     }
00564 
00565     if (ibm866_small_range>cp1251_small_range+koi_small_range)
00566         return "ibm866";
00567 
00568 //     QCString koi_string = "koi8-u";
00569 //     QCString cp1251_string = "cp1251";
00570 
00571     if (cp1251_st==0 && koi_st>1)
00572         koi_score+=10;
00573     else if (koi_st==0 && cp1251_st>1)
00574         cp1251_score+=10;
00575 
00576     if (cp1251_st && koi_st)
00577     {
00578         if (cp1251_st/koi_st>2)
00579             cp1251_score+=20;
00580         else if (koi_st/cp1251_st>2)
00581             koi_score+=20;
00582     }
00583 
00584     if (cp1251_a>koi_a)
00585         cp1251_score+=10;
00586     else if (cp1251_a || koi_a)
00587         koi_score+=10;
00588 
00589     if (cp1251_o>koi_o)
00590         cp1251_score+=10;
00591     else if (cp1251_o || koi_o)
00592         koi_score+=10;
00593 
00594     if (cp1251_i>koi_i)
00595         cp1251_score+=10;
00596     else if (cp1251_i || koi_i)
00597         koi_score+=10;
00598 
00599     if (cp1251_s>koi_s)
00600         cp1251_score+=10;
00601     else if (cp1251_s || koi_s)
00602         koi_score+=10;
00603 
00604     if (cp1251_a_capital>koi_a_capital)
00605         cp1251_score+=9;
00606     else if (cp1251_a_capital || koi_a_capital)
00607         koi_score+=9;
00608 
00609     if (cp1251_o_capital>koi_o_capital)
00610         cp1251_score+=9;
00611     else if (cp1251_o_capital || koi_o_capital)
00612         koi_score+=9;
00613 
00614     if (cp1251_i_capital>koi_i_capital)
00615         cp1251_score+=9;
00616     else if (cp1251_i_capital || koi_i_capital)
00617         koi_score+=9;
00618 
00619     if (cp1251_s_capital>koi_s_capital)
00620         cp1251_score+=9;
00621     else if (cp1251_s_capital || koi_s_capital)
00622         koi_score+=9;
00623 #ifdef DECODE_DEBUG
00624     kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
00625 #endif
00626     if (abs(koi_score-cp1251_score)<10)
00627     {
00628         //fallback...
00629         cp1251_score=cp1251_small_range;
00630         koi_score=koi_small_range;
00631     }
00632     if (cp1251_score>koi_score)
00633         return "cp1251";
00634     else
00635         return "koi8-u";
00636 
00637 
00638 //     if (cp1251_score>koi_score)
00639 //         setEncoding("cp1251",AutoDetectedEncoding);
00640 //     else
00641 //         setEncoding("koi8-u",AutoDetectedEncoding);
00642 //     return true;
00643 
00644 }
00645 
00646 static QCString automaticDetectionForGreek( const unsigned char* ptr, int size )
00647 {
00648     for ( int i = 0; i < size; ++i ) {
00649         if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
00650              || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
00651              || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
00652             return "cp1253";
00653         }
00654     }
00655 
00656     return "iso-8859-7";
00657 }
00658 
00659 static QCString automaticDetectionForHebrew( const unsigned char* ptr, int size )
00660 {
00661     for ( int i = 0; i < size; ++i ) {
00662         if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
00663              || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
00664              || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
00665             return "cp1255";
00666         }
00667 
00668         if ( ptr[ i ] == 0xDF )
00669             return "iso-8859-8-i";
00670     }
00671 
00672     return "iso-8859-8-i";
00673 }
00674 
00675 static QCString automaticDetectionForJapanese( const unsigned char* ptr, int size )
00676 {
00677     JapaneseCode kc;
00678 
00679     switch ( kc.guess_jp( (const char*)ptr, size ) ) {
00680     case JapaneseCode::JIS:
00681         return "jis7";
00682     case JapaneseCode::EUC:
00683         return "eucjp";
00684     case JapaneseCode::SJIS:
00685         return "sjis";
00686      case JapaneseCode::UTF8:
00687         return "utf8";
00688     default:
00689         break;
00690     }
00691 
00692     return "";
00693 }
00694 
00695 static QCString automaticDetectionForTurkish( const unsigned char* ptr, int size )
00696 {
00697     for ( int i = 0; i < size; ++i ) {
00698         if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
00699             return "cp1254";
00700         }
00701     }
00702 
00703     return "iso-8859-9";
00704 }
00705 
00706 static QCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
00707 {
00708     uint nonansi_count=0;
00709     for (int i=0; i<size; ++i)
00710     {
00711         if (ptr[i]>0x79)
00712         {
00713              ++nonansi_count;
00714             if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
00715             {
00716                 return "UTF-8";
00717             }
00718             if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
00719             {
00720                 return "cp1252";
00721             }
00722         }
00723 
00724     }
00725 
00726     if (nonansi_count>0)
00727         return "iso-8859-15";
00728 
00729     return "";
00730 }
00731 
00732 // Other browsers allow comments in the head section, so we need to also.
00733 // It's important not to look for tags inside the comments.
00734 static void skipComment(const char *&ptr, const char *pEnd)
00735 {
00736     const char *p = ptr;
00737     // Allow <!-->; other browsers do.
00738     if (*p=='>')
00739     {
00740         p++;
00741     }
00742     else
00743     {
00744         while (p!=pEnd)
00745         {
00746             if (*p=='-')
00747             {
00748                 // This is the real end of comment, "-->".
00749                 if (p[1]=='-' && p[2]=='>')
00750                 {
00751                     p += 3;
00752                     break;
00753                 }
00754                 // This is the incorrect end of comment that other browsers allow, "--!>".
00755                 if (p[1] == '-' && p[2] == '!' && p[3] == '>')
00756                 {
00757                     p += 4;
00758                     break;
00759                 }
00760             }
00761             p++;
00762         }
00763     }
00764     ptr=p;
00765 }
00766 
00767 // Returns the position of the encoding string.
00768 static int findXMLEncoding(const QCString &str, int &encodingLength)
00769 {
00770     int len = str.length();
00771     int pos = str.find("encoding");
00772     if (pos == -1)
00773         return -1;
00774     pos += 8;
00775 
00776     // Skip spaces and stray control characters.
00777     while (pos<len && str[pos]<=' ')
00778         ++pos;
00779 
00780     //Bail out if nothing after
00781     // Skip equals sign.
00782     if (pos>=len || str[pos] != '=')
00783         return -1;
00784     ++pos;
00785 
00786     // Skip spaces and stray control characters.
00787     while (pos<len && str[pos]<=' ')
00788         ++pos;
00789 
00790     //Bail out if nothing after
00791     if (pos >= len)
00792         return -1;
00793 
00794     // Skip quotation mark.
00795     char quoteMark = str[pos];
00796     if (quoteMark != '"' && quoteMark != '\'')
00797         return -1;
00798     ++pos;
00799 
00800     // Find the trailing quotation mark.
00801     int end=pos;
00802     while (end<len && str[end]!=quoteMark)
00803         ++end;
00804 
00805     if (end>=len)
00806         return -1;
00807 
00808     encodingLength = end-pos;
00809     return pos;
00810 }
00811 
00812 
00813 bool EncodingDetector::errorsIfUtf8 (const char* data, int length)
00814 {
00815     if (d->m_codec->mibEnum()!=MibUtf8)
00816         return false; //means no errors
00817 // #define highest1Bits (unsigned char)0x80
00818 // #define highest2Bits (unsigned char)0xC0
00819 // #define highest3Bits (unsigned char)0xE0
00820 // #define highest4Bits (unsigned char)0xF0
00821 // #define highest5Bits (unsigned char)0xF8
00822 static const unsigned char highest1Bits = 0x80;
00823 static const unsigned char highest2Bits = 0xC0;
00824 static const unsigned char highest3Bits = 0xE0;
00825 static const unsigned char highest4Bits = 0xF0;
00826 static const unsigned char highest5Bits = 0xF8;
00827 
00828     for (int i=0; i<length; ++i)
00829     {
00830         unsigned char c = data[i];
00831 
00832         if (d->m_multiByte>0)
00833         {
00834             if ((c & highest2Bits) == 0x80)
00835             {
00836                 --(d->m_multiByte);
00837                 continue;
00838             }
00839 #ifdef DECODE_DEBUG
00840             kWarning() << "EncDetector: Broken UTF8";
00841 #endif
00842             return true;
00843         }
00844 
00845         // most significant bit zero, single char
00846         if ((c & highest1Bits) == 0x00)
00847             continue;
00848 
00849         // 110xxxxx => init 1 following bytes
00850         if ((c & highest3Bits) == 0xC0)
00851         {
00852             d->m_multiByte = 1;
00853             continue;
00854         }
00855 
00856         // 1110xxxx => init 2 following bytes
00857         if ((c & highest4Bits) == 0xE0)
00858         {
00859             d->m_multiByte = 2;
00860             continue;
00861         }
00862 
00863         // 11110xxx => init 3 following bytes
00864         if ((c & highest5Bits) == 0xF0)
00865         {
00866             d->m_multiByte = 3;
00867             continue;
00868         }
00869 #ifdef DECODE_DEBUG
00870         kWarning() << "EncDetector:_Broken UTF8";
00871 #endif
00872         return true;
00873     }
00874     return false;
00875 }
00876 
00877 EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate)
00878 {
00879 }
00880 
00881 EncodingDetector::EncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
00882     d(new EncodingDetectorPrivate(codec,source,script))
00883 {
00884 }
00885 
00886 EncodingDetector::~EncodingDetector()
00887 {
00888     delete d;
00889 }
00890 
00891 void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
00892 {
00893     d->m_autoDetectLanguage=lang;
00894 }
00895 EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const
00896 {
00897     return d->m_autoDetectLanguage;
00898 }
00899 
00900 EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const
00901 {
00902     return d->m_source;
00903 }
00904 
00905 const char* EncodingDetector::encoding() const
00906 {
00907     d->m_storeDecoderName = d->m_codec->name();
00908     return d->m_storeDecoderName.data();
00909 }
00910 
00911 bool EncodingDetector::visuallyOrdered() const
00912 {
00913     return d->m_visualRTL;
00914 }
00915 
00916 // const QTextCodec* EncodingDetector::codec() const
00917 // {
00918 //     return d->m_codec;
00919 // }
00920 
00921 QTextDecoder* EncodingDetector::decoder()
00922 {
00923     return d->m_decoder;
00924 }
00925 
00926 bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
00927 {
00928     QTextCodec *codec;
00929     QCString enc(_encoding);
00930     if(/*enc.isNull() || */enc.isEmpty())
00931     {
00932         if (type==DefaultEncoding)
00933             codec=d->m_defaultCodec;
00934         else
00935             return false;
00936     }
00937     else
00938     {
00939         //QString->QTextCodec
00940 
00941         enc = enc.lower();
00942          // hebrew visually ordered
00943         if(enc=="visual")
00944             enc="iso8859-8";
00945         bool b;
00946         codec = KGlobal::charsets()->codecForName(enc, b);
00947         if (!b)
00948         return false;
00949     }
00950 
00951     if (d->m_codec->mibEnum()==codec->mibEnum())
00952         return true;
00953 
00954     if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
00955     {
00956         //Sometimes the codec specified is absurd, i.e. UTF-16 despite
00957         //us decoding a meta tag as ASCII. In that case, ignore it.
00958         return false;
00959     }
00960 
00961     if (codec->mibEnum() == Mib8859_8)
00962     {
00963         //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
00964         codec = QTextCodec::codecForName("iso8859-8-i");
00965 
00966         // visually ordered unless one of the following
00967         if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
00968             d->m_visualRTL = true;
00969     }
00970 
00971     d->m_codec = codec;
00972     d->m_source = type;
00973     delete d->m_decoder;
00974     d->m_decoder = d->m_codec->makeDecoder();
00975 #ifdef DECODE_DEBUG
00976     kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name();
00977 #endif
00978     return true;
00979 }
00980 
00981 bool EncodingDetector::analyze(const QByteArray &data)
00982 {
00983     return analyze( data.data(), data.size() );
00984 }
00985 
00986 bool EncodingDetector::analyze(const char *data, int len)
00987 {
00988     // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
00989     // maximumBOMLength = 10
00990     // Even if the user has chosen utf16 we still need to auto-detect the endianness
00991     if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
00992     {
00993         // Extract the first three bytes.
00994         const uchar *udata = (const uchar *)data;
00995         uchar c1 = *udata++;
00996         uchar c2 = *udata++;
00997         uchar c3 = *udata++;
00998 
00999         // Check for the BOM
01000         const char *autoDetectedEncoding;
01001         if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
01002         {
01003             autoDetectedEncoding = "ISO-10646-UCS-2";
01004         }
01005         else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
01006         {
01007             autoDetectedEncoding = "UTF-8";
01008         }
01009         else if (c1 == 0x00 || c2 == 0x00)
01010         {
01011             uchar c4 = *udata++;
01012             uchar c5 = *udata++;
01013             uchar c6 = *udata++;
01014             uchar c7 = *udata++;
01015             uchar c8 = *udata++;
01016             uchar c9 = *udata++;
01017             uchar c10 = *udata++;
01018 
01019             int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
01020             int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
01021             if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
01022                 autoDetectedEncoding = "ISO-10646-UCS-2";
01023             else
01024                 autoDetectedEncoding = 0;
01025         }
01026         else
01027         {
01028             autoDetectedEncoding = 0;
01029         }
01030 
01031         // If we found a BOM, use the encoding it implies.
01032         if (autoDetectedEncoding != 0)
01033         {
01034             d->m_source = BOM;
01035             d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
01036             assert(d->m_codec);
01037             //enc = d->m_codec->name();
01038             delete d->m_decoder;
01039             d->m_decoder = d->m_codec->makeDecoder();
01040 #ifdef DECODE_DEBUG
01041             kWarning() << "Detection by BOM";
01042 #endif
01043             if (is16Bit(d->m_codec) && c2==0x00)
01044             {
01045                 // utf16LE, we need to put the decoder in LE mode
01046                 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
01047                 d->m_decoder->toUnicode(reverseUtf16, 2);
01048             }
01049             return true;
01050         }
01051     }
01052 
01053     //exit from routine in case it was called to only detect byte order for utf-16
01054     if (d->m_source==UserChosenEncoding)
01055     {
01056 #ifdef DECODE_DEBUG
01057         kWarning() << "EncodingDetector: UserChosenEncoding exit ";
01058 #endif
01059 
01060         if (errorsIfUtf8(data, len))
01061             setEncoding("",DefaultEncoding);
01062         return true;
01063     }
01064 #if 0  //This is for plaintext, so don't try to parse HTML headers -- ahartmetz
01065     if (!d->m_seenBody)
01066     {
01067         // we still don't have an encoding, and are in the head
01068         // the following tags are allowed in <head>:
01069         // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
01070         const char *ptr = data;
01071         const char *pEnd = data+len;
01072 
01073         while(ptr != pEnd)
01074         {
01075             if(*ptr!='<')
01076             {
01077                 ++ptr;
01078                 continue;
01079             }
01080             ++ptr;
01081             // Handle comments.
01082             if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
01083             {
01084                 ptr += 3;
01085                 skipComment(ptr, pEnd);
01086                 continue;
01087             }
01088 
01089             // Handle XML header, which can have encoding in it.
01090             if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
01091             {
01092                 const char *end = ptr;
01093                 while (*end != '>' && end < pEnd)
01094                     end++;
01095                 if (*end == '\0' || end == pEnd)
01096                     break;
01097                 QCString str(ptr, end - ptr + 1);
01098                 int length;
01099                 int pos = findXMLEncoding(str, length);
01100                 // also handles the case when specified encoding aint correct
01101                 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
01102                 {
01103                     return true;
01104                 }
01105             }
01106 
01107             //look for <meta>, stop if we reach <body>
01108             while (
01109                         !((*ptr >= 'a') && (*ptr <= 'z') ||
01110                         (*ptr >= 'A') && (*ptr <= 'Z'))
01111                         && ptr < pEnd
01112                 )
01113                 ++ptr;
01114 
01115             char tmp[5];
01116             int length=0;
01117             const char* max=ptr+4;
01118             if (pEnd<max)
01119                 max=pEnd;
01120             while (
01121                         ((*ptr >= 'a') && (*ptr <= 'z') ||
01122                         (*ptr >= 'A') && (*ptr <= 'Z') ||
01123                         (*ptr >= '0') && (*ptr <= '9'))
01124                         && ptr < max
01125                 )
01126             {
01127                 tmp[length] = tolower( *ptr );
01128                 ++ptr;
01129                 ++length;
01130             }
01131             tmp[length] = 0;
01132             if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
01133             {
01134                 // found a meta tag...
01135                 const char* end = ptr;
01136                 while(*end != '>' && *end != '\0' && end<pEnd)
01137                     end++;
01138                 //if ( *end == '\0' ) break;
01139                 QCString str( ptr, (end-ptr)+1);
01140                 str = str.lower();
01141                 int pos=0;
01142                         //if( (pos = str.find("http-equiv", pos)) == -1) break;
01143                         //if( (pos = str.find("content-type", pos)) == -1) break;
01144                 if( (pos = str.find("charset")) == -1)
01145                     continue;
01146                 pos+=6;
01147                 // skip to '='
01148                 if( (pos = str.find('=', pos)) == -1)
01149                     continue;
01150 
01151                 // skip whitespace before encoding itself
01152                 while (pos < (int)str.length() && str[pos] <= ' ')
01153                     ++pos;
01154                 if ( pos == (int)str.length())
01155                     continue;
01156 
01157                 int endpos = pos;
01158                 while( endpos < str.length() &&
01159                         (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
01160                                     && str[endpos] != ';' && str[endpos] != '>') )
01161                     ++endpos;
01162     #ifdef DECODE_DEBUG
01163                 kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
01164     #endif
01165                 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
01166                     return true;
01167             }
01168             else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
01169             {
01170                 d->m_seenBody=true;
01171                 break;
01172             }
01173         }
01174     }
01175 
01176     if (d->m_source==EncodingFromHTTPHeader)
01177         return true;
01178 #endif
01179     //if (len<20)     //make a guess even if the file is short -- ahartmetz
01180     if (len < 1)
01181     {
01182         setEncoding("",DefaultEncoding);
01183         return false;
01184     }
01185 #ifdef DECODE_DEBUG
01186     kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")";
01187 #endif
01188 
01189     switch ( d->m_autoDetectLanguage )
01190     {
01191         case EncodingDetector::Arabic:
01192             return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01193 //             break;
01194         case EncodingDetector::Baltic:
01195             return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01196 //             break;
01197         case EncodingDetector::CentralEuropean:
01198             return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
01199             break;
01200         case EncodingDetector::Cyrillic:
01201             return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
01202 //             break;
01203         case EncodingDetector::Greek:
01204             return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
01205 //             break;
01206         case EncodingDetector::Hebrew:
01207             return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
01208 //             break;
01209         case EncodingDetector::Japanese:
01210             return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
01211 //             break;
01212         case EncodingDetector::Turkish:
01213             return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
01214 //             break;
01215         case EncodingDetector::WesternEuropean:
01216             if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
01217                 return true;
01218             else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml
01219             {
01220                 return setEncoding("iso-8859-15",AutoDetectedEncoding);
01221             }
01222             else //use default provided by eg katepart
01223             {
01224                 return setEncoding("",DefaultEncoding);
01225             }
01226 //             break;
01227         case EncodingDetector::SemiautomaticDetection:
01228         case EncodingDetector::ChineseSimplified:
01229         case EncodingDetector::ChineseTraditional:
01230         case EncodingDetector::Korean:
01231         case EncodingDetector::Thai:
01232         case EncodingDetector::Unicode:
01233         case EncodingDetector::NorthernSaami:
01234         case EncodingDetector::SouthEasternEurope:
01235         case EncodingDetector::None:
01236             // huh. somethings broken in this code ### FIXME
01237             //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
01238             break;
01239         }
01240 
01241         setEncoding("",DefaultEncoding);
01242         return true;
01243 }
01244 
01245 
01246 EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const QString& lang)
01247 {
01248     if (lang.isEmpty())
01249         return EncodingDetector::None;
01250     else if (lang==i18n("@item Text character set", "Unicode"))
01251         return EncodingDetector::Unicode;
01252     else if (lang==i18n("@item Text character set", "Cyrillic"))
01253         return EncodingDetector::Cyrillic;
01254     else if (lang==i18n("@item Text character set", "Western European"))
01255         return EncodingDetector::WesternEuropean;
01256     else if (lang==i18n("@item Text character set", "Central European"))
01257         return EncodingDetector::CentralEuropean;
01258     else if (lang==i18n("@item Text character set", "Greek"))
01259         return EncodingDetector::Greek;
01260     else if (lang==i18n("@item Text character set", "Hebrew"))
01261         return EncodingDetector::Hebrew;
01262     else if (lang==i18n("@item Text character set", "Turkish"))
01263         return EncodingDetector::Turkish;
01264     else if (lang==i18n("@item Text character set", "Japanese"))
01265         return EncodingDetector::Japanese;
01266     else if (lang==i18n("@item Text character set", "Baltic"))
01267         return EncodingDetector::Baltic;
01268     else if (lang==i18n("@item Text character set", "Arabic"))
01269         return EncodingDetector::Arabic;
01270 
01271     return EncodingDetector::None;
01272 }
01273 
01274 bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
01275 {
01276     switch (script)
01277     {
01278         case EncodingDetector::Arabic:
01279             return true;
01280         case EncodingDetector::Baltic:
01281             return true;
01282         case EncodingDetector::CentralEuropean:
01283             return true;
01284         case EncodingDetector::Cyrillic:
01285             return true;
01286         case EncodingDetector::Greek:
01287             return true;
01288         case EncodingDetector::Hebrew:
01289             return true;
01290         case EncodingDetector::Japanese:
01291             return true;
01292         case EncodingDetector::Turkish:
01293             return true;
01294         case EncodingDetector::WesternEuropean:
01295             return true;
01296         case EncodingDetector::ChineseTraditional:
01297             return true;
01298         case EncodingDetector::ChineseSimplified:
01299             return true;
01300         case EncodingDetector::Unicode:
01301             return true;
01302             break;
01303         default:
01304             return false;
01305     }
01306 }
01307 
01308 QString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
01309 {
01310     switch (script)
01311     {
01312         case EncodingDetector::Arabic:
01313             return i18n("@item Text character set", "Arabic");
01314             break;
01315         case EncodingDetector::Baltic:
01316             return i18n("@item Text character set", "Baltic");
01317             break;
01318         case EncodingDetector::CentralEuropean:
01319             return i18n("@item Text character set", "Central European");
01320             break;
01321         case EncodingDetector::Cyrillic:
01322             return i18n("@item Text character set", "Cyrillic");
01323             break;
01324         case EncodingDetector::Greek:
01325             return i18n("@item Text character set", "Greek");
01326             break;
01327         case EncodingDetector::Hebrew:
01328             return i18n("@item Text character set", "Hebrew");
01329             break;
01330         case EncodingDetector::Japanese:
01331             return i18n("@item Text character set", "Japanese");
01332             break;
01333         case EncodingDetector::Turkish:
01334             return i18n("@item Text character set", "Turkish");
01335             break;
01336         case EncodingDetector::WesternEuropean:
01337             return i18n("@item Text character set", "Western European");
01338             break;
01339         case EncodingDetector::ChineseTraditional:
01340             return i18n("@item Text character set", "Chinese Traditional");
01341             break;
01342         case EncodingDetector::ChineseSimplified:
01343             return i18n("@item Text character set", "Chinese Simplified");
01344             break;
01345         case EncodingDetector::Korean:
01346             return i18n("@item Text character set", "Korean");
01347             break;
01348         case EncodingDetector::Thai:
01349             return i18n("@item Text character set", "Thai");
01350             break;
01351         case EncodingDetector::Unicode:
01352             return i18n("@item Text character set", "Unicode");
01353             break;
01354         //case EncodingDetector::SemiautomaticDetection:
01355         default:
01356             return QString();
01357 
01358         }
01359 }
01360 
01361 EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const QString &lc)
01362 {
01363   // It might make sense to do something special if the locale ends with
01364   // ".UTF-8" or "@utf8"
01365   const char *langStr = pango_script_for_lang[0].lang;
01366   // There is obvious optimization potential...
01367   for ( int i = 0; langStr; i++ ) {
01368      langStr = pango_script_for_lang[i].lang;
01369      // startsWith() works for empty strings: every string "starts with" an empty string.
01370      if ( lc.startsWith( QString::fromAscii( langStr ) ) )
01371        return pango_script_for_lang[i].scripts[0];
01372   }
01373   return None;
01374 }
01375 
01376 #undef DECODE_DEBUG
01377
kmail

encodingdetector.cpp

kmail

API Reference