00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #include "kcharsets.h"
00020
00021 #include "kqiodevicegzip_p.h"
00022 #include "kentities.c"
00023
00024 #include <kapplication.h>
00025 #include <kglobal.h>
00026 #include <klocale.h>
00027 #include <kconfig.h>
00028
00029 #include <qfontinfo.h>
00030 #include <qstrlist.h>
00031 #include <qfontdatabase.h>
00032 #include <kdebug.h>
00033
00034 #include <qtextcodec.h>
00035 #include <qmap.h>
00036 #include <qcstring.h>
00037 #include <qdir.h>
00038 #include <qregexp.h>
00039
00040 #include <assert.h>
00041
00042 static const char * const language_names[] = {
00043 I18N_NOOP( "Other" ),
00044 I18N_NOOP( "Arabic" ),
00045 I18N_NOOP( "Baltic" ),
00046 I18N_NOOP( "Central European" ),
00047 I18N_NOOP( "Chinese Simplified" ),
00048 I18N_NOOP( "Chinese Traditional" ),
00049 I18N_NOOP( "Cyrillic" ),
00050 I18N_NOOP( "Greek" ),
00051 I18N_NOOP( "Hebrew" ),
00052 I18N_NOOP( "Japanese" ),
00053 I18N_NOOP( "Korean" ),
00054 I18N_NOOP( "Thai" ),
00055 I18N_NOOP( "Turkish" ),
00056 I18N_NOOP( "Western European" ),
00057 I18N_NOOP( "Tamil" ),
00058 I18N_NOOP( "Unicode" ),
00059 I18N_NOOP( "Northern Saami" ),
00060 I18N_NOOP( "Vietnamese" ),
00061 I18N_NOOP( "South-Eastern Europe" )
00062 };
00063
00064
00065
00066 static const char* const charsets_for_encoding[] = {
00067 "koi8-r",
00068 "koi8-u",
00069 "iso 8859-1",
00070 "iso 8859-2",
00071 "iso 8859-3",
00072 "iso 8859-4",
00073 "iso 8859-5",
00074 "iso 8859-6",
00075 "iso 8859-7",
00076 "iso 8859-8",
00077 "iso 8859-8-i",
00078 "iso 8859-9",
00079 "iso 8859-11",
00080 "iso 8859-13",
00081 "iso 8859-14",
00082 "iso 8859-15",
00083 "iso 8859-16",
00084 "utf8",
00085 "utf16",
00086 "iso-10646-ucs-2",
00087 "cp 1250",
00088 "cp 1251",
00089 "cp 1252",
00090 "cp 1253",
00091 "cp 1254",
00092 "cp 1255",
00093 "cp 1256",
00094 "cp 1257",
00095 "cp 1258",
00096 "ibm850",
00097 "ibm852",
00098 "ibm866",
00099 "tis620",
00100 "eucjp",
00101 "sjis",
00102 "jis7",
00103 "big5",
00104 "big5-hkscs",
00105 "gbk",
00106 "gb18030",
00107 "gb2312",
00108 "euckr",
00109 "tscii",
00110
00111 "winsami2",
00112 "cp 874",
00113 0 };
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135 static struct LanguageForEncoding
00136 {
00137 const char* index;
00138 int data;
00139 } const language_for_encoding[] = {
00140 { "iso 8859-1", 13 },
00141 { "iso 8859-15", 13 },
00142 { "iso 8859-14", 13 },
00143 { "cp 1252", 13 },
00144 { "ibm850", 13 },
00145 { "iso 8859-2", 3 },
00146 { "iso 8859-3", 3 },
00147 { "iso 8859-4", 2 },
00148 { "iso 8859-13", 2 },
00149 { "iso 8859-16", 18 },
00150 { "cp 1250", 3 },
00151 { "cp 1254", 12 },
00152 { "cp 1257", 2 },
00153 { "ibm852", 3 },
00154 { "koi8-r", 6 },
00155 { "iso 8859-5", 6 },
00156 { "cp 1251", 6 },
00157 { "koi8-u", 6 },
00158
00159 { "ibm866", 6 },
00160 { "big5", 5 },
00161 { "big5-hkscs", 5 },
00162 { "gb18030", 4 },
00163 { "gbk", 4 },
00164 { "gb2312", 4 },
00165 { "euckr", 10 },
00166 { "sjis", 9 },
00167 { "jis7", 9 },
00168 { "eucjp", 9 },
00169 { "iso 8859-7", 7 },
00170 { "cp 1253", 7 },
00171 { "iso 8859-6", 1 },
00172 { "cp 1256", 1 },
00173 { "iso 8859-8", 8 },
00174 { "iso 8859-8-i", 8 },
00175 { "cp 1255", 8 },
00176 { "iso 8859-9", 12 },
00177 { "tis620", 11 },
00178 { "iso 8859-11", 11 },
00179 { "cp 874", 11 },
00180 { "cp 1258", 17 },
00181 { "tscii", 14 },
00182 { "utf8", 15 },
00183 { "utf16", 15 },
00184 { "utf7", 15 },
00185 { "ucs2", 15 },
00186 { "iso-10646-ucs-2", 15 },
00187 { "winsami2", 16},
00188 { 0, 0 } };
00189
00190
00191 static struct Builtin
00192 {
00193 const char* index;
00194 const char* data;
00195 } const builtin[] = {
00196 { "iso-ir-111", "koi8-r" },
00197 { "koi8-ru", "koi8-u" },
00198 { "koi unified", "koi8-r" },
00199
00200 { "us-ascii", "iso 8859-1" },
00201 { "usascii", "iso 8859-1" },
00202 { "ascii", "iso 8859-1" },
00203 { "x-utf-8", "utf-8" },
00204 { "x-utf-7", "utf-7" },
00205 { "unicode-1-1-utf-7", "utf-7" },
00206 { "utf-16", "iso-10646-ucs-2" },
00207 { "utf16", "iso-10646-ucs-2" },
00208 { "ucs2", "iso-10646-ucs-2" },
00209 { "iso10646-1", "iso-10646-ucs-2" },
00210 { "gb18030.2000-1", "gb18030" },
00211 { "gb18030.2000-0", "gb18030" },
00212 { "gbk-0", "gbk" },
00213 { "gb2312.1980-0", "gbk" },
00214 { "gb_2312-80", "gbk" },
00215 { "x-euc-kr", "euckr" },
00216 { "jisx0201.1976-0", "eucjp" },
00217 { "jisx0208.1983-0", "eucjp" },
00218 { "jisx0208.1990-0", "eucjp" },
00219 { "jisx0208.1997-0", "eucjp" },
00220 { "jisx0212.1990-0", "eucjp" },
00221 { "jisx0213.2000-1", "eucjp" },
00222 { "jisx0213.2000-2", "eucjp" },
00223 { "windows850", "ibm850" },
00224 { "windows866", "ibm866" },
00225 { "windows1251", "cp 1251" },
00226 { "windows1252", "cp 1252" },
00227 { "windows1253", "cp 1253" },
00228 { "windows1254", "cp 1254" },
00229 { "windows1255", "cp 1255" },
00230 { "windows1256", "cp 1256" },
00231 { "windows1257", "cp 1257" },
00232 { "windows1258", "cp 1258" },
00233 { "windows-850", "ibm850" },
00234 { "windows-866", "ibm866" },
00235 { "x-windows-850", "ibm850" },
00236 { "x-windows-866", "ibm866" },
00237 { "x-windows-1250", "cp 1250" },
00238 { "x-windows-1251", "cp 1251" },
00239 { "x-windows-1252", "cp 1252" },
00240 { "x-windows-1253", "cp 1253" },
00241 { "x-windows-1254", "cp 1254" },
00242 { "x-windows-1255", "cp 1255" },
00243 { "x-windows-1256", "cp 1256" },
00244 { "x-windows-1257", "cp 1257" },
00245 { "x-windows-1258", "cp 1258" },
00246 { "cp819", "iso 8859-1" },
00247 { "cp850", "ibm850" },
00248 { "cp866", "ibm866" },
00249 { "cp-819", "iso 8859-1" },
00250 { "cp-850", "ibm850" },
00251 { "cp-866", "ibm866" },
00252 { "cp-1250", "cp 1250" },
00253 { "cp-1251", "cp 1251" },
00254 { "cp-1252", "cp 1252" },
00255 { "cp-1253", "cp 1253" },
00256 { "cp-1254", "cp 1254" },
00257 { "cp-1255", "cp 1255" },
00258 { "cp-1256", "cp 1256" },
00259 { "cp-1257", "cp 1257" },
00260 { "cp-1258", "cp 1258" },
00261 { "cp-10000", "apple roman" },
00262 { "x-cp-850", "ibm850" },
00263 { "x-cp-866", "ibm866" },
00264 { "x-cp-1250", "cp 1250" },
00265 { "x-cp-1251", "cp 1251" },
00266 { "x-cp-1252", "cp 1252" },
00267 { "x-cp-1253", "cp 1253" },
00268 { "x-cp-1254", "cp 1254" },
00269 { "x-cp-1255", "cp 1255" },
00270 { "x-cp-1256", "cp 1256" },
00271 { "x-cp-1257", "cp 1257" },
00272 { "x-cp-1258", "cp 1258" },
00273 { "x-cp-10000", "apple roman" },
00274 { "ibm819", "iso 8859-1" },
00275 { "thai-tis620", "iso 8859-11" },
00276 { "windows-874", "cp 874" },
00277 { "windows874", "cp 874" },
00278 { "x-windows-874", "cp 874" },
00279 { "x-cp-874", "cp 874" },
00280 { "ibm 874", "cp 874" },
00281 { "ibm874", "cp 874" },
00282 { "x-ibm874", "cp 874" },
00283 { "ksc5601.1987-0", "euckr" },
00284 { "x-winsami2", "winsami2" },
00285 { "x-mac-roman", "apple roman" },
00286 { "macintosh", "apple roman" },
00287 { "mac", "apple roman" },
00288 { "csiso2022jp", "jis7" },
00289 { "big5-eten", "big5-hkscs" },
00290 { "cp950", "big5-hkscs" },
00291 { 0, 0 }};
00292
00293
00294
00295 static struct Aliases
00296 {
00297 const char* index;
00298 const char* data;
00299 } const aliases[] = {
00300 { "cp852", "ibm852" },
00301 { "cp-852", "ibm852" },
00302 { "x-cp-852", "ibm852" },
00303 { "windows852", "ibm852" },
00304 { "windows-852", "ibm852" },
00305 { "x-windows-852", "ibm852" },
00306 { 0, 0 }};
00307
00308
00309
00310
00311 static struct ConversionHints
00312 {
00313 const char* index;
00314 const char* data;
00315 } const conversion_hints[] = {
00316 { "cp1250", "iso-8859-2" },
00317 { "koi8-r", "iso-8859-5" },
00318 { "koi8-u", "koi8-r" },
00319
00320 { "pt 154", "cp 1251" },
00321 { "paratype-154", "cp 1251" },
00322 { "pt-154", "cp 1251" },
00323 { 0, 0 }};
00324
00325
00326
00327
00328 template< typename T, typename Data >
00329 static Data kcharsets_array_search( const T* start, const char* entry )
00330 {
00331 for( const T* pos = start;
00332 pos->index != 0;
00333 ++pos )
00334 if( qstrcmp( pos->index, entry ) == 0 )
00335 return pos->data;
00336 return 0;
00337 }
00338
00339
00340 class KCharsetsPrivate
00341 {
00342 public:
00343 KCharsetsPrivate(KCharsets* _kc)
00344 : codecForNameDict(43, false)
00345 {
00346 db = 0;
00347 kc = _kc;
00348 }
00349 ~KCharsetsPrivate()
00350 {
00351 delete db;
00352 }
00353 QFontDatabase *db;
00354 QAsciiDict<QTextCodec> codecForNameDict;
00355 KCharsets* kc;
00356 };
00357
00358
00359
00360 KCharsets::KCharsets()
00361 {
00362 d = new KCharsetsPrivate(this);
00363 }
00364
00365 KCharsets::~KCharsets()
00366 {
00367 delete d;
00368 }
00369
00370 QChar KCharsets::fromEntity(const QString &str)
00371 {
00372 QChar res = QChar::null;
00373
00374 int pos = 0;
00375 if(str[pos] == '&') pos++;
00376
00377
00378 if (str[pos] == '#' && str.length()-pos > 1) {
00379 bool ok;
00380 pos++;
00381 if (str[pos] == 'x' || str[pos] == 'X') {
00382 pos++;
00383
00384 QString tmp(str.unicode()+pos, str.length()-pos);
00385 res = tmp.toInt(&ok, 16);
00386 } else {
00387
00388 QString tmp(str.unicode()+pos, str.length()-pos);
00389 res = tmp.toInt(&ok, 10);
00390 }
00391 return res;
00392 }
00393
00394 const entity *e = kde_findEntity(str.ascii(), str.length());
00395
00396 if(!e)
00397 {
00398
00399 return QChar::null;
00400 }
00401
00402
00403 return QChar(e->code);
00404 }
00405
00406 QChar KCharsets::fromEntity(const QString &str, int &len)
00407 {
00408
00409
00410 len = 8;
00411 while(len > 0)
00412 {
00413 QString tmp = str.left(len);
00414 QChar res = fromEntity(tmp);
00415 if( res != QChar::null ) return res;
00416 len--;
00417 }
00418 return QChar::null;
00419 }
00420
00421
00422 QString KCharsets::toEntity(const QChar &ch)
00423 {
00424 QString ent;
00425 ent.sprintf("�x%x;", ch.unicode());
00426 return ent;
00427 }
00428
00429 QString KCharsets::resolveEntities( const QString &input )
00430 {
00431 QString text = input;
00432 const QChar *p = text.unicode();
00433 const QChar *end = p + text.length();
00434 const QChar *ampersand = 0;
00435 bool scanForSemicolon = false;
00436
00437 for ( ; p < end; ++p ) {
00438 const QChar ch = *p;
00439
00440 if ( ch == '&' ) {
00441 ampersand = p;
00442 scanForSemicolon = true;
00443 continue;
00444 }
00445
00446 if ( ch != ';' || scanForSemicolon == false )
00447 continue;
00448
00449 assert( ampersand );
00450
00451 scanForSemicolon = false;
00452
00453 const QChar *entityBegin = ampersand + 1;
00454
00455 const uint entityLength = p - entityBegin;
00456 if ( entityLength == 0 )
00457 continue;
00458
00459 const QChar entityValue = KCharsets::fromEntity( QConstString( entityBegin, entityLength ).string() );
00460 if ( entityValue.isNull() )
00461 continue;
00462
00463 const uint ampersandPos = ampersand - text.unicode();
00464
00465 text[ (int)ampersandPos ] = entityValue;
00466 text.remove( ampersandPos + 1, entityLength + 1 );
00467 p = text.unicode() + ampersandPos;
00468 end = text.unicode() + text.length();
00469 ampersand = 0;
00470 }
00471
00472 return text;
00473 }
00474
00475 QStringList KCharsets::availableEncodingNames()
00476 {
00477 QStringList available;
00478 for ( const char* const* pos = charsets_for_encoding; *pos; ++pos ) {
00479
00480 available.append( QString::fromLatin1( *pos ));
00481 }
00482 return available;
00483 }
00484
00485 QString KCharsets::languageForEncoding( const QString &encoding )
00486 {
00487 int lang = kcharsets_array_search< LanguageForEncoding, int >
00488 ( language_for_encoding, encoding.latin1());
00489 return i18n( language_names[lang] );
00490 }
00491
00492 QString KCharsets::encodingForName( const QString &descriptiveName )
00493 {
00494 const int left = descriptiveName.findRev( '(' );
00495
00496 if (left<0)
00497 return descriptiveName.stripWhiteSpace();
00498
00499 QString name(descriptiveName.mid(left+1));
00500
00501 const int right = name.findRev( ')' );
00502
00503 if (right<0)
00504 return name;
00505
00506 return name.left(right).stripWhiteSpace();
00507 }
00508
00509 QStringList KCharsets::descriptiveEncodingNames()
00510 {
00511
00512 QStringList encodings;
00513 for ( const LanguageForEncoding* pos = language_for_encoding; pos->index; ++pos ) {
00514 const QString name = QString::fromLatin1( pos->index );
00515 const QString description = i18n( language_names[ pos->data ] );
00516 encodings.append( i18n("Descriptive Encoding Name", "%1 ( %2 )"). arg ( description ). arg( name ) );
00517 }
00518 encodings.sort();
00519 return encodings;
00520 }
00521
00522 QTextCodec *KCharsets::codecForName(const QString &n) const
00523 {
00524 bool b;
00525 return codecForName( n, b );
00526 }
00527
00528 QTextCodec *KCharsets::codecForName(const QString &n, bool &ok) const
00529 {
00530 ok = true;
00531
00532 QTextCodec* codec = 0;
00533
00534 if((codec = d->codecForNameDict[n.isEmpty() ? "->locale<-" : n.latin1()]))
00535 return codec;
00536
00537 if (n.isEmpty()) {
00538 codec = KGlobal::locale()->codecForEncoding();
00539 d->codecForNameDict.replace("->locale<-", codec);
00540 return codec;
00541 }
00542
00543 QCString name = n.lower().latin1();
00544 QCString key = name;
00545 if (name.right(8) == "_charset")
00546 name.truncate(name.length()-8);
00547
00548 if (name.isEmpty()) {
00549 ok = false;
00550 return QTextCodec::codecForName("iso8859-1");
00551 }
00552
00553 codec = QTextCodec::codecForName(name);
00554
00555 if(codec) {
00556 d->codecForNameDict.replace(key, codec);
00557 return codec;
00558 }
00559
00560
00561
00562 QCString cname = kcharsets_array_search< Builtin, const char* >( builtin, name.data());
00563
00564 if(!cname.isEmpty())
00565 codec = QTextCodec::codecForName(cname);
00566
00567 if(codec)
00568 {
00569 d->codecForNameDict.replace(key, codec);
00570 return codec;
00571 }
00572
00573 QString dir;
00574 {
00575 KConfigGroupSaver cfgsav( KGlobal::config(), "i18n" );
00576 dir = KGlobal::config()->readPathEntry("i18ndir", QString::fromLatin1("/usr/share/i18n/charmaps"));
00577 }
00578
00579
00580
00581 cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data());
00582
00583 if(cname.isEmpty())
00584 cname = name;
00585 cname = cname.upper();
00586
00587 const QString basicName = QString::fromLatin1(cname);
00588 kdDebug() << k_funcinfo << endl << " Trying to find " << cname << " in " << dir << endl;
00589
00590 QString charMapFileName;
00591 bool gzipped = false;
00592 QDir qdir(dir);
00593 if (!qdir.exists()) {
00594
00595 }
00596 else if (qdir.exists(basicName, false)) {
00597 charMapFileName = basicName;
00598 }
00599 else if (qdir.exists(basicName+".gz", false)) {
00600 charMapFileName = basicName + ".gz";
00601 gzipped = true;
00602 }
00603 else {
00604
00605
00606
00607 QRegExp regexp("^(X-)?(CP|IBM)(-| )?(0-9)+");
00608 if ( regexp.search(basicName) != -1) {
00609 const QString num = regexp.cap(4);
00610 if (num.isEmpty()) {
00611
00612 }
00613 else if (qdir.exists("IBM"+num)) {
00614 charMapFileName = "IBM"+num;
00615 }
00616 else if (qdir.exists("IBM"+num+".gz")) {
00617 charMapFileName = "IBM"+num+".gz";
00618 gzipped = true;
00619 }
00620 else if (qdir.exists("CP"+num)) {
00621 charMapFileName = "CP"+num;
00622 }
00623 else if (qdir.exists("CP"+num+".gz")) {
00624 charMapFileName = "CP"+num+".gz";
00625 gzipped = true;
00626 }
00627 }
00628 }
00629
00630 if (gzipped && !charMapFileName.isEmpty()) {
00631 KQIODeviceGZip gzip(dir + "/" + charMapFileName);
00632 if (gzip.open(IO_ReadOnly)) {
00633 kdDebug() << "Loading gzipped charset..." << endl;
00634 codec = QTextCodec::loadCharmap(&gzip);
00635 gzip.close();
00636 }
00637 else
00638 kdWarning() << "Could not open gzipped charset!" << endl;
00639 }
00640 else if (!charMapFileName.isEmpty()) {
00641 codec = QTextCodec::loadCharmapFile(dir + "/" + charMapFileName);
00642 }
00643
00644 if(codec) {
00645 d->codecForNameDict.replace(key, codec);
00646 return codec;
00647 }
00648
00649
00650
00651 cname = kcharsets_array_search< ConversionHints, const char* >( conversion_hints, (const char*)name.data() );
00652
00653 if(!cname.isEmpty())
00654 codec = QTextCodec::codecForName(cname);
00655
00656 if(codec) {
00657 d->codecForNameDict.replace(key, codec);
00658 return codec;
00659 }
00660
00661
00662 ok = false;
00663 return QTextCodec::codecForName("iso8859-1");
00664 }