KCodecs

kcharsets.cpp
1 /*
2  This file is part of the KDE libraries
3 
4  SPDX-FileCopyrightText: 1999 Lars Knoll <[email protected]>
5  SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <[email protected]>
6  SPDX-FileCopyrightText: 2007 Nick Shaforostoff <[email protected]>
7 
8  SPDX-License-Identifier: LGPL-2.0-or-later
9 */
10 #include "kcharsets.h"
11 #include "kcharsets_p.h"
12 #include "kcodecs_debug.h"
13 
14 #include "kusasciitextcodec.h"
15 #include <kentities.h>
16 
17 #include <QHash>
18 #include <QTextCodec>
19 
20 #include <algorithm>
21 #include <assert.h>
22 
23 /*
24  * The encoding names (like "ISO 8859-1") in this list are user-visible,
25  * and should be mostly uppercase.
26  * Generate with generate_string_table.pl (located in kde-dev-scripts),
27  * input data:
28 ISO 8859-1
29 i18n:Western European
30 ISO 8859-15
31 i18n:Western European
32 ISO 8859-14
33 i18n:Western European
34 cp 1252
35 i18n:Western European
36 IBM850
37 i18n:Western European
38 ISO 8859-2
39 i18n:Central European
40 ISO 8859-3
41 i18n:Central European
42 ISO 8859-4
43 i18n:Baltic
44 ISO 8859-13
45 i18n:Baltic
46 ISO 8859-16
47 i18n:South-Eastern Europe
48 cp 1250
49 i18n:Central European
50 cp 1254
51 i18n:Turkish
52 cp 1257
53 i18n:Baltic
54 KOI8-R
55 i18n:Cyrillic
56 ISO 8859-5
57 i18n:Cyrillic
58 cp 1251
59 i18n:Cyrillic
60 KOI8-U
61 i18n:Cyrillic
62 IBM866
63 i18n:Cyrillic
64 Big5
65 i18n:Chinese Traditional
66 Big5-HKSCS
67 i18n:Chinese Traditional
68 GB18030
69 i18n:Chinese Simplified
70 GBK
71 i18n:Chinese Simplified
72 GB2312
73 i18n:Chinese Simplified
74 EUC-KR
75 i18n:Korean
76 windows-949
77 i18n:Korean
78 sjis
79 i18n:Japanese
80 ISO-2022-JP
81 i18n:Japanese
82 EUC-JP
83 i18n:Japanese
84 ISO 8859-7
85 i18n:Greek
86 cp 1253
87 i18n:Greek
88 ISO 8859-6
89 i18n:Arabic
90 cp 1256
91 i18n:Arabic
92 ISO 8859-8
93 i18n:Hebrew
94 ISO 8859-8-I
95 i18n:Hebrew
96 cp 1255
97 i18n:Hebrew
98 ISO 8859-9
99 i18n:Turkish
100 TIS620
101 i18n:Thai
102 ISO 8859-11
103 i18n:Thai
104 UTF-8
105 i18n:Unicode
106 UTF-16
107 i18n:Unicode
108 utf7
109 i18n:Unicode
110 ucs2
111 i18n:Unicode
112 ISO 10646-UCS-2
113 i18n:Unicode
114 windows-1258
115 i18n:Other
116 IBM874
117 i18n:Other
118 TSCII
119 i18n:Other
120  */
121 /*
122  * Notes about the table:
123  *
124  * - The following entries were disabled and removed from the table:
125 ibm852
126 i18n:Central European
127 pt 154
128 i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt
129  *
130  * - ISO 8559-11 is the deprecated name of TIS-620
131  * - utf7 is not in Qt
132  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
133  * - windows-1258: TODO
134  * - IBM874: TODO
135  * - TSCII: TODO
136  */
137 
138 /*
139  * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that
140  * statically initialised text should be translated so that it expands to just
141  * the string that should be translated, making it possible to use it in the
142  * single string construct below.
143  */
144 #undef QT_TRANSLATE_NOOP3
145 #define QT_TRANSLATE_NOOP3(a, b, c) b
146 
147 /*
148  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
149  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
150  * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP.
151  */
152 
153 static const char language_for_encoding_string[] =
154  "ISO 8859-1\0"
155  QT_TRANSLATE_NOOP3("KCharsets", "Western European", "@item Text character set")"\0"
156  "ISO 8859-15\0"
157  "ISO 8859-14\0"
158  "cp 1252\0"
159  "IBM850\0"
160  "ISO 8859-2\0"
161  QT_TRANSLATE_NOOP3("KCharsets", "Central European", "@item Text character set")"\0"
162  "ISO 8859-3\0"
163  "ISO 8859-4\0"
164  QT_TRANSLATE_NOOP3("KCharsets", "Baltic", "@item Text character set")"\0"
165  "ISO 8859-13\0"
166  "ISO 8859-16\0"
167  QT_TRANSLATE_NOOP3("KCharsets", "South-Eastern Europe", "@item Text character set")"\0"
168  "cp 1250\0"
169  "cp 1254\0"
170  QT_TRANSLATE_NOOP3("KCharsets", "Turkish", "@item Text character set")"\0"
171  "cp 1257\0"
172  "KOI8-R\0"
173  QT_TRANSLATE_NOOP3("KCharsets", "Cyrillic", "@item Text character set")"\0"
174  "ISO 8859-5\0"
175  "cp 1251\0"
176  "KOI8-U\0"
177  "IBM866\0"
178  "Big5\0"
179  QT_TRANSLATE_NOOP3("KCharsets", "Chinese Traditional", "@item Text character set")"\0"
180  "Big5-HKSCS\0"
181  "GB18030\0"
182  QT_TRANSLATE_NOOP3("KCharsets", "Chinese Simplified", "@item Text character set")"\0"
183  "GBK\0"
184  "GB2312\0"
185  "EUC-KR\0"
186  QT_TRANSLATE_NOOP3("KCharsets", "Korean", "@item Text character set")"\0"
187  "windows-949\0"
188  "sjis\0"
189  QT_TRANSLATE_NOOP3("KCharsets", "Japanese", "@item Text character set")"\0"
190  "ISO-2022-JP\0"
191  "EUC-JP\0"
192  "ISO 8859-7\0"
193  QT_TRANSLATE_NOOP3("KCharsets", "Greek", "@item Text character set")"\0"
194  "cp 1253\0"
195  "ISO 8859-6\0"
196  QT_TRANSLATE_NOOP3("KCharsets", "Arabic", "@item Text character set")"\0"
197  "cp 1256\0"
198  "ISO 8859-8\0"
199  QT_TRANSLATE_NOOP3("KCharsets", "Hebrew", "@item Text character set")"\0"
200  "ISO 8859-8-I\0"
201  "cp 1255\0"
202  "ISO 8859-9\0"
203  "TIS620\0"
204  QT_TRANSLATE_NOOP3("KCharsets", "Thai", "@item Text character set")"\0"
205  "ISO 8859-11\0"
206  "UTF-8\0"
207  QT_TRANSLATE_NOOP3("KCharsets", "Unicode", "@item Text character set")"\0"
208  "UTF-16\0"
209  "utf7\0"
210  "ucs2\0"
211  "ISO 10646-UCS-2\0"
212  "windows-1258\0"
213  QT_TRANSLATE_NOOP3("KCharsets", "Other", "@item Text character set")"\0"
214  "IBM874\0"
215  "TSCII\0"
216  "\0";
217 
218 static const int language_for_encoding_indices[] = {
219  0, 11, 28, 11, 40, 11, 52, 11, 60, 11, 67, 78, 95, 78, 106, 117, 124, 117, 136, 148, 169, 78, 177, 185, 193, 117, 201, 208, 217, 208, 228,
220  208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419,
221  426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1};
222 
223 /*
224  * GENERATED CODE ENDS HERE
225  */
226 
227 /*
228  * defines some different names for codecs that are built into Qt.
229  * The names in this list must be lower-case.
230  * input data for generate_string_table.pl:
231 iso-ir-111
232 koi8-r
233 koi unified
234 koi8-r
235 us-ascii
236 iso 8859-1
237 usascii
238 iso 8859-1
239 ascii
240 iso 8859-1
241 unicode-1-1-utf-7
242 utf-7
243 ucs2
244 iso-10646-ucs-2
245 iso10646-1
246 iso-10646-ucs-2
247 gb18030.2000-1
248 gb18030
249 gb18030.2000-0
250 gb18030
251 gbk-0
252 gbk
253 gb2312
254 gbk
255 gb2312.1980-0
256 gbk
257 big5-0
258 big5
259 euc-kr
260 euckr
261 cp 949
262 windows-949
263 euc-jp
264 eucjp
265 jisx0201.1976-0
266 eucjp
267 jisx0208.1983-0
268 eucjp
269 jisx0208.1990-0
270 eucjp
271 jisx0208.1997-0
272 eucjp
273 jisx0212.1990-0
274 eucjp
275 jisx0213.2000-1
276 eucjp
277 jisx0213.2000-2
278 eucjp
279 shift_jis
280 sjis
281 shift-jis
282 sjis
283 sjis
284 sjis
285 iso-2022-jp
286 jis7
287 windows850
288 ibm850
289 windows866
290 ibm866
291 windows-850
292 ibm850
293 windows-866
294 ibm866
295 cp-10000
296 apple roman
297 thai-tis620
298 iso 8859-11
299 windows-874
300 ibm874
301 windows874
302 ibm874
303 cp-874
304 ibm874
305 ksc5601.1987-0
306 euckr
307 ks_c_5601-1987
308 euckr
309 mac-roman
310 apple roman
311 macintosh
312 apple roman
313 mac
314 apple roman
315 csiso2022jp
316 iso-2022-jp
317 */
318 /*
319  * Notes about the table:
320  * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
321  * - utf7 is not in Qt
322  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
323  * - sjis: appears on the table for x-sjis
324  * - jis7: ISO-2022-JP is now the default name in Qt4
325  * - cp-874: is it really needed?
326  * - mac-roman: appears on the table for x-mac-roman
327  * - csiso2022jp: See bug #77243
328  */
329 
330 /*
331  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
332  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
333  */
334 
335 static const char builtin_string[] =
336  "iso-ir-111\0"
337  "koi8-r\0"
338  "koi unified\0"
339  "us-ascii\0"
340  "iso 8859-1\0"
341  "usascii\0"
342  "ascii\0"
343  "unicode-1-1-utf-7\0"
344  "utf-7\0"
345  "ucs2\0"
346  "iso-10646-ucs-2\0"
347  "iso10646-1\0"
348  "gb18030.2000-1\0"
349  "gb18030\0"
350  "gb18030.2000-0\0"
351  "gbk-0\0"
352  "gbk\0"
353  "gb2312\0"
354  "gb2312.1980-0\0"
355  "big5-0\0"
356  "big5\0"
357  "euc-kr\0"
358  "euckr\0"
359  "cp 949\0"
360  "windows-949\0"
361  "euc-jp\0"
362  "eucjp\0"
363  "jisx0201.1976-0\0"
364  "jisx0208.1983-0\0"
365  "jisx0208.1990-0\0"
366  "jisx0208.1997-0\0"
367  "jisx0212.1990-0\0"
368  "jisx0213.2000-1\0"
369  "jisx0213.2000-2\0"
370  "shift_jis\0"
371  "sjis\0"
372  "shift-jis\0"
373  "iso-2022-jp\0"
374  "jis7\0"
375  "windows850\0"
376  "ibm850\0"
377  "windows866\0"
378  "ibm866\0"
379  "windows-850\0"
380  "windows-866\0"
381  "cp-10000\0"
382  "apple roman\0"
383  "thai-tis620\0"
384  "iso 8859-11\0"
385  "windows-874\0"
386  "ibm874\0"
387  "windows874\0"
388  "cp-874\0"
389  "ksc5601.1987-0\0"
390  "ks_c_5601-1987\0"
391  "mac-roman\0"
392  "macintosh\0"
393  "mac\0"
394  "csiso2022jp\0"
395  "\0";
396 
397 static const int builtin_indices[] = {0, 11, 18, 11, 30, 39, 50, 39, 58, 39, 64, 82, 88, 93, 109, 93, 120, 135, 143, 135, 158, 164,
398  168, 164, 175, 164, 189, 196, 201, 208, 214, 221, 233, 240, 246, 240, 262, 240, 278, 240, 294, 240, 310, 240,
399  326, 240, 342, 240, 358, 368, 373, 368, 368, 368, 383, 395, 400, 411, 418, 429, 436, 411, 448, 429, 460, 469,
400  481, 493, 505, 517, 524, 517, 535, 517, 542, 208, 557, 208, 572, 469, 582, 469, 592, 469, 596, 383, -1};
401 
402 /*
403  * GENERATED CODE ENDS HERE
404  */
405 
406 /*
407  * some last resort hints in case the charmap file couldn't be found.
408  * This gives at least a partial conversion and helps making things readable.
409  *
410  * the name used as input here is already converted to the more canonical
411  * name as defined in the aliases array.
412  *
413  * Input data:
414 cp1250
415 iso-8859-2
416 koi8-r
417 iso-8859-5
418 koi8-u
419 koi8-r
420 pt 154
421 windows-1251
422 paratype-154
423 windows-1251
424 pt-154
425 windows-1251
426  */
427 /* Notes:
428  * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
429  */
430 
431 /*
432  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
433  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
434  */
435 
436 static const char conversion_hints_string[] =
437  "cp1250\0"
438  "iso-8859-2\0"
439  "koi8-r\0"
440  "iso-8859-5\0"
441  "koi8-u\0"
442  "pt 154\0"
443  "windows-1251\0"
444  "paratype-154\0"
445  "pt-154\0"
446  "\0";
447 
448 static const int conversion_hints_indices[] = {0, 7, 18, 25, 36, 18, 43, 50, 63, 50, 76, 50, -1};
449 
450 /*
451  * GENERATED CODE ENDS HERE
452  */
453 
454 struct KCharsetsSingletonPrivate {
455  KCharsets instance;
456 };
457 
458 Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets)
459 
460 // search an array of items index/data, find first matching index
461 // and return data, or return 0
462 static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
463 {
464  for (int i = 0; indices[i] != -1; i += 2) {
465  if (qstrcmp(start + indices[i], entry) == 0) {
466  return start + indices[i + 1];
467  }
468  }
469  return nullptr;
470 }
471 
472 bool KCharsetsPrivate::isUsAsciiTextCodecRequest(const QByteArray &name) const
473 {
474  if (usAsciiTextCodec->name().compare(name, Qt::CaseInsensitive) == 0) {
475  return true;
476  }
477  const QList<QByteArray> aliases = usAsciiTextCodec->aliases();
478  return std::any_of(aliases.constBegin(), aliases.constEnd(), [name](const QByteArray &aliasName) {
479  return (aliasName.compare(name, Qt::CaseInsensitive) == 0);
480  });
481 }
482 
483 // --------------------------------------------------------------------------
484 
486  : d(new KCharsetsPrivate(this))
487 {
488 }
489 
490 KCharsets::~KCharsets() = default;
491 
493 {
494  QChar res = QChar::Null;
495 
496  if (str.isEmpty()) {
497  return QChar::Null;
498  }
499 
500  int pos = 0;
501  if (str[pos] == QLatin1Char('&')) {
502  pos++;
503  }
504 
505  // Check for '&#000' or '&#x0000' sequence
506  if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) {
507  bool ok;
508  pos++;
509  if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
510  pos++;
511  // '&#x0000', hexadecimal character reference
512  const QString tmp(str.mid(pos));
513  res = QChar(tmp.toInt(&ok, 16));
514  } else {
515  // '&#0000', decimal character reference
516  const QString tmp(str.mid(pos));
517  res = QChar(tmp.toInt(&ok, 10));
518  }
519  if (ok) {
520  return res;
521  } else {
522  return QChar::Null;
523  }
524  }
525 
526  const QByteArray raw(str.toLatin1());
527  const entity *e = KCodecsEntities::kde_findEntity(raw.data(), raw.length());
528 
529  if (!e) {
530  // qCDebug(KCODECS_LOG) << "unknown entity " << str <<", len = " << str.length();
531  return QChar::Null;
532  }
533  // qCDebug(KCODECS_LOG) << "got entity " << str << " = " << e->code;
534 
535  return QChar(e->code);
536 }
537 
538 QChar KCharsets::fromEntity(const QString &str, int &len)
539 {
540  // entities are never longer than 8 chars... we start from
541  // that length and work backwards...
542  len = 8;
543  while (len > 0) {
544  QString tmp = str.left(len);
545  QChar res = fromEntity(tmp);
546  if (res != QChar::Null) {
547  return res;
548  }
549  len--;
550  }
551  return QChar::Null;
552 }
553 
555 {
556  return QString::asprintf("&#0x%x;", ch.unicode());
557 }
558 
560 {
561  QString text = input;
562  const QChar *p = text.unicode();
563  const QChar *end = p + text.length();
564  const QChar *ampersand = nullptr;
565  bool scanForSemicolon = false;
566 
567  for (; p < end; ++p) {
568  const QChar ch = *p;
569 
570  if (ch == QLatin1Char('&')) {
571  ampersand = p;
572  scanForSemicolon = true;
573  continue;
574  }
575 
576  if (ch != QLatin1Char(';') || scanForSemicolon == false) {
577  continue;
578  }
579 
580  assert(ampersand);
581 
582  scanForSemicolon = false;
583 
584  const QChar *entityBegin = ampersand + 1;
585 
586  const uint entityLength = p - entityBegin;
587  if (entityLength == 0) {
588  continue;
589  }
590 
591  const QChar entityValue = KCharsets::fromEntity(QString(entityBegin, entityLength));
592  if (entityValue.isNull()) {
593  continue;
594  }
595 
596  const uint ampersandPos = ampersand - text.unicode();
597 
598  text[(int)ampersandPos] = entityValue;
599  text.remove(ampersandPos + 1, entityLength + 1);
600  p = text.unicode() + ampersandPos;
601  end = text.unicode() + text.length();
602  ampersand = nullptr;
603  }
604 
605  return text;
606 }
607 
609 {
610  QStringList available;
611  for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
612  available.append(QString::fromUtf8(language_for_encoding_string + *p));
613  }
614  available.sort();
615  return available;
616 }
617 
619 {
620  const char *lang = kcharsets_array_search(language_for_encoding_string, language_for_encoding_indices, encoding.toUtf8().data());
621  if (lang) {
622  return tr("%1 ( %2 )", "@item %1 character set, %2 encoding").arg(tr(lang, "@item Text character set"), encoding);
623  } else {
624  return tr("Other encoding (%1)", "@item").arg(encoding);
625  }
626 }
627 
628 QString KCharsets::encodingForName(const QString &descriptiveName) const
629 {
630  const int left = descriptiveName.lastIndexOf(QLatin1Char('('));
631 
632  if (left < 0) { // No parenthesis, so assume it is a normal encoding name
633  return descriptiveName.trimmed();
634  }
635 
636  QString name(descriptiveName.mid(left + 1));
637 
638  const int right = name.lastIndexOf(QLatin1Char(')'));
639 
640  if (right < 0) {
641  return name;
642  }
643 
644  return name.left(right).trimmed();
645 }
646 
648 {
649  QStringList encodings;
650  for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
651  const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
652  const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
653  encodings.append(tr("%1 ( %2 )", "@item Text encoding: %1 character set, %2 encoding").arg(description, name));
654  }
655  encodings.sort();
656  return encodings;
657 }
658 
660 {
661  if (!d->encodingsByScript.isEmpty()) {
662  return d->encodingsByScript;
663  }
664  int i;
665  for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
666  const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
667  const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
668 
669  for (i = 0; i < d->encodingsByScript.size(); ++i) {
670  if (d->encodingsByScript.at(i).at(0) == description) {
671  d->encodingsByScript[i].append(name);
672  break;
673  }
674  }
675 
676  if (i == d->encodingsByScript.size()) {
677  d->encodingsByScript.append(QStringList() << description << name);
678  }
679  }
680  return d->encodingsByScript;
681 }
682 
683 #if KCODECS_BUILD_DEPRECATED_SINCE(5, 101)
685 {
686  return d->codecForName(n);
687 }
688 #endif
689 
690 QTextCodec *KCharsetsPrivate::codecForName(const QString &n)
691 {
692  if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) {
693  return QTextCodec::codecForName("gb18030");
694  }
695  const QByteArray name(n.toLatin1());
696  QTextCodec *codec = codecForNameOrNull(name);
697  if (codec) {
698  return codec;
699  } else {
700  return QTextCodec::codecForName("iso-8859-1");
701  }
702 }
703 
704 #if KCODECS_BUILD_DEPRECATED_SINCE(5, 101)
705 QTextCodec *KCharsets::codecForName(const QString &n, bool &ok) const
706 {
707  return d->codecForName(n, ok);
708 };
709 #endif
710 
711 QTextCodec *KCharsetsPrivate::codecForName(const QString &n, bool &ok)
712 {
713  if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) {
714  ok = true;
715  return QTextCodec::codecForName("gb18030");
716  }
717  const QByteArray name(n.toLatin1());
718  QTextCodec *codec = codecForNameOrNull(name);
719  if (codec) {
720  ok = true;
721  return codec;
722  } else {
723  ok = false;
724  return QTextCodec::codecForName("iso-8859-1");
725  }
726 }
727 
728 #if KCODECS_BUILD_DEPRECATED_SINCE(5, 101)
729 QTextCodec *KCharsets::codecForNameOrNull(const QByteArray &n) const
730 {
731  return d->codecForNameOrNull(n);
732 }
733 #endif
734 
735 QTextCodec *KCharsetsPrivate::codecForNameOrNull(const QByteArray &n)
736 {
737  QTextCodec *codec = nullptr;
738 
739  if (n.isEmpty()) {
740  // TODO: Any better ideas ?
741  // No name, assume system locale
742  const QByteArray locale = "->locale<-";
743  if (codecForNameDict.contains(locale)) {
744  return codecForNameDict.value(locale);
745  }
746  codec = QTextCodec::codecForLocale();
747  codecForNameDict.insert("->locale<-", codec);
748  return codec;
749  }
750  // For a non-empty name, lookup the "dictionary", in a case-sensitive way.
751  else if (codecForNameDict.contains(n)) {
752  return codecForNameDict.value(n);
753  }
754 
755  // If the name is not in the hash table,
756  // first check ourselves if our fixed variant of a US-ASCII codec should be returned:
757  // API docs of QTextCodec do not specify the handling of custom codec instances
758  // on look-up when there are multiple codecs supporting the same name.
759  // The code of Qt 5.15 prepends custom instances to the internal list,
760  // so they would be preferred initially.
761  // But the code also has a look-up cache which does not get updated on new instances,
762  // so if somewhere a US-ASCII codec was requested by some other code before
763  // our KUsAsciiTextCodec instance gets created, the Qt-built-in will be always
764  // picked instead, at least for the used name.
765  // So we cannot rely on the internal mechanisms, but have to prefer our codec ourselves.
766  if (isUsAsciiTextCodecRequest(n)) {
767  codec = usAsciiTextCodec;
768  } else {
769  // call directly QTextCodec::codecForName.
770  // We assume that QTextCodec is smarter and more maintained than this code.
771  codec = QTextCodec::codecForName(n);
772  }
773 
774  if (codec) {
775  codecForNameDict.insert(n, codec);
776  return codec;
777  }
778 
779  // We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it.
780 
781  QByteArray name = n.toLower();
782  bool changed = false;
783  if (name.endsWith("_charset")) { // krazy:exclude=strings
784  name.chop(8);
785  changed = true;
786  }
787  if (name.startsWith("x-")) { // krazy:exclude=strings
788  name.remove(0, 2); // remove x- at start
789  changed = true;
790  }
791 
792  if (name.isEmpty()) {
793  // We have no name anymore, therefore the name is invalid.
794  return nullptr;
795  }
796 
797  // We only need to check changed names.
798  if (changed) {
799  codec = QTextCodec::codecForName(name);
800  if (codec) {
801  codecForNameDict.insert(n, codec);
802  return codec;
803  }
804  }
805 
806  // these codecs are built into Qt, but the name given for the codec is different,
807  // so QTextCodec did not recognize it.
808  QByteArray cname = kcharsets_array_search(builtin_string, builtin_indices, name.data());
809 
810  if (!cname.isEmpty()) {
811  codec = QTextCodec::codecForName(cname);
812  }
813 
814  if (codec) {
815  codecForNameDict.insert(n, codec);
816  return codec;
817  }
818 
819  // this also failed, the last resort is now to take some compatibility charmap
820  // ### TODO: while emergency conversions might be useful at read, it is not sure if they should be done if the application plans to write.
821  cname = kcharsets_array_search(conversion_hints_string, conversion_hints_indices, name.data());
822 
823  if (!cname.isEmpty()) {
824  codec = QTextCodec::codecForName(cname);
825  if (codec) {
826  codecForNameDict.insert(n, codec);
827  return codec;
828  }
829  }
830 
831  // we could not assign a codec, therefore return NULL
832  return nullptr;
833 }
834 
836 {
837  return &globalCharsets()->instance;
838 }
void append(const T &value)
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
QByteArray toLower() const const
QString encodingForName(const QString &descriptiveName) const
Returns the encoding for a string obtained with descriptiveEncodingNames().
Definition: kcharsets.cpp:628
QString fromUtf8(const char *str, int size)
CaseInsensitive
KCharsets()
Protected constructor.
Definition: kcharsets.cpp:485
QString trimmed() const const
Q_SCRIPTABLE Q_NOREPLY void start()
static QString toEntity(const QChar &ch)
Converts a QChar to an entity.
Definition: kcharsets.cpp:554
void chop(int n)
QByteArray toLatin1() const const
QList::const_iterator constBegin() const const
QList< QStringList > encodingsByScript() const
Lists the available encoding names grouped by script (or language that uses them).
Definition: kcharsets.cpp:659
const QChar * unicode() const const
QTextCodec * codecForName(const QString &name) const
Provided for compatibility.
Definition: kcharsets.cpp:684
int lastIndexOf(QChar ch, int from, Qt::CaseSensitivity cs) const const
Q_GLOBAL_STATIC(Internal::StaticControl, s_instance) class ControlPrivate
QTextCodec * codecForLocale()
virtual ~KCharsets()
Destructor.
bool isEmpty() const const
QByteArray toUtf8() const const
int length() const const
QTextCodec * codecForName(const QByteArray &name)
int toInt(bool *ok, int base) const const
static QString resolveEntities(const QString &text)
Scans the given string for entities (like &amp;) and resolves them using fromEntity.
Definition: kcharsets.cpp:559
QStringList availableEncodingNames() const
Lists all available encodings as names.
Definition: kcharsets.cpp:608
QString & remove(int position, int n)
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
LocaleWrapper locale()
QString descriptionForEncoding(const QString &encoding) const
Returns a long description for an encoding name.
Definition: kcharsets.cpp:618
bool isEmpty() const const
QList::const_iterator constEnd() const const
static KCharsets * charsets()
The global charset manager.
Definition: kcharsets.cpp:835
QString left(int n) const const
QString name(StandardShortcut id)
QChar * data()
int length() const const
static QChar fromEntity(const QString &str)
Converts an entity to a character.
Definition: kcharsets.cpp:492
QString asprintf(const char *cformat,...)
QString mid(int position, int n) const const
QStringList descriptiveEncodingNames() const
Lists the available encoding names together with a more descriptive language.
Definition: kcharsets.cpp:647
bool isNull() const const
char * data()
ushort unicode() const const
void sort(Qt::CaseSensitivity cs)
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Tue Feb 7 2023 04:06:47 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.