KCodecs

kcharsets.cpp
1 /*
2  This file is part of the KDE libraries
3 
4  SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org>
5  SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
6  SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net>
7 
8  SPDX-License-Identifier: LGPL-2.0-or-later
9 */
10 #include "kcharsets.h"
11 #include "kcharsets_p.h"
12 #include "kcodecs_debug.h"
13 
14 #include <kentities.h>
15 
16 #include <QHash>
17 
18 #include <algorithm>
19 #include <assert.h>
20 
21 /*
22  * The encoding names (like "ISO 8859-1") in this list are user-visible,
23  * and should be mostly uppercase.
24  * Generate with generate_string_table.pl (located in kde-dev-scripts),
25  * input data:
26 ISO 8859-1
27 i18n:Western European
28 ISO 8859-15
29 i18n:Western European
30 ISO 8859-14
31 i18n:Western European
32 cp 1252
33 i18n:Western European
34 IBM850
35 i18n:Western European
36 ISO 8859-2
37 i18n:Central European
38 ISO 8859-3
39 i18n:Central European
40 ISO 8859-4
41 i18n:Baltic
42 ISO 8859-13
43 i18n:Baltic
44 ISO 8859-16
45 i18n:South-Eastern Europe
46 cp 1250
47 i18n:Central European
48 cp 1254
49 i18n:Turkish
50 cp 1257
51 i18n:Baltic
52 KOI8-R
53 i18n:Cyrillic
54 ISO 8859-5
55 i18n:Cyrillic
56 cp 1251
57 i18n:Cyrillic
58 KOI8-U
59 i18n:Cyrillic
60 IBM866
61 i18n:Cyrillic
62 Big5
63 i18n:Chinese Traditional
64 Big5-HKSCS
65 i18n:Chinese Traditional
66 GB18030
67 i18n:Chinese Simplified
68 GBK
69 i18n:Chinese Simplified
70 GB2312
71 i18n:Chinese Simplified
72 EUC-KR
73 i18n:Korean
74 windows-949
75 i18n:Korean
76 sjis
77 i18n:Japanese
78 ISO-2022-JP
79 i18n:Japanese
80 EUC-JP
81 i18n:Japanese
82 ISO 8859-7
83 i18n:Greek
84 cp 1253
85 i18n:Greek
86 ISO 8859-6
87 i18n:Arabic
88 cp 1256
89 i18n:Arabic
90 ISO 8859-8
91 i18n:Hebrew
92 ISO 8859-8-I
93 i18n:Hebrew
94 cp 1255
95 i18n:Hebrew
96 ISO 8859-9
97 i18n:Turkish
98 TIS620
99 i18n:Thai
100 ISO 8859-11
101 i18n:Thai
102 UTF-8
103 i18n:Unicode
104 UTF-16
105 i18n:Unicode
106 utf7
107 i18n:Unicode
108 ucs2
109 i18n:Unicode
110 ISO 10646-UCS-2
111 i18n:Unicode
112 windows-1258
113 i18n:Other
114 IBM874
115 i18n:Other
116 TSCII
117 i18n:Other
118  */
119 /*
120  * Notes about the table:
121  *
122  * - The following entries were disabled and removed from the table:
123 ibm852
124 i18n:Central European
125 pt 154
126 i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt
127  *
128  * - ISO 8559-11 is the deprecated name of TIS-620
129  * - utf7 is not in Qt
130  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
131  * - windows-1258: TODO
132  * - IBM874: TODO
133  * - TSCII: TODO
134  */
135 
136 /*
137  * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that
138  * statically initialised text should be translated so that it expands to just
139  * the string that should be translated, making it possible to use it in the
140  * single string construct below.
141  */
142 #undef QT_TRANSLATE_NOOP3
143 #define QT_TRANSLATE_NOOP3(a, b, c) b
144 
145 /*
146  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
147  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
148  * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP.
149  */
150 
151 static const char language_for_encoding_string[] =
152  "ISO 8859-1\0"
153  QT_TRANSLATE_NOOP3("KCharsets", "Western European", "@item Text character set")"\0"
154  "ISO 8859-15\0"
155  "ISO 8859-14\0"
156  "cp 1252\0"
157  "IBM850\0"
158  "ISO 8859-2\0"
159  QT_TRANSLATE_NOOP3("KCharsets", "Central European", "@item Text character set")"\0"
160  "ISO 8859-3\0"
161  "ISO 8859-4\0"
162  QT_TRANSLATE_NOOP3("KCharsets", "Baltic", "@item Text character set")"\0"
163  "ISO 8859-13\0"
164  "ISO 8859-16\0"
165  QT_TRANSLATE_NOOP3("KCharsets", "South-Eastern Europe", "@item Text character set")"\0"
166  "cp 1250\0"
167  "cp 1254\0"
168  QT_TRANSLATE_NOOP3("KCharsets", "Turkish", "@item Text character set")"\0"
169  "cp 1257\0"
170  "KOI8-R\0"
171  QT_TRANSLATE_NOOP3("KCharsets", "Cyrillic", "@item Text character set")"\0"
172  "ISO 8859-5\0"
173  "cp 1251\0"
174  "KOI8-U\0"
175  "IBM866\0"
176  "Big5\0"
177  QT_TRANSLATE_NOOP3("KCharsets", "Chinese Traditional", "@item Text character set")"\0"
178  "Big5-HKSCS\0"
179  "GB18030\0"
180  QT_TRANSLATE_NOOP3("KCharsets", "Chinese Simplified", "@item Text character set")"\0"
181  "GBK\0"
182  "GB2312\0"
183  "EUC-KR\0"
184  QT_TRANSLATE_NOOP3("KCharsets", "Korean", "@item Text character set")"\0"
185  "windows-949\0"
186  "sjis\0"
187  QT_TRANSLATE_NOOP3("KCharsets", "Japanese", "@item Text character set")"\0"
188  "ISO-2022-JP\0"
189  "EUC-JP\0"
190  "ISO 8859-7\0"
191  QT_TRANSLATE_NOOP3("KCharsets", "Greek", "@item Text character set")"\0"
192  "cp 1253\0"
193  "ISO 8859-6\0"
194  QT_TRANSLATE_NOOP3("KCharsets", "Arabic", "@item Text character set")"\0"
195  "cp 1256\0"
196  "ISO 8859-8\0"
197  QT_TRANSLATE_NOOP3("KCharsets", "Hebrew", "@item Text character set")"\0"
198  "ISO 8859-8-I\0"
199  "cp 1255\0"
200  "ISO 8859-9\0"
201  "TIS620\0"
202  QT_TRANSLATE_NOOP3("KCharsets", "Thai", "@item Text character set")"\0"
203  "ISO 8859-11\0"
204  "UTF-8\0"
205  QT_TRANSLATE_NOOP3("KCharsets", "Unicode", "@item Text character set")"\0"
206  "UTF-16\0"
207  "utf7\0"
208  "ucs2\0"
209  "ISO 10646-UCS-2\0"
210  "windows-1258\0"
211  QT_TRANSLATE_NOOP3("KCharsets", "Other", "@item Text character set")"\0"
212  "IBM874\0"
213  "TSCII\0"
214  "\0";
215 
216 static const int language_for_encoding_indices[] = {
217  0, 11, 28, 11, 40, 11, 52, 11, 60, 11, 67, 78, 95, 78, 106, 117, 124, 117, 136, 148, 169, 78, 177, 185, 193, 117, 201, 208, 217, 208, 228,
218  208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419,
219  426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1};
220 
221 /*
222  * GENERATED CODE ENDS HERE
223  */
224 
225 /*
226  * defines some different names for codecs that are built into Qt.
227  * The names in this list must be lower-case.
228  * input data for generate_string_table.pl:
229 iso-ir-111
230 koi8-r
231 koi unified
232 koi8-r
233 us-ascii
234 iso 8859-1
235 usascii
236 iso 8859-1
237 ascii
238 iso 8859-1
239 unicode-1-1-utf-7
240 utf-7
241 ucs2
242 iso-10646-ucs-2
243 iso10646-1
244 iso-10646-ucs-2
245 gb18030.2000-1
246 gb18030
247 gb18030.2000-0
248 gb18030
249 gbk-0
250 gbk
251 gb2312
252 gbk
253 gb2312.1980-0
254 gbk
255 big5-0
256 big5
257 euc-kr
258 euckr
259 cp 949
260 windows-949
261 euc-jp
262 eucjp
263 jisx0201.1976-0
264 eucjp
265 jisx0208.1983-0
266 eucjp
267 jisx0208.1990-0
268 eucjp
269 jisx0208.1997-0
270 eucjp
271 jisx0212.1990-0
272 eucjp
273 jisx0213.2000-1
274 eucjp
275 jisx0213.2000-2
276 eucjp
277 shift_jis
278 sjis
279 shift-jis
280 sjis
281 sjis
282 sjis
283 iso-2022-jp
284 jis7
285 windows850
286 ibm850
287 windows866
288 ibm866
289 windows-850
290 ibm850
291 windows-866
292 ibm866
293 cp-10000
294 apple roman
295 thai-tis620
296 iso 8859-11
297 windows-874
298 ibm874
299 windows874
300 ibm874
301 cp-874
302 ibm874
303 ksc5601.1987-0
304 euckr
305 ks_c_5601-1987
306 euckr
307 mac-roman
308 apple roman
309 macintosh
310 apple roman
311 mac
312 apple roman
313 csiso2022jp
314 iso-2022-jp
315 */
316 /*
317  * Notes about the table:
318  * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
319  * - utf7 is not in Qt
320  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
321  * - sjis: appears on the table for x-sjis
322  * - jis7: ISO-2022-JP is now the default name in Qt4
323  * - cp-874: is it really needed?
324  * - mac-roman: appears on the table for x-mac-roman
325  * - csiso2022jp: See bug #77243
326  */
327 
328 /*
329  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
330  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
331  */
332 
333 static const char builtin_string[] =
334  "iso-ir-111\0"
335  "koi8-r\0"
336  "koi unified\0"
337  "us-ascii\0"
338  "iso 8859-1\0"
339  "usascii\0"
340  "ascii\0"
341  "unicode-1-1-utf-7\0"
342  "utf-7\0"
343  "ucs2\0"
344  "iso-10646-ucs-2\0"
345  "iso10646-1\0"
346  "gb18030.2000-1\0"
347  "gb18030\0"
348  "gb18030.2000-0\0"
349  "gbk-0\0"
350  "gbk\0"
351  "gb2312\0"
352  "gb2312.1980-0\0"
353  "big5-0\0"
354  "big5\0"
355  "euc-kr\0"
356  "euckr\0"
357  "cp 949\0"
358  "windows-949\0"
359  "euc-jp\0"
360  "eucjp\0"
361  "jisx0201.1976-0\0"
362  "jisx0208.1983-0\0"
363  "jisx0208.1990-0\0"
364  "jisx0208.1997-0\0"
365  "jisx0212.1990-0\0"
366  "jisx0213.2000-1\0"
367  "jisx0213.2000-2\0"
368  "shift_jis\0"
369  "sjis\0"
370  "shift-jis\0"
371  "iso-2022-jp\0"
372  "jis7\0"
373  "windows850\0"
374  "ibm850\0"
375  "windows866\0"
376  "ibm866\0"
377  "windows-850\0"
378  "windows-866\0"
379  "cp-10000\0"
380  "apple roman\0"
381  "thai-tis620\0"
382  "iso 8859-11\0"
383  "windows-874\0"
384  "ibm874\0"
385  "windows874\0"
386  "cp-874\0"
387  "ksc5601.1987-0\0"
388  "ks_c_5601-1987\0"
389  "mac-roman\0"
390  "macintosh\0"
391  "mac\0"
392  "csiso2022jp\0"
393  "\0";
394 
395 static const int builtin_indices[] = {0, 11, 18, 11, 30, 39, 50, 39, 58, 39, 64, 82, 88, 93, 109, 93, 120, 135, 143, 135, 158, 164,
396  168, 164, 175, 164, 189, 196, 201, 208, 214, 221, 233, 240, 246, 240, 262, 240, 278, 240, 294, 240, 310, 240,
397  326, 240, 342, 240, 358, 368, 373, 368, 368, 368, 383, 395, 400, 411, 418, 429, 436, 411, 448, 429, 460, 469,
398  481, 493, 505, 517, 524, 517, 535, 517, 542, 208, 557, 208, 572, 469, 582, 469, 592, 469, 596, 383, -1};
399 
400 /*
401  * GENERATED CODE ENDS HERE
402  */
403 
404 /*
405  * some last resort hints in case the charmap file couldn't be found.
406  * This gives at least a partial conversion and helps make things readable.
407  *
408  * the name used as input here is already converted to the more canonical
409  * name as defined in the aliases array.
410  *
411  * Input data:
412 cp1250
413 iso-8859-2
414 koi8-r
415 iso-8859-5
416 koi8-u
417 koi8-r
418 pt 154
419 windows-1251
420 paratype-154
421 windows-1251
422 pt-154
423 windows-1251
424  */
425 /* Notes:
426  * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
427  */
428 
429 /*
430  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
431  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
432  */
433 
434 static const char conversion_hints_string[] =
435  "cp1250\0"
436  "iso-8859-2\0"
437  "koi8-r\0"
438  "iso-8859-5\0"
439  "koi8-u\0"
440  "pt 154\0"
441  "windows-1251\0"
442  "paratype-154\0"
443  "pt-154\0"
444  "\0";
445 
446 static const int conversion_hints_indices[] = {0, 7, 18, 25, 36, 18, 43, 50, 63, 50, 76, 50, -1};
447 
448 /*
449  * GENERATED CODE ENDS HERE
450  */
451 
452 struct KCharsetsSingletonPrivate {
453  KCharsets instance;
454 };
455 
456 Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets)
457 
458 // search an array of items index/data, find first matching index
459 // and return data, or return 0
460 static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
461 {
462  for (int i = 0; indices[i] != -1; i += 2) {
463  if (qstrcmp(start + indices[i], entry) == 0) {
464  return start + indices[i + 1];
465  }
466  }
467  return nullptr;
468 }
469 
470 // --------------------------------------------------------------------------
471 
473  : d(new KCharsetsPrivate)
474 {
475 }
476 
477 KCharsets::~KCharsets() = default;
478 
480 {
481  QChar res = QChar::Null;
482 
483  if (str.isEmpty()) {
484  return QChar::Null;
485  }
486 
487  int pos = 0;
488  if (str[pos] == QLatin1Char('&')) {
489  pos++;
490  }
491 
492  // Check for '&#000' or '&#x0000' sequence
493  if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) {
494  bool ok;
495  pos++;
496  if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
497  pos++;
498  // '&#x0000', hexadecimal character reference
499  const auto tmp = str.mid(pos);
500  res = QChar(tmp.toInt(&ok, 16));
501  } else {
502  // '&#0000', decimal character reference
503  const auto tmp = str.mid(pos);
504  res = QChar(tmp.toInt(&ok, 10));
505  }
506  if (ok) {
507  return res;
508  } else {
509  return QChar::Null;
510  }
511  }
512 
513  const QByteArray raw(str.toLatin1());
514  const entity *e = KCodecsEntities::kde_findEntity(raw.data(), raw.length());
515 
516  if (!e) {
517  // qCDebug(KCODECS_LOG) << "unknown entity " << str <<", len = " << str.length();
518  return QChar::Null;
519  }
520  // qCDebug(KCODECS_LOG) << "got entity " << str << " = " << e->code;
521 
522  return QChar(e->code);
523 }
524 
526 {
527  // entities are never longer than 8 chars... we start from
528  // that length and work backwards...
529  len = 8;
530  while (len > 0) {
531  const auto tmp = str.left(len);
532  QChar res = fromEntity(tmp);
533  if (res != QChar::Null) {
534  return res;
535  }
536  len--;
537  }
538  return QChar::Null;
539 }
540 
542 {
543  return QString::asprintf("&#0x%x;", ch.unicode());
544 }
545 
547 {
548  QString text = input;
549  const QChar *p = text.unicode();
550  const QChar *end = p + text.length();
551  const QChar *ampersand = nullptr;
552  bool scanForSemicolon = false;
553 
554  for (; p < end; ++p) {
555  const QChar ch = *p;
556 
557  if (ch == QLatin1Char('&')) {
558  ampersand = p;
559  scanForSemicolon = true;
560  continue;
561  }
562 
563  if (ch != QLatin1Char(';') || scanForSemicolon == false) {
564  continue;
565  }
566 
567  assert(ampersand);
568 
569  scanForSemicolon = false;
570 
571  const QChar *entityBegin = ampersand + 1;
572 
573  const uint entityLength = p - entityBegin;
574  if (entityLength == 0) {
575  continue;
576  }
577 
578  const QChar entityValue = KCharsets::fromEntity(QStringView(entityBegin, entityLength));
579  if (entityValue.isNull()) {
580  continue;
581  }
582 
583  const uint ampersandPos = ampersand - text.unicode();
584 
585  text[(int)ampersandPos] = entityValue;
586  text.remove(ampersandPos + 1, entityLength + 1);
587  p = text.unicode() + ampersandPos;
588  end = text.unicode() + text.length();
589  ampersand = nullptr;
590  }
591 
592  return text;
593 }
594 
596 {
597  QStringList available;
598  for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
599  available.append(QString::fromUtf8(language_for_encoding_string + *p));
600  }
601  available.sort();
602  return available;
603 }
604 
606 {
607  const char *lang = kcharsets_array_search(language_for_encoding_string, language_for_encoding_indices, encoding.toUtf8().data());
608  if (lang) {
609  return tr("%1 ( %2 )", "@item %1 character set, %2 encoding").arg(tr(lang, "@item Text character set"), encoding);
610  } else {
611  return tr("Other encoding (%1)", "@item").arg(encoding);
612  }
613 }
614 
615 QString KCharsets::encodingForName(const QString &descriptiveName) const
616 {
617  const int left = descriptiveName.lastIndexOf(QLatin1Char('('));
618 
619  if (left < 0) { // No parenthesis, so assume it is a normal encoding name
620  return descriptiveName.trimmed();
621  }
622 
623  QString name(descriptiveName.mid(left + 1));
624 
625  const int right = name.lastIndexOf(QLatin1Char(')'));
626 
627  if (right < 0) {
628  return name;
629  }
630 
631  return name.left(right).trimmed();
632 }
633 
635 {
636  QStringList encodings;
637  for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
638  const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
639  const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
640  encodings.append(tr("%1 ( %2 )", "@item Text encoding: %1 character set, %2 encoding").arg(description, name));
641  }
642  encodings.sort();
643  return encodings;
644 }
645 
647 {
648  if (!d->encodingsByScript.isEmpty()) {
649  return d->encodingsByScript;
650  }
651  int i;
652  for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
653  const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
654  const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
655 
656  for (i = 0; i < d->encodingsByScript.size(); ++i) {
657  if (d->encodingsByScript.at(i).at(0) == description) {
658  d->encodingsByScript[i].append(name);
659  break;
660  }
661  }
662 
663  if (i == d->encodingsByScript.size()) {
664  d->encodingsByScript.append(QStringList() << description << name);
665  }
666  }
667  return d->encodingsByScript;
668 }
669 
671 {
672  return &globalCharsets()->instance;
673 }
void append(const T &value)
QString encodingForName(const QString &descriptiveName) const
Returns the encoding for a string obtained with descriptiveEncodingNames().
Definition: kcharsets.cpp:615
QByteArray toLatin1() const const
QString fromUtf8(const char *str, int size)
KCharsets()
Protected constructor.
Definition: kcharsets.cpp:472
QString trimmed() const const
QStringView mid(qsizetype start) const const
Q_SCRIPTABLE Q_NOREPLY void start()
static QString toEntity(const QChar &ch)
Converts a QChar to an entity.
Definition: kcharsets.cpp:541
QList< QStringList > encodingsByScript() const
Lists the available encoding names grouped by script (or language that uses them).
Definition: kcharsets.cpp:646
const QChar * unicode() const const
int lastIndexOf(QChar ch, int from, Qt::CaseSensitivity cs) const const
QStringView left(qsizetype length) const const
QByteArray toUtf8() const const
Q_GLOBAL_STATIC(Internal::StaticControl, s_instance) class ControlPrivate
static QChar fromEntity(QStringView str)
Converts an entity to a character.
Definition: kcharsets.cpp:479
~KCharsets()
Destructor.
int length() const const
static QString resolveEntities(const QString &text)
Scans the given string for entities (like &amp;) and resolves them using fromEntity.
Definition: kcharsets.cpp:546
QStringList availableEncodingNames() const
Lists all available encodings as names.
Definition: kcharsets.cpp:595
QString & remove(int position, int n)
bool isEmpty() const const
static KCharsets * charsets()
The global charset manager.
Definition: kcharsets.cpp:670
QString left(int n) const const
int length() const const
int length() const const
QString asprintf(const char *cformat,...)
QString mid(int position, int n) const const
QStringList descriptiveEncodingNames() const
Lists the available encoding names together with a more descriptive language.
Definition: kcharsets.cpp:634
bool isNull() const const
char * data()
ushort unicode() const const
QString descriptionForEncoding(QStringView encoding) const
Returns a long description for an encoding name.
Definition: kcharsets.cpp:605
void sort(Qt::CaseSensitivity cs)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Thu Feb 15 2024 04:06:58 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.