KCodecs

kcharsets.cpp
1/*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org>
5 SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
6 SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net>
7
8 SPDX-License-Identifier: LGPL-2.0-or-later
9*/
10#include "kcharsets.h"
11#include "kcharsets_p.h"
12#include "kcodecs_debug.h"
13
14#include <kentities.h>
15
16#include <QHash>
17
18#include <algorithm>
19#include <assert.h>
20
21/*
22 * The encoding names (like "ISO 8859-1") in this list are user-visible,
23 * and should be mostly uppercase.
24 * Generate with generate_string_table.pl (located in kde-dev-scripts),
25 * input data:
26ISO 8859-1
27i18n:Western European
28ISO 8859-15
29i18n:Western European
30ISO 8859-14
31i18n:Western European
32cp 1252
33i18n:Western European
34IBM850
35i18n:Western European
36ISO 8859-2
37i18n:Central European
38ISO 8859-3
39i18n:Central European
40ISO 8859-4
41i18n:Baltic
42ISO 8859-13
43i18n:Baltic
44ISO 8859-16
45i18n:South-Eastern Europe
46cp 1250
47i18n:Central European
48cp 1254
49i18n:Turkish
50cp 1257
51i18n:Baltic
52KOI8-R
53i18n:Cyrillic
54ISO 8859-5
55i18n:Cyrillic
56cp 1251
57i18n:Cyrillic
58KOI8-U
59i18n:Cyrillic
60IBM866
61i18n:Cyrillic
62Big5
63i18n:Chinese Traditional
64Big5-HKSCS
65i18n:Chinese Traditional
66GB18030
67i18n:Chinese Simplified
68GBK
69i18n:Chinese Simplified
70GB2312
71i18n:Chinese Simplified
72EUC-KR
73i18n:Korean
74windows-949
75i18n:Korean
76sjis
77i18n:Japanese
78ISO-2022-JP
79i18n:Japanese
80EUC-JP
81i18n:Japanese
82ISO 8859-7
83i18n:Greek
84cp 1253
85i18n:Greek
86ISO 8859-6
87i18n:Arabic
88cp 1256
89i18n:Arabic
90ISO 8859-8
91i18n:Hebrew
92ISO 8859-8-I
93i18n:Hebrew
94cp 1255
95i18n:Hebrew
96ISO 8859-9
97i18n:Turkish
98TIS620
99i18n:Thai
100ISO 8859-11
101i18n:Thai
102UTF-8
103i18n:Unicode
104UTF-16
105i18n:Unicode
106utf7
107i18n:Unicode
108ucs2
109i18n:Unicode
110ISO 10646-UCS-2
111i18n:Unicode
112windows-1258
113i18n:Other
114IBM874
115i18n:Other
116TSCII
117i18n:Other
118 */
119/*
120 * Notes about the table:
121 *
122 * - The following entries were disabled and removed from the table:
123ibm852
124i18n:Central European
125pt 154
126i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt
127 *
128 * - ISO 8559-11 is the deprecated name of TIS-620
129 * - utf7 is not in Qt
130 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
131 * - windows-1258: TODO
132 * - IBM874: TODO
133 * - TSCII: TODO
134 */
135
136/*
137 * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that
138 * statically initialised text should be translated so that it expands to just
139 * the string that should be translated, making it possible to use it in the
140 * single string construct below.
141 */
142#undef QT_TRANSLATE_NOOP3
143#define QT_TRANSLATE_NOOP3(a, b, c) b
144
145/*
146 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
147 * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
148 * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP.
149 */
150
151static const char language_for_encoding_string[] =
152 "ISO 8859-1\0"
153 QT_TRANSLATE_NOOP3("KCharsets", "Western European", "@item Text character set")"\0"
154 "ISO 8859-15\0"
155 "ISO 8859-14\0"
156 "cp 1252\0"
157 "IBM850\0"
158 "ISO 8859-2\0"
159 QT_TRANSLATE_NOOP3("KCharsets", "Central European", "@item Text character set")"\0"
160 "ISO 8859-3\0"
161 "ISO 8859-4\0"
162 QT_TRANSLATE_NOOP3("KCharsets", "Baltic", "@item Text character set")"\0"
163 "ISO 8859-13\0"
164 "ISO 8859-16\0"
165 QT_TRANSLATE_NOOP3("KCharsets", "South-Eastern Europe", "@item Text character set")"\0"
166 "cp 1250\0"
167 "cp 1254\0"
168 QT_TRANSLATE_NOOP3("KCharsets", "Turkish", "@item Text character set")"\0"
169 "cp 1257\0"
170 "KOI8-R\0"
171 QT_TRANSLATE_NOOP3("KCharsets", "Cyrillic", "@item Text character set")"\0"
172 "ISO 8859-5\0"
173 "cp 1251\0"
174 "KOI8-U\0"
175 "IBM866\0"
176 "Big5\0"
177 QT_TRANSLATE_NOOP3("KCharsets", "Chinese Traditional", "@item Text character set")"\0"
178 "Big5-HKSCS\0"
179 "GB18030\0"
180 QT_TRANSLATE_NOOP3("KCharsets", "Chinese Simplified", "@item Text character set")"\0"
181 "GBK\0"
182 "GB2312\0"
183 "EUC-KR\0"
184 QT_TRANSLATE_NOOP3("KCharsets", "Korean", "@item Text character set")"\0"
185 "windows-949\0"
186 "sjis\0"
187 QT_TRANSLATE_NOOP3("KCharsets", "Japanese", "@item Text character set")"\0"
188 "ISO-2022-JP\0"
189 "EUC-JP\0"
190 "ISO 8859-7\0"
191 QT_TRANSLATE_NOOP3("KCharsets", "Greek", "@item Text character set")"\0"
192 "cp 1253\0"
193 "ISO 8859-6\0"
194 QT_TRANSLATE_NOOP3("KCharsets", "Arabic", "@item Text character set")"\0"
195 "cp 1256\0"
196 "ISO 8859-8\0"
197 QT_TRANSLATE_NOOP3("KCharsets", "Hebrew", "@item Text character set")"\0"
198 "ISO 8859-8-I\0"
199 "cp 1255\0"
200 "ISO 8859-9\0"
201 "TIS620\0"
202 QT_TRANSLATE_NOOP3("KCharsets", "Thai", "@item Text character set")"\0"
203 "ISO 8859-11\0"
204 "UTF-8\0"
205 QT_TRANSLATE_NOOP3("KCharsets", "Unicode", "@item Text character set")"\0"
206 "UTF-16\0"
207 "utf7\0"
208 "ucs2\0"
209 "ISO 10646-UCS-2\0"
210 "windows-1258\0"
211 QT_TRANSLATE_NOOP3("KCharsets", "Other", "@item Text character set")"\0"
212 "IBM874\0"
213 "TSCII\0"
214 "\0";
215
216static const int language_for_encoding_indices[] = {
217 0, 11, 28, 11, 40, 11, 52, 11, 60, 11, 67, 78, 95, 78, 106, 117, 124, 117, 136, 148, 169, 78, 177, 185, 193, 117, 201, 208, 217, 208, 228,
218 208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419,
219 426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1};
220
221/*
222 * GENERATED CODE ENDS HERE
223 */
224
225struct KCharsetsSingletonPrivate {
226 KCharsets instance;
227};
228
229Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets)
230
231// search an array of items index/data, find first matching index
232// and return data, or return 0
233static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
234{
235 for (int i = 0; indices[i] != -1; i += 2) {
236 if (qstrcmp(start + indices[i], entry) == 0) {
237 return start + indices[i + 1];
238 }
239 }
240 return nullptr;
241}
242
243// --------------------------------------------------------------------------
244
246 : d(new KCharsetsPrivate)
247{
248}
249
250KCharsets::~KCharsets() = default;
251
253{
254 QChar res = QChar::Null;
255
256 if (str.isEmpty()) {
257 return QChar::Null;
258 }
259
260 int pos = 0;
261 if (str[pos] == QLatin1Char('&')) {
262 pos++;
263 }
264
265 // Check for '&#000' or '&#x0000' sequence
266 if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) {
267 bool ok;
268 pos++;
269 if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
270 pos++;
271 // '&#x0000', hexadecimal character reference
272 const auto tmp = str.mid(pos);
273 res = QChar(tmp.toInt(&ok, 16));
274 } else {
275 // '&#0000', decimal character reference
276 const auto tmp = str.mid(pos);
277 res = QChar(tmp.toInt(&ok, 10));
278 }
279 if (ok) {
280 return res;
281 } else {
282 return QChar::Null;
283 }
284 }
285
286 const QByteArray raw(str.toLatin1());
287 const entity *e = KCodecsEntities::kde_findEntity(raw.data(), raw.length());
288
289 if (!e) {
290 // qCDebug(KCODECS_LOG) << "unknown entity " << str <<", len = " << str.length();
291 return QChar::Null;
292 }
293 // qCDebug(KCODECS_LOG) << "got entity " << str << " = " << e->code;
294
295 return QChar(e->code);
296}
297
299{
300 // entities are never longer than 8 chars... we start from
301 // that length and work backwards...
302 len = 8;
303 while (len > 0) {
304 const auto tmp = str.left(len);
305 QChar res = fromEntity(tmp);
306 if (res != QChar::Null) {
307 return res;
308 }
309 len--;
310 }
311 return QChar::Null;
312}
313
315{
316 return QString::asprintf("&#0x%x;", ch.unicode());
317}
318
320{
321 QString text = input;
322 const QChar *p = text.unicode();
323 const QChar *end = p + text.length();
324 const QChar *ampersand = nullptr;
325 bool scanForSemicolon = false;
326
327 for (; p < end; ++p) {
328 const QChar ch = *p;
329
330 if (ch == QLatin1Char('&')) {
331 ampersand = p;
332 scanForSemicolon = true;
333 continue;
334 }
335
336 if (ch != QLatin1Char(';') || scanForSemicolon == false) {
337 continue;
338 }
339
340 assert(ampersand);
341
342 scanForSemicolon = false;
343
344 const QChar *entityBegin = ampersand + 1;
345
346 const uint entityLength = p - entityBegin;
347 if (entityLength == 0) {
348 continue;
349 }
350
351 const QChar entityValue = KCharsets::fromEntity(QStringView(entityBegin, entityLength));
352 if (entityValue.isNull()) {
353 continue;
354 }
355
356 const uint ampersandPos = ampersand - text.unicode();
357
358 text[(int)ampersandPos] = entityValue;
359 text.remove(ampersandPos + 1, entityLength + 1);
360 p = text.unicode() + ampersandPos;
361 end = text.unicode() + text.length();
362 ampersand = nullptr;
363 }
364
365 return text;
366}
367
369{
370 QStringList available;
371 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
372 available.append(QString::fromUtf8(language_for_encoding_string + *p));
373 }
374 available.sort();
375 return available;
376}
377
379{
380 const char *lang = kcharsets_array_search(language_for_encoding_string, language_for_encoding_indices, encoding.toUtf8().data());
381 if (lang) {
382 return tr("%1 ( %2 )", "@item %1 character set, %2 encoding").arg(tr(lang, "@item Text character set"), encoding);
383 } else {
384 return tr("Other encoding (%1)", "@item").arg(encoding);
385 }
386}
387
388QString KCharsets::encodingForName(const QString &descriptiveName) const
389{
390 const int left = descriptiveName.lastIndexOf(QLatin1Char('('));
391
392 if (left < 0) { // No parenthesis, so assume it is a normal encoding name
393 return descriptiveName.trimmed();
394 }
395
396 QString name(descriptiveName.mid(left + 1));
397
398 const int right = name.lastIndexOf(QLatin1Char(')'));
399
400 if (right < 0) {
401 return name;
402 }
403
404 return name.left(right).trimmed();
405}
406
408{
409 QStringList encodings;
410 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
411 const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
412 const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
413 encodings.append(tr("%1 ( %2 )", "@item Text encoding: %1 character set, %2 encoding").arg(description, name));
414 }
415 encodings.sort();
416 return encodings;
417}
418
420{
421 if (!d->encodingsByScript.isEmpty()) {
422 return d->encodingsByScript;
423 }
424 int i;
425 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
426 const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
427 const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
428
429 for (i = 0; i < d->encodingsByScript.size(); ++i) {
430 if (d->encodingsByScript.at(i).at(0) == description) {
431 d->encodingsByScript[i].append(name);
432 break;
433 }
434 }
435
436 if (i == d->encodingsByScript.size()) {
437 d->encodingsByScript.append(QStringList() << description << name);
438 }
439 }
440 return d->encodingsByScript;
441}
442
444{
445 return &globalCharsets()->instance;
446}
Charset font and encoder/decoder handling.
Definition kcharsets.h:34
QStringList descriptiveEncodingNames() const
Lists the available encoding names together with a more descriptive language.
QList< QStringList > encodingsByScript() const
Lists the available encoding names grouped by script (or language that uses them).
~KCharsets()
Destructor.
QString encodingForName(const QString &descriptiveName) const
Returns the encoding for a string obtained with descriptiveEncodingNames().
QString descriptionForEncoding(QStringView encoding) const
Returns a long description for an encoding name.
KCharsets()
Protected constructor.
static QChar fromEntity(QStringView str)
Converts an entity to a character.
QStringList availableEncodingNames() const
Lists all available encodings as names.
static QString resolveEntities(const QString &text)
Scans the given string for entities (like &amp;) and resolves them using fromEntity.
static QString toEntity(const QChar &ch)
Converts a QChar to an entity.
static KCharsets * charsets()
The global charset manager.
Q_SCRIPTABLE Q_NOREPLY void start()
char * data()
qsizetype length() const const
bool isNull() const const
char16_t & unicode()
void append(QList< T > &&value)
QString asprintf(const char *cformat,...)
QString fromUtf8(QByteArrayView str)
qsizetype lastIndexOf(QChar ch, Qt::CaseSensitivity cs) const const
QString left(qsizetype n) const const
qsizetype length() const const
QString mid(qsizetype position, qsizetype n) const const
QString & remove(QChar ch, Qt::CaseSensitivity cs)
QString trimmed() const const
const QChar * unicode() const const
void sort(Qt::CaseSensitivity cs)
QStringView left(qsizetype length) const const
QStringView mid(qsizetype start, qsizetype length) const const
bool isEmpty() const const
qsizetype length() const const
QByteArray toLatin1() const const
QByteArray toUtf8() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Fri Dec 13 2024 11:48:48 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.