KWidgetsAddons

kcharselectdata.cpp
1/*
2 This file is part of the KDE libraries
3 SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de>
4
5 SPDX-License-Identifier: LGPL-2.0-or-later
6*/
7
8#include "kcharselectdata_p.h"
9
10#include <QCoreApplication>
11#include <QFile>
12#include <QFutureInterface>
13#include <QRegularExpression>
14#include <QRunnable>
15#include <QStringList>
16#include <QThreadPool>
17#include <qendian.h>
18
19#include <../test-config.h>
20#include <qstandardpaths.h>
21#include <string.h>
22
23/* constants for hangul (de)composition, see UAX #15 */
24#define SBase 0xAC00
25#define LBase 0x1100
26#define VBase 0x1161
27#define TBase 0x11A7
28#define LCount 19
29#define VCount 21
30#define TCount 28
31#define NCount (VCount * TCount)
32#define SCount (LCount * NCount)
33
34class RunIndexCreation : public QFutureInterface<Index>, public QRunnable
35{
36public:
37 RunIndexCreation(KCharSelectData *data, const QByteArray &dataFile)
38 : m_data(data)
39 , m_dataFile(dataFile)
40 {
41 }
42
43 QFuture<Index> start()
44 {
45 setRunnable(this);
46 reportStarted();
47 QFuture<Index> f = this->future();
49 return f;
50 }
51
52 void run() override
53 {
54 Index index = m_data->createIndex(m_dataFile);
55 reportResult(index);
56 reportFinished(nullptr);
57 }
58
59private:
60 KCharSelectData *const m_data;
61 const QByteArray m_dataFile;
62};
63
64// clang-format off
65static const char JAMO_L_TABLE[][4] = {
66 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
67 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
68};
69
70static const char JAMO_V_TABLE[][4] = {
71 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
72 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
73 "YU", "EU", "YI", "I"
74};
75
76static const char JAMO_T_TABLE[][4] = {
77 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
78 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
79 "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
80};
81// clang-format on
82
83bool KCharSelectData::openDataFile()
84{
85 if (!dataFile.isEmpty()) {
86 return true;
87 } else {
88 QFile file(QStringLiteral(":/kf6/kcharselect/kcharselect-data"));
89 file.open(QIODevice::ReadOnly);
90 dataFile = file.readAll();
91 file.close();
92 if (dataFile.size() < 40) {
93 dataFile.clear();
94 return false;
95 }
96 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
97 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
98 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
99 uint blocks = (offsetEnd - offsetBegin) / 4;
100 if (blocks <= 167) { // maximum possible number of blocks in BMP
101 // no remapping
102 remapType = -1;
103 } else if (blocks >= 174 && blocks <= 180) {
104 // remapping introduced in 5.25
105 remapType = 0;
106 } else {
107 // unknown remapping, abort
108 dataFile.clear();
109 return false;
110 }
111 futureIndex = (new RunIndexCreation(this, dataFile))->start();
112 return true;
113 }
114}
115
116// Temporary remapping code points <-> 16 bit database codes
117// See kcharselect-generate-datafile.py for details
118
119quint16 KCharSelectData::mapCodePointToDataBase(uint code) const
120{
121 if (remapType == 0) {
122 if (code >= 0xE000 && code <= 0xEFFF) {
123 return 0xFFFF;
124 }
125 if (code >= 0xF000 && code <= 0xFFFF) {
126 return code - 0x1000;
127 }
128 if (code >= 0x1F000 && code <= 0x1FFFF) {
129 return code - 0x10000;
130 }
131 }
132 if (code >= 0x10000) {
133 return 0xFFFF;
134 }
135 return code;
136}
137
138uint KCharSelectData::mapDataBaseToCodePoint(quint16 code) const
139{
140 if (remapType == 0) {
141 if (code >= 0xE000 && code <= 0xEFFF) {
142 return code + 0x1000;
143 }
144 if (code >= 0xF000) {
145 return code + 0x10000;
146 }
147 }
148 return code;
149}
150
151quint32 KCharSelectData::getDetailIndex(uint c) const
152{
153 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
154 // Convert from little-endian, so that this code works on PPC too.
155 // http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=482286
156 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 12);
157 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 16);
158
159 int min = 0;
160 int mid;
161 int max = ((offsetEnd - offsetBegin) / 27) - 1;
162
163 quint16 unicode = mapCodePointToDataBase(c);
164 if (unicode == 0xFFFF) {
165 return 0;
166 }
167
168 static quint16 most_recent_searched;
169 static quint32 most_recent_result;
170
171 if (unicode == most_recent_searched) {
172 return most_recent_result;
173 }
174
175 most_recent_searched = unicode;
176
177 while (max >= min) {
178 mid = (min + max) / 2;
179 const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 27);
180 if (unicode > midUnicode) {
181 min = mid + 1;
182 } else if (unicode < midUnicode) {
183 max = mid - 1;
184 } else {
185 most_recent_result = offsetBegin + mid * 27;
186
187 return most_recent_result;
188 }
189 }
190
191 most_recent_result = 0;
192 return 0;
193}
194
195QString KCharSelectData::formatCode(uint code, int length, const QString &prefix, int base)
196{
197 QString s = QString::number(code, base).toUpper();
198 while (s.size() < length) {
199 s.prepend(QLatin1Char('0'));
200 }
201 s.prepend(prefix);
202 return s;
203}
204
205QList<uint> KCharSelectData::blockContents(int block)
206{
207 if (!openDataFile()) {
208 return QList<uint>();
209 }
210
211 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
212 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
213 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
214
215 int max = ((offsetEnd - offsetBegin) / 4) - 1;
216
217 QList<uint> res;
218
219 if (block > max) {
220 return res;
221 }
222
223 quint16 unicodeBegin = qFromLittleEndian<quint16>(data + offsetBegin + block * 4);
224 quint16 unicodeEnd = qFromLittleEndian<quint16>(data + offsetBegin + block * 4 + 2);
225
226 while (unicodeBegin < unicodeEnd) {
227 res.append(mapDataBaseToCodePoint(unicodeBegin));
228 unicodeBegin++;
229 }
230 res.append(mapDataBaseToCodePoint(unicodeBegin)); // Be careful when unicodeEnd==0xffff
231
232 return res;
233}
234
235QList<int> KCharSelectData::sectionContents(int section)
236{
237 section -= 1;
238 if (!openDataFile()) {
239 return QList<int>();
240 }
241
242 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
243 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28);
244 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32);
245
246 int max = ((offsetEnd - offsetBegin) / 4) - 1;
247
248 QList<int> res;
249
250 if (section > max) {
251 return res;
252 }
253
254 for (int i = 0; i <= max; i++) {
255 const quint16 currSection = qFromLittleEndian<quint16>(data + offsetBegin + i * 4);
256 if (currSection == section || section < 0) {
257 res.append(qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2));
258 }
259 }
260
261 return res;
262}
263
264QStringList KCharSelectData::sectionList()
265{
266 if (!openDataFile()) {
267 return QStringList();
268 }
269
270 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
271 const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24);
272 const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28);
273
274 const char *data = dataFile.constData();
276 quint32 i = stringBegin;
277 list.append(QCoreApplication::translate("KCharSelectData", "All", "KCharSelect section name"));
278 while (i < stringEnd) {
279 list.append(QCoreApplication::translate("KCharSelectData", data + i, "KCharSelect section name"));
280 i += qstrlen(data + i) + 1;
281 }
282
283 return list;
284}
285
286QString KCharSelectData::block(uint c)
287{
288 return blockName(blockIndex(c));
289}
290
291QString KCharSelectData::section(uint c)
292{
293 return sectionName(sectionIndex(blockIndex(c)));
294}
295
296QString KCharSelectData::name(uint c)
297{
298 if (!openDataFile()) {
299 return QString();
300 }
301
302 if ((c & 0xFFFE) == 0xFFFE || (c >= 0xFDD0 && c <= 0xFDEF)) {
303 return QCoreApplication::translate("KCharSelectData", "<noncharacter>");
304 } else if ((c >= 0x3400 && c <= 0x4DBF) || (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x20000 && c <= 0x2F7FF)) {
305 return QLatin1String("CJK UNIFIED IDEOGRAPH-") + formatCode(c, 4, QString());
306 } else if (c >= 0xAC00 && c <= 0xD7AF) {
307 /* compute hangul syllable name as per UAX #15 */
308 int SIndex = c - SBase;
309 int LIndex;
310 int VIndex;
311 int TIndex;
312
313 if (SIndex < 0 || SIndex >= SCount) {
314 return QString();
315 }
316
317 LIndex = SIndex / NCount;
318 VIndex = (SIndex % NCount) / TCount;
319 TIndex = SIndex % TCount;
320
321 return QLatin1String("HANGUL SYLLABLE ") + QLatin1String(JAMO_L_TABLE[LIndex]) + QLatin1String(JAMO_V_TABLE[VIndex])
322 + QLatin1String(JAMO_T_TABLE[TIndex]);
323 } else if (c >= 0xD800 && c <= 0xDB7F) {
324 return QCoreApplication::translate("KCharSelectData", "<Non Private Use High Surrogate>");
325 } else if (c >= 0xDB80 && c <= 0xDBFF) {
326 return QCoreApplication::translate("KCharSelectData", "<Private Use High Surrogate>");
327 } else if (c >= 0xDC00 && c <= 0xDFFF) {
328 return QCoreApplication::translate("KCharSelectData", "<Low Surrogate>");
329 } else if ((c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) {
330 return QCoreApplication::translate("KCharSelectData", "<Private Use>");
331 } else if ((c >= 0xF900 && c <= 0xFAFF) || (c >= 0x2F800 && c <= 0x2FFFF)) {
332 return QLatin1String("CJK COMPATIBILITY IDEOGRAPH-") + formatCode(c, 4, QString());
333 }
334 quint16 unicode = mapCodePointToDataBase(c);
335 if (unicode == 0xFFFF) {
336 return QLatin1String("NON-BMP-CHARACTER-") + formatCode(c, 4, QString());
337 } else {
338 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
339 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4);
340 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8);
341
342 int min = 0;
343 int mid;
344 int max = ((offsetEnd - offsetBegin) / 6) - 1;
345 QString s;
346
347 while (max >= min) {
348 mid = (min + max) / 2;
349 const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6);
350 if (unicode > midUnicode) {
351 min = mid + 1;
352 } else if (unicode < midUnicode) {
353 max = mid - 1;
354 } else {
355 quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2);
356 s = QString::fromUtf8(dataFile.constData() + offset + 1);
357 break;
358 }
359 }
360
361 if (s.isNull()) {
362 return QCoreApplication::translate("KCharSelectData", "<not assigned>");
363 } else {
364 return s;
365 }
366 }
367}
368
369int KCharSelectData::blockIndex(uint c)
370{
371 if (!openDataFile()) {
372 return 0;
373 }
374
375 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
376 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
377 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
378 const quint16 unicode = mapCodePointToDataBase(c);
379 if (unicode == 0xFFFF) {
380 return 0;
381 }
382
383 int max = ((offsetEnd - offsetBegin) / 4) - 1;
384
385 int i = 0;
386
387 while (unicode > qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) && i < max) {
388 i++;
389 }
390
391 return i;
392}
393
394int KCharSelectData::sectionIndex(int block)
395{
396 if (!openDataFile()) {
397 return 0;
398 }
399
400 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
401 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28);
402 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32);
403
404 int max = ((offsetEnd - offsetBegin) / 4) - 1;
405
406 for (int i = 0; i <= max; i++) {
407 if (qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) == block) {
408 return qFromLittleEndian<quint16>(data + offsetBegin + i * 4) + 1;
409 }
410 }
411
412 return 0;
413}
414
415QString KCharSelectData::blockName(int index)
416{
417 if (!openDataFile()) {
418 return QString();
419 }
420
421 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
422 const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 16);
423 const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 20);
424
425 quint32 i = stringBegin;
426 int currIndex = 0;
427
428 const char *data = dataFile.constData();
429 while (i < stringEnd && currIndex < index) {
430 i += qstrlen(data + i) + 1;
431 currIndex++;
432 }
433
434 return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode block name");
435}
436
437QString KCharSelectData::sectionName(int index)
438{
439 if (index == 0) {
440 return QCoreApplication::translate("KCharSelectData", "All", "KCharselect unicode section name");
441 }
442 if (!openDataFile()) {
443 return QString();
444 }
445
446 index -= 1;
447
448 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
449 const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24);
450 const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28);
451
452 quint32 i = stringBegin;
453 int currIndex = 0;
454
455 const char *data = dataFile.constData();
456 while (i < stringEnd && currIndex < index) {
457 i += qstrlen(data + i) + 1;
458 currIndex++;
459 }
460
461 return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode section name");
462}
463
464QStringList KCharSelectData::aliases(uint c)
465{
466 if (!openDataFile()) {
467 return QStringList();
468 }
469 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
470 const int detailIndex = getDetailIndex(c);
471 if (detailIndex == 0) {
472 return QStringList();
473 }
474
475 const quint8 count = *(quint8 *)(udata + detailIndex + 6);
476 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 2);
477
478 QStringList aliases;
479 aliases.reserve(count);
480
481 const char *data = dataFile.constData();
482 for (int i = 0; i < count; i++) {
483 aliases.append(QString::fromUtf8(data + offset));
484 offset += qstrlen(data + offset) + 1;
485 }
486 return aliases;
487}
488
489QStringList KCharSelectData::notes(uint c)
490{
491 if (!openDataFile()) {
492 return QStringList();
493 }
494 const int detailIndex = getDetailIndex(c);
495 if (detailIndex == 0) {
496 return QStringList();
497 }
498
499 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
500 const quint8 count = *(quint8 *)(udata + detailIndex + 11);
501 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 7);
502
503 QStringList notes;
504 notes.reserve(count);
505
506 const char *data = dataFile.constData();
507 for (int i = 0; i < count; i++) {
508 notes.append(QString::fromUtf8(data + offset));
509 offset += qstrlen(data + offset) + 1;
510 }
511
512 return notes;
513}
514
515QList<uint> KCharSelectData::seeAlso(uint c)
516{
517 if (!openDataFile()) {
518 return QList<uint>();
519 }
520 const int detailIndex = getDetailIndex(c);
521 if (detailIndex == 0) {
522 return QList<uint>();
523 }
524
525 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
526 const quint8 count = *(quint8 *)(udata + detailIndex + 26);
527 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 22);
528
529 QList<uint> seeAlso;
530 seeAlso.reserve(count);
531
532 for (int i = 0; i < count; i++) {
533 seeAlso.append(mapDataBaseToCodePoint(qFromLittleEndian<quint16>(udata + offset)));
534 offset += 2;
535 }
536
537 return seeAlso;
538}
539
540QStringList KCharSelectData::equivalents(uint c)
541{
542 if (!openDataFile()) {
543 return QStringList();
544 }
545 const int detailIndex = getDetailIndex(c);
546 if (detailIndex == 0) {
547 return QStringList();
548 }
549
550 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
551 const quint8 count = *(quint8 *)(udata + detailIndex + 21);
552 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 17);
553
554 QStringList equivalents;
555 equivalents.reserve(count);
556
557 const char *data = dataFile.constData();
558 for (int i = 0; i < count; i++) {
559 equivalents.append(QString::fromUtf8(data + offset));
560 offset += qstrlen(data + offset) + 1;
561 }
562
563 return equivalents;
564}
565
566QStringList KCharSelectData::approximateEquivalents(uint c)
567{
568 if (!openDataFile()) {
569 return QStringList();
570 }
571 const int detailIndex = getDetailIndex(c);
572 if (detailIndex == 0) {
573 return QStringList();
574 }
575
576 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
577 const quint8 count = *(quint8 *)(udata + detailIndex + 16);
578 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 12);
579
580 QStringList approxEquivalents;
581 approxEquivalents.reserve(count);
582
583 const char *data = dataFile.constData();
584 for (int i = 0; i < count; i++) {
585 approxEquivalents.append(QString::fromUtf8(data + offset));
586 offset += qstrlen(data + offset) + 1;
587 }
588
589 return approxEquivalents;
590}
591
592QList<uint> KCharSelectData::decomposition(uint c)
593{
594 // for now, only decompose Hangul Syllable into Hangul Jamo
595 uint SIndex = c - SBase;
596 if (SIndex >= SCount) {
597 return QList<uint>();
598 }
599
600 uint L = LBase + SIndex / NCount; // Choseong
601 uint V = VBase + (SIndex % NCount) / TCount; // Jungseong
602 uint T = TBase + SIndex % TCount; // Jongsung
603 QList<uint> jamoList;
604 jamoList.append(L);
605 jamoList.append(V);
606 if (T != TBase) {
607 jamoList.append(T);
608 }
609 return jamoList;
610}
611
612QStringList KCharSelectData::unihanInfo(uint c)
613{
614 if (!openDataFile()) {
615 return QStringList();
616 }
617
618 quint16 unicode = mapCodePointToDataBase(c);
619 if (unicode == 0xFFFF) {
620 return QStringList();
621 }
622
623 const char *data = dataFile.constData();
624 const uchar *udata = reinterpret_cast<const uchar *>(data);
625 const quint32 offsetBegin = qFromLittleEndian<quint32>(udata + 36);
626 const quint32 offsetEnd = dataFile.size();
627
628 int min = 0;
629 int mid;
630 int max = ((offsetEnd - offsetBegin) / 30) - 1;
631
632 while (max >= min) {
633 mid = (min + max) / 2;
634 const quint16 midUnicode = qFromLittleEndian<quint16>(udata + offsetBegin + mid * 30);
635 if (unicode > midUnicode) {
636 min = mid + 1;
637 } else if (unicode < midUnicode) {
638 max = mid - 1;
639 } else {
640 QStringList res;
641 res.reserve(7);
642 for (int i = 0; i < 7; i++) {
643 quint32 offset = qFromLittleEndian<quint32>(udata + offsetBegin + mid * 30 + 2 + i * 4);
644 if (offset != 0) {
645 res.append(QString::fromUtf8(data + offset));
646 } else {
647 res.append(QString());
648 }
649 }
650 return res;
651 }
652 }
653
654 return QStringList();
655}
656
657QChar::Category KCharSelectData::category(uint c)
658{
659 if (!openDataFile()) {
660 return QChar::category(c);
661 }
662
663 ushort unicode = mapCodePointToDataBase(c);
664 if (unicode == 0xFFFF) {
665 return QChar::category(c);
666 }
667
668 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
669 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4);
670 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8);
671
672 int min = 0;
673 int mid;
674 int max = ((offsetEnd - offsetBegin) / 6) - 1;
675
676 while (max >= min) {
677 mid = (min + max) / 2;
678 const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6);
679 if (unicode > midUnicode) {
680 min = mid + 1;
681 } else if (unicode < midUnicode) {
682 max = mid - 1;
683 } else {
684 quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2);
685 uchar categoryCode = *(data + offset);
686 Q_ASSERT(categoryCode > 0);
687 categoryCode--; /* Qt5 changed QChar::Category enum to start from 0 instead of 1
688 See QtBase commit d17c76feee9eece4 */
689 return QChar::Category(categoryCode);
690 }
691 }
692
693 return QChar::category(c);
694}
695
696bool KCharSelectData::isPrint(uint c)
697{
698 QChar::Category cat = category(c);
699 return !(cat == QChar::Other_Control || cat == QChar::Other_NotAssigned);
700}
701
702bool KCharSelectData::isDisplayable(uint c)
703{
704 // Qt internally uses U+FDD0 and U+FDD1 to mark the beginning and the end of frames.
705 // They should be seen as non-printable characters, as trying to display them leads
706 // to a crash caused by a Qt "noBlockInString" assertion.
707 if (c == 0xFDD0 || c == 0xFDD1) {
708 return false;
709 }
710
711 return !isIgnorable(c) && isPrint(c);
712}
713
714bool KCharSelectData::isIgnorable(uint c)
715{
716 /*
717 * According to the Unicode standard, Default Ignorable Code Points
718 * should be ignored unless explicitly supported. For example, U+202E
719 * RIGHT-TO-LEFT-OVERRIDE ir printable according to Qt, but displaying
720 * it gives the undesired effect of all text being turned RTL. We do not
721 * have a way to "explicitly" support it, so we will treat it as
722 * non-printable.
723 *
724 * There is a list of these on
725 * http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt under the
726 * property Default_Ignorable_Code_Point.
727 */
728
729 // NOTE: not very nice to hardcode these here; is it worth it to modify
730 // the binary data file to hold them?
731 // clang-format off
732 return c == 0x00AD || c == 0x034F || c == 0x115F || c == 0x1160 ||
733 c == 0x17B4 || c == 0x17B5 || (c >= 0x180B && c <= 0x180D) ||
734 (c >= 0x200B && c <= 0x200F) || (c >= 0x202A && c <= 0x202E) ||
735 (c >= 0x2060 && c <= 0x206F) || c == 0x3164 ||
736 (c >= 0xFE00 && c <= 0xFE0F) || c == 0xFEFF || c == 0xFFA0 ||
737 (c >= 0xFFF0 && c <= 0xFFF8);
738 // clang-format on
739}
740
741bool KCharSelectData::isCombining(uint c)
742{
743 return section(c) == QCoreApplication::translate("KCharSelectData", "Combining Diacritics", "KCharSelect section name");
744 // FIXME: this is an imperfect test. There are many combining characters
745 // that are outside of this section. See Grapheme_Extend in
746 // http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
747}
748
749QString KCharSelectData::display(uint c, const QFont &font)
750{
751 if (!isDisplayable(c)) {
752 return QLatin1String("<b>") + QCoreApplication::translate("KCharSelectData", "Non-printable") + QLatin1String("</b>");
753 } else {
754 QString s = QLatin1String("<font size=\"+4\" face=\"") + font.family() + QLatin1String("\">");
755 if (isCombining(c)) {
756 s += displayCombining(c);
757 } else {
758 s += QLatin1String("&#") + QString::number(c) + QLatin1Char(';');
759 }
760 s += QLatin1String("</font>");
761 return s;
762 }
763}
764
765QString KCharSelectData::displayCombining(uint c)
766{
767 /*
768 * The purpose of this is to make it easier to see how a combining
769 * character affects the text around it.
770 * The initial plan was to use U+25CC DOTTED CIRCLE for this purpose,
771 * as seen in pdfs from Unicode, but there seem to be a lot of alignment
772 * problems with that.
773 *
774 * Eventually, it would be nice to determine whether the character
775 * combines to the left or to the right, etc.
776 */
777 QString s = QLatin1String("&nbsp;&#") + QString::number(c) + QLatin1String(";&nbsp;") + QLatin1String(" (ab&#") + QString::number(c) + QLatin1String(";c)");
778 return s;
779}
780
781QString KCharSelectData::categoryText(QChar::Category category)
782{
783 switch (category) {
785 return QCoreApplication::translate("KCharSelectData", "Other, Control");
787 return QCoreApplication::translate("KCharSelectData", "Other, Format");
789 return QCoreApplication::translate("KCharSelectData", "Other, Not Assigned");
791 return QCoreApplication::translate("KCharSelectData", "Other, Private Use");
793 return QCoreApplication::translate("KCharSelectData", "Other, Surrogate");
795 return QCoreApplication::translate("KCharSelectData", "Letter, Lowercase");
797 return QCoreApplication::translate("KCharSelectData", "Letter, Modifier");
799 return QCoreApplication::translate("KCharSelectData", "Letter, Other");
801 return QCoreApplication::translate("KCharSelectData", "Letter, Titlecase");
803 return QCoreApplication::translate("KCharSelectData", "Letter, Uppercase");
805 return QCoreApplication::translate("KCharSelectData", "Mark, Spacing Combining");
807 return QCoreApplication::translate("KCharSelectData", "Mark, Enclosing");
809 return QCoreApplication::translate("KCharSelectData", "Mark, Non-Spacing");
811 return QCoreApplication::translate("KCharSelectData", "Number, Decimal Digit");
813 return QCoreApplication::translate("KCharSelectData", "Number, Letter");
815 return QCoreApplication::translate("KCharSelectData", "Number, Other");
817 return QCoreApplication::translate("KCharSelectData", "Punctuation, Connector");
819 return QCoreApplication::translate("KCharSelectData", "Punctuation, Dash");
821 return QCoreApplication::translate("KCharSelectData", "Punctuation, Close");
823 return QCoreApplication::translate("KCharSelectData", "Punctuation, Final Quote");
825 return QCoreApplication::translate("KCharSelectData", "Punctuation, Initial Quote");
827 return QCoreApplication::translate("KCharSelectData", "Punctuation, Other");
829 return QCoreApplication::translate("KCharSelectData", "Punctuation, Open");
831 return QCoreApplication::translate("KCharSelectData", "Symbol, Currency");
833 return QCoreApplication::translate("KCharSelectData", "Symbol, Modifier");
835 return QCoreApplication::translate("KCharSelectData", "Symbol, Math");
837 return QCoreApplication::translate("KCharSelectData", "Symbol, Other");
839 return QCoreApplication::translate("KCharSelectData", "Separator, Line");
841 return QCoreApplication::translate("KCharSelectData", "Separator, Paragraph");
843 return QCoreApplication::translate("KCharSelectData", "Separator, Space");
844 default:
845 return QCoreApplication::translate("KCharSelectData", "Unknown");
846 }
847}
848
849QList<uint> KCharSelectData::find(const QString &needle)
850{
851 QSet<uint> result;
852
853 QList<uint> returnRes;
854 QString simplified = needle.length() > 1 ? needle.simplified() : needle;
855 QStringList searchStrings;
856
857 static const QRegularExpression octalExp(QStringLiteral("^\\\\[0-7][0-7\\\\]*$"));
858 if (octalExp.match(simplified).hasMatch()) {
859 // search for C octal escaped UTF-8
860 QByteArray utf8;
861 int byte = -1;
862 for (int i = 0; i <= simplified.length(); ++i) {
863 int c = simplified.at(i).unicode();
864 if (c >= '0' && c <= '7') {
865 byte = 8 * byte + c - '0';
866 } else if (byte == -1) {
867 byte = 0;
868 } else if (byte >= 0x00 && byte <= 0xFF) {
869 utf8.append((char)byte);
870 byte = 0;
871 }
872 }
873 simplified = QString::fromUtf8(utf8);
874 }
875
876 if (simplified.length() <= 2) {
877 QList<uint> ucs4 = simplified.toUcs4();
878 if (ucs4.size() == 1) {
879 // search for hex representation of the character
880 searchStrings = QStringList(formatCode(ucs4.at(0)));
881 } else {
882 searchStrings = splitString(simplified);
883 }
884 } else {
885 searchStrings = splitString(simplified);
886 }
887
888 if (searchStrings.isEmpty()) {
889 return returnRes;
890 }
891
892 static const QRegularExpression hexExp(QStringLiteral("^(?:|u\\+|U\\+|0x|0X)([A-Fa-f0-9]{4,5})$"));
893 for (const QString &s : std::as_const(searchStrings)) {
894 const QRegularExpressionMatch match = hexExp.match(s);
895 if (match.hasMatch()) {
896 const QString cap = match.captured(1);
897 returnRes.append(cap.toInt(nullptr, 16));
898 // search for "1234" instead of "0x1234"
899 if (s.length() == 6 || s.length() == 7) {
900 searchStrings[searchStrings.indexOf(s)] = cap;
901 }
902 }
903 // try to parse string as decimal number
904 bool ok;
905 int unicode = s.toInt(&ok);
906 if (ok && unicode >= 0 && unicode <= QChar::LastValidCodePoint) {
907 returnRes.append(unicode);
908 }
909 }
910
911 bool firstSubString = true;
912 for (const QString &s : std::as_const(searchStrings)) {
913 QSet<uint> partResult = getMatchingChars(s.toLower());
914 if (firstSubString) {
915 result = partResult;
916 firstSubString = false;
917 } else {
918 result = result.intersect(partResult);
919 }
920 }
921
922 // remove results found by matching the code point to prevent duplicate results
923 // while letting these characters stay at the beginning
924 for (uint c : std::as_const(returnRes)) {
925 result.remove(c);
926 }
927
928 QList<uint> sortedResult;
929 sortedResult.reserve(result.count());
930 for (auto c : std::as_const(result)) {
931 sortedResult.append(c);
932 }
933 std::sort(sortedResult.begin(), sortedResult.end());
934
935 returnRes += sortedResult;
936 return returnRes;
937}
938
939QSet<uint> KCharSelectData::getMatchingChars(const QString &s)
940{
941 if (dataFile.isEmpty()) {
942 return QSet<uint>();
943 }
944 futureIndex.waitForFinished();
945 const Index index = futureIndex.result();
946 Index::const_iterator pos = index.lowerBound(s);
947 QSet<uint> result;
948
949 while (pos != index.constEnd() && pos.key().startsWith(s)) {
950 for (quint16 c : pos.value()) {
951 result.insert(mapDataBaseToCodePoint(c));
952 }
953 ++pos;
954 }
955
956 return result;
957}
958
959QStringList KCharSelectData::splitString(const QString &s)
960{
961 QStringList result;
962 int start = 0;
963 int end = 0;
964 int length = s.length();
965 while (end < length) {
966 while (end < length && (s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
967 end++;
968 }
969 if (start != end) {
970 result.append(s.mid(start, end - start));
971 }
972 start = end;
973 while (end < length && !(s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
974 end++;
975 start++;
976 }
977 }
978 return result;
979}
980
981void KCharSelectData::appendToIndex(Index *index, quint16 unicode, const QString &s)
982{
983 const QStringList strings = splitString(s);
984 for (const QString &s : strings) {
985 (*index)[s.toLower()].append(unicode);
986 }
987}
988
989Index KCharSelectData::createIndex(const QByteArray &dataFile)
990{
991 Index i;
992
993 // character names
994 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
995 const char *data = dataFile.constData();
996 const quint32 nameOffsetBegin = qFromLittleEndian<quint32>(udata + 4);
997 const quint32 nameOffsetEnd = qFromLittleEndian<quint32>(udata + 8);
998
999 int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1;
1000
1001 for (int pos = 0; pos <= max; pos++) {
1002 const quint16 unicode = qFromLittleEndian<quint16>(udata + nameOffsetBegin + pos * 6);
1003 quint32 offset = qFromLittleEndian<quint32>(udata + nameOffsetBegin + pos * 6 + 2);
1004 appendToIndex(&i, unicode, QString::fromUtf8(data + offset + 1));
1005 }
1006
1007 // details
1008 const quint32 detailsOffsetBegin = qFromLittleEndian<quint32>(udata + 12);
1009 const quint32 detailsOffsetEnd = qFromLittleEndian<quint32>(udata + 16);
1010
1011 max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1;
1012
1013 for (int pos = 0; pos <= max; pos++) {
1014 const quint16 unicode = qFromLittleEndian<quint16>(udata + detailsOffsetBegin + pos * 27);
1015
1016 // aliases
1017 const quint8 aliasCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 6);
1018 quint32 aliasOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 2);
1019
1020 for (int j = 0; j < aliasCount; j++) {
1021 appendToIndex(&i, unicode, QString::fromUtf8(data + aliasOffset));
1022 aliasOffset += qstrlen(data + aliasOffset) + 1;
1023 }
1024
1025 // notes
1026 const quint8 notesCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 11);
1027 quint32 notesOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 7);
1028
1029 for (int j = 0; j < notesCount; j++) {
1030 appendToIndex(&i, unicode, QString::fromUtf8(data + notesOffset));
1031 notesOffset += qstrlen(data + notesOffset) + 1;
1032 }
1033
1034 // approximate equivalents
1035 const quint8 apprCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 16);
1036 quint32 apprOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 12);
1037
1038 for (int j = 0; j < apprCount; j++) {
1039 appendToIndex(&i, unicode, QString::fromUtf8(data + apprOffset));
1040 apprOffset += qstrlen(data + apprOffset) + 1;
1041 }
1042
1043 // equivalents
1044 const quint8 equivCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 21);
1045 quint32 equivOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 17);
1046
1047 for (int j = 0; j < equivCount; j++) {
1048 appendToIndex(&i, unicode, QString::fromUtf8(data + equivOffset));
1049 equivOffset += qstrlen(data + equivOffset) + 1;
1050 }
1051
1052 // see also - convert to string (hex)
1053 const quint8 seeAlsoCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 26);
1054 quint32 seeAlsoOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 22);
1055
1056 for (int j = 0; j < seeAlsoCount; j++) {
1057 quint16 seeAlso = qFromLittleEndian<quint16>(udata + seeAlsoOffset);
1058 appendToIndex(&i, unicode, formatCode(seeAlso, 4, QString()));
1059 equivOffset += qstrlen(data + equivOffset) + 1;
1060 }
1061 }
1062
1063 // unihan data
1064 // temporary disabled due to the huge amount of data
1065 // const quint32 unihanOffsetBegin = qFromLittleEndian<quint32>(udata+36);
1066 // const quint32 unihanOffsetEnd = dataFile.size();
1067 // max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1;
1068 //
1069 // for (int pos = 0; pos <= max; pos++) {
1070 // const quint16 unicode = qFromLittleEndian<quint16>(udata + unihanOffsetBegin + pos*30);
1071 // for(int j = 0; j < 7; j++) {
1072 // quint32 offset = qFromLittleEndian<quint32>(udata + unihanOffsetBegin + pos*30 + 2 + j*4);
1073 // if(offset != 0) {
1074 // appendToIndex(&i, unicode, QString::fromUtf8(data + offset));
1075 // }
1076 // }
1077 // }
1078
1079 return i;
1080}
Q_SCRIPTABLE Q_NOREPLY void start()
KCOREADDONS_EXPORT Result match(QStringView pattern, QStringView str)
KIOCORE_EXPORT QStringList list(const QString &fileClass)
KGuiItem ok()
Returns the 'Ok' gui item.
Category category(StandardShortcut id)
const QList< QKeySequence > & end()
Trait::StringList splitString(const typename Trait::String &str, const typename Trait::Char &ch)
QByteArray & append(QByteArrayView data)
const char * constData() const const
LastValidCodePoint
Category category() const const
char16_t & unicode()
QString translate(const char *context, const char *sourceText, const char *disambiguation, int n)
QString family() const const
void append(QList< T > &&value)
const_reference at(qsizetype i) const const
iterator begin()
iterator end()
bool isEmpty() const const
void reserve(qsizetype size)
qsizetype size() const const
qsizetype count() const const
iterator insert(const T &value)
QSet< T > & intersect(const QSet< T > &other)
bool remove(const T &value)
const QChar at(qsizetype position) const const
QString fromUtf8(QByteArrayView str)
bool isNull() const const
qsizetype length() const const
QString mid(qsizetype position, qsizetype n) const const
QString number(double n, char format, int precision)
QString & prepend(QChar ch)
QString simplified() const const
qsizetype size() const const
int toInt(bool *ok, int base) const const
QString toLower() const const
QList< uint > toUcs4() const const
QString toUpper() const const
qsizetype indexOf(const QRegularExpression &re, qsizetype from) const const
QThreadPool * globalInstance()
void start(Callable &&callableToRun, int priority)
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:46:44 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.