KItinerary

stringutil.cpp
1/*
2 SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "stringutil.h"
8
9#include <KCharsets>
10
11#include <QDebug>
12#include <QString>
13
14#include <cstring>
15#include <cctype>
16
17using namespace KItinerary;
18
20{
21 QString out;
22 out.reserve(str.size());
23 for (const auto c : str) {
24 // case folding
25 const auto n = c.toCaseFolded();
26
27 // if the character has a canonical decomposition use that and skip the
28 // combining diacritic markers following it
29 // see https://en.wikipedia.org/wiki/Unicode_equivalence
30 // see https://en.wikipedia.org/wiki/Combining_character
31 if (n.decompositionTag() == QChar::Canonical) {
32 out.push_back(n.decomposition().at(0));
33 }
34 // handle compatibility compositions such as ligatures
35 // see https://en.wikipedia.org/wiki/Unicode_compatibility_characters
36 else if (n.decompositionTag() == QChar::Compat && n.isLetter() && n.script() == QChar::Script_Latin) {
37 out.append(n.decomposition());
38 }
39 else {
40 out.push_back(n);
41 }
42 }
43 return out;
44}
45
46static bool containsNonAscii(QStringView s)
47{
48 for (const auto c : s) {
49 if (c.row() != 0 || c.cell() > 127) {
50 return true;
51 }
52 }
53
54 return false;
55}
56
57static bool isMixedCase(QStringView s)
58{
59 const auto letterCount = std::count_if(s.begin(), s.end(), [](auto c) { return c.isLetter(); });
60 const auto upperCount = std::count_if(s.begin(), s.end(), [](auto c) { return c.isUpper(); });
61 return upperCount != letterCount && upperCount != 0;
62}
63
64static int longestUpperCaseSubstring(QStringView s)
65{
66 int globalCount = 0;
67 int count = 0;
68 for (const auto c : s) {
69 if (c.isUpper()) {
70 ++count;
71 continue;
72 }
73 globalCount = std::max(globalCount, count);
74 count = 0;
75 }
76 return std::max(globalCount, count);
77}
78
80{
81 // prefer the one that exists at all
82 if (lhs.isEmpty()) {
83 return rhs;
84 }
85 if (rhs.isEmpty()) {
86 return lhs;
87 }
88
89 // prefer Unicode over ASCII normalization
90 const auto lhsNonAscii = containsNonAscii(lhs);
91 const auto rhsNonAscii = containsNonAscii(rhs);
92 if (lhsNonAscii && !rhsNonAscii) {
93 return lhs;
94 }
95 if (!lhsNonAscii && rhsNonAscii) {
96 return rhs;
97 }
98
99 // prefer better casing
100 const auto lhsMixedCase = isMixedCase(lhs);
101 const auto rhsMixedCase = isMixedCase(rhs);
102 if (lhsMixedCase && !rhsMixedCase) {
103 return lhs;
104 }
105 if (!lhsMixedCase && rhsMixedCase) {
106 return rhs;
107 }
108
109 if (lhs.size() == rhs.size()) {
110 if (lhsMixedCase && rhsMixedCase) {
111 if (longestUpperCaseSubstring(lhs) > longestUpperCaseSubstring(rhs)) {
112 return rhs;
113 } else if (longestUpperCaseSubstring(lhs) < longestUpperCaseSubstring(rhs)) {
114 return lhs;
115 }
116 }
117 if (!lhsMixedCase && !rhsMixedCase) {
118 if (longestUpperCaseSubstring(lhs) > longestUpperCaseSubstring(rhs)) {
119 return lhs;
120 }
121 else if (longestUpperCaseSubstring(lhs) < longestUpperCaseSubstring(rhs)) {
122 return rhs;
123 }
124 }
125 }
126
127 // prefer longer == more detailed version
128 if (rhs.size() < lhs.size()) {
129 return lhs;
130 }
131 return rhs;
132}
133
135{
136 if (s1.empty() || s2.empty()) {
137 return 0.0f;
138 }
139
140 if (s1.size() > s2.size()) {
141 std::swap(s1, s2);
142 }
143
144 for (int i = 0; i < s1.size(); ++i) {
145 if (s1[i].toCaseFolded() == s2[i].toCaseFolded()) {
146 continue;
147 }
148 return (float)i / (float)s2.size();
149 }
150
151 return (float)s1.size() / (float)s2.size();
152}
153
158
159// keep this ordered (see https://en.wikipedia.org/wiki/List_of_Unicode_characters)
160struct {
161 ushort key;
162 const char* replacement;
163} static const transliteration_map[] = {
164 { u'ä', "ae" },
165 { u'ö', "oe" },
166 { u'ø', "oe" },
167 { u'ü', "ue" },
168 { u'ő', "oe" },
169};
170
172{
173 QString res;
174 res.reserve(s.size());
175
176 for (const auto c : s) {
177 const auto it = std::lower_bound(std::begin(transliteration_map), std::end(transliteration_map), c, [](const auto &lhs, const auto rhs) {
178 return QChar(lhs.key) < rhs;
179 });
180 if (it != std::end(transliteration_map) && QChar((*it).key) == c) {
181 res += QString::fromUtf8((*it).replacement);
182 continue;
183 }
184
185 if (c.decompositionTag() == QChar::Canonical) { // see above
186 res += c.decomposition().at(0);
187 } else {
188 res += c;
189 }
190 }
191
192 return res;
193}
194
195bool StringUtil::startsWithIgnoreSpace(const QByteArray &data, const char *pattern)
196{
197 auto it = data.begin();
198 while (it != data.end() && std::isspace(static_cast<unsigned char>(*it))) {
199 ++it;
200 }
201
202 const auto len = std::strlen(pattern);
203 if ((int)len >= std::distance(it, data.end())) {
204 return false;
205 }
206 return std::strncmp(it, pattern, len) == 0;
207}
208
210{
211 if (std::all_of(s.begin(), s.end(), [](QChar c) { return c.category() == QChar::Punctuation_Dash || c.category() == QChar::Punctuation_Other; })) {
212 return {};
213 }
214 return s.simplified();
215}
static QString resolveEntities(const QString &text)
bool startsWithIgnoreSpace(const QByteArray &data, const char *pattern)
Same as QByteArray::startsWith, but ignoring leading whitespaces.
QString clean(const QString &s)
Cleans up extra white spaces and XML entities from s.
QString normalize(QStringView str)
Strips out diacritics and converts to case-folded form.
QString transliterate(QStringView s)
Transliterate diacritics or other special characters.
float prefixSimilarity(QStringView s1, QStringView s2)
Returns how much of the prefix of two given strings are equal, in relation to the longer of the two i...
QStringView betterString(QStringView lhs, QStringView rhs)
Assuming both sides are describing the same thing, this tries to find the "better" string.
QString simplifiedNoPlaceholder(const QString &s)
Same as QString::simplified() and dropping everything that just contains punctuation or dash characer...
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
iterator begin()
iterator end()
QString & append(QChar ch)
const QChar at(qsizetype position) const const
iterator begin()
iterator end()
QString fromUtf8(QByteArrayView str)
void push_back(QChar ch)
void reserve(qsizetype size)
QString simplified() const const
const_iterator begin() const const
bool empty() const const
const_iterator end() const const
bool isEmpty() const const
qsizetype size() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 24 2025 11:52:36 by doxygen 1.13.2 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.