KItinerary

pricefinder.cpp
1/*
2 SPDX-FileCopyrightText: 2023 Volker Krause <vkrause@kde.org>
3 SPDX-License-Identifier: LGPL-2.0-or-later
4*/
5
6#include "pricefinder_p.h"
7
8#include <KItinerary/PriceUtil>
9
10#include <QDebug>
11#include <QLocale>
12#include <QRegularExpression>
13
14#include <cmath>
15#include <cstring>
16
17using namespace KItinerary;
18
19std::vector<PriceFinder::CurrencyData> PriceFinder::s_currencyData;
20
21// normalize currency symbols, as e.g. "wide Yen" and "normal Yen" should be considered the same
22static QString normalizeSymbol(QStringView str)
23{
24 QString out;
25 out.reserve(str.size());
26 for (const auto c : str) {
27 if (c.decompositionTag() == QChar::Wide) {
28 out.push_back(c.decomposition().at(0));
29 } else {
30 out.push_back(c);
31 }
32 }
33 return out;
34}
35
36static bool isCollidingSymbol(QStringView lhs, QStringView rhs)
37{
38 return lhs == rhs
39 || (lhs.size() == rhs.size() + 1 && lhs.back() == QLatin1Char('.') && lhs.startsWith(rhs))
40 || (rhs.size() == lhs.size() + 1 && rhs.back() == QLatin1Char('.') && rhs.startsWith(lhs));
41}
42
43// overrides to QLocale data
44// ### keep sorted by ISO code
45struct {
46 const char isoCode[4];
47 const char *symbol;
48} static constexpr const currency_data_overrides[] = {
49 { "BAM", nullptr }, // BAM's symbol is "KM", which collides with distance values on train tickets too often
50 { "GBP", "£" }, // FKP, GIP and SHP are practically GPB-equivalent using the pound sign, SSP has it wrongly assigned in QLocale
51 { "JPY", "円"}, // the Yen sign is also used by CNY and thus ambigious, but the Japanese Yen symbol works
52};
53
54PriceFinder::PriceFinder()
55{
56 if (!s_currencyData.empty()) {
57 return;
58 }
59
61 for (const auto &locale : allLocales) {
62 CurrencyData data{locale.currencySymbol(QLocale::CurrencyIsoCode), normalizeSymbol(locale.currencySymbol(QLocale::CurrencySymbol))};
63 if (data.isoCode.isEmpty()) {
64 continue;
65 }
66
67 // single letter symbols tend to be way too trigger-happy
68 if (data.symbol.size() == 1 && data.symbol[0].isLetter()) {
69 //qDebug() << "Dropping single letter symbol:" << data.symbol << data.isoCode;
70 data.symbol.clear();
71 }
72
73 s_currencyData.push_back(std::move(data));
74 }
75
76 // remove duplicates
77 const auto lessThanCurrencyData = [](const auto &lhs, const auto &rhs) {
78 return std::tie(lhs.isoCode, lhs.symbol) < std::tie(rhs.isoCode, rhs.symbol);
79 };
80 std::sort(s_currencyData.begin(), s_currencyData.end(), lessThanCurrencyData);
81 const auto compareCurrencyData = [](const auto &lhs, const auto &rhs) {
82 return lhs.isoCode == rhs.isoCode && lhs.symbol == rhs.symbol;
83 };
84 s_currencyData.erase(std::unique(s_currencyData.begin(), s_currencyData.end(), compareCurrencyData), s_currencyData.end());
85
86 // clear ambigious symbols
87 for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) {
88 if ((*it).symbol.isEmpty()) {
89 continue;
90 }
91 bool collision = false;
92 for (auto it2 = std::next(it); it2 != s_currencyData.end(); ++it2) {
93 if (!isCollidingSymbol((*it).symbol, (*it2).symbol)) {
94 continue;
95 }
96 (*it2).symbol.clear();
97 if (!collision) {
98 qDebug() << "Ambigious currency symbol:" << (*it).symbol;
99 }
100 collision = true;
101 }
102 if (collision) {
103 (*it).symbol.clear();
104 }
105 }
106
107 // apply our own overrides over QLocale
108 for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) {
109 const auto it2 = std::lower_bound(std::begin(currency_data_overrides), std::end(currency_data_overrides), (*it).isoCode, [](const auto &lhs, const auto &rhs) {
110 return std::strncmp(lhs.isoCode, rhs.toLatin1().constData(), 3) < 0;
111 });
112 if (it2 == std::end(currency_data_overrides) || std::strncmp((*it2).isoCode, (*it).isoCode.toLatin1().constData(), 3) != 0) {
113 continue;
114 }
115 (*it).symbol = (*it2).symbol ? QString::fromUtf8((*it2).symbol) : QString();
116 }
117}
118
119PriceFinder::~PriceFinder() = default;
120
121static bool isBoundaryChar(QChar c)
122{
123 return c != QLatin1Char('-') && (c.isSpace() || c.isPunct() || c.isSymbol());
124}
125
126void PriceFinder::findAll(QStringView text, std::vector<Result> &results) const
127{
128 static QRegularExpression rx(QStringLiteral(R"((?<=\s|[[:punct:]]|^)([^\d\s]{1,4})?[  ]*(\d(?:[\d,.  ]*\d)?)[  ]*([^\d\s]{1,4})?(?=\s|[[:punct:]]|$))"));
129
130 const auto prevResultSize = results.size();
131 qsizetype offset = 0;
132 while (true) {
133 const auto match = rx.matchView(text, offset);
134 if (!match.hasMatch()) {
135 break;
136 }
137 offset = match.capturedEnd(2);
138
139 const auto leadingCurrency = parseCurrency(match.capturedView(1), CurrencyPrefix);
140 const auto trailingCurrency = parseCurrency(match.capturedView(3), CurrencySuffix);
141 if ((leadingCurrency.isEmpty() && trailingCurrency.isEmpty()) || (!leadingCurrency.isEmpty() && !trailingCurrency.isEmpty() && leadingCurrency != trailingCurrency)) {
142 continue;
143 }
144
145 // additional boundary checks not covered by the regular expression
146 if (leadingCurrency.isEmpty() && match.capturedStart(2) > 0 && !isBoundaryChar(text[match.capturedStart(2) - 1])) {
147 continue;
148 }
149 if (trailingCurrency.isEmpty() && match.capturedEnd(2) < text.size() - 2 && !isBoundaryChar(text[match.capturedEnd(2)])) {
150 continue;
151 }
152
153 Result r;
154 r.start = leadingCurrency.isEmpty() ? match.capturedStart(2) : match.capturedStart();
155 r.end = trailingCurrency.isEmpty() ? match.capturedEnd(2) : match.capturedEnd();
156 r.currency = leadingCurrency.isEmpty() ? trailingCurrency : leadingCurrency;
157
158 r.value = parseValue(match.capturedView(2), r.currency);
159 if (std::isnan(r.value)) {
160 continue;
161 }
162
163 results.push_back(std::move(r));
164 }
165
166 // check for overlapping results: in those case we have to assume the entire result is invalid
167 if (results.size() <= 1 + prevResultSize) {
168 return;
169 }
170 for (auto it = results.begin() + prevResultSize; it != std::prev(results.end()); ++it) {
171 if ((*it).end >= (*std::next(it)).start) {
172 qDebug() << "overlapping price data, discarding result";
173 results.erase(results.begin() + prevResultSize, results.end());
174 return;
175 }
176 }
177}
178
179PriceFinder::Result PriceFinder::findHighest(QStringView text) const
180{
181 std::vector<Result> results;
182 findAll(text, results);
183 return highest(results);
184}
185
186bool PriceFinder::isSingleCurrency(const std::vector<Result> &results) const
187{
188 if (results.empty()) {
189 return false;
190 }
191
192 const auto isoCode = results.front().currency;
193 return std::all_of(results.begin(), results.end(), [&isoCode](const auto &r) { return r.currency == isoCode; });
194}
195
196PriceFinder::Result PriceFinder::highest(const std::vector<Result> &results) const
197{
198 if (!isSingleCurrency(results)) {
199 return {};
200 }
201
202 const auto it = std::max_element(results.begin(), results.end(), [](const auto &lhs, const auto &rhs) { return lhs.value < rhs.value; });
203 return (*it);
204}
205
206static bool equalIgnoreDiacritics(QStringView lhs, QStringView rhs)
207{
208 if (lhs.size() != rhs.size()) {
209 return false;
210 }
211
212 for (qsizetype i = 0; i < lhs.size(); ++i) {
213 auto l = lhs[i];
214 if (l.decompositionTag() == QChar::Canonical) {
215 l = l.decomposition().at(0);
216 }
217 auto r = rhs[i];
218 if (r.decompositionTag() == QChar::Canonical) {
219 r = r.decomposition().at(0);
220 }
221 if (l != r) {
222 return false;
223 }
224 }
225
226 return true;
227}
228
229QString PriceFinder::parseCurrency(QStringView s, CurrencyPosition pos) const
230{
231 // trim remaining boundary chars
232 if (s.isEmpty()) {
233 return {};
234 }
235
236 // valid currency ISO code
237 auto isoCandidate = s;
238 while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.last())) {
239 isoCandidate = isoCandidate.left(isoCandidate.size() - 1);
240 }
241 while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.front())) {
242 isoCandidate = isoCandidate.mid(1);
243 }
244 if (isoCandidate.size() == 3) {
245 const auto it = std::lower_bound(s_currencyData.begin(), s_currencyData.end(), isoCandidate, [](const auto &lhs, QStringView rhs) { return lhs.isoCode < rhs; });
246 if (it != s_currencyData.end() && (*it).isoCode == isoCandidate) {
247 return (*it).isoCode;
248 }
249 }
250
251 // currency symbol
252 const auto symbol = normalizeSymbol(s);
253 // exact match: we know there is only ever going to be one (see ctor)
254 const auto it = std::find_if(s_currencyData.begin(), s_currencyData.end(), [&symbol](const auto &data) { return data.symbol == symbol; });
255 if (it != s_currencyData.end())
256 return (*it).isoCode;
257
258 // partial match: needs to be unique
259 QString isoCode;
260 for (const auto &data : s_currencyData) {
261 if (data.symbol.isEmpty()) {
262 continue;
263 }
264
265 // match disregarding diacritics
266 if (equalIgnoreDiacritics(data.symbol, symbol)) {
267 if (!isoCode.isEmpty()) {
268 return {};
269 }
270 isoCode = data.isoCode;
271 }
272
273 // prefix or suffix match
274 if (pos == CurrencyPrefix) {
275 if (symbol.size() <= data.symbol.size() || !symbol.endsWith(data.symbol) || !isBoundaryChar(symbol.at(symbol.size() - data.symbol.size() - 1))) {
276 continue;
277 }
278 } else {
279 if (symbol.size() <= data.symbol.size() || !symbol.startsWith(data.symbol) || !isBoundaryChar(symbol.at(data.symbol.size()))) {
280 continue;
281 }
282 }
283 if (!isoCode.isEmpty()) {
284 return {};
285 }
286 isoCode = data.isoCode;
287 }
288 return isoCode;
289}
290
291double PriceFinder::parseValue(QStringView s, const QString &isoCode) const
292{
293 if (s.isEmpty() || !s[0].isDigit() || !s[s.size() - 1].isDigit()) {
294 return NAN;
295 }
296
297 // find potential decimal separator
298 QChar decimalSeparator;
299 qsizetype decimalSeparatorIndex = -1;
300 for (qsizetype i = s.size() - 1; i > 0; --i) {
301 if (s[i].isDigit()) {
302 continue;
303 }
304 if (!s[i].isSpace()) {
305 decimalSeparator = s[i];
306 decimalSeparatorIndex = i;
307 }
308 break;
309 }
310
311 // identify/validate group separators
312 QChar groupSeparator;
313 qsizetype lastGroupSeparatorIndex = -1;
314 for (qsizetype i = 0; i < s.size(); ++i) {
315 if (s[i].isDigit()) {
316 continue;
317 }
318 if (lastGroupSeparatorIndex > 0 && i - lastGroupSeparatorIndex != 4) { // separator interval is wrong
319 return NAN;
320 }
321 if (decimalSeparatorIndex > 0 && i == decimalSeparatorIndex) { // found the suspected decimal separator
322 break;
323 }
324 if (!groupSeparator.isNull() && s[i] != groupSeparator) { // inconsistent separators
325 return NAN;
326 }
327
328 lastGroupSeparatorIndex = i;
329 groupSeparator = s[i];
330 }
331
332 // we found both and they are the same: has to be the group separator
333 if (!decimalSeparator.isNull() && !groupSeparator.isNull() && decimalSeparator == groupSeparator) {
334 if ((s.size() - decimalSeparatorIndex) != 4) {
335 return NAN;
336 }
337 decimalSeparator = {};
338 decimalSeparatorIndex = -1;
339 }
340
341 // we found a decimal separator: verify the number of decimals is consistent with the currency's subdivision
342 // see https://en.wikipedia.org/wiki/List_of_circulating_currencies
343 if (!decimalSeparator.isNull()) {
344 const auto decimalCount = s.size() - decimalSeparatorIndex - 1;
345 const auto expectedDecimalCount = PriceUtil::decimalCount(isoCode);
346
347 // subdivision x1000 is ambigious if we don't have a group separator
348 if (decimalCount == expectedDecimalCount && decimalCount == 3 && groupSeparator.isNull()) {
349 return NAN;
350 }
351
352 // if decimal count is 3, assume group separator
353 else if (decimalCount != expectedDecimalCount && decimalCount == 3) {
354 if (groupSeparator.isNull()) {
355 groupSeparator = decimalSeparator;
356 decimalSeparator = {};
357 } else {
358 return NAN;
359 }
360 }
361
362 else if (decimalCount > expectedDecimalCount) {
363 return NAN;
364 }
365 }
366
367 // strip group separators, replace decimal separator
368 auto normalized = s.toString();
369 if (!groupSeparator.isNull()) {
370 normalized.remove(groupSeparator);
371 }
372 if (!decimalSeparator.isNull()) {
373 normalized.replace(decimalSeparator, QLatin1Char('.'));
374 }
375
376 bool ok = false;
377 const auto value = normalized.toDouble(&ok);
378 if (!ok) {
379 return NAN;
380 }
381 return value;
382}
static int decimalCount(QStringView currency)
Returns the number of decimals to represent the sub-unit of currency.
Definition priceutil.cpp:92
KCOREADDONS_EXPORT Result match(QStringView pattern, QStringView str)
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
bool isNull() const const
bool isPunct(char32_t ucs4)
bool isSpace(char32_t ucs4)
bool isSymbol(char32_t ucs4)
QList< QLocale > matchingLocales(QLocale::Language language, QLocale::Script script, QLocale::Territory territory)
void clear()
QString fromUtf8(QByteArrayView str)
void push_back(QChar ch)
QString & remove(QChar ch, Qt::CaseSensitivity cs)
QString & replace(QChar before, QChar after, Qt::CaseSensitivity cs)
void reserve(qsizetype size)
QStringView left(qsizetype length) const const
QStringView mid(qsizetype start, qsizetype length) const const
QChar at(qsizetype n) const const
QChar back() const const
bool isEmpty() const const
qsizetype size() const const
bool startsWith(QChar ch) const const
QString toString() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Fri May 3 2024 11:45:33 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.