KItinerary

pricefinder.cpp
1/*
2 SPDX-FileCopyrightText: 2023 Volker Krause <vkrause@kde.org>
3 SPDX-License-Identifier: LGPL-2.0-or-later
4*/
5
6#include "pricefinder_p.h"
7
8#include <KItinerary/PriceUtil>
9
10#include <QDebug>
11#include <QLocale>
12#include <QRegularExpression>
13
14#include <cmath>
15#include <cstring>
16
17using namespace KItinerary;
18
19std::vector<PriceFinder::CurrencyData> PriceFinder::s_currencyData;
20
21// normalize currency symbols, as e.g. "wide Yen" and "normal Yen" should be considered the same
22static QString normalizeSymbol(QStringView str)
23{
24 QString out;
25 out.reserve(str.size());
26 for (const auto c : str) {
27 if (c.decompositionTag() == QChar::Wide) {
28 out.push_back(c.decomposition().at(0));
29 } else {
30 out.push_back(c);
31 }
32 }
33 return out;
34}
35
36static bool isCollidingSymbol(QStringView lhs, QStringView rhs)
37{
38 return lhs == rhs
39 || (lhs.size() == rhs.size() + 1 && lhs.back() == QLatin1Char('.') && lhs.startsWith(rhs))
40 || (rhs.size() == lhs.size() + 1 && rhs.back() == QLatin1Char('.') && rhs.startsWith(lhs));
41}
42
43// overrides to QLocale data
44// ### keep sorted by ISO code
45struct {
46 const char isoCode[4];
47 const char *symbol;
48} static constexpr const currency_data_overrides[] = {
49 { "BAM", nullptr }, // BAM's symbol is "KM", which collides with distance values on train tickets too often
50 { "GBP", "£" }, // FKP, GIP and SHP are practically GPB-equivalent using the pound sign, SSP has it wrongly assigned in QLocale
51 { "JPY", "円"}, // the Yen sign is also used by CNY and thus ambigious, but the Japanese Yen symbol works
52};
53
54PriceFinder::PriceFinder()
55{
56 if (!s_currencyData.empty()) {
57 return;
58 }
59
61 for (const auto &locale : allLocales) {
62 CurrencyData data{locale.currencySymbol(QLocale::CurrencyIsoCode), normalizeSymbol(locale.currencySymbol(QLocale::CurrencySymbol))};
63 if (data.isoCode.isEmpty()) {
64 continue;
65 }
66
67 // single letter symbols tend to be way too trigger-happy
68 if (data.symbol.size() == 1 && data.symbol[0].isLetter()) {
69 //qDebug() << "Dropping single letter symbol:" << data.symbol << data.isoCode;
70 data.symbol.clear();
71 }
72
73 s_currencyData.push_back(std::move(data));
74 }
75
76 // remove duplicates
77 const auto lessThanCurrencyData = [](const auto &lhs, const auto &rhs) {
78 return std::tie(lhs.isoCode, lhs.symbol) < std::tie(rhs.isoCode, rhs.symbol);
79 };
80 std::sort(s_currencyData.begin(), s_currencyData.end(), lessThanCurrencyData);
81 const auto compareCurrencyData = [](const auto &lhs, const auto &rhs) {
82 return lhs.isoCode == rhs.isoCode && lhs.symbol == rhs.symbol;
83 };
84 s_currencyData.erase(std::unique(s_currencyData.begin(), s_currencyData.end(), compareCurrencyData), s_currencyData.end());
85
86 // clear ambigious symbols
87 for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) {
88 if ((*it).symbol.isEmpty()) {
89 continue;
90 }
91 bool collision = false;
92 for (auto it2 = std::next(it); it2 != s_currencyData.end(); ++it2) {
93 if (!isCollidingSymbol((*it).symbol, (*it2).symbol)) {
94 continue;
95 }
96 (*it2).symbol.clear();
97 if (!collision) {
98 qDebug() << "Ambigious currency symbol:" << (*it).symbol;
99 }
100 collision = true;
101 }
102 if (collision) {
103 (*it).symbol.clear();
104 }
105 }
106
107 // apply our own overrides over QLocale
108 for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) {
109 const auto it2 = std::lower_bound(std::begin(currency_data_overrides), std::end(currency_data_overrides), (*it).isoCode, [](const auto &lhs, const auto &rhs) {
110 return std::strncmp(lhs.isoCode, rhs.toLatin1().constData(), 3) < 0;
111 });
112 if (it2 == std::end(currency_data_overrides) || std::strncmp((*it2).isoCode, (*it).isoCode.toLatin1().constData(), 3) != 0) {
113 continue;
114 }
115 (*it).symbol = (*it2).symbol ? QString::fromUtf8((*it2).symbol) : QString();
116 }
117}
118
119PriceFinder::~PriceFinder() = default;
120
121static bool isBoundaryChar(QChar c)
122{
123 return c != QLatin1Char('-') && (c.isSpace() || c.isPunct() || c.isSymbol());
124}
125
126void PriceFinder::findAll(QStringView text, std::vector<Result> &results) const
127{
128 static QRegularExpression rx(QStringLiteral(R"((?<=\s|[[:punct:]]|^)([^\d\s]{1,4})?[  ]*(\d(?:[\d,.  ]*\d)?)[  ]*([^\d\s]{1,4})?(?=\s|[[:punct:]]|$))"));
129
130 const auto prevResultSize = results.size();
131 qsizetype offset = 0;
132 while (true) {
133 const auto match = rx.matchView(text, offset);
134 if (!match.hasMatch()) {
135 break;
136 }
137 offset = match.capturedEnd(2);
138
139 const auto leadingCurrency = parseCurrency(match.capturedView(1), CurrencyPrefix);
140 const auto trailingCurrency = parseCurrency(match.capturedView(3), CurrencySuffix);
141 if ((leadingCurrency.isEmpty() && trailingCurrency.isEmpty()) || (!leadingCurrency.isEmpty() && !trailingCurrency.isEmpty() && leadingCurrency != trailingCurrency)) {
142 continue;
143 }
144
145 // additional boundary checks not covered by the regular expression
146 if (leadingCurrency.isEmpty() && match.capturedStart(2) > 0 && !isBoundaryChar(text[match.capturedStart(2) - 1])) {
147 continue;
148 }
149 if (trailingCurrency.isEmpty() && match.capturedEnd(2) < text.size() - 2 && !isBoundaryChar(text[match.capturedEnd(2)])) {
150 continue;
151 }
152
153 Result r;
154 r.start = leadingCurrency.isEmpty() ? match.capturedStart(2) : match.capturedStart();
155 r.end = trailingCurrency.isEmpty() ? match.capturedEnd(2) : match.capturedEnd();
156 r.currency = leadingCurrency.isEmpty() ? trailingCurrency : leadingCurrency;
157
158 r.value = parseValue(match.capturedView(2), r.currency);
159 if (std::isnan(r.value)) {
160 continue;
161 }
162
163 results.push_back(std::move(r));
164 }
165
166 // check for overlapping results: in those case we have to assume the entire result is invalid
167 if (results.size() <= 1 + prevResultSize) {
168 return;
169 }
170 for (auto it = results.begin() + prevResultSize; it != std::prev(results.end()); ++it) {
171 if ((*it).end >= (*std::next(it)).start) {
172 qDebug() << "overlapping price data, discarding result";
173 results.erase(results.begin() + prevResultSize, results.end());
174 return;
175 }
176 }
177}
178
179PriceFinder::Result PriceFinder::findHighest(QStringView text) const
180{
181 std::vector<Result> results;
182 findAll(text, results);
183 return highest(results);
184}
185
186bool PriceFinder::isSingleCurrency(const std::vector<Result> &results) const
187{
188 if (results.empty()) {
189 return false;
190 }
191
192 const auto isoCode = results.front().currency;
193 return std::all_of(results.begin(), results.end(), [&isoCode](const auto &r) { return r.currency == isoCode; });
194}
195
196// sanity threshold for prices that aren't plausible and might instead be company capital statements in the fineprint
197struct {
198 const char currency[4];
199 int threshold;
200} static constexpr const price_upper_limit[] = {
201 { "EUR", 15000 }
202};
203
204PriceFinder::Result PriceFinder::highest(std::vector<Result> &results) const
205{
206 if (!isSingleCurrency(results)) {
207 return {};
208 }
209
210 std::sort(results.begin(), results.end(), [](const auto &lhs, const auto &rhs) { return lhs.value > rhs.value; });
211 if (results.size() == 1) {
212 return results.front();
213 }
214
215 // check for extremely large differences between the max and max - 1
216 // this can be caused by the fine print containing company capital statements (common e.g. in France)
217 if (results[1].value > 0 && results[0].value / results[1].value > 1000) {
218 // TODO is this reliable enough to return results[1] here?
219 return {};
220 }
221 const auto it = std::ranges::find_if(price_upper_limit, [&results](auto t) {
222 return QLatin1StringView(t.currency, 3) == results[0].currency;
223 });
224 if (it != std::end(price_upper_limit) && (*it).threshold < results[0].value) {
225 return {};
226 }
227
228 return results.front();
229}
230
231static bool equalIgnoreDiacritics(QStringView lhs, QStringView rhs)
232{
233 if (lhs.size() != rhs.size()) {
234 return false;
235 }
236
237 for (qsizetype i = 0; i < lhs.size(); ++i) {
238 auto l = lhs[i];
239 if (l.decompositionTag() == QChar::Canonical) {
240 l = l.decomposition().at(0);
241 }
242 auto r = rhs[i];
243 if (r.decompositionTag() == QChar::Canonical) {
244 r = r.decomposition().at(0);
245 }
246 if (l != r) {
247 return false;
248 }
249 }
250
251 return true;
252}
253
254QString PriceFinder::parseCurrency(QStringView s, CurrencyPosition pos) const
255{
256 // trim remaining boundary chars
257 if (s.isEmpty()) {
258 return {};
259 }
260
261 // valid currency ISO code
262 auto isoCandidate = s;
263 while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.last())) {
264 isoCandidate = isoCandidate.left(isoCandidate.size() - 1);
265 }
266 while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.front())) {
267 isoCandidate = isoCandidate.mid(1);
268 }
269 if (isoCandidate.size() == 3) {
270 const auto it = std::lower_bound(s_currencyData.begin(), s_currencyData.end(), isoCandidate, [](const auto &lhs, QStringView rhs) { return lhs.isoCode < rhs; });
271 if (it != s_currencyData.end() && (*it).isoCode == isoCandidate) {
272 return (*it).isoCode;
273 }
274 }
275
276 // currency symbol
277 const auto symbol = normalizeSymbol(s);
278 // exact match: we know there is only ever going to be one (see ctor)
279 const auto it = std::find_if(s_currencyData.begin(), s_currencyData.end(), [&symbol](const auto &data) { return data.symbol == symbol; });
280 if (it != s_currencyData.end())
281 return (*it).isoCode;
282
283 // partial match: needs to be unique
284 QString isoCode;
285 for (const auto &data : s_currencyData) {
286 if (data.symbol.isEmpty()) {
287 continue;
288 }
289
290 // match disregarding diacritics
291 if (equalIgnoreDiacritics(data.symbol, symbol)) {
292 if (!isoCode.isEmpty()) {
293 return {};
294 }
295 isoCode = data.isoCode;
296 }
297
298 // prefix or suffix match
299 if (pos == CurrencyPrefix) {
300 if (symbol.size() <= data.symbol.size() || !symbol.endsWith(data.symbol) || !isBoundaryChar(symbol.at(symbol.size() - data.symbol.size() - 1))) {
301 continue;
302 }
303 } else {
304 if (symbol.size() <= data.symbol.size() || !symbol.startsWith(data.symbol) || !isBoundaryChar(symbol.at(data.symbol.size()))) {
305 continue;
306 }
307 }
308 if (!isoCode.isEmpty()) {
309 return {};
310 }
311 isoCode = data.isoCode;
312 }
313 return isoCode;
314}
315
316double PriceFinder::parseValue(QStringView s, const QString &isoCode) const
317{
318 if (s.isEmpty() || !s[0].isDigit() || !s[s.size() - 1].isDigit()) {
319 return NAN;
320 }
321
322 // find potential decimal separator
323 QChar decimalSeparator;
324 qsizetype decimalSeparatorIndex = -1;
325 for (qsizetype i = s.size() - 1; i > 0; --i) {
326 if (s[i].isDigit()) {
327 continue;
328 }
329 if (!s[i].isSpace()) {
330 decimalSeparator = s[i];
331 decimalSeparatorIndex = i;
332 }
333 break;
334 }
335
336 // identify/validate group separators
337 QChar groupSeparator;
338 qsizetype lastGroupSeparatorIndex = -1;
339 for (qsizetype i = 0; i < s.size(); ++i) {
340 if (s[i].isDigit()) {
341 continue;
342 }
343 if (lastGroupSeparatorIndex > 0 && i - lastGroupSeparatorIndex != 4) { // separator interval is wrong
344 return NAN;
345 }
346 if (decimalSeparatorIndex > 0 && i == decimalSeparatorIndex) { // found the suspected decimal separator
347 break;
348 }
349 if (!groupSeparator.isNull() && s[i] != groupSeparator) { // inconsistent separators
350 return NAN;
351 }
352
353 lastGroupSeparatorIndex = i;
354 groupSeparator = s[i];
355 }
356
357 // we found both and they are the same: has to be the group separator
358 if (!decimalSeparator.isNull() && !groupSeparator.isNull() && decimalSeparator == groupSeparator) {
359 if ((s.size() - decimalSeparatorIndex) != 4) {
360 return NAN;
361 }
362 decimalSeparator = {};
363 decimalSeparatorIndex = -1;
364 }
365
366 // we found a decimal separator: verify the number of decimals is consistent with the currency's subdivision
367 // see https://en.wikipedia.org/wiki/List_of_circulating_currencies
368 if (!decimalSeparator.isNull()) {
369 const auto decimalCount = s.size() - decimalSeparatorIndex - 1;
370 const auto expectedDecimalCount = PriceUtil::decimalCount(isoCode);
371
372 // subdivision x1000 is ambigious if we don't have a group separator
373 if (decimalCount == expectedDecimalCount && decimalCount == 3 && groupSeparator.isNull()) {
374 return NAN;
375 }
376
377 // if decimal count is 3, assume group separator
378 else if (decimalCount != expectedDecimalCount && decimalCount == 3) {
379 if (groupSeparator.isNull()) {
380 groupSeparator = decimalSeparator;
381 decimalSeparator = {};
382 } else {
383 return NAN;
384 }
385 }
386
387 else if (decimalCount > expectedDecimalCount) {
388 return NAN;
389 }
390 }
391
392 // strip group separators, replace decimal separator
393 auto normalized = s.toString();
394 if (!groupSeparator.isNull()) {
395 normalized.remove(groupSeparator);
396 }
397 if (!decimalSeparator.isNull()) {
398 normalized.replace(decimalSeparator, QLatin1Char('.'));
399 }
400
401 bool ok = false;
402 const auto value = normalized.toDouble(&ok);
403 if (!ok) {
404 return NAN;
405 }
406 return value;
407}
static int decimalCount(QStringView currency)
Returns the number of decimals to represent the sub-unit of currency.
Definition priceutil.cpp:92
KCOREADDONS_EXPORT Result match(QStringView pattern, QStringView str)
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
bool isNull() const const
bool isPunct(char32_t ucs4)
bool isSpace(char32_t ucs4)
bool isSymbol(char32_t ucs4)
QList< QLocale > matchingLocales(QLocale::Language language, QLocale::Script script, QLocale::Territory territory)
QString fromUtf8(QByteArrayView str)
void push_back(QChar ch)
void reserve(qsizetype size)
QChar back() const const
bool isEmpty() const const
qsizetype size() const const
bool startsWith(QChar ch) const const
QString toString() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Mar 28 2025 11:59:50 by doxygen 1.13.2 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.