KItinerary

htmldocumentprocessor.cpp
1/*
2 SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "htmldocumentprocessor.h"
8
9#include "genericpriceextractorhelper_p.h"
10#include "logging.h"
11#include "stringutil.h"
12#include "json/jsonld.h"
13
14#include <KItinerary/ExtractorDocumentNodeFactory>
15#include <KItinerary/ExtractorEngine>
16#include <KItinerary/ExtractorResult>
17#include <KItinerary/HtmlDocument>
18#include <KItinerary/JsonLdDocument>
19
20#include <QJsonArray>
21#include <QJsonDocument>
22#include <QJsonObject>
23#include <QJSEngine>
24#include <QJSValue>
25#include <QString>
26#include <QUrl>
27
28#include <cmath>
29
30using namespace Qt::Literals;
31using namespace KItinerary;
32
33Q_DECLARE_METATYPE(KItinerary::Internal::OwnedPtr<KItinerary::HtmlDocument>)
34
35bool HtmlDocumentProcessor::canHandleData(const QByteArray &encodedData, QStringView fileName) const
36{
37 return StringUtil::startsWithIgnoreSpace(encodedData, "<") ||
38 fileName.endsWith(QLatin1StringView(".html"), Qt::CaseInsensitive) ||
39 fileName.endsWith(QLatin1StringView(".htm"), Qt::CaseInsensitive);
40}
41
42static ExtractorDocumentNode nodeFromHtml(HtmlDocument *html)
43{
44 if (!html || html->root().firstChild().isNull()) {
45 return {};
46 }
47
49 node.setContent<Internal::OwnedPtr<HtmlDocument>>(html);
50 return node;
51}
52
54{
55 return nodeFromHtml(HtmlDocument::fromData(encodedData));
56}
57
59{
60 if (decodedData.userType() == QMetaType::QString) {
61 return nodeFromHtml(HtmlDocument::fromString(decodedData.toString()));
62 }
64}
65
67{
68 const auto html = node.content<HtmlDocument*>();
69
70 // inline images
71 expandElementRecursive(node, html->root(), engine);
72
73 // plain text fallback node
74 auto fallback = engine->documentNodeFactory()->createNode(html->root().recursiveContent(), u"text/plain");
75 node.appendChild(fallback);
76}
77
78static bool isJsonLdTag(const HtmlElement &elem)
79{
80 return elem.name() == QLatin1StringView("script") &&
81 elem.attribute(QStringLiteral("type")) ==
82 QLatin1StringView("application/ld+json");
83}
84
85static QByteArray fixupJson(const QByteArray &data)
86{
87 if (data.isEmpty()) {
88 return {};
89 }
90 auto output(data);
91
92 // Eurowings doesn't put a comma between objects in top-level arrays...
93 output.replace("}{", "},{");
94
95 // Volotea doesn't put square brackets in top level arrays...
96 if (output.front() != '[' && output.back() != ']') {
97 output.prepend("[");
98 output.append("]");
99 }
100
101 // Eventbrite adds commas where there shouldn't be one...
102 for (qsizetype idx = output.indexOf("\",\n"); idx > 0 && idx + 3 < output.size(); idx = output.indexOf("\",\n", idx)) {
103 const auto comma = idx + 1;
104 idx += 3;
105 while (idx < output.size() && std::isspace(static_cast<unsigned char>(output[idx]))) {
106 ++idx;
107 }
108 if (idx < output.size() && output[idx] == '}') {
109 output[comma] = ' ';
110 }
111 }
112
113 // Airbnb applies XML entity encoding...
114 output.replace("&quot;", "\"");
115
116 return output;
117}
118
119static void parseJson(const QByteArray &data, QJsonArray &result)
120{
121 QJsonParseError error;
122 auto jsonDoc = QJsonDocument::fromJson(data, &error);
123 if (jsonDoc.isNull()) {
124 if (error.error != QJsonParseError::NoError) {
125 // try to fix up common JSON encoding errors
126 jsonDoc = QJsonDocument::fromJson(fixupJson(data));
127 }
128 if (jsonDoc.isNull()) {
129 qCDebug(Log).noquote() << data;
130 qCDebug(Log) << error.errorString() << "at offset" << error.offset;
131 return;
132 }
133 }
134 if (jsonDoc.isArray()) {
135 const auto jsonArray = jsonDoc.array();
136 std::copy(jsonArray.begin(), jsonArray.end(), std::back_inserter(result));
137 } else if (jsonDoc.isObject()) {
138 result.push_back(jsonDoc.object());
139 }
140}
141
142static QString valueForItemProperty(const HtmlElement &elem)
143{
144 // TODO see https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/itemprop#Values
145 const auto elemName = elem.name();
146 QString v;
147 if (elemName == QLatin1StringView("meta")) {
148 v = elem.attribute(QStringLiteral("content"));
149 } else if (elemName == QLatin1StringView("time")) {
150 v = elem.attribute(QStringLiteral("datetime"));
151 } else if (elemName == QLatin1StringView("link") ||
152 elemName == QLatin1Char('a') ||
153 elemName == QLatin1StringView("img")) {
154 if (elem.hasAttribute(QStringLiteral("href"))) {
155 v = elem.attribute(QStringLiteral("href"));
156 } else if (elem.hasAttribute(QStringLiteral("content"))) {
157 v = elem.attribute(QStringLiteral("content"));
158 } else if (elem.hasAttribute(QStringLiteral("src"))) {
159 v = elem.attribute(QStringLiteral("src"));
160 } else {
161 v = elem.recursiveContent();
162 }
163 } else {
164 v = elem.recursiveContent();
165 }
166
167 return v;
168}
169
170static void insertProperties(QJsonObject &obj, const QString &prop, const QJsonValue &val)
171{
172 // multiple properties can be specified at once, as a space-separated list
173 const auto props = prop.split(QLatin1Char(' '), Qt::SkipEmptyParts);
174 for (const auto &p : props) {
175 auto valRef = obj[p];
176 if (valRef.isUndefined() || valRef.isNull()) {
177 obj.insert(p, val);
178 // convert multiple repeated properties into an array
179 } else if (valRef.isArray()) {
180 auto array = valRef.toArray();
181 array.push_back(val);
182 valRef = array;
183 } else {
184 QJsonArray array({valRef, val});
185 valRef = array;
186 }
187 }
188}
189
190static void parseMicroData(const HtmlElement &elem, QJsonObject &obj, QJsonArray &result)
191{
192 auto child = elem.firstChild();
193 while (!child.isNull()) {
194 const auto prop = child.attribute(QStringLiteral("itemprop"));
195 const auto type = child.attribute(QStringLiteral("itemtype"));
197 QJsonObject subObj;
198 parseMicroData(child, subObj, result);
199 const QUrl typeUrl(type);
200 subObj.insert(QStringLiteral("@type"), typeUrl.fileName());
201 if (prop.isEmpty()) {
202 result.push_back(subObj); // stand-alone object that just happens to be nested
203 } else {
204 insertProperties(obj, prop, subObj);
205 }
206 } else if (!prop.isEmpty()) {
207 insertProperties(obj, prop, valueForItemProperty(child));
208 // Maybe there is more JSON-LD inside this microdata tree
209 } else if (isJsonLdTag(child)) {
210 parseJson(child.content().toUtf8(), result);
211 } else {
212 // skip intermediate nodes without Microdata annotations
213 parseMicroData(child, obj, result);
214 }
215 child = child.nextSibling();
216 }
217}
218
219static void extractRecursive(const HtmlElement &elem, QJsonArray &result)
220{
221 // JSON-LD
222 if (isJsonLdTag(elem)) {
223 parseJson(elem.content().toUtf8(), result);
224 return;
225 }
226
227 // Microdata
228 const auto itemType = elem.attribute(QStringLiteral("itemtype"));
229 if (JsonLd::isSchemaOrgNamespace(itemType)) {
230 QJsonObject obj;
231 parseMicroData(elem, obj, result);
232 if (obj.isEmpty()) {
233 return;
234 }
235
236 const QUrl typeUrl(itemType);
237 obj.insert(QStringLiteral("@type"), typeUrl.fileName());
238
239 const auto itemProp = elem.attribute(QStringLiteral("itemprop"));
240 if (!itemProp.isEmpty() && !result.isEmpty()) {
241 // this is likely a child of our preceding sibling, but broken XML put it here
242 auto parent = result.last().toObject();
243 parent.insert(itemProp, obj);
244 result[result.size() - 1] = parent;
245 } else {
246 obj.insert(QStringLiteral("@context"), QStringLiteral("http://schema.org"));
247 result.push_back(obj);
248 }
249 return;
250 }
251
252 // recurse otherwise
253 auto child = elem.firstChild();
254 while (!child.isNull()) {
255 extractRecursive(child, result);
256 child = child.nextSibling();
257 }
258}
259
260void HtmlDocumentProcessor::preExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
261{
262 auto doc = node.content<HtmlDocument*>();
263 Q_ASSERT(doc);
264
265 if (!doc->root().isNull()) {
266 QJsonArray result;
267 extractRecursive(doc->root(), result);
268 node.addResult(result);
269 }
270}
271
272void HtmlDocumentProcessor::postExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
273{
274 if (node.childNodes().empty() || node.result().isEmpty()) {
275 return;
276 }
277
278 const QString text = node.childNodes().back().content<QString>();
279 GenericPriceExtractorHelper::postExtract(text, node);
280}
281
286
288{
289 destroyIfOwned<HtmlDocument>(node);
290}
291
292void HtmlDocumentProcessor::expandElementRecursive(ExtractorDocumentNode &node, const HtmlElement &elem, const ExtractorEngine *engine) const
293{
294 if (elem.name() == "img"_L1) {
295 const auto src = elem.attribute("src"_L1);
296 if (src.startsWith("data:"_L1)) {
297 expandDataUrl(node, src, engine);
298 }
299 }
300
301 auto child = elem.firstChild();
302 while (!child.isNull()) {
303 expandElementRecursive(node, child, engine);
304 child = child.nextSibling();
305 }
306}
307
308void HtmlDocumentProcessor::expandDataUrl(ExtractorDocumentNode &node, QStringView data, const ExtractorEngine *engine) const
309{
310 const auto idx = data.indexOf(QLatin1Char(','));
311 if (idx < 0) {
312 return;
313 }
314 const auto header = data.mid(5, idx - 5);
315 const auto headerItems = header.split(QLatin1Char(';'));
316 if (headerItems.isEmpty()) {
317 return;
318 }
319
320 if (headerItems.front() != QLatin1StringView("image/png")) {
321 return;
322 }
323
324 auto imgData = data.mid(idx).toUtf8();
325 if (headerItems.back() == QLatin1StringView("base64")) {
326 imgData = QByteArray::fromBase64(imgData.trimmed());
327 }
328
329 auto child = engine->documentNodeFactory()->createNode(imgData, {}, headerItems.front());
330 node.appendChild(child);
331}
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName={}, QStringView mimeType={}) const
Create a new document node from data.
A node in the extracted document object tree.
QJsonArray result
Result access for QJSEngine.
void appendChild(ExtractorDocumentNode &child)
Add another child node.
QJSValue content
The decoded content of this node.
void addResult(ExtractorResult &&result)
Add additional results from an extraction step.
QVariantList childNodes
Child nodes, for QJSEngine access.
void setContent(const QVariant &content)
Set decoded content.
virtual ExtractorDocumentNode createNodeFromContent(const QVariant &decodedData) const
Create a document node from an already decoded data type.
Semantic data extraction engine.
const ExtractorDocumentNodeFactory * documentNodeFactory() const
Factory for creating new document nodes.
Processor for HTML documents.
void preExtract(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Called before extractors are applied to node.
void postExtract(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Called after extractors have been applied to node.
void destroyNode(ExtractorDocumentNode &node) const override
Destroys type-specific data in node.
ExtractorDocumentNode createNodeFromData(const QByteArray &encodedData) const override
Create a document node from raw data.
void expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Create child nodes for node, as far as that's necessary for this document type.
ExtractorDocumentNode createNodeFromContent(const QVariant &decodedData) const override
Create a document node from an already decoded data type.
QJSValue contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const override
Create a QJSValue for the node content.
bool canHandleData(const QByteArray &encodedData, QStringView fileName) const override
Fast check whether the given encoded data can possibly be processed by this instance.
HTML document for extraction.
static HtmlDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a HtmlDocument from the given raw data.
static HtmlDocument * fromString(const QString &data, QObject *parent=nullptr)
Creates a HtmlDocument from a given (unicode) string.
HTML document element.
Q_INVOKABLE QString attribute(const QString &attr) const
Value of the attribute attr.
bool hasAttribute(const QString &attr) const
Checks whether an attribute with name attr exists.
bool isSchemaOrgNamespace(QStringView uri)
Checks whether uri is in the http://schema.org namespace.
bool startsWithIgnoreSpace(const QByteArray &data, const char *pattern)
Same as QByteArray::startsWith, but ignoring leading whitespaces.
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
void error(QWidget *parent, const QString &text, const QString &title, const KGuiItem &buttonOk, Options options=Notify)
QByteArray fromBase64(const QByteArray &base64, Base64Options options)
bool isEmpty() const const
QJSValue toScriptValue(const T &value)
bool isEmpty() const const
QJsonValue last() const const
void push_back(const QJsonValue &value)
qsizetype size() const const
QJsonDocument fromJson(const QByteArray &json, QJsonParseError *error)
iterator insert(QLatin1StringView key, const QJsonValue &value)
bool isEmpty() const const
QJsonObject toObject() const const
bool isEmpty() const const
QStringList split(QChar sep, Qt::SplitBehavior behavior, Qt::CaseSensitivity cs) const const
QByteArray toUtf8() const const
QStringView mid(qsizetype start, qsizetype length) const const
qsizetype indexOf(QChar c, qsizetype from, Qt::CaseSensitivity cs) const const
QList< QStringView > split(QChar sep, Qt::SplitBehavior behavior, Qt::CaseSensitivity cs) const const
QByteArray toUtf8() const const
CaseInsensitive
SkipEmptyParts
QString toString() const const
int userType() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri May 2 2025 11:54:59 by doxygen 1.13.2 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.