KItinerary

htmldocumentprocessor.cpp
1/*
2 SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "htmldocumentprocessor.h"
8
9#include "genericpriceextractorhelper_p.h"
10#include "logging.h"
11#include "stringutil.h"
12#include "json/jsonld.h"
13
14#include <KItinerary/ExtractorDocumentNodeFactory>
15#include <KItinerary/ExtractorEngine>
16#include <KItinerary/ExtractorResult>
17#include <KItinerary/HtmlDocument>
18#include <KItinerary/JsonLdDocument>
19
20#include <QJsonArray>
21#include <QJsonDocument>
22#include <QJsonObject>
23#include <QJSEngine>
24#include <QJSValue>
25#include <QString>
26#include <QUrl>
27
28#include <cmath>
29
30using namespace KItinerary;
31
32Q_DECLARE_METATYPE(KItinerary::Internal::OwnedPtr<KItinerary::HtmlDocument>)
33
34bool HtmlDocumentProcessor::canHandleData(const QByteArray &encodedData, QStringView fileName) const
35{
36 return StringUtil::startsWithIgnoreSpace(encodedData, "<") ||
37 fileName.endsWith(QLatin1StringView(".html"), Qt::CaseInsensitive) ||
38 fileName.endsWith(QLatin1StringView(".htm"), Qt::CaseInsensitive);
39}
40
41static ExtractorDocumentNode nodeFromHtml(HtmlDocument *html)
42{
43 if (!html || html->root().firstChild().isNull()) {
44 return {};
45 }
46
48 node.setContent<Internal::OwnedPtr<HtmlDocument>>(html);
49 return node;
50}
51
53{
54 return nodeFromHtml(HtmlDocument::fromData(encodedData));
55}
56
58{
59 if (decodedData.userType() == QMetaType::QString) {
60 return nodeFromHtml(HtmlDocument::fromString(decodedData.toString()));
61 }
63}
64
66{
67 const auto html = node.content<HtmlDocument*>();
68
69 // inline images
70 expandElementRecursive(node, html->root(), engine);
71
72 // plain text fallback node
73 auto fallback = engine->documentNodeFactory()->createNode(html->root().recursiveContent(), u"text/plain");
74 node.appendChild(fallback);
75}
76
77static bool isJsonLdTag(const HtmlElement &elem)
78{
79 return elem.name() == QLatin1StringView("script") &&
80 elem.attribute(QStringLiteral("type")) ==
81 QLatin1String("application/ld+json");
82}
83
84static QByteArray fixupJson(const QByteArray &data)
85{
86 if (data.isEmpty()) {
87 return {};
88 }
89 auto output(data);
90
91 // Eurowings doesn't put a comma between objects in top-level arrays...
92 output.replace("}{", "},{");
93
94 // Volotea doesn't put square brackets in top level arrays...
95 if (output.front() != '[' && output.back() != ']') {
96 output.prepend("[");
97 output.append("]");
98 }
99
100 // Eventbrite adds commas where there shouldn't be one...
101 for (qsizetype idx = output.indexOf("\",\n"); idx > 0 && idx + 3 < output.size(); idx = output.indexOf("\",\n", idx)) {
102 const auto comma = idx + 1;
103 idx += 3;
104 while (idx < output.size() && std::isspace(static_cast<unsigned char>(output[idx]))) {
105 ++idx;
106 }
107 if (idx < output.size() && output[idx] == '}') {
108 output[comma] = ' ';
109 }
110 }
111
112 // Airbnb applies XML entity encoding...
113 output.replace("&quot;", "\"");
114
115 return output;
116}
117
118static void parseJson(const QByteArray &data, QJsonArray &result)
119{
121 auto jsonDoc = QJsonDocument::fromJson(data, &error);
122 if (jsonDoc.isNull()) {
123 if (error.error != QJsonParseError::NoError) {
124 // try to fix up common JSON encoding errors
125 jsonDoc = QJsonDocument::fromJson(fixupJson(data));
126 }
127 if (jsonDoc.isNull()) {
128 qCDebug(Log).noquote() << data;
129 qCDebug(Log) << error.errorString() << "at offset" << error.offset;
130 return;
131 }
132 }
133 if (jsonDoc.isArray()) {
134 const auto jsonArray = jsonDoc.array();
135 std::copy(jsonArray.begin(), jsonArray.end(), std::back_inserter(result));
136 } else if (jsonDoc.isObject()) {
137 result.push_back(jsonDoc.object());
138 }
139}
140
141static QString valueForItemProperty(const HtmlElement &elem)
142{
143 // TODO see https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/itemprop#Values
144 const auto elemName = elem.name();
145 QString v;
146 if (elemName == QLatin1StringView("meta")) {
147 v = elem.attribute(QStringLiteral("content"));
148 } else if (elemName == QLatin1StringView("time")) {
149 v = elem.attribute(QStringLiteral("datetime"));
150 } else if (elemName == QLatin1StringView("link") ||
151 elemName == QLatin1Char('a') ||
152 elemName == QLatin1String("img")) {
153 if (elem.hasAttribute(QStringLiteral("href"))) {
154 v = elem.attribute(QStringLiteral("href"));
155 } else if (elem.hasAttribute(QStringLiteral("content"))) {
156 v = elem.attribute(QStringLiteral("content"));
157 } else if (elem.hasAttribute(QStringLiteral("src"))) {
158 v = elem.attribute(QStringLiteral("src"));
159 } else {
160 v = elem.recursiveContent();
161 }
162 } else {
163 v = elem.recursiveContent();
164 }
165
166 return v;
167}
168
169static void insertProperties(QJsonObject &obj, const QString &prop, const QJsonValue &val)
170{
171 // multiple properties can be specified at once, as a space-separated list
172 const auto props = prop.split(QLatin1Char(' '), Qt::SkipEmptyParts);
173 for (const auto &p : props) {
174 auto valRef = obj[p];
175 if (valRef.isUndefined() || valRef.isNull()) {
176 obj.insert(p, val);
177 // convert multiple repeated properties into an array
178 } else if (valRef.isArray()) {
179 auto array = valRef.toArray();
180 array.push_back(val);
181 valRef = array;
182 } else {
183 QJsonArray array({valRef, val});
184 valRef = array;
185 }
186 }
187}
188
189static void parseMicroData(const HtmlElement &elem, QJsonObject &obj, QJsonArray &result)
190{
191 auto child = elem.firstChild();
192 while (!child.isNull()) {
193 const auto prop = child.attribute(QStringLiteral("itemprop"));
194 const auto type = child.attribute(QStringLiteral("itemtype"));
196 QJsonObject subObj;
197 parseMicroData(child, subObj, result);
198 const QUrl typeUrl(type);
199 subObj.insert(QStringLiteral("@type"), typeUrl.fileName());
200 if (prop.isEmpty()) {
201 result.push_back(subObj); // stand-alone object that just happens to be nested
202 } else {
203 insertProperties(obj, prop, subObj);
204 }
205 } else if (!prop.isEmpty()) {
206 insertProperties(obj, prop, valueForItemProperty(child));
207 // Maybe there is more JSON-LD inside this microdata tree
208 } else if (isJsonLdTag(child)) {
209 parseJson(child.content().toUtf8(), result);
210 } else {
211 // skip intermediate nodes without Microdata annotations
212 parseMicroData(child, obj, result);
213 }
214 child = child.nextSibling();
215 }
216}
217
218static void extractRecursive(const HtmlElement &elem, QJsonArray &result)
219{
220 // JSON-LD
221 if (isJsonLdTag(elem)) {
222 parseJson(elem.content().toUtf8(), result);
223 return;
224 }
225
226 // Microdata
227 const auto itemType = elem.attribute(QStringLiteral("itemtype"));
228 if (JsonLd::isSchemaOrgNamespace(itemType)) {
229 QJsonObject obj;
230 parseMicroData(elem, obj, result);
231 if (obj.isEmpty()) {
232 return;
233 }
234
235 const QUrl typeUrl(itemType);
236 obj.insert(QStringLiteral("@type"), typeUrl.fileName());
237
238 const auto itemProp = elem.attribute(QStringLiteral("itemprop"));
239 if (!itemProp.isEmpty() && !result.isEmpty()) {
240 // this is likely a child of our preceding sibling, but broken XML put it here
241 auto parent = result.last().toObject();
242 parent.insert(itemProp, obj);
243 result[result.size() - 1] = parent;
244 } else {
245 obj.insert(QStringLiteral("@context"), QStringLiteral("http://schema.org"));
246 result.push_back(obj);
247 }
248 return;
249 }
250
251 // recurse otherwise
252 auto child = elem.firstChild();
253 while (!child.isNull()) {
254 extractRecursive(child, result);
255 child = child.nextSibling();
256 }
257}
258
259void HtmlDocumentProcessor::preExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
260{
261 auto doc = node.content<HtmlDocument*>();
262 Q_ASSERT(doc);
263
264 if (!doc->root().isNull()) {
265 QJsonArray result;
266 extractRecursive(doc->root(), result);
267 node.addResult(result);
268 }
269}
270
271void HtmlDocumentProcessor::postExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
272{
273 if (node.childNodes().empty() || node.result().isEmpty()) {
274 return;
275 }
276
277 const QString text = node.childNodes().back().content<QString>();
278 GenericPriceExtractorHelper::postExtract(text, node);
279}
280
285
287{
288 destroyIfOwned<HtmlDocument>(node);
289}
290
291void HtmlDocumentProcessor::expandElementRecursive(ExtractorDocumentNode &node, const HtmlElement &elem, const ExtractorEngine *engine) const
292{
293 if (elem.name() == QLatin1StringView("img")) {
294 const auto src = elem.attribute(QLatin1StringView("src"));
295 if (src.startsWith(QLatin1StringView("data:"))) {
296 expandDataUrl(node, src, engine);
297 }
298 }
299
300 auto child = elem.firstChild();
301 while (!child.isNull()) {
302 expandElementRecursive(node, child, engine);
303 child = child.nextSibling();
304 }
305}
306
307void HtmlDocumentProcessor::expandDataUrl(ExtractorDocumentNode &node, QStringView data, const ExtractorEngine *engine) const
308{
309 const auto idx = data.indexOf(QLatin1Char(','));
310 if (idx < 0) {
311 return;
312 }
313 const auto header = data.mid(5, idx - 5);
314 const auto headerItems = header.split(QLatin1Char(';'));
315 if (headerItems.isEmpty()) {
316 return;
317 }
318
319 if (headerItems.front() != QLatin1StringView("image/png")) {
320 return;
321 }
322
323 auto imgData = data.mid(idx).toUtf8();
324 if (headerItems.back() == QLatin1StringView("base64")) {
325 imgData = QByteArray::fromBase64(imgData.trimmed());
326 }
327
328 auto child = engine->documentNodeFactory()->createNode(imgData, {}, headerItems.front());
329 node.appendChild(child);
330}
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName={}, QStringView mimeType={}) const
Create a new document node from data.
A node in the extracted document object tree.
QJsonArray result
Result access for QJSEngine.
void appendChild(ExtractorDocumentNode &child)
Add another child node.
QJSValue content
The decoded content of this node.
void addResult(ExtractorResult &&result)
Add additional results from an extraction step.
QVariantList childNodes
Child nodes, for QJSEngine access.
void setContent(const QVariant &content)
Set decoded content.
virtual ExtractorDocumentNode createNodeFromContent(const QVariant &decodedData) const
Create a document node from an already decoded data type.
Semantic data extraction engine.
const ExtractorDocumentNodeFactory * documentNodeFactory() const
Factory for creating new document nodes.
Processor for HTML documents.
void preExtract(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Called before extractors are applied to node.
void postExtract(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Called after extractors have been applied to node.
void destroyNode(ExtractorDocumentNode &node) const override
Destroys type-specific data in node.
ExtractorDocumentNode createNodeFromData(const QByteArray &encodedData) const override
Create a document node from raw data.
void expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Create child nodes for node, as far as that's necessary for this document type.
ExtractorDocumentNode createNodeFromContent(const QVariant &decodedData) const override
Create a document node from an already decoded data type.
QJSValue contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const override
Create a QJSValue for the node content.
HTML document for extraction.
static HtmlDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a HtmlDocument from the given raw data.
static HtmlDocument * fromString(const QString &data, QObject *parent=nullptr)
Creates a HtmlDocument from a given (unicode) string.
HTML document element.
Q_INVOKABLE QString attribute(const QString &attr) const
Value of the attribute attr.
bool hasAttribute(const QString &attr) const
Checks whether an attribute with name attr exists.
bool isSchemaOrgNamespace(QStringView uri)
Checks whether uri is in the http://schema.org namespace.
bool startsWithIgnoreSpace(const QByteArray &data, const char *pattern)
Same as QByteArray::startsWith, but ignoring leading whitespaces.
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
void error(QWidget *parent, const QString &text, const QString &title, const KGuiItem &buttonOk, Options options=Notify)
QByteArray fromBase64(const QByteArray &base64, Base64Options options)
bool isEmpty() const const
QJSValue toScriptValue(const T &value)
bool isEmpty() const const
QJsonValue last() const const
void push_back(const QJsonValue &value)
qsizetype size() const const
QJsonDocument fromJson(const QByteArray &json, QJsonParseError *error)
iterator insert(QLatin1StringView key, const QJsonValue &value)
bool isEmpty() const const
QJsonObject toObject() const const
bool isEmpty() const const
QStringList split(QChar sep, Qt::SplitBehavior behavior, Qt::CaseSensitivity cs) const const
QByteArray toUtf8() const const
QStringView mid(qsizetype start, qsizetype length) const const
qsizetype indexOf(QChar c, qsizetype from, Qt::CaseSensitivity cs) const const
QList< QStringView > split(QChar sep, Qt::SplitBehavior behavior, Qt::CaseSensitivity cs) const const
QByteArray toUtf8() const const
CaseInsensitive
SkipEmptyParts
QString toString() const const
int userType() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Tue Mar 26 2024 11:14:49 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.