KItinerary

htmldocument.cpp
1/*
2 SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "config-kitinerary.h"
8#include "htmldocument.h"
9
10#include <QDebug>
11#include <QVariant>
12
13#if HAVE_LIBXML2
14#include <libxml/HTMLparser.h>
15#include <libxml/xpath.h>
16#endif
17
18using namespace KItinerary;
19
20namespace KItinerary {
21class HtmlDocumentPrivate {
22public:
23#if HAVE_LIBXML2
24 ~HtmlDocumentPrivate() {
25 xmlFreeDoc(m_doc);
26 }
27
28 xmlDocPtr m_doc;
29 QByteArray m_rawData;
30#endif
31};
32}
33
34HtmlElement::HtmlElement()
35 : d(nullptr)
36{
37}
38
39HtmlElement::~HtmlElement() = default;
40
41#if HAVE_LIBXML2
42HtmlElement::HtmlElement(xmlNode *dd)
43 : d(dd)
44{
45}
46#endif
47
48HtmlDocument::HtmlDocument(QObject *parent)
49 : QObject(parent)
50 , d(new HtmlDocumentPrivate)
51{
52}
53
54HtmlDocument::~HtmlDocument() = default;
55
56bool HtmlElement::isNull() const
57{
58 return d == nullptr;
59}
60
61QString HtmlElement::name() const
62{
63#if HAVE_LIBXML2
64 if (d) {
65 return QString::fromUtf8(reinterpret_cast<const char*>(d->name));
66 }
67#endif
68 return {};
69}
70
72{
73#if HAVE_LIBXML2
74 if (d) {
75 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlGetProp(d, reinterpret_cast<const xmlChar*>(attr.toUtf8().constData())), xmlFree);
76 return QString::fromUtf8(reinterpret_cast<const char*>(val.get()));
77 }
78#else
79 Q_UNUSED(attr)
80#endif
81 return {};
82}
83
84HtmlElement HtmlElement::parent() const
85{
86#if HAVE_LIBXML2
87 if (d && d->parent && d->parent->type == XML_ELEMENT_NODE) {
88 return HtmlElement(d->parent);
89 }
90#endif
91 return {};
92}
93
94HtmlElement HtmlElement::firstChild() const
95{
96#if HAVE_LIBXML2
97 if (d) {
98 return HtmlElement(xmlFirstElementChild(d));
99 }
100#endif
101 return {};
102}
103
104HtmlElement HtmlElement::nextSibling() const
105{
106#if HAVE_LIBXML2
107 if (d) {
108 return HtmlElement(xmlNextElementSibling(d));
109 }
110#endif
111 return {};
112}
113
114#if HAVE_LIBXML2
115static void normalizingAppend(QString &out, const QString &in)
116{
117 if (in.isEmpty()) {
118 return;
119 }
120
121 const bool needsLeadingSpace = !out.isEmpty() && !out.back().isSpace();
122 out.reserve(out.size() + in.size() + (needsLeadingSpace ? 1 : 0));
123 if (needsLeadingSpace) {
125 }
126
127 // convert non-breaking spaces and windows line break to normal ones, technically not correct
128 // but way too often this confuses our regular expressions
129 bool leadingTrim = true;
130 bool foundCR = false;
131 for (const auto c : in) {
132 // trim leading spaces while we are at it
133 if (leadingTrim && c.isSpace()) {
134 continue;
135 }
136 leadingTrim = false;
137
138 // normalize CRs
139 if (c == QChar::CarriageReturn) {
140 foundCR = true;
141 continue;
142 }
143 if (foundCR && c != QChar::LineFeed) {
145 }
146 foundCR = false;
147
148 // normalize space variations
149 if (c == QChar::Nbsp) {
151 } else {
152 out.push_back(c);
153 }
154 }
155}
156
157static void normalizingLineBreakAppend(QString &s)
158{
159 s = s.trimmed();
161}
162#endif
163
164QString HtmlElement::content() const
165{
166#if HAVE_LIBXML2
167 if (!d) {
168 return {};
169 }
170
171 QString s;
172 auto node = d->children;
173 while (node) {
174 switch (node->type) {
175 case XML_TEXT_NODE:
176 case XML_CDATA_SECTION_NODE:
177 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(node->content)));
178 break;
179 case XML_ENTITY_REF_NODE:
180 {
181 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree);
182 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(val.get())));
183 break;
184 }
185 case XML_ELEMENT_NODE:
186 if (qstricmp(reinterpret_cast<const char*>(node->name), "br") == 0) {
187 s += QLatin1Char('\n');
188 }
189 break;
190 default:
191 break;
192
193 }
194 node = node->next;
195 }
196
197 return s.trimmed(); // trailing trim can be done without copying
198#endif
199 return {};
200}
201
202#if HAVE_LIBXML2
203static void recursiveContent(_xmlNode *node, QString &s)
204{
205 switch (node->type) {
206 case XML_TEXT_NODE:
207 case XML_CDATA_SECTION_NODE:
208 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(node->content)));
209 return;
210 case XML_ENTITY_REF_NODE:
211 {
212 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree);
213 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(val.get())));
214 break;
215 }
216 case XML_ELEMENT_NODE:
217 {
218 if (qstricmp(reinterpret_cast<const char*>(node->name), "style") == 0) {
219 return;
220 } else if (qstricmp(reinterpret_cast<const char*>(node->name), "table") == 0) {
221 normalizingLineBreakAppend(s);
222 }
223 break;
224 }
225 case XML_ATTRIBUTE_NODE:
226 case XML_COMMENT_NODE:
227 return;
228 default:
229 break;
230 }
231
232 auto child = node->children;
233 while (child) {
234 recursiveContent(child, s);
235 child = child->next;
236 }
237
238 if (node->type == XML_ELEMENT_NODE) {
239 for (const auto elemName : { "br", "p", "tr" }) {
240 if (qstricmp(reinterpret_cast<const char*>(node->name), elemName) == 0) {
241 normalizingLineBreakAppend(s);
242 break;
243 }
244 }
245 }
246}
247#endif
248
249QString HtmlElement::recursiveContent() const
250{
251#if HAVE_LIBXML2
252 if (!d) {
253 return {};
254 }
255
256 QString s;
257 ::recursiveContent(d, s);
258 return s.trimmed(); // trailing trim can be done without copying
259#else
260 return {};
261#endif
262}
263
265{
266#if HAVE_LIBXML2
267 if (!d) {
268 return {};
269 }
270
271 const auto ctx = std::unique_ptr<xmlXPathContext, decltype(&xmlXPathFreeContext)>(xmlXPathNewContext(d->doc), &xmlXPathFreeContext);
272 if (!ctx) {
273 return {};
274 }
275 xmlXPathSetContextNode(d, ctx.get());
276 const auto xpathObj = std::unique_ptr<xmlXPathObject, decltype(&xmlXPathFreeObject)>(xmlXPathEvalExpression(reinterpret_cast<const xmlChar*>(xpath.toUtf8().constData()), ctx.get()), &xmlXPathFreeObject);
277 if (!xpathObj) {
278 return {};
279 }
280
281 switch (xpathObj->type) {
282 case XPATH_NODESET:
283 {
284 QVariantList l;
285 if (!xpathObj->nodesetval) {
286 return l;
287 }
288 l.reserve(xpathObj->nodesetval->nodeNr);
289 for (int i = 0; i < xpathObj->nodesetval->nodeNr; ++i) {
290 l.push_back(QVariant::fromValue<HtmlElement>(xpathObj->nodesetval->nodeTab[i]));
291 }
292 return l;
293 }
294 case XPATH_BOOLEAN:
295 return QVariant::fromValue<bool>(xpathObj->boolval);
296 case XPATH_NUMBER:
297 return xpathObj->floatval;
298 case XPATH_STRING:
299 return QString::fromUtf8(reinterpret_cast<const char*>(xpathObj->stringval));
300 default:
301 return {};
302 }
303#else
304 Q_UNUSED(xpath)
305#endif
306 return {};
307}
308
309bool HtmlElement::hasAttribute(const QString& attr) const
310{
311#if HAVE_LIBXML2
312 if (!d) {
313 return false;
314 }
315
316 auto attribute = d->properties;
317 while(attribute)
318 {
319 if (qstricmp(attr.toUtf8().constData(), reinterpret_cast<const char*>(attribute->name)) == 0) {
320 return true;
321 }
322 attribute = attribute->next;
323 }
324#else
325 Q_UNUSED(attr)
326#endif
327 return false;
328}
329
331{
332 QStringList l;
333#if HAVE_LIBXML2
334 if (!d) {
335 return l;
336 }
337
338 auto attribute = d->properties;
339 while(attribute)
340 {
341 l.push_back(QString::fromUtf8(reinterpret_cast<const char*>(attribute->name)));
342 attribute = attribute->next;
343 }
344#endif
345 return l;
346}
347
348bool HtmlElement::operator==(const HtmlElement &other) const
349{
350 return d == other.d;
351}
352
353
354HtmlElement HtmlDocument::root() const
355{
356#if HAVE_LIBXML2
357 if (!d->m_doc) {
358 return {};
359 }
360 return HtmlElement(xmlDocGetRootElement(d->m_doc));
361#else
362 return {};
363#endif
364}
365
366QString HtmlDocument::rawData() const
367{
368#if HAVE_LIBXML2
369 return QString::fromUtf8(d->m_rawData);
370#else
371 return {};
372#endif
373}
374
376{
377 return root().eval(xpath);
378}
379
381{
382#if HAVE_LIBXML2
383 auto tree = htmlReadMemory(data.constData(), data.size(), nullptr, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT);
384 if (!tree) {
385 return nullptr;
386 }
387
388 auto doc = new HtmlDocument(parent);
389 doc->d->m_doc = tree;
390 doc->d->m_rawData = data;
391 return doc;
392#else
393 Q_UNUSED(data)
394 Q_UNUSED(parent)
395 return nullptr;
396#endif
397}
398
400{
401#if HAVE_LIBXML2
402 auto utf8Data = data.toUtf8();
403 auto tree = htmlReadMemory(utf8Data.constData(), utf8Data.size(), nullptr, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT);
404 if (!tree) {
405 return nullptr;
406 }
407
408 auto doc = new HtmlDocument(parent);
409 doc->d->m_doc = tree;
410 doc->d->m_rawData = std::move(utf8Data);
411 return doc;
412#else
413 Q_UNUSED(data)
414 Q_UNUSED(parent)
415 return nullptr;
416#endif
417}
418
419#include "moc_htmldocument.cpp"
HTML document for extraction.
Q_INVOKABLE QVariant eval(const QString &xpath) const
Evaluate an XPath expression relative to the document root.
static HtmlDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a HtmlDocument from the given raw data.
static HtmlDocument * fromString(const QString &data, QObject *parent=nullptr)
Creates a HtmlDocument from a given (unicode) string.
HTML document element.
Q_INVOKABLE QString attribute(const QString &attr) const
Value of the attribute attr.
QStringList attributes() const
Returns the list of all attributes of this node.
bool operator==(const HtmlElement &other) const
Checks if two HtmlElement instances refer to the same DOM node.
Q_INVOKABLE QVariant eval(const QString &xpath) const
Evaluate an XPath expression relative to this node.
bool hasAttribute(const QString &attr) const
Checks whether an attribute with name attr exists.
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
const char * constData() const const
int size() const const
bool isSpace() const const
void push_back(const T &value)
QObject * parent() const const
QChar back() const const
QString fromUtf8(const char *str, int size)
bool isEmpty() const const
void push_back(QChar ch)
void reserve(int size)
int size() const const
QByteArray toUtf8() const const
QString trimmed() const const
bool canConvert() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Sun Feb 25 2024 18:40:32 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.