KItinerary

htmldocument.cpp
1/*
2 SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "config-kitinerary.h"
8#include "htmldocument.h"
9
10#include <QDebug>
11#include <QVariant>
12
13#if HAVE_LIBXML2
14#include <libxml/HTMLparser.h>
15#include <libxml/xpath.h>
16#endif
17
18using namespace KItinerary;
19
20namespace KItinerary {
21class HtmlDocumentPrivate {
22public:
23#if HAVE_LIBXML2
24 ~HtmlDocumentPrivate() {
25 xmlFreeDoc(m_doc);
26 }
27
28 xmlDocPtr m_doc;
29 QByteArray m_rawData;
30#endif
31};
32}
33
34HtmlElement::HtmlElement()
35 : d(nullptr)
36{
37}
38
39HtmlElement::~HtmlElement() = default;
40
41#if HAVE_LIBXML2
42HtmlElement::HtmlElement(xmlNode *dd)
43 : d(dd)
44{
45}
46#endif
47
48HtmlDocument::HtmlDocument(QObject *parent)
49 : QObject(parent)
50 , d(new HtmlDocumentPrivate)
51{
52}
53
54HtmlDocument::~HtmlDocument() = default;
55
56bool HtmlElement::isNull() const
57{
58 return d == nullptr;
59}
60
61QString HtmlElement::name() const
62{
63#if HAVE_LIBXML2
64 if (d) {
65 return QString::fromUtf8(reinterpret_cast<const char*>(d->name));
66 }
67#endif
68 return {};
69}
70
72{
73#if HAVE_LIBXML2
74 if (d) {
75 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlGetProp(d, reinterpret_cast<const xmlChar*>(attr.toUtf8().constData())), xmlFree);
76 return QString::fromUtf8(reinterpret_cast<const char*>(val.get()));
77 }
78#else
79 Q_UNUSED(attr)
80#endif
81 return {};
82}
83
84HtmlElement HtmlElement::parent() const
85{
86#if HAVE_LIBXML2
87 if (d && d->parent && d->parent->type == XML_ELEMENT_NODE) {
88 return HtmlElement(d->parent);
89 }
90#endif
91 return {};
92}
93
94HtmlElement HtmlElement::firstChild() const
95{
96#if HAVE_LIBXML2
97 if (d) {
98 return HtmlElement(xmlFirstElementChild(d));
99 }
100#endif
101 return {};
102}
103
104HtmlElement HtmlElement::nextSibling() const
105{
106#if HAVE_LIBXML2
107 if (d) {
108 return HtmlElement(xmlNextElementSibling(d));
109 }
110#endif
111 return {};
112}
113
114#if HAVE_LIBXML2
115static void normalizingAppend(QString &out, const QString &in)
116{
117 if (in.isEmpty()) {
118 return;
119 }
120
121 const bool needsLeadingSpace = !out.isEmpty() && !out.back().isSpace();
122 out.reserve(out.size() + in.size() + (needsLeadingSpace ? 1 : 0));
123 if (needsLeadingSpace) {
125 }
126
127 // convert non-breaking spaces and windows line break to normal ones, technically not correct
128 // but way too often this confuses our regular expressions
129 bool leadingTrim = true;
130 bool foundCR = false;
131 for (const auto c : in) {
132 // trim leading spaces while we are at it
133 if (leadingTrim && c.isSpace()) {
134 continue;
135 }
136 leadingTrim = false;
137
138 // normalize CRs
139 if (c == QChar::CarriageReturn) {
140 foundCR = true;
141 continue;
142 }
143 if (foundCR && c != QChar::LineFeed) {
145 }
146 foundCR = false;
147
148 // normalize space variations
149 if (c == QChar::Nbsp) {
151 } else {
152 out.push_back(c);
153 }
154 }
155}
156
157static void normalizingLineBreakAppend(QString &s)
158{
159 s = s.trimmed();
161}
162#endif
163
164QString HtmlElement::content() const
165{
166#if HAVE_LIBXML2
167 if (!d) {
168 return {};
169 }
170
171 QString s;
172 auto node = d->children;
173 while (node) {
174 switch (node->type) {
175 case XML_TEXT_NODE:
176 case XML_CDATA_SECTION_NODE:
177 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(node->content)));
178 break;
179 case XML_ENTITY_REF_NODE:
180 {
181 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree);
182 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(val.get())));
183 break;
184 }
185 case XML_ELEMENT_NODE:
186 if (qstricmp(reinterpret_cast<const char*>(node->name), "br") == 0) {
187 s += QLatin1Char('\n');
188 }
189 break;
190 default:
191 break;
192
193 }
194 node = node->next;
195 }
196
197 return s.trimmed(); // trailing trim can be done without copying
198#endif
199 return {};
200}
201
202#if HAVE_LIBXML2
203static void recursiveContent(_xmlNode *node, QString &s)
204{
205 switch (node->type) {
206 case XML_TEXT_NODE:
207 case XML_CDATA_SECTION_NODE:
208 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(node->content)));
209 return;
210 case XML_ENTITY_REF_NODE:
211 {
212 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree);
213 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(val.get())));
214 break;
215 }
216 case XML_ELEMENT_NODE:
217 {
218 if (qstricmp(reinterpret_cast<const char*>(node->name), "style") == 0) {
219 return;
220 } else if (qstricmp(reinterpret_cast<const char*>(node->name), "table") == 0) {
221 normalizingLineBreakAppend(s);
222 }
223 break;
224 }
225 case XML_ATTRIBUTE_NODE:
226 case XML_COMMENT_NODE:
227 return;
228 default:
229 break;
230 }
231
232 auto child = node->children;
233 while (child) {
234 recursiveContent(child, s);
235 child = child->next;
236 }
237
238 if (node->type == XML_ELEMENT_NODE) {
239 for (const auto elemName : { "br", "p", "tr" }) {
240 if (qstricmp(reinterpret_cast<const char*>(node->name), elemName) == 0) {
241 normalizingLineBreakAppend(s);
242 break;
243 }
244 }
245 }
246}
247#endif
248
249QString HtmlElement::recursiveContent() const
250{
251#if HAVE_LIBXML2
252 if (!d) {
253 return {};
254 }
255
256 QString s;
257 ::recursiveContent(d, s);
258 return s.trimmed(); // trailing trim can be done without copying
259#else
260 return {};
261#endif
262}
263
265{
266#if HAVE_LIBXML2
267 if (!d) {
268 return {};
269 }
270
271 const auto ctx = std::unique_ptr<xmlXPathContext, decltype(&xmlXPathFreeContext)>(xmlXPathNewContext(d->doc), &xmlXPathFreeContext);
272 if (!ctx) {
273 return {};
274 }
275 xmlXPathSetContextNode(d, ctx.get());
276 const auto xpathObj = std::unique_ptr<xmlXPathObject, decltype(&xmlXPathFreeObject)>(xmlXPathEvalExpression(reinterpret_cast<const xmlChar*>(xpath.toUtf8().constData()), ctx.get()), &xmlXPathFreeObject);
277 if (!xpathObj) {
278 return {};
279 }
280
281 switch (xpathObj->type) {
282 case XPATH_NODESET:
283 {
284 QVariantList l;
285 if (!xpathObj->nodesetval) {
286 return l;
287 }
288 l.reserve(xpathObj->nodesetval->nodeNr);
289 for (int i = 0; i < xpathObj->nodesetval->nodeNr; ++i) {
290 l.push_back(QVariant::fromValue<HtmlElement>(xpathObj->nodesetval->nodeTab[i]));
291 }
292 return l;
293 }
294 case XPATH_BOOLEAN:
295 return QVariant::fromValue<bool>(xpathObj->boolval);
296 case XPATH_NUMBER:
297 return xpathObj->floatval;
298 case XPATH_STRING:
299 return QString::fromUtf8(reinterpret_cast<const char*>(xpathObj->stringval));
300 default:
301 return {};
302 }
303#else
304 Q_UNUSED(xpath)
305#endif
306 return {};
307}
308
309bool HtmlElement::hasAttribute(const QString& attr) const
310{
311#if HAVE_LIBXML2
312 if (!d) {
313 return false;
314 }
315
316 auto attribute = d->properties;
317 while(attribute)
318 {
319 if (qstricmp(attr.toUtf8().constData(), reinterpret_cast<const char*>(attribute->name)) == 0) {
320 return true;
321 }
322 attribute = attribute->next;
323 }
324#else
325 Q_UNUSED(attr)
326#endif
327 return false;
328}
329
331{
332 QStringList l;
333#if HAVE_LIBXML2
334 if (!d) {
335 return l;
336 }
337
338 auto attribute = d->properties;
339 while(attribute)
340 {
341 l.push_back(QString::fromUtf8(reinterpret_cast<const char*>(attribute->name)));
342 attribute = attribute->next;
343 }
344#endif
345 return l;
346}
347
348bool HtmlElement::operator==(const HtmlElement &other) const
349{
350 return d == other.d;
351}
352
353
354HtmlElement HtmlDocument::root() const
355{
356#if HAVE_LIBXML2
357 if (!d->m_doc) {
358 return {};
359 }
360 return HtmlElement(xmlDocGetRootElement(d->m_doc));
361#else
362 return {};
363#endif
364}
365
366QString HtmlDocument::rawData() const
367{
368#if HAVE_LIBXML2
369 return QString::fromUtf8(d->m_rawData);
370#else
371 return {};
372#endif
373}
374
376{
377 return root().eval(xpath);
378}
379
381{
382#if HAVE_LIBXML2
383 auto tree = htmlReadMemory(data.constData(), data.size(), nullptr, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT);
384 if (!tree) {
385 return nullptr;
386 }
387
388 auto doc = new HtmlDocument(parent);
389 doc->d->m_doc = tree;
390 doc->d->m_rawData = data;
391 return doc;
392#else
393 Q_UNUSED(data)
394 Q_UNUSED(parent)
395 return nullptr;
396#endif
397}
398
400{
401#if HAVE_LIBXML2
402 auto utf8Data = data.toUtf8();
403 auto tree = htmlReadMemory(utf8Data.constData(), utf8Data.size(), nullptr, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT);
404 if (!tree) {
405 return nullptr;
406 }
407
408 auto doc = new HtmlDocument(parent);
409 doc->d->m_doc = tree;
410 doc->d->m_rawData = std::move(utf8Data);
411 return doc;
412#else
413 Q_UNUSED(data)
414 Q_UNUSED(parent)
415 return nullptr;
416#endif
417}
418
419#include "moc_htmldocument.cpp"
HTML document for extraction.
Q_INVOKABLE QVariant eval(const QString &xpath) const
Evaluate an XPath expression relative to the document root.
static HtmlDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a HtmlDocument from the given raw data.
static HtmlDocument * fromString(const QString &data, QObject *parent=nullptr)
Creates a HtmlDocument from a given (unicode) string.
HTML document element.
Q_INVOKABLE QString attribute(const QString &attr) const
Value of the attribute attr.
QStringList attributes() const
Returns the list of all attributes of this node.
bool operator==(const HtmlElement &other) const
Checks if two HtmlElement instances refer to the same DOM node.
Q_INVOKABLE QVariant eval(const QString &xpath) const
Evaluate an XPath expression relative to this node.
bool hasAttribute(const QString &attr) const
Checks whether an attribute with name attr exists.
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
const char * constData() const const
qsizetype size() const const
bool isSpace(char32_t ucs4)
void push_back(parameter_type value)
QObject * parent() const const
QChar & back()
QString fromUtf8(QByteArrayView str)
bool isEmpty() const const
void push_back(QChar ch)
void reserve(qsizetype size)
qsizetype size() const const
QByteArray toUtf8() const const
QString trimmed() const const
QVariant fromValue(T &&value)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Mon Nov 4 2024 16:28:48 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.