7#include "config-kitinerary.h"
8#include "htmldocument.h"
14#include <libxml/HTMLparser.h>
15#include <libxml/xpath.h>
21class HtmlDocumentPrivate {
24 ~HtmlDocumentPrivate() {
34HtmlElement::HtmlElement()
39HtmlElement::~HtmlElement() =
default;
42HtmlElement::HtmlElement(xmlNode *dd)
48HtmlDocument::HtmlDocument(
QObject *parent)
50 , d(new HtmlDocumentPrivate)
54HtmlDocument::~HtmlDocument() =
default;
56bool HtmlElement::isNull()
const
75 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlGetProp(d,
reinterpret_cast<const xmlChar*
>(attr.
toUtf8().
constData())), xmlFree);
87 if (d && d->parent && d->parent->type == XML_ELEMENT_NODE) {
123 if (needsLeadingSpace) {
129 bool leadingTrim =
true;
130 bool foundCR =
false;
131 for (
const auto c : in) {
133 if (leadingTrim && c.isSpace()) {
157static void normalizingLineBreakAppend(
QString &s)
172 auto node = d->children;
174 switch (node->type) {
176 case XML_CDATA_SECTION_NODE:
177 normalizingAppend(s,
QString::fromUtf8(
reinterpret_cast<const char*
>(node->content)));
179 case XML_ENTITY_REF_NODE:
181 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree);
182 normalizingAppend(s,
QString::fromUtf8(
reinterpret_cast<const char*
>(val.get())));
185 case XML_ELEMENT_NODE:
186 if (qstricmp(
reinterpret_cast<const char*
>(node->name),
"br") == 0) {
203static void recursiveContent(_xmlNode *node,
QString &s)
205 switch (node->type) {
207 case XML_CDATA_SECTION_NODE:
208 normalizingAppend(s,
QString::fromUtf8(
reinterpret_cast<const char*
>(node->content)));
210 case XML_ENTITY_REF_NODE:
212 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree);
213 normalizingAppend(s,
QString::fromUtf8(
reinterpret_cast<const char*
>(val.get())));
216 case XML_ELEMENT_NODE:
218 if (qstricmp(
reinterpret_cast<const char*
>(node->name),
"style") == 0) {
220 }
else if (qstricmp(
reinterpret_cast<const char*
>(node->name),
"table") == 0) {
221 normalizingLineBreakAppend(s);
225 case XML_ATTRIBUTE_NODE:
226 case XML_COMMENT_NODE:
232 auto child = node->children;
234 recursiveContent(child, s);
238 if (node->type == XML_ELEMENT_NODE) {
239 for (
const auto elemName : {
"br",
"p",
"tr" }) {
240 if (qstricmp(
reinterpret_cast<const char*
>(node->name), elemName) == 0) {
241 normalizingLineBreakAppend(s);
257 ::recursiveContent(d, s);
271 const auto ctx = std::unique_ptr<xmlXPathContext, decltype(&xmlXPathFreeContext)>(xmlXPathNewContext(d->doc), &xmlXPathFreeContext);
275 xmlXPathSetContextNode(d, ctx.get());
276 const auto xpathObj = std::unique_ptr<xmlXPathObject, decltype(&xmlXPathFreeObject)>(xmlXPathEvalExpression(
reinterpret_cast<const xmlChar*
>(xpath.
toUtf8().
constData()), ctx.get()), &xmlXPathFreeObject);
281 switch (xpathObj->type) {
285 if (!xpathObj->nodesetval) {
288 l.reserve(xpathObj->nodesetval->nodeNr);
289 for (
int i = 0; i < xpathObj->nodesetval->nodeNr; ++i) {
297 return xpathObj->floatval;
360 return HtmlElement(xmlDocGetRootElement(d->m_doc));
377 return root().
eval(xpath);
383 auto tree = htmlReadMemory(data.
constData(), data.
size(),
nullptr,
"utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT);
389 doc->d->m_doc = tree;
390 doc->d->m_rawData = data;
402 auto utf8Data = data.
toUtf8();
403 auto tree = htmlReadMemory(utf8Data.constData(), utf8Data.size(),
nullptr,
"utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT);
409 doc->d->m_doc = tree;
410 doc->d->m_rawData = std::move(utf8Data);
419#include "moc_htmldocument.cpp"
HTML document for extraction.
Q_INVOKABLE QVariant eval(const QString &xpath) const
Evaluate an XPath expression relative to the document root.
static HtmlDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a HtmlDocument from the given raw data.
static HtmlDocument * fromString(const QString &data, QObject *parent=nullptr)
Creates a HtmlDocument from a given (unicode) string.
Q_INVOKABLE QString attribute(const QString &attr) const
Value of the attribute attr.
QStringList attributes() const
Returns the list of all attributes of this node.
bool operator==(const HtmlElement &other) const
Checks if two HtmlElement instances refer to the same DOM node.
Q_INVOKABLE QVariant eval(const QString &xpath) const
Evaluate an XPath expression relative to this node.
bool hasAttribute(const QString &attr) const
Checks whether an attribute with name attr exists.
Classes for reservation/travel data models, data extraction and data augmentation.
const char * constData() const const
qsizetype size() const const
bool isSpace(char32_t ucs4)
void push_back(parameter_type value)
QObject * parent() const const
QString fromUtf8(QByteArrayView str)
bool isEmpty() const const
void reserve(qsizetype size)
qsizetype size() const const
QByteArray toUtf8() const const
QString trimmed() const const
QVariant fromValue(T &&value)