KItinerary

extractorfilter.cpp
1 /*
2  SPDX-FileCopyrightText: 2017-2021 Volker Krause <[email protected]>
3 
4  SPDX-License-Identifier: LGPL-2.0-or-later
5 */
6 
7 #include "extractorfilter.h"
8 #include "extractordocumentnode.h"
9 #include "extractordocumentprocessor.h"
10 #include "extractorresult.h"
11 #include "logging.h"
12 
13 #include <QJsonObject>
14 #include <QJSValue>
15 #include <QMetaEnum>
16 #include <QRegularExpression>
17 
18 using namespace KItinerary;
19 
20 namespace KItinerary {
21 class ExtractorFilterPrivate : public QSharedData
22 {
23 public:
24  QString m_mimeType;
25  QString m_fieldName;
26  QRegularExpression m_exp;
28 };
29 }
30 
31 ExtractorFilter::ExtractorFilter()
32  : d(new ExtractorFilterPrivate)
33 {
34 }
35 
36 ExtractorFilter::ExtractorFilter(const ExtractorFilter&) = default;
37 ExtractorFilter::ExtractorFilter(ExtractorFilter&&) noexcept = default;
38 ExtractorFilter::~ExtractorFilter() = default;
39 ExtractorFilter& ExtractorFilter::operator=(const ExtractorFilter&) = default;
40 ExtractorFilter& ExtractorFilter::operator=(ExtractorFilter&&) = default;
41 
42 QString ExtractorFilter::mimeType() const
43 {
44  return d->m_mimeType;
45 }
46 
47 void ExtractorFilter::setMimeType(const QString &mimeType)
48 {
49  d.detach();
50  d->m_mimeType = mimeType;
51 }
52 
54 {
55  return d->m_fieldName;
56 }
57 
58 void ExtractorFilter::setFieldName(const QString &fieldName)
59 {
60  d.detach();
61  d->m_fieldName = fieldName;
62 }
63 
64 bool ExtractorFilter::matches(const QString &data) const
65 {
66  if (!d->m_exp.isValid()) {
67  qCDebug(Log) << d->m_exp.errorString() << d->m_exp.pattern();
68  }
69  return d->m_exp.match(data).hasMatch();
70 }
71 
72 static bool needsFieldName(const QString &mimeType)
73 {
74  return mimeType != QLatin1String("text/plain") && mimeType != QLatin1String("application/octet-stream");
75 }
76 
77 template <typename T>
78 static T readEnum(const QJsonValue &v, T defaultValue = {})
79 {
80  if (!v.isString()) {
81  return defaultValue;
82  }
83 
84  const auto me = QMetaEnum::fromType<T>();
85  bool success = false;
86  const auto result = static_cast<T>(me.keyToValue(v.toString().toUtf8().constData(), &success));
87  return success ? result : defaultValue;
88 }
89 
90 bool ExtractorFilter::load(const QJsonObject &obj)
91 {
92  d.detach();
93  d->m_mimeType = obj.value(QLatin1String("mimeType")).toString();
94  if (d->m_mimeType.isEmpty()) {
95  qCDebug(Log) << "unspecified filter MIME type";
96  }
97  d->m_fieldName = obj.value(QLatin1String("field")).toString();
98  d->m_exp.setPattern(obj.value(QLatin1String("match")).toString());
99  d->m_scope = readEnum<ExtractorFilter::Scope>(obj.value(QLatin1String("scope")), ExtractorFilter::Current);
100  return !d->m_mimeType.isEmpty() && (!d->m_fieldName.isEmpty() || !needsFieldName(d->m_mimeType)) && d->m_exp.isValid();
101 }
102 
103 QJsonObject ExtractorFilter::toJson() const
104 {
105  QJsonObject obj;
106  obj.insert(QLatin1String("mimeType"), d->m_mimeType);
107  if (needsFieldName(d->m_mimeType)) {
108  obj.insert(QLatin1String("field"), d->m_fieldName);
109  }
110  obj.insert(QLatin1String("match"), pattern());
111  obj.insert(QLatin1String("scope"), QLatin1String(QMetaEnum::fromType<ExtractorFilter::Scope>().valueToKey(d->m_scope)));
112  return obj;
113 }
114 
116 {
117  return d->m_exp.pattern();
118 }
119 
120 void ExtractorFilter::setPattern(const QString &pattern)
121 {
122  d.detach();
123  d->m_exp.setPattern(pattern);
124 }
125 
127 {
128  return d->m_scope;
129 }
130 
131 void ExtractorFilter::setScope(Scope scope)
132 {
133  d.detach();
134  d->m_scope = scope;
135 }
136 
137 static QString valueForJsonPath(const QJsonObject &obj, const QString &path)
138 {
139  const auto pathSections = QStringView(path).split(QLatin1Char('.'));
140  QJsonValue v(obj);
141  for (const auto &pathSection : pathSections) {
142  if (!v.isObject()) {
143  return {};
144  }
145  v = v.toObject().value(pathSection.toString());
146  }
147  return v.toString();
148 }
149 
150 enum MatchMode { Any, All };
151 
152 static bool filterMachesNode(const ExtractorFilter &filter, ExtractorFilter::Scope scope, const ExtractorDocumentNode &node,
153  std::vector<ExtractorDocumentNode> &matches, MatchMode matchMode)
154 {
155  if (node.isNull()) {
156  return false;
157  }
158 
159  // filter without field/pattern always match, if the mimetype does
160  if (filter.mimeType() == node.mimeType() && ((filter.fieldName().isEmpty() && filter.pattern().isEmpty()) || node.processor()->matches(filter, node))) {
161  if (matchMode == All) {
162  matches.push_back(node);
163  }
164  return true;
165  }
166 
167  if (scope != ExtractorFilter::Ancestors && filter.mimeType() == QLatin1String("application/ld+json") && !node.result().isEmpty()) {
168  // when collecting all matches for results, we only want the "leaf-most" ones, not those along the path
169  if (matchMode == All && scope == ExtractorFilter::Descendants) {
170  bool descendantsMatched = false;
171  for (const auto &child : node.childNodes()) {
172  descendantsMatched |= filterMachesNode(filter, ExtractorFilter::Descendants, child, matches, matchMode);
173  }
174  if (descendantsMatched) {
175  return true;
176  }
177  }
178 
179  const auto res = node.result().jsonLdResult();
180  for (const auto &elem : res) {
181  const auto property = valueForJsonPath(elem.toObject(), filter.fieldName());
182  if (filter.matches(property)) {
183  if (matchMode == All) {
184  matches.push_back(node);
185  } else {
186  return true;
187  }
188  }
189  }
190  }
191 
192  if (scope == ExtractorFilter::Ancestors) {
193  return filterMachesNode(filter, scope, node.parent(), matches, matchMode);
194  }
195  if (scope == ExtractorFilter::Descendants) {
196  for (const auto &child : node.childNodes()) {
197  const auto m = filterMachesNode(filter, ExtractorFilter::Descendants, child, matches, matchMode);
198  if (m && matchMode == Any) {
199  return true;
200  }
201  }
202  }
203 
204  return !matches.empty();
205 }
206 
208 {
209  std::vector<ExtractorDocumentNode> matches;
210  switch (d->m_scope) {
212  return filterMachesNode(*this, ExtractorFilter::Current, node, matches, Any);
214  return filterMachesNode(*this, ExtractorFilter::Current, node.parent(), matches, Any);
216  return filterMachesNode(*this, ExtractorFilter::Ancestors, node.parent(), matches, Any);
219  for (const auto &child : node.childNodes()) {
220  if (filterMachesNode(*this, d->m_scope == ExtractorFilter::Descendants ? d->m_scope : ExtractorFilter::Current, child, matches, Any)) {
221  return true;
222  }
223  }
224  }
225  return false;
226 }
227 
228 void ExtractorFilter::allMatches(const ExtractorDocumentNode &node, std::vector<ExtractorDocumentNode>& matches) const
229 {
230  switch (d->m_scope) {
232  filterMachesNode(*this, ExtractorFilter::Current, node, matches, All);
233  return;
235  filterMachesNode(*this, ExtractorFilter::Current, node.parent(), matches, All);
236  return;
238  filterMachesNode(*this, ExtractorFilter::Ancestors, node.parent(), matches, All);
239  return;
242  for (const auto &child : node.childNodes()) {
243  filterMachesNode(*this, d->m_scope == ExtractorFilter::Descendants ? d->m_scope : ExtractorFilter::Current, child, matches, All);
244  }
245  return;
246  }
247 }
248 
249 ExtractorFilter ExtractorFilter::fromJSValue(const QJSValue &js)
250 {
251  ExtractorFilter f;
252  f.setMimeType(js.property(QLatin1String("mimeType")).toString());
253  const auto fieldName = js.property(QLatin1String("field"));
254  if (fieldName.isString()) {
255  f.setFieldName(fieldName.toString());
256  }
257  const auto match = js.property(QLatin1String("match"));
258  if (match.isString()) {
259  f.setPattern(match.toString());
260  }
261  f.setScope(readEnum<ExtractorFilter::Scope>(js.property(QLatin1String("scope")).toString(), ExtractorFilter::Current));
262  return f;
263 }
match the direct child nodes
Scope
Specifies which document nodes should match this filter, relative to the one being extracted...
Classes for reservation/travel data models, data extraction and data augmentation.
match the direct parent node
QString toString() const const
bool isObject() const const
match any direct or indirect child nodes
match any direct or indirect parent nodes
QString toString() const const
QVariantList childNodes
Child nodes, for QJSEngine access.
QJSValue property(const QString &name) const const
Determines whether an extractor is applicable to a given email.
QJsonObject toObject() const const
bool isEmpty() const const
const char * constData() const const
QString pattern() const
Pattern to match field value against.
KItinerary::ExtractorDocumentNode parent
The parent node, or a null node if this is the root node.
QJsonArray result
Result access for QJSEngine.
A node in the extracted document object tree.
bool matches(const QString &data) const
Check if data matches this filter.
Scope scope() const
Evaluation scope of this filter, in relation to the node being extracted.
void push_back(QChar ch)
bool isString() const const
bool isEmpty() const const
void allMatches(const ExtractorDocumentNode &node, std::vector< ExtractorDocumentNode > &matches) const
Checks whether this filter applies to node.
QJsonValue value(const QString &key) const const
match the node being extracted
QString mimeType() const
MIME type of the document part this filter can match.
QString mimeType
The MIME type of this node.
QJsonObject::iterator insert(const QString &key, const QJsonValue &value)
QString fieldName() const
The field to filter on.
QByteArray toUtf8() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Tue Jan 25 2022 23:06:15 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.