KItinerary

extractorengine.cpp
1 /*
2  SPDX-FileCopyrightText: 2017-2021 Volker Krause <[email protected]>
3 
4  SPDX-License-Identifier: LGPL-2.0-or-later
5 */
6 
7 #include "config-kitinerary.h"
8 #include "extractorengine.h"
9 
10 #include "barcodedecoder.h"
11 #include "abstractextractor.h"
12 #include "extractordocumentnode.h"
13 #include "extractordocumentnodefactory.h"
14 #include "extractordocumentprocessor.h"
15 #include "extractorresult.h"
16 #include "extractorrepository.h"
17 #include "extractorscriptengine_p.h"
18 #include "jsonlddocument.h"
19 #include "logging.h"
20 
21 #include <KMime/Content>
22 #include <KMime/Message>
23 
24 #include <QDateTime>
25 #include <QFile>
26 #include <QFileInfo>
27 #include <QJsonArray>
28 #include <QJsonDocument>
29 #include <QJsonObject>
30 #include <QLocale>
31 
32 #include <cstring>
33 
34 using namespace KItinerary;
35 
36 namespace KItinerary {
37 
38 class ExtractorEnginePrivate {
39 public:
40  void processNode(ExtractorDocumentNode &node);
41 
42  ExtractorEngine *q = nullptr;
43  std::vector<const AbstractExtractor*> m_additionalExtractors;
44  ExtractorDocumentNode m_rootNode;
45  ExtractorDocumentNode m_contextNode;
46  ExtractorDocumentNodeFactory m_nodeFactory;
47  ExtractorRepository m_repo;
48  BarcodeDecoder m_barcodeDecoder;
49  QString m_usedExtractor;
50  ExtractorScriptEngine m_scriptEngine;
51  ExtractorEngine::Hints m_hints = ExtractorEngine::NoHint;
52 };
53 
54 }
55 
56 void ExtractorEnginePrivate::processNode(ExtractorDocumentNode& node)
57 {
58  if (node.isNull()) {
59  return;
60  }
61 
62  node.processor()->expandNode(node, q);
63  for (auto c : node.childNodes()) {
64  processNode(c);
65  }
66  node.processor()->reduceNode(node);
67 
68  node.processor()->preExtract(node, q);
69  std::vector<const AbstractExtractor*> extractors = m_additionalExtractors;
70  m_repo.extractorsForNode(node, extractors);
71 
72  ExtractorResult nodeResult;
73  for (const auto &extractor : extractors) {
74  auto res = extractor->extract(node, q);
75  if (!res.isEmpty()) {
76  m_usedExtractor = extractor->name();
77  nodeResult.append(std::move(res));
78  }
79  }
80  if (!nodeResult.isEmpty()) {
81  node.setResult(std::move(nodeResult));
82  }
83 
84  node.processor()->postExtract(node);
85 
86  // set modification time for all results that don't have it yet
87  if (node.contextDateTime().isValid()) {
88  auto result = node.result().jsonLdResult();
89  for (int i = 0; i < result.size(); ++i) {
90  auto res = result.at(i).toObject();
91  if (!res.contains(QLatin1String("modifiedTime"))) {
92  res.insert(QStringLiteral("modifiedTime"), node.contextDateTime().toString(Qt::ISODate));
93  }
94  result[i] = res;
95  }
96  node.setResult(result);
97  }
98 }
99 
100 
101 ExtractorEngine::ExtractorEngine()
102  : d(new ExtractorEnginePrivate)
103 {
104  d->q = this;
105 }
106 
107 ExtractorEngine::ExtractorEngine(ExtractorEngine &&) noexcept = default;
108 
110 {
111  // ensure we destroy nodes before we destroy the node factory
112  clear();
113 }
114 
116 {
117  d->m_rootNode = {};
118  d->m_contextNode = {};
119 }
120 
121 void ExtractorEngine::setData(const QByteArray &data, QStringView fileName, QStringView mimeType)
122 {
123  d->m_rootNode = d->m_nodeFactory.createNode(data, fileName, mimeType);
124 }
125 
127 {
128  d->m_rootNode = d->m_nodeFactory.createNode(data, mimeType);
129 }
130 
132 {
133  d->m_contextNode = d->m_nodeFactory.createNode(data, mimeType);
134 }
135 
137 {
138  d->m_contextNode.setContextDateTime(dt);
139 }
140 
142 {
143  return d->m_hints;
144 }
145 
147 {
148  d->m_hints = hints;
149 }
150 
152 {
153  d->m_rootNode.setParent(d->m_contextNode);
154  d->processNode(d->m_rootNode);
155  return d->m_rootNode.result().jsonLdResult();
156 }
157 
158 void ExtractorEngine::setUseSeparateProcess(bool separateProcess)
159 {
160  d->m_nodeFactory.setUseSeparateProcess(separateProcess);
161 }
162 
163 void ExtractorEngine::setAdditionalExtractors(std::vector<const AbstractExtractor*> &&extractors)
164 {
165  d->m_additionalExtractors = std::move(extractors);
166 }
167 
169 {
170  return d->m_usedExtractor;
171 }
172 
174 {
175  return &d->m_nodeFactory;
176 }
177 
179 {
180  return &d->m_barcodeDecoder;
181 }
182 
183 const ExtractorRepository* ExtractorEngine::extractorRepository() const
184 {
185  return &d->m_repo;
186 }
187 
188 const ExtractorScriptEngine* ExtractorEngine::scriptEngine() const
189 {
190  d->m_scriptEngine.setBarcodeDecoder(&d->m_barcodeDecoder);
191  return &d->m_scriptEngine;
192 }
193 
194 ExtractorDocumentNode ExtractorEngine::rootDocumentNode() const
195 {
196  return d->m_rootNode;
197 }
QJsonObject toObject() const const
bool isEmpty() const
Checks if there is any relevant result set in here.
const BarcodeDecoder * barcodeDecoder() const
Barcode decoder for use by KItinerary::ExtractorDocumentProcessor.
Instantiates KItinerary::ExtractorDocumentNode instances using the type-specific document processor.
A node in the extracted document object tree.
void append(ExtractorResult &&other)
Append another result to this one.
void setUseSeparateProcess(bool separateProcess)
Perform extraction of "risky" content such as PDF files in a separate process.
Generic extraction result.
void clear()
Resets the internal state, call before processing new input data.
Collection of all known data extractors.
void setData(const QByteArray &data, QStringView fileName={}, QStringView mimeType={})
Set raw data to extract from.
void setContext(const QVariant &data, QStringView mimeType)
Provide a document part that is only used to determine which extractor to use, but not for extraction...
QString usedCustomExtractor() const
Returns the extractor id used to obtain the result.
void setHints(Hints hints)
Set extraction hints.
Barcode decoding with result caching.
QVariantList childNodes
Child nodes, for QJSEngine access.
Hints hints() const
The currently set extraction hints.
QDateTime contextDateTime
The best known context date/time at this point in the document tree.
void setAdditionalExtractors(std::vector< const AbstractExtractor * > &&extractors)
Sets additional extractors to run on the given data.
QJsonObject::iterator insert(const QString &key, const QJsonValue &value)
QJsonArray result
Result access for QJSEngine.
void setContent(const QVariant &data, QStringView mimeType)
Already decoded data to extract from.
void setContextDate(const QDateTime &dt)
Set the date the extracted document has been issued at.
QJsonValue at(int i) const const
void setResult(ExtractorResult &&result)
Replace the existing results by result.
Semantic data extraction engine.
bool isValid() const const
QJsonArray extract()
Perform the actual extraction, and return the JSON-LD data that has been found.
QString toString(Qt::DateFormat format) const const
const ExtractorDocumentNodeFactory * documentNodeFactory() const
Factory for creating new document nodes.
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Sun Sep 25 2022 03:58:14 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.