KItinerary

engine/extractorengine.cpp
1/*
2 SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "config-kitinerary.h"
8#include "extractorengine.h"
9
10#include "barcodedecoder.h"
11#include "abstractextractor.h"
12#include "extractordocumentnode.h"
13#include "extractordocumentnodefactory.h"
14#include "extractordocumentprocessor.h"
15#include "extractorresult.h"
16#include "extractorrepository.h"
17#include "extractorscriptengine_p.h"
18#include "jsonlddocument.h"
19#include "logging.h"
20
21#include <KMime/Content>
22#include <KMime/Message>
23
24#include <QDateTime>
25#include <QFile>
26#include <QFileInfo>
27#include <QJsonArray>
28#include <QJsonDocument>
29#include <QJsonObject>
30#include <QLocale>
31
32#include <cstring>
33
34using namespace KItinerary;
35
36namespace KItinerary {
37
38class ExtractorEnginePrivate {
39public:
40 void processNode(ExtractorDocumentNode &node);
41
42 ExtractorEngine *q = nullptr;
43 std::vector<const AbstractExtractor*> m_additionalExtractors;
44 ExtractorDocumentNode m_rootNode;
45 ExtractorDocumentNode m_contextNode;
46 ExtractorDocumentNodeFactory m_nodeFactory;
48 BarcodeDecoder m_barcodeDecoder;
49 ExtractorScriptEngine m_scriptEngine;
50 ExtractorEngine::Hints m_hints = ExtractorEngine::NoHint;
51};
52
53}
54
55void ExtractorEnginePrivate::processNode(ExtractorDocumentNode& node)
56{
57 if (node.isNull()) {
58 return;
59 }
60
61 node.processor()->expandNode(node, q);
62 for (auto c : node.childNodes()) {
63 processNode(c);
64 }
65 node.processor()->reduceNode(node);
66
67 node.processor()->preExtract(node, q);
68 std::vector<const AbstractExtractor*> extractors = m_additionalExtractors;
69 m_repo.extractorsForNode(node, extractors);
70
71 ExtractorResult nodeResult;
72 QString usedExtractor;
73 for (const auto &extractor : extractors) {
74 auto res = extractor->extract(node, q);
75 if (!res.isEmpty()) {
76 usedExtractor = extractor->name();
77 nodeResult.append(std::move(res));
78 }
79 }
80 if (!nodeResult.isEmpty()) {
81 node.setResult(std::move(nodeResult));
82 node.setUsedExtractor(usedExtractor);
83 }
84
85 node.processor()->postExtract(node, q);
86
87 // set modification time for all results that don't have it yet
88 if (node.contextDateTime().isValid()) {
89 auto result = node.result().jsonLdResult();
90 for (int i = 0; i < result.size(); ++i) {
91 auto res = result.at(i).toObject();
92 if (!res.contains(QLatin1StringView("modifiedTime"))) {
93 res.insert(QStringLiteral("modifiedTime"),
95 }
96 result[i] = res;
97 }
98 node.setResult(result);
99 }
100}
101
102
103ExtractorEngine::ExtractorEngine()
104 : d(new ExtractorEnginePrivate)
105{
106 d->q = this;
107}
108
109ExtractorEngine::ExtractorEngine(ExtractorEngine &&) noexcept = default;
110
112{
113 // ensure we destroy nodes before we destroy the node factory
114 clear();
115}
116
118{
119 d->m_rootNode = {};
120 d->m_contextNode = {};
121}
122
123void ExtractorEngine::setData(const QByteArray &data, QStringView fileName, QStringView mimeType)
124{
125 d->m_rootNode = d->m_nodeFactory.createNode(data, fileName, mimeType);
126}
127
129{
130 d->m_rootNode = d->m_nodeFactory.createNode(data, mimeType);
131}
132
134{
135 d->m_contextNode = d->m_nodeFactory.createNode(data, mimeType);
136}
137
139{
140 d->m_contextNode.setContextDateTime(dt);
141}
142
144{
145 return d->m_hints;
146}
147
149{
150 d->m_hints = hints;
151}
152
154{
155 d->m_rootNode.setParent(d->m_contextNode);
156 d->processNode(d->m_rootNode);
157 return d->m_rootNode.result().jsonLdResult();
158}
159
161{
162 d->m_nodeFactory.setUseSeparateProcess(separateProcess);
163}
164
165void ExtractorEngine::setAdditionalExtractors(std::vector<const AbstractExtractor*> &&extractors)
166{
167 d->m_additionalExtractors = std::move(extractors);
168}
169
171{
172 return d->m_rootNode.usedExtractor();
173}
174
176{
177 return &d->m_nodeFactory;
178}
179
181{
182 return &d->m_barcodeDecoder;
183}
184
185const ExtractorRepository* ExtractorEngine::extractorRepository() const
186{
187 return &d->m_repo;
188}
189
190const ExtractorScriptEngine* ExtractorEngine::scriptEngine() const
191{
192 d->m_scriptEngine.setExtractorEngine(const_cast<ExtractorEngine*>(this));
193 return &d->m_scriptEngine;
194}
195
196ExtractorDocumentNode ExtractorEngine::rootDocumentNode() const
197{
198 return d->m_rootNode;
199}
200
201void ExtractorEngine::processNode(ExtractorDocumentNode &node) const
202{
203 d->processNode(node);
204}
Barcode decoding with result caching.
Instantiates KItinerary::ExtractorDocumentNode instances using the type-specific document processor.
A node in the extracted document object tree.
QJsonArray result
Result access for QJSEngine.
void setResult(ExtractorResult &&result)
Replace the existing results by result.
QDateTime contextDateTime
The best known context date/time at this point in the document tree.
Semantic data extraction engine.
void setAdditionalExtractors(std::vector< const AbstractExtractor * > &&extractors)
Sets additional extractors to run on the given data.
Hints hints() const
The currently set extraction hints.
void setData(const QByteArray &data, QStringView fileName={}, QStringView mimeType={})
Set raw data to extract from.
void setContent(const QVariant &data, QStringView mimeType)
Already decoded data to extract from.
QString usedCustomExtractor() const
Returns the extractor id used to obtain the result.
void clear()
Resets the internal state, call before processing new input data.
void setContextDate(const QDateTime &dt)
Set the date the extracted document has been issued at.
void setHints(Hints hints)
Set extraction hints.
QJsonArray extract()
Perform the actual extraction, and return the JSON-LD data that has been found.
const BarcodeDecoder * barcodeDecoder() const
Barcode decoder for use by KItinerary::ExtractorDocumentProcessor.
void setContext(const QVariant &data, QStringView mimeType)
Provide a document part that is only used to determine which extractor to use, but not for extraction...
const ExtractorDocumentNodeFactory * documentNodeFactory() const
Factory for creating new document nodes.
void setUseSeparateProcess(bool separateProcess)
Perform extraction of "risky" content such as PDF files in a separate process.
Collection of all known data extractors.
void extractorsForNode(const ExtractorDocumentNode &node, std::vector< const AbstractExtractor * > &extractors) const
Finds matching extractors for the given document node.
Generic extraction result.
void append(ExtractorResult &&other)
Append another result to this one.
bool isEmpty() const
Checks if there is any relevant result set in here.
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
bool isValid() const const
QString toString(QStringView format, QCalendar cal) const const
QJsonValue at(qsizetype i) const const
iterator insert(QLatin1StringView key, const QJsonValue &value)
QJsonObject toObject() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Sat Apr 27 2024 22:08:32 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.