KItinerary

extractorengine.h
1 /*
2  SPDX-FileCopyrightText: 2017-2021 Volker Krause <[email protected]>
3 
4  SPDX-License-Identifier: LGPL-2.0-or-later
5 */
6 
7 #pragma once
8 
9 #include "kitinerary_export.h"
10 
11 #include <QString>
12 
13 #include <memory>
14 #include <vector>
15 
16 class QByteArray;
17 class QDateTime;
18 class QJsonArray;
19 class QVariant;
20 
21 namespace KItinerary {
22 
23 class AbstractExtractor;
24 class BarcodeDecoder;
25 class ExtractorDocumentNode;
26 class ExtractorDocumentNodeFactory;
27 class ExtractorEnginePrivate;
28 class ExtractorRepository;
29 class ExtractorScriptEngine;
30 
31 /**
32  * Semantic data extraction engine.
33  *
34  * This will attempt to find travel itinerary data in the given input data
35  * (plain text, HTML text, PDF documents, etc), and return the extracted
36  * JSON-LD data.
37  *
38  * @section create_extractors Creating Extractors
39  *
40  * @subsection extractor_api Extractor API
41  *
42  * For adding custom extractors, two parts are needed:
43  * - JSON meta-data describing the extractor and when to apply it, as described
44  * in the Extractor documentation.
45  * - An extractor JavaScript file, compatible with QJSEngine.
46  *
47  * The extractor script will have access to API defined in the JsApi namespace:
48  * - JsApi::Context: information about the input data being processed.
49  * - JsApi::JsonLd: functions for generating JSON-LD data.
50  * - JsApi::Barcode: barcode decoding functions.
51  *
52  * The entry point to the script is specified in the meta-data, its argument depends
53  * on the extractor type:
54  * - Plain text extractors are passed a string.
55  * If input is HTML or PDF, the string will be the text of the document stripped
56  * of all formatting etc.
57  * - HTML extractors are passed a HtmlDocument instance allowing DOM-like access to
58  * the document structure.
59  * - PDF extractors are passed a PdfDocument instance allowing access to textual and
60  * image content.
61  * - Apple Wallet pass extractors are passed a KPkPass::BoardingPass instance.
62  * - iCalendar event extractors are passed KCalendarCore::Event instances.
63  *
64  * These functions should return an object or an array of objects following the JSON-LD
65  * format defined on schema.org. JsApi::JsonLd provides helper functions to build such
66  * objects. If @c null or an empty array is returned, the next applicable extractor is
67  * run.
68  *
69  * Returned objects are then passed through ExtractorPostprocessor which will normalize,
70  * augment and validate the data. This can greatly simplify the extraction, as for example
71  * the expansion of an IATA BCBP ticket token already fills most key properties of a flight
72  * reservation automatically.
73  *
74  * @subsection extractor_tools Development Tools
75  *
76  * For interactive testing during development of new extractors, it is recommended to
77  * link (or copy) the JSON meta data and JavaScript code files to the search path for
78  * Extractor meta data.
79  *
80  * Additionally, there's an interactive testing and inspection tool called @c kitinerary-workbench
81  * (see https://invent.kde.org/pim/kitinerary-workbench).
82  *
83  * @subsection extractor_testing Automated Testing
84  *
85  * There are a few unit tests for extractors in the kitinerary repository (see autotests/extractordata),
86  * however the majority of real-world test data cannot be shared this way, due to privacy
87  * and copyright issues (e.g. PDFs containing copyrighted vendor logos and user credit card details).
88  * Therefore there is also support for testing against external data (see extractortest.cpp).
89  *
90  * External test data is assumed to be in a folder named @c kitinerary-tests next to the @c kitinerary
91  * source folder. The test program searches this folder recursively for folders with the following content
92  * and attempts to extract data from each test file in there.
93  *
94  * - @c context.eml: MIME message header data specifying the context in which the test data
95  * was received. This typically only needs a @c From: and @c Date: line, but can even be
96  * entirely empty (or non-existing) for structured data that does not need a custom extractor.
97  * This context information is applied to all tests in this folder.
98  * - @c <testname>.[txt|html|pdf|pkpass|ics|eml|mbox]: The input test data.
99  * - @c <testname.extension>.json: The expected JSON-LD output. If this file doesn't
100  * exists it is created by the test program.
101  * - @c <testname.extension>.skip: If this file is present the corresponding test
102  * is skipped.
103  */
104 class KITINERARY_EXPORT ExtractorEngine
105 {
106 public:
107  ExtractorEngine();
108  ExtractorEngine(ExtractorEngine &&) noexcept;
109  ExtractorEngine(const ExtractorEngine &) = delete;
110  ~ExtractorEngine();
111 
112  /** Resets the internal state, call before processing new input data. */
113  void clear();
114 
115  /** Set raw data to extract from.
116  * @param data Raw data to extract from.
117  * @param fileName Used as a hint to determine the type, optional and used for MIME type auto-detection if needed.
118  * @param mimeType MIME type of @p data, auto-detected if empty.
119  */
120  void setData(const QByteArray &data, QStringView fileName = {}, QStringView mimeType = {});
121 
122  /** Already decoded data to extract from.
123  * @param data Has to contain a object of a supported data type matching @p mimeType.
124  */
125  void setContent(const QVariant &data, QStringView mimeType);
126 
127  /** Provide a document part that is only used to determine which extractor to use,
128  * but not for extraction itself.
129  * This can for example be the MIME message part wrapping a document to extract.
130  * Using this is not necessary when this document part is already included in
131  * what is passed to setContent() already anyway.
132  */
133  void setContext(const QVariant &data, QStringView mimeType);
134 
135  /** Set the date the extracted document has been issued at.
136  * This does not need to be perfectly accurate and is used to
137  * complete incomplete date information in the document (typically
138  * a missing year).
139  * This method does not need to be called when setContext is used.
140  */
141  void setContextDate(const QDateTime &dt);
142 
143  /** Perform extraction of "risky" content such as PDF files in a separate process.
144  * This is safer as it isolates the using application from crashes/hangs due to corrupt files.
145  * It is however slower, and not available on all platforms.
146  * This is off by default.
147  */
148  void setUseSeparateProcess(bool separateProcess);
149 
150  /** Sets additional extractors to run on the given data.
151  * Extractors are usually automatically selected, this is therefore most likely not needed to
152  * be called manually. This mainly exists for the external extractor process.
153  */
154  void setAdditionalExtractors(std::vector<const AbstractExtractor*> &&extractors);
155 
156  /** Hints about the document to extract based on application knowledge that
157  * can help the extractor.
158  */
159  enum Hint {
160  NoHint = 0,
161  ExtractFullPageRasterImages = 1, ///< perform expensive image processing on (PDF) documents containing full page raster images
162  };
163  Q_DECLARE_FLAGS(Hints, Hint)
164 
165  /** The currently set extraction hints. */
166  Hints hints() const;
167  /** Set extraction hints. */
168  void setHints(Hints hints);
169 
170  /** Perform the actual extraction, and return the JSON-LD data
171  * that has been found.
172  */
173  QJsonArray extract();
174 
175  /** Returns the extractor id used to obtain the result.
176  * Can be empty if generic extractors have been used.
177  * Not supposed to be used for normal operations, this is only needed for tooling.
178  */
179  QString usedCustomExtractor() const;
180 
181  /** Factory for creating new document nodes.
182  * This is only for use by KItinerary::ExtractorDocumentProcessor instances.
183  */
184  const ExtractorDocumentNodeFactory* documentNodeFactory() const;
185  /** Barcode decoder for use by KItinerary::ExtractorDocumentProcessor.
186  * Use this rather than your own instance as it caches repeated attempts to
187  * decode the same image.
188  */
189  const BarcodeDecoder* barcodeDecoder() const;
190 
191  ///@cond internal
192  /** Extractor repository instance used by this engine. */
193  const ExtractorRepository* extractorRepository() const;
194  /** JavaScript execution engine for script extractors. */
195  const ExtractorScriptEngine* scriptEngine() const;
196  /** Document root node.
197  * Only fully populated after extraction has been performed.
198  * Only exposed for tooling.
199  */
200  ExtractorDocumentNode rootDocumentNode() const;
201  ///@endcond
202 
203 private:
204  std::unique_ptr<ExtractorEnginePrivate> d;
205 };
206 
207 Q_DECLARE_OPERATORS_FOR_FLAGS(ExtractorEngine::Hints)
208 
209 }
210 
Hint
Hints about the document to extract based on application knowledge that can help the extractor.
Instantiates KItinerary::ExtractorDocumentNode instances using the type-specific document processor.
A node in the extracted document object tree.
Collection of all known data extractors.
Barcode decoding with result caching.
Semantic data extraction engine.
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Tue Sep 27 2022 03:59:28 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.