KItinerary

scriptextractor.h
1/*
2 SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#pragma once
8
9#include "abstractextractor.h"
10
11#include <memory>
12#include <vector>
13
14class QJsonObject;
15class QString;
16
17namespace KItinerary {
18class ExtractorFilter;
19class ScriptExtractorPrivate;
20
21/** A single unstructured data extraction rule set.
22 *
23 * These rules are loaded from JSON meta-data files in a compiled-in qrc file,
24 * or from $XDG_DATA_DIRS/kitinerary/extractors.
25 *
26 * @section extractor_metadata Meta Data Format
27 *
28 * The meta-data files either contain a single JSON object or an array of JSON objects
29 * with the following content:
30 * - \c mimeType: The MIME type of the extractor, \c text if not specified.
31 * - \c filter: An array of filters that are used to select this extractor for a given input file.
32 * - \c script: A JavaScript file to execute.
33 * - \c function: The entry point in the above mentioned script, @c main if not specified.
34 *
35 * The following extractor types are supported:
36 * - \c text/plain: plain text, the argument to the script function is a single string.
37 * - \c text/html: HTML documents, the argument to the script function is a KItinerary::HtmlDocument instance.
38 * - \c application/pdf: PDF documents, the argument to the script function is a KItinerary::PdfDocument instance.
39 * - \c application/vnd.apple.pkpass: Apple Wallet passes, the argument to the script function is a KPkPass::Pass instance.
40 * - \c internal/event: iCalendar events, the argument to the script function is a KCalendarCore::Event instance.
41 *
42 * Filter definitions have the following field:
43 * - \c mimeType: The MIME type of the document part this filter can match against.
44 * - \c field: The name of the field to match against. This can be a field id in a Apple Wallet pass,
45 * A MIME message header name, a property on a Json-LD object or an iCal calendar or event.
46 * For plain text or binary content, this is ignored.
47 * - \c match: A regular expression that is matched against the specified value (see QRegularExpression).
48 * - \c scope: Specifies how the filter should be applied relative to the document node that is being extracted.
49 * One of @c Current, @c Parent, @c Children, @c Ancestors, @c Descendants (@c Current is the default).
50 *
51 * Example:
52 * @code
53 * [
54 * {
55 * "mimeType": "application/pdf",
56 * "filter": [ { "field": "From", "match": "@swiss.com", "mimeType": "message/rfc822", "scope": "Ancestors" } ],
57 * "script": "swiss.js",
58 * "function": "parsePdf"
59 * },
60 * {
61 * "mimeType": "application/vnd.apple.pkpass",
62 * "filter": [ { "field": "passTypeIdentifier", "match": "pass.booking.swiss.com", "mimeType": "application/vnd.apple.pkpass", "scope": "Current" } ],
63 * "script": "swiss.js",
64 * "function": "parsePkPass"
65 * }
66 * ]
67 * @endcode
68 *
69 * @section extractor_development Development
70 *
71 * For development it's convenient to symlink the extractors source folder to
72 * $XDG_DATA_DIRS/kitinerary/extractors, so you can re-run a changed extractor
73 * script without recompiling or restarting the application.
74 *
75 */
76class KITINERARY_EXPORT ScriptExtractor : public AbstractExtractor
77{
78public:
79 explicit ScriptExtractor();
80 ~ScriptExtractor();
81
82 QString name() const override;
83 bool canHandle(const ExtractorDocumentNode &node) const override;
84 ExtractorResult extract(const ExtractorDocumentNode &node, const ExtractorEngine *engine) const override;
85
86 /** The JS script containing the code of the extractor. */
87 QString scriptFileName() const;
88 /** The JS function entry point for this extractor, @c main if empty. */
89 QString scriptFunction() const;
90 /** Mime type this script extractor supports. */
91 QString mimeType() const;
92 /** Returns the filters deciding whether this extractor should be applied. */
93 const std::vector<ExtractorFilter> &filters() const;
94
95 ///@cond internal
96 /** Load meta data from the given JSON object. */
97 bool load(const QJsonObject &obj, const QString &fileName, int index = -1);
98 /** Save extractor meta data to a JSON object. */
99 QJsonObject toJson() const;
100
101 /** Source file name. */
102 QString fileName() const;
103
104 void setMimeType(const QString &mimeType);
105 void setScriptFileName(const QString &script);
106 void setScriptFunction(const QString &func);
107 void setFilters(std::vector<ExtractorFilter> &&filters);
108 void setFilters(const std::vector<ExtractorFilter> &filters);
109 ///@endcond
110
111private:
112 std::unique_ptr<ScriptExtractorPrivate> d;
113};
114
115}
116
Abstract base class for data extractors.
A node in the extracted document object tree.
Semantic data extraction engine.
Determines whether an extractor is applicable to a given email.
Generic extraction result.
QString scriptFunction() const
The JS function entry point for this extractor, main if empty.
ExtractorResult extract(const ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Extract data from node.
const std::vector< ExtractorFilter > & filters() const
Returns the filters deciding whether this extractor should be applied.
QString mimeType() const
Mime type this script extractor supports.
QString name() const override
Identifier for this extractor.
QString scriptFileName() const
The JS script containing the code of the extractor.
bool canHandle(const ExtractorDocumentNode &node) const override
Fast check whether this extractor is applicable for node.
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 24 2025 11:52:35 by doxygen 1.13.2 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.