KItinerary

extractordocumentnodefactory.cpp
1/*
2 SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "extractordocumentnodefactory.h"
8#include "extractordocumentnode.h"
9#include "extractordocumentprocessor.h"
10#include "logging.h"
11
12#include "processors/binarydocumentprocessor.h"
13#include "processors/eradocumentprocessor.h"
14#include "processors/externalprocessor.h"
15#include "processors/htmldocumentprocessor.h"
16#include "processors/httpresponseprocessor.h"
17#include "processors/iatabcbpdocumentprocessor.h"
18#include "processors/icaldocumentprocessor.h"
19#include "processors/imagedocumentprocessor.h"
20#include "processors/jsonlddocumentprocessor.h"
21#include "processors/mimedocumentprocessor.h"
22#include "processors/pdfdocumentprocessor.h"
23#include "processors/pkpassdocumentprocessor.h"
24#include "processors/plistdocumentprocessor.h"
25#include "processors/textdocumentprocessor.h"
26#include "processors/uic9183documentprocessor.h"
27#include "processors/vdvdocumentprocessor.h"
28
29#include <QHash>
30#include <QMimeDatabase>
31
32using namespace KItinerary;
33
34enum {
35 MinDocumentSize = 4,
36 MaxDocumentSize = 10000000,
37};
38
39namespace KItinerary {
40class ExtractorDocumentNodeFactoryStatic {
41public:
42 ExtractorDocumentNodeFactoryStatic();
43
44 void registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
45 std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {});
46
47 template <typename T>
48 inline void registerProcessor(QStringView canonicalMimeType, std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {})
49 {
50 registerProcessor(std::make_unique<T>(), canonicalMimeType, aliasMimeTypes, fallbackMimeType);
51 }
52
53 void registerBuiltIn();
54 QStringView resolveAlias(QStringView mimeType) const;
55
56 struct ProcessorData {
57 QString mimeType;
58 const ExtractorDocumentProcessor* processor;
59 };
60 std::vector<ProcessorData> m_probeProcessors;
61 std::vector<ProcessorData> m_fallbackProbeProcessors;
62 std::vector<ProcessorData> m_mimetypeProcessorMap;
63 QHash<QString, QString> m_aliasMap;
64
65 // just for memory management
66 std::vector<std::unique_ptr<ExtractorDocumentProcessor>> processorPool;
67
68 static void insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap);
69};
70
71class ExtractorDocumentNodeFactoryPrivate {
72public:
73 ExtractorDocumentNodeFactoryStatic *s;
74 std::unique_ptr<ExtractorDocumentProcessor> interceptProcessor;
75};
76}
77
78ExtractorDocumentNodeFactoryStatic::ExtractorDocumentNodeFactoryStatic()
79{
80 registerBuiltIn();
81}
82
83void ExtractorDocumentNodeFactoryStatic::insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap)
84{
85 if (mimeType.empty()) {
86 return;
87 }
88
89 const auto it = std::lower_bound(procMap.begin(), procMap.end(), mimeType, [](const auto &proc, auto mt) {
90 return proc.mimeType < mt;
91 });
92 if (it != procMap.end() && (*it).mimeType == mimeType) {
93 qCWarning(Log) << "Document processor already registered for mimetype:" << mimeType;
94 return;
95 }
96
97 procMap.insert(it, { mimeType.toString(), proc });
98}
99
100void ExtractorDocumentNodeFactoryStatic::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
101 std::initializer_list<QStringView> aliasMimeTypes, QStringView fallbackMimeType)
102{
103 insertProcessor(processor.get(), canonicalMimeType, m_probeProcessors);
104 insertProcessor(processor.get(), canonicalMimeType, m_mimetypeProcessorMap);
105 for (const auto mt : aliasMimeTypes) {
106 m_aliasMap.insert(mt.toString(), canonicalMimeType.isEmpty() ? fallbackMimeType.toString() : canonicalMimeType.toString());
107 }
108 insertProcessor(processor.get(), fallbackMimeType, m_fallbackProbeProcessors);
109 insertProcessor(processor.get(), fallbackMimeType, m_mimetypeProcessorMap);
110 processorPool.push_back(std::move(processor));
111}
112
113void ExtractorDocumentNodeFactoryStatic::registerBuiltIn()
114{
115 registerProcessor<PdfDocumentProcessor>(u"application/pdf");
116 registerProcessor<PkPassDocumentProcessor>(u"application/vnd.apple.pkpass");
117 registerProcessor<IcalEventProcessor>(u"internal/event");
118 registerProcessor<ImageDocumentProcessor>(u"internal/qimage", {u"image/png", u"image/jpeg", u"image/gif"});
119 registerProcessor<ElbDocumentProcessor>(u"internal/era-elb");
120 registerProcessor<SsbDocumentProcessor>(u"internal/era-ssb");
121 registerProcessor<IataBcbpDocumentProcessor>(u"internal/iata-bcbp");
122 registerProcessor<Uic9183DocumentProcessor>(u"internal/uic9183");
123 registerProcessor<VdvDocumentProcessor>(u"internal/vdv");
124 registerProcessor<IcalCalendarProcessor>(u"text/calendar");
125 registerProcessor<PListDocumentProcessor>(u"application/x-plist");
126 registerProcessor<HttpResponseProcessor>(u"internal/http-response");
127 registerProcessor<HarDocumentProcessor>(u"internal/har-archive");
128
129 // fallback types that catch a very broad set of input types
130 // order matters particularly here, the broadest ones need to go last
131 registerProcessor<JsonLdDocumentProcessor>({}, {u"application/json"}, u"application/ld+json");
132 registerProcessor<MimeDocumentProcessor>({}, {u"application/mbox"}, u"message/rfc822");
133 registerProcessor<HtmlDocumentProcessor>({}, {u"application/xhtml+xml"}, u"text/html");
134 registerProcessor<TextDocumentProcessor>({}, {}, u"text/plain");
135 registerProcessor<BinaryDocumentProcessor>({}, {}, u"application/octet-stream");
136}
137
138QStringView ExtractorDocumentNodeFactoryStatic::resolveAlias(QStringView mimeType) const
139{
140 const auto it = m_aliasMap.find(mimeType.toString());
141 if (it != m_aliasMap.end()) {
142 return it.value();
143 }
144 return mimeType;
145}
146
147
148ExtractorDocumentNodeFactory::ExtractorDocumentNodeFactory()
149 : d(std::make_unique<ExtractorDocumentNodeFactoryPrivate>())
150{
151 static ExtractorDocumentNodeFactoryStatic s_factory;
152 d->s = &s_factory;
153}
154
155ExtractorDocumentNodeFactory::~ExtractorDocumentNodeFactory() = default;
156
158{
159 if (data.size() <= MinDocumentSize || data.size() > MaxDocumentSize) {
160 return {};
161 }
162
163 if (d->interceptProcessor && d->interceptProcessor->canHandleData(data, fileName)) {
164 auto node = d->interceptProcessor->createNodeFromData(data);
165 if (node.mimeType().isEmpty()) {
166 node.setMimeType(QStringLiteral("internal/external-process"));
167 }
168 node.setProcessor(d->interceptProcessor.get());
169 return node;
170 }
171
172 QString autoDetectedMimeType;
173 if (mimeType.isEmpty()) {
174 // let processors check themselves if they support this data
175 for (const auto &p : d->s->m_probeProcessors) {
176 if (p.processor->canHandleData(data, fileName)) {
177 auto node = p.processor->createNodeFromData(data);
178 if (node.content().isNull()) {
179 continue;
180 }
181
182 node.setMimeType(p.mimeType);
183 node.setProcessor(p.processor);
184 return node;
185 }
186 }
187 // same again with the basic types that ultimately will accept anything
188 for (const auto &p : d->s->m_fallbackProbeProcessors) {
189 if (p.processor->canHandleData(data, fileName)) {
190 auto node = p.processor->createNodeFromData(data);
191 if (node.content().isNull()) {
192 continue;
193 }
194
195 node.setMimeType(p.mimeType);
196 node.setProcessor(p.processor);
197 return node;
198 }
199 }
200
201 // if none felt responsible, try the generic mimetype detection
202 QMimeDatabase db;
203 if (fileName.isEmpty()) {
204 autoDetectedMimeType = db.mimeTypeForData(data).name();
205 } else {
206 autoDetectedMimeType = db.mimeTypeForFileNameAndData(fileName.toString(), data).name();
207 }
208 mimeType = autoDetectedMimeType;
209 }
210
211 mimeType = d->s->resolveAlias(mimeType);
212 const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
213 return proc.mimeType < mt;
214 });
215 if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
216 qCDebug(Log) << "No document processor found for mimetype" << mimeType;
217 return {};
218 }
219
220 auto node = (*it).processor->createNodeFromData(data);
221 node.setMimeType((*it).mimeType);
222 node.setProcessor((*it).processor);
223 return node;
224}
225
227{
228 mimeType = d->s->resolveAlias(mimeType);
229 const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
230 return proc.mimeType < mt;
231 });
232 if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
233 qCDebug(Log) << "No document processor found for mimetype" << mimeType;
234 return {};
235 }
236
237 auto node = (*it).processor->createNodeFromContent(decodedData);
238 node.setMimeType((*it).mimeType);
239 node.setProcessor((*it).processor);
240 return node;
241}
242
243void ExtractorDocumentNodeFactory::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView mimeType,
244 std::initializer_list<QStringView> aliasMimeTypes)
245{
246 d->s->registerProcessor(std::move(processor), mimeType, aliasMimeTypes);
247}
248
250{
251 if (separateProcess && !d->interceptProcessor) {
252 d->interceptProcessor = std::make_unique<ExternalProcessor>();
253 } else if (!separateProcess && d->interceptProcessor) {
254 d->interceptProcessor.reset();
255 }
256}
void registerProcessor(std::unique_ptr< ExtractorDocumentProcessor > &&processor, QStringView canonicalMimeType, std::initializer_list< QStringView > aliasMimeTypes={})
Register a new document processor.
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName={}, QStringView mimeType={}) const
Create a new document node from data.
void setUseSeparateProcess(bool separateProcess)
Perform extraction of "risky" content such as PDF files in a separate process.
A node in the extracted document object tree.
Abstract base class of a document type processor.
char * toString(const EngineQuery &query)
KCALUTILS_EXPORT QString mimeType()
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
qsizetype size() const const
iterator end()
iterator find(const Key &key)
iterator insert(const Key &key, const T &value)
QMimeType mimeTypeForData(QIODevice *device) const const
QMimeType mimeTypeForFileNameAndData(const QString &fileName, QIODevice *device) const const
QString & insert(qsizetype position, QChar ch)
bool isEmpty() const const
bool isEmpty() const const
QString toString() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Tue Mar 26 2024 11:14:48 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.