KItinerary

extractordocumentnodefactory.cpp
1 /*
2  SPDX-FileCopyrightText: 2021 Volker Krause <[email protected]>
3 
4  SPDX-License-Identifier: LGPL-2.0-or-later
5 */
6 
7 #include "extractordocumentnodefactory.h"
8 #include "extractordocumentnode.h"
9 #include "extractordocumentprocessor.h"
10 #include "logging.h"
11 
12 #include "processors/binarydocumentprocessor.h"
13 #include "processors/eradocumentprocessor.h"
14 #include "processors/externalprocessor.h"
15 #include "processors/htmldocumentprocessor.h"
16 #include "processors/httpresponseprocessor.h"
17 #include "processors/iatabcbpdocumentprocessor.h"
18 #include "processors/icaldocumentprocessor.h"
19 #include "processors/imagedocumentprocessor.h"
20 #include "processors/jsonlddocumentprocessor.h"
21 #include "processors/mimedocumentprocessor.h"
22 #include "processors/pdfdocumentprocessor.h"
23 #include "processors/pkpassdocumentprocessor.h"
24 #include "processors/plistdocumentprocessor.h"
25 #include "processors/textdocumentprocessor.h"
26 #include "processors/uic9183documentprocessor.h"
27 #include "processors/vdvdocumentprocessor.h"
28 
29 #include <QHash>
30 #include <QMimeDatabase>
31 
32 using namespace KItinerary;
33 
34 enum {
35  MinDocumentSize = 4,
36  MaxDocumentSize = 10000000,
37 };
38 
39 namespace KItinerary {
40 class ExtractorDocumentNodeFactoryStatic {
41 public:
42  ExtractorDocumentNodeFactoryStatic();
43 
44  void registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
45  std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {});
46 
47  template <typename T>
48  inline void registerProcessor(QStringView canonicalMimeType, std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {})
49  {
50  registerProcessor(std::make_unique<T>(), canonicalMimeType, aliasMimeTypes, fallbackMimeType);
51  }
52 
53  void registerBuiltIn();
54  QStringView resolveAlias(QStringView mimeType) const;
55 
56  struct ProcessorData {
58  const ExtractorDocumentProcessor* processor;
59  };
60  std::vector<ProcessorData> m_probeProcessors;
61  std::vector<ProcessorData> m_fallbackProbeProcessors;
62  std::vector<ProcessorData> m_mimetypeProcessorMap;
63  QHash<QString, QString> m_aliasMap;
64 
65  // just for memory management
66  std::vector<std::unique_ptr<ExtractorDocumentProcessor>> processorPool;
67 
68  static void insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap);
69 };
70 
71 class ExtractorDocumentNodeFactoryPrivate {
72 public:
73  ExtractorDocumentNodeFactoryStatic *s;
74  std::unique_ptr<ExtractorDocumentProcessor> interceptProcessor;
75 };
76 }
77 
78 ExtractorDocumentNodeFactoryStatic::ExtractorDocumentNodeFactoryStatic()
79 {
80  registerBuiltIn();
81 }
82 
83 void ExtractorDocumentNodeFactoryStatic::insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap)
84 {
85  if (mimeType.empty()) {
86  return;
87  }
88 
89  const auto it = std::lower_bound(procMap.begin(), procMap.end(), mimeType, [](const auto &proc, auto mt) {
90  return proc.mimeType < mt;
91  });
92  if (it != procMap.end() && (*it).mimeType == mimeType) {
93  qCWarning(Log) << "Document processor already registered for mimetype:" << mimeType;
94  return;
95  }
96 
97  procMap.insert(it, { mimeType.toString(), proc });
98 }
99 
100 void ExtractorDocumentNodeFactoryStatic::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
101  std::initializer_list<QStringView> aliasMimeTypes, QStringView fallbackMimeType)
102 {
103  insertProcessor(processor.get(), canonicalMimeType, m_probeProcessors);
104  insertProcessor(processor.get(), canonicalMimeType, m_mimetypeProcessorMap);
105  for (const auto mt : aliasMimeTypes) {
106  m_aliasMap.insert(mt.toString(), canonicalMimeType.isEmpty() ? fallbackMimeType.toString() : canonicalMimeType.toString());
107  }
108  insertProcessor(processor.get(), fallbackMimeType, m_fallbackProbeProcessors);
109  insertProcessor(processor.get(), fallbackMimeType, m_mimetypeProcessorMap);
110  processorPool.push_back(std::move(processor));
111 }
112 
113 void ExtractorDocumentNodeFactoryStatic::registerBuiltIn()
114 {
115  registerProcessor<PdfDocumentProcessor>(u"application/pdf");
116  registerProcessor<PkPassDocumentProcessor>(u"application/vnd.apple.pkpass");
117  registerProcessor<IcalEventProcessor>(u"internal/event");
118  registerProcessor<ImageDocumentProcessor>(u"internal/qimage", {u"image/png", u"image/jpeg"});
119  registerProcessor<ElbDocumentProcessor>(u"internal/era-elb");
120  registerProcessor<SsbDocumentProcessor>(u"internal/era-ssb");
121  registerProcessor<IataBcbpDocumentProcessor>(u"internal/iata-bcbp");
122  registerProcessor<Uic9183DocumentProcessor>(u"internal/uic9183");
123  registerProcessor<VdvDocumentProcessor>(u"internal/vdv");
124  registerProcessor<IcalCalendarProcessor>(u"text/calendar");
125  registerProcessor<PListDocumentProcessor>(u"application/x-plist");
126  registerProcessor<HttpResponseProcessor>(u"internal/http-response");
127  registerProcessor<HarDocumentProcessor>(u"internal/har-archive");
128 
129  // fallback types that catch a very broad set of input types
130  // order matters particularly here, the broadest ones need to go last
131  registerProcessor<JsonLdDocumentProcessor>({}, {u"application/json"}, u"application/ld+json");
132  registerProcessor<MimeDocumentProcessor>({}, {u"application/mbox"}, u"message/rfc822");
133  registerProcessor<HtmlDocumentProcessor>({}, {u"application/xhtml+xml"}, u"text/html");
134  registerProcessor<TextDocumentProcessor>({}, {}, u"text/plain");
135  registerProcessor<BinaryDocumentProcessor>({}, {}, u"application/octet-stream");
136 }
137 
138 QStringView ExtractorDocumentNodeFactoryStatic::resolveAlias(QStringView mimeType) const
139 {
140  const auto it = m_aliasMap.find(mimeType.toString());
141  if (it != m_aliasMap.end()) {
142  return it.value();
143  }
144  return mimeType;
145 }
146 
147 
148 ExtractorDocumentNodeFactory::ExtractorDocumentNodeFactory()
149  : d(std::make_unique<ExtractorDocumentNodeFactoryPrivate>())
150 {
151  static ExtractorDocumentNodeFactoryStatic s_factory;
152  d->s = &s_factory;
153 }
154 
155 ExtractorDocumentNodeFactory::~ExtractorDocumentNodeFactory() = default;
156 
158 {
159  if (data.size() <= MinDocumentSize || data.size() > MaxDocumentSize) {
160  return {};
161  }
162 
163  if (d->interceptProcessor && d->interceptProcessor->canHandleData(data, fileName)) {
164  auto node = d->interceptProcessor->createNodeFromData(data);
165  if (node.mimeType().isEmpty()) {
166  node.setMimeType(QStringLiteral("internal/external-process"));
167  }
168  node.setProcessor(d->interceptProcessor.get());
169  return node;
170  }
171 
172  QString autoDetectedMimeType;
173  if (mimeType.isEmpty()) {
174  // let processors check themselves if they support this data
175  for (const auto &p : d->s->m_probeProcessors) {
176  if (p.processor->canHandleData(data, fileName)) {
177  auto node = p.processor->createNodeFromData(data);
178  if (node.content().isNull()) {
179  continue;
180  }
181 
182  node.setMimeType(p.mimeType);
183  node.setProcessor(p.processor);
184  return node;
185  }
186  }
187  // same again with the basic types that ultimately will accept anything
188  for (const auto &p : d->s->m_fallbackProbeProcessors) {
189  if (p.processor->canHandleData(data, fileName)) {
190  auto node = p.processor->createNodeFromData(data);
191  if (node.content().isNull()) {
192  continue;
193  }
194 
195  node.setMimeType(p.mimeType);
196  node.setProcessor(p.processor);
197  return node;
198  }
199  }
200 
201  // if none felt responsible, try the generic mimetype detection
202  QMimeDatabase db;
203  if (fileName.isEmpty()) {
204  autoDetectedMimeType = db.mimeTypeForData(data).name();
205  } else {
206  autoDetectedMimeType = db.mimeTypeForFileNameAndData(fileName.toString(), data).name();
207  }
208  mimeType = autoDetectedMimeType;
209  }
210 
211  mimeType = d->s->resolveAlias(mimeType);
212  const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
213  return proc.mimeType < mt;
214  });
215  if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
216  qCDebug(Log) << "No document processor found for mimetype" << mimeType;
217  return {};
218  }
219 
220  auto node = (*it).processor->createNodeFromData(data);
221  node.setMimeType((*it).mimeType);
222  node.setProcessor((*it).processor);
223  return node;
224 }
225 
227 {
228  mimeType = d->s->resolveAlias(mimeType);
229  const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
230  return proc.mimeType < mt;
231  });
232  if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
233  qCDebug(Log) << "No document processor found for mimetype" << mimeType;
234  return {};
235  }
236 
237  auto node = (*it).processor->createNodeFromContent(decodedData);
238  node.setMimeType((*it).mimeType);
239  node.setProcessor((*it).processor);
240  return node;
241 }
242 
243 void ExtractorDocumentNodeFactory::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView mimeType,
244  std::initializer_list<QStringView> aliasMimeTypes)
245 {
246  d->s->registerProcessor(std::move(processor), mimeType, aliasMimeTypes);
247 }
248 
250 {
251  if (separateProcess && !d->interceptProcessor) {
252  d->interceptProcessor = std::make_unique<ExternalProcessor>();
253  } else if (!separateProcess && d->interceptProcessor) {
254  d->interceptProcessor.reset();
255  }
256 }
A node in the extracted document object tree.
KCALUTILS_EXPORT QString mimeType()
void registerProcessor(std::unique_ptr< ExtractorDocumentProcessor > &&processor, QStringView canonicalMimeType, std::initializer_list< QStringView > aliasMimeTypes={})
Register a new document processor.
Abstract base class of a document type processor.
QMimeType mimeTypeForData(const QByteArray &data) const const
QString toString() const const
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName={}, QStringView mimeType={}) const
Create a new document node from data.
bool isEmpty() const const
QMimeType mimeTypeForFileNameAndData(const QString &fileName, QIODevice *device) const const
bool isEmpty() const const
QString & insert(int position, QChar ch)
int size() const const
void setUseSeparateProcess(bool separateProcess)
Perform extraction of "risky" content such as PDF files in a separate process.
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Thu Dec 7 2023 04:03:00 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.