KItinerary

extractordocumentnodefactory.cpp
1 /*
2  SPDX-FileCopyrightText: 2021 Volker Krause <[email protected]>
3 
4  SPDX-License-Identifier: LGPL-2.0-or-later
5 */
6 
7 #include "extractordocumentnodefactory.h"
8 #include "extractordocumentnode.h"
9 #include "extractordocumentprocessor.h"
10 #include "logging.h"
11 
12 #include "processors/binarydocumentprocessor.h"
13 #include "processors/externalprocessor.h"
14 #include "processors/htmldocumentprocessor.h"
15 #include "processors/iatabcbpdocumentprocessor.h"
16 #include "processors/icaldocumentprocessor.h"
17 #include "processors/imagedocumentprocessor.h"
18 #include "processors/jsonlddocumentprocessor.h"
19 #include "processors/mimedocumentprocessor.h"
20 #include "processors/pdfdocumentprocessor.h"
21 #include "processors/pkpassdocumentprocessor.h"
22 #include "processors/ssbdocumentprocessor.h"
23 #include "processors/textdocumentprocessor.h"
24 #include "processors/uic9183documentprocessor.h"
25 #include "processors/vdvdocumentprocessor.h"
26 
27 #include <QHash>
28 #include <QMimeDatabase>
29 
30 using namespace KItinerary;
31 
32 enum {
33  MinDocumentSize = 4,
34  MaxDocumentSize = 4000000,
35 };
36 
37 namespace KItinerary {
38 class ExtractorDocumentNodeFactoryStatic {
39 public:
40  ExtractorDocumentNodeFactoryStatic();
41 
42  void registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
43  std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {});
44 
45  template <typename T>
46  inline void registerProcessor(QStringView canonicalMimeType, std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {})
47  {
48  registerProcessor(std::make_unique<T>(), canonicalMimeType, aliasMimeTypes, fallbackMimeType);
49  }
50 
51  void registerBuiltIn();
52  QStringView resolveAlias(QStringView mimeType) const;
53 
54  struct ProcessorData {
56  const ExtractorDocumentProcessor* processor;
57  };
58  std::vector<ProcessorData> m_probeProcessors;
59  std::vector<ProcessorData> m_fallbackProbeProcessors;
60  std::vector<ProcessorData> m_mimetypeProcessorMap;
61  QHash<QString, QString> m_aliasMap;
62 
63  // just for memory management
64  std::vector<std::unique_ptr<ExtractorDocumentProcessor>> processorPool;
65 
66  static void insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap);
67 };
68 
69 class ExtractorDocumentNodeFactoryPrivate {
70 public:
71  ExtractorDocumentNodeFactoryStatic *s;
72  std::unique_ptr<ExtractorDocumentProcessor> interceptProcessor;
73 };
74 }
75 
76 ExtractorDocumentNodeFactoryStatic::ExtractorDocumentNodeFactoryStatic()
77 {
78  registerBuiltIn();
79 }
80 
81 void ExtractorDocumentNodeFactoryStatic::insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap)
82 {
83  if (mimeType.empty()) {
84  return;
85  }
86 
87  const auto it = std::lower_bound(procMap.begin(), procMap.end(), mimeType, [](const auto &proc, auto mt) {
88  return proc.mimeType < mt;
89  });
90  if (it != procMap.end() && (*it).mimeType == mimeType) {
91  qCWarning(Log) << "Document processor already registered for mimetype:" << mimeType;
92  return;
93  }
94 
95  procMap.insert(it, { mimeType.toString(), proc });
96 }
97 
98 void ExtractorDocumentNodeFactoryStatic::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
99  std::initializer_list<QStringView> aliasMimeTypes, QStringView fallbackMimeType)
100 {
101  insertProcessor(processor.get(), canonicalMimeType, m_probeProcessors);
102  insertProcessor(processor.get(), canonicalMimeType, m_mimetypeProcessorMap);
103  for (const auto mt : aliasMimeTypes) {
104  m_aliasMap.insert(mt.toString(), canonicalMimeType.isEmpty() ? fallbackMimeType.toString() : canonicalMimeType.toString());
105  }
106  insertProcessor(processor.get(), fallbackMimeType, m_fallbackProbeProcessors);
107  insertProcessor(processor.get(), fallbackMimeType, m_mimetypeProcessorMap);
108  processorPool.push_back(std::move(processor));
109 }
110 
111 void ExtractorDocumentNodeFactoryStatic::registerBuiltIn()
112 {
113  registerProcessor<JsonLdDocumentProcessor>(u"application/ld+json", {u"application/json"});
114  registerProcessor<PdfDocumentProcessor>(u"application/pdf");
115  registerProcessor<PkPassDocumentProcessor>(u"application/vnd.apple.pkpass");
116  registerProcessor<IcalEventProcessor>(u"internal/event");
117  registerProcessor<ImageDocumentProcessor>(u"internal/qimage", {u"image/png"});
118  registerProcessor<SsbDocumentProcessor>(u"internal/era-ssb");
119  registerProcessor<IataBcbpDocumentProcessor>(u"internal/iata-bcbp");
120  registerProcessor<Uic9183DocumentProcessor>(u"internal/uic9183");
121  registerProcessor<VdvDocumentProcessor>(u"internal/vdv");
122  registerProcessor<IcalCalendarProcessor>(u"text/calendar");
123 
124  // fallback types that catch a very broad set of input types
125  // order matters particularly here, the broadest ones need to go last
126  registerProcessor<MimeDocumentProcessor>({}, {u"application/mbox"}, u"message/rfc822");
127  registerProcessor<HtmlDocumentProcessor>({}, {u"application/xhtml+xml"}, u"text/html");
128  registerProcessor<TextDocumentProcessor>({}, {}, u"text/plain");
129  registerProcessor<BinaryDocumentProcessor>({}, {}, u"application/octet-stream");
130 }
131 
132 QStringView ExtractorDocumentNodeFactoryStatic::resolveAlias(QStringView mimeType) const
133 {
134  const auto it = m_aliasMap.find(mimeType.toString());
135  if (it != m_aliasMap.end()) {
136  return it.value();
137  }
138  return mimeType;
139 }
140 
141 
142 ExtractorDocumentNodeFactory::ExtractorDocumentNodeFactory()
143  : d(std::make_unique<ExtractorDocumentNodeFactoryPrivate>())
144 {
145  static ExtractorDocumentNodeFactoryStatic s_factory;
146  d->s = &s_factory;
147 }
148 
149 ExtractorDocumentNodeFactory::~ExtractorDocumentNodeFactory() = default;
150 
152 {
153  if (data.size() <= MinDocumentSize || data.size() > MaxDocumentSize) {
154  return {};
155  }
156 
157  if (d->interceptProcessor && d->interceptProcessor->canHandleData(data, fileName)) {
158  auto node = d->interceptProcessor->createNodeFromData(data);
159  if (node.mimeType().isEmpty()) {
160  node.setMimeType(QStringLiteral("internal/external-process"));
161  }
162  node.setProcessor(d->interceptProcessor.get());
163  return node;
164  }
165 
166  QString autoDetectedMimeType;
167  if (mimeType.isEmpty()) {
168  // let processors check themselves if they support this data
169  for (const auto &p : d->s->m_probeProcessors) {
170  if (p.processor->canHandleData(data, fileName)) {
171  auto node = p.processor->createNodeFromData(data);
172  if (node.content().isNull()) {
173  continue;
174  }
175 
176  node.setMimeType(p.mimeType);
177  node.setProcessor(p.processor);
178  return node;
179  }
180  }
181  // same again with the basic types that ultimately will accept anything
182  for (const auto &p : d->s->m_fallbackProbeProcessors) {
183  if (p.processor->canHandleData(data, fileName)) {
184  auto node = p.processor->createNodeFromData(data);
185  if (node.content().isNull()) {
186  continue;
187  }
188 
189  node.setMimeType(p.mimeType);
190  node.setProcessor(p.processor);
191  return node;
192  }
193  }
194 
195  // if none felt responsible, try the generic mimetype detection
196  QMimeDatabase db;
197  if (fileName.isEmpty()) {
198  autoDetectedMimeType = db.mimeTypeForData(data).name();
199  } else {
200  autoDetectedMimeType = db.mimeTypeForFileNameAndData(fileName.toString(), data).name();
201  }
202  mimeType = autoDetectedMimeType;
203  }
204 
205  mimeType = d->s->resolveAlias(mimeType);
206  const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
207  return proc.mimeType < mt;
208  });
209  if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
210  qCDebug(Log) << "No document processor found for mimetype" << mimeType;
211  return {};
212  }
213 
214  auto node = (*it).processor->createNodeFromData(data);
215  node.setMimeType((*it).mimeType);
216  node.setProcessor((*it).processor);
217  return node;
218 }
219 
221 {
222  mimeType = d->s->resolveAlias(mimeType);
223  const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
224  return proc.mimeType < mt;
225  });
226  if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
227  qCDebug(Log) << "No document processor found for mimetype" << mimeType;
228  return {};
229  }
230 
231  auto node = (*it).processor->createNodeFromContent(decodedData);
232  node.setMimeType((*it).mimeType);
233  node.setProcessor((*it).processor);
234  return node;
235 }
236 
237 void ExtractorDocumentNodeFactory::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView mimeType,
238  std::initializer_list<QStringView> aliasMimeTypes)
239 {
240  d->s->registerProcessor(std::move(processor), mimeType, aliasMimeTypes);
241 }
242 
244 {
245  if (separateProcess && !d->interceptProcessor) {
246  d->interceptProcessor = std::make_unique<ExternalProcessor>();
247  } else if (!separateProcess && d->interceptProcessor) {
248  d->interceptProcessor.reset();
249  }
250 }
A node in the extracted document object tree.
KCALUTILS_EXPORT QString mimeType()
void registerProcessor(std::unique_ptr< ExtractorDocumentProcessor > &&processor, QStringView canonicalMimeType, std::initializer_list< QStringView > aliasMimeTypes={})
Register a new document processor.
Abstract base class of a document type processor.
QMimeType mimeTypeForData(const QByteArray &data) const const
QString toString() const const
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName={}, QStringView mimeType={}) const
Create a new document node from data.
bool isEmpty() const const
QMimeType mimeTypeForFileNameAndData(const QString &fileName, QIODevice *device) const const
bool isEmpty() const const
QString & insert(int position, QChar ch)
int size() const const
void setUseSeparateProcess(bool separateProcess)
Perform extraction of "risky" content such as PDF files in a separate process.
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Sun Sep 25 2022 03:58:14 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.