KItinerary

mimedocumentprocessor.cpp
1 /*
2  SPDX-FileCopyrightText: 2021 Volker Krause <[email protected]>
3 
4  SPDX-License-Identifier: LGPL-2.0-or-later
5 */
6 
7 #include "mimedocumentprocessor.h"
8 
9 #include <KItinerary/ExtractorDocumentNodeFactory>
10 #include <KItinerary/ExtractorEngine>
11 #include <KItinerary/ExtractorFilter>
12 
13 #include <KMime/Message>
14 
15 #include <QDebug>
16 
17 using namespace KItinerary;
18 
19 Q_DECLARE_METATYPE(KItinerary::Internal::OwnedPtr<KMime::Content>)
20 
21 static bool contentMightBeEmail(const QByteArray &data)
22 {
23  // raw email
24  for (const auto c : data) {
25  if (std::isalpha(c) || c == '-') {
26  continue;
27  }
28  if (c == ':') {
29  return true;
30  } else {
31  break;
32  }
33  }
34 
35  // mbox format
36  return data.startsWith("From ");
37 }
38 
39 bool MimeDocumentProcessor::canHandleData(const QByteArray &encodedData, QStringView fileName) const
40 {
41  return contentMightBeEmail(encodedData) ||
42  fileName.endsWith(QLatin1String(".eml"), Qt::CaseInsensitive) ||
43  fileName.endsWith(QLatin1String(".mbox"), Qt::CaseInsensitive);
44 }
45 
46 template <typename T>
47 static const T* findHeader(KMime::Content *content)
48 {
49  auto h = content->header<T>();
50  if (h || !content->parent()) {
51  return h;
52  }
53  return findHeader<T>(content->parent());
54 }
55 
56 static const KMime::Headers::Base* findHeader(KMime::Content *content, const char *headerType)
57 {
58  auto h = content->headerByType(headerType);
59  if (h || !content->parent()) {
60  return h;
61  }
62  return findHeader(content->parent(), headerType);
63 }
64 
66 {
67  auto msg = new KMime::Message;
68  msg->setContent(KMime::CRLFtoLF(encodedData));
69  if (msg->head().isEmpty() || msg->body().isEmpty()) {
70  delete msg;
71  return {};
72  }
73  msg->parse();
74 
76  node.setContent<Internal::OwnedPtr<KMime::Content>>(msg);
77 
78  auto dateHdr = findHeader<KMime::Headers::Date>(msg);
79  if (dateHdr) {
80  node.setContextDateTime(dateHdr->dateTime());
81  }
82 
83  return node;
84 }
85 
87 {
88  KMime::Content *content = decodedData.value<KMime::Content*>();
89  if (!content) {
90  content = decodedData.value<KMime::Message*>();
91  }
92  if (!content) {
93  return {};
94  }
95 
97  node.setContent(content);
98 
99  const auto dateHdr = findHeader<KMime::Headers::Date>(content);
100  if (dateHdr) {
101  node.setContextDateTime(dateHdr->dateTime());
102  }
103 
104  return node;
105 }
106 
107 static ExtractorDocumentNode expandContentNode(ExtractorDocumentNode &node, KMime::Content *content, const ExtractorEngine *engine)
108 {
109  QString fileName;
110  const auto ct = content->contentType(false);
111  if (ct) {
112  fileName = ct->name();
113  }
114  const auto cd = content->contentDisposition(false);
115  if (fileName.isEmpty() && cd) {
116  fileName = cd->filename();
117  }
118 
119  ExtractorDocumentNode child;
120  if ((ct && ct->isPlainText() && fileName.isEmpty()) || (!ct && content->isTopLevel())) {
121  child = engine->documentNodeFactory()->createNode(content->decodedText(), u"text/plain");
122  } else if (ct && ct->isHTMLText()) {
123  child = engine->documentNodeFactory()->createNode(content->decodedText(), u"text/html");
124  } else {
125  child = engine->documentNodeFactory()->createNode(content->decodedContent(), fileName);
126  }
127  node.appendChild(child);
128  return child;
129 }
130 
131 static void expandContentNodeRecursive(ExtractorDocumentNode &node, KMime::Content *content, const ExtractorEngine *engine)
132 {
133  const auto ct = content->contentType(false);
134  const auto children = content->contents();
135  if (!ct || children.empty()) {
136  expandContentNode(node, content, engine);
137  return;
138  }
139 
140  // special handling of multipart/related to add images to the corresponding HTML document
141  if (ct && ct->isMultipart() && ct->isSubtype("related") && ct->parameter(QLatin1String("type")) == QLatin1String("text/html") && children.size() >= 2) {
142  const auto child = children.front();
143  if (child->contentType(false) && child->contentType(false)->isHTMLText()) {
144  auto htmlNode = expandContentNode(node, child, engine);
145  for (auto it = std::next(children.begin()); it != children.end(); ++it) {
146  auto imgNode = expandContentNode(htmlNode, (*it), engine);
147  const auto cid = (*it)->contentID(false);
148  if (cid) {
149  imgNode.setLocation(cid->identifier());
150  }
151  }
152  return;
153  }
154  }
155 
156  for (const auto child : children) {
157  expandContentNodeRecursive(node, child, engine);
158  }
159 }
160 
162 {
163  const auto content = node.content<KMime::Content*>();
164  expandContentNodeRecursive(node, content, engine);
165 }
166 
168 {
169  const auto content = node.content<KMime::Content*>();
170  const auto header = findHeader(content, filter.fieldName().toUtf8().constData());
171  return header ? filter.matches(header->asUnicodeString()) : false;
172 }
173 
175 {
176  destroyIfOwned<KMime::Content>(node);
177 }
void destroyNode(ExtractorDocumentNode &node) const override
Destroys type-specific data in node.
void expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Create child nodes for node, as far as that&#39;s necessary for this document type.
Classes for reservation/travel data models, data extraction and data augmentation.
bool matches(const ExtractorFilter &filter, const ExtractorDocumentNode &node) const override
Checks whether the given filter matches node.
bool canHandleData(const QByteArray &encodedData, QStringView fileName) const override
Fast check whether the given encoded data can possibly be processed by this instance.
void appendChild(ExtractorDocumentNode &child)
Add another child node.
void setContextDateTime(const QDateTime &contextDateTime)
Set the context date/time.
Semantic data extraction engine.
T * header(bool create=false)
T value() const const
Headers::ContentDisposition * contentDisposition(bool create=true)
ExtractorDocumentNode createNodeFromData(const QByteArray &encodedData) const override
Create a document node from raw data.
QString decodedText(bool trimText=false, bool removeTrailingNewlines=false)
QByteArray decodedContent()
QVector< Content * > contents() const
Determines whether an extractor is applicable to a given email.
CaseInsensitive
bool isEmpty() const const
const char * constData() const const
ExtractorDocumentNode createNodeFromContent(const QVariant &decodedData) const override
Create a document node from an already decoded data type.
A node in the extracted document object tree.
bool matches(const QString &data) const
Check if data matches this filter.
bool isTopLevel() const
const ExtractorDocumentNodeFactory * documentNodeFactory() const
Factory for creating new document nodes.
Headers::ContentType * contentType(bool create=true)
void setContent(const QByteArray &s)
Headers::Base * headerByType(const char *type) const
QJSValue content
The decoded content of this node.
void setContent(const QVariant &content)
Set decoded content.
bool endsWith(QStringView str, Qt::CaseSensitivity cs) const const
int size() const const
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName={}, QStringView mimeType={}) const
Create a new document node from data.
Content * parent() const
QString fieldName() const
The field to filter on.
QByteArray toUtf8() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Mon Jan 17 2022 23:06:16 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.