KFileMetaData

odfextractor.cpp
1 /*
2  SPDX-FileCopyrightText: 2013 Vishesh Handa <[email protected]>
3  SPDX-FileCopyrightText: 2012 Jörg Ehrichs <[email protected]>
4  SPDX-FileCopyrightText: 2016 Christoph Cullmann <[email protected]>
5 
6  SPDX-License-Identifier: LGPL-2.1-or-later
7 */
8 
9 
10 #include "odfextractor.h"
11 #include <memory>
12 
13 #include <KZip>
14 
15 #include <QDebug>
16 #include <QDomDocument>
17 #include <QXmlStreamReader>
18 
19 namespace {
20 
21 inline QString dcNS() { return QStringLiteral("http://purl.org/dc/elements/1.1/"); }
22 inline QString metaNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:meta:1.0"); }
23 inline QString officeNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:office:1.0"); }
24 
25 QDomElement firstChildElementNS(const QDomNode &node, const QString &nsURI, const QString &localName)
26 {
27  for (auto e = node.firstChildElement(); !e.isNull(); e = e.nextSiblingElement()) {
28  if (e.localName() == localName && e.namespaceURI() == nsURI) {
29  return e;
30  }
31  }
32 
33  return QDomElement();
34 }
35 
36 const QStringList supportedMimeTypes = {
37  QStringLiteral("application/vnd.oasis.opendocument.text"),
38  QStringLiteral("application/vnd.oasis.opendocument.text-template"),
39  QStringLiteral("application/vnd.oasis.opendocument.text-master"),
40  QStringLiteral("application/vnd.oasis.opendocument.text-master-template"),
41  QStringLiteral("application/vnd.oasis.opendocument.presentation"),
42  QStringLiteral("application/vnd.oasis.opendocument.presentation-template"),
43  QStringLiteral("application/vnd.oasis.opendocument.spreadsheet"),
44  QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-template"),
45 };
46 
47 }
48 
49 using namespace KFileMetaData;
50 
51 OdfExtractor::OdfExtractor(QObject* parent)
52  : ExtractorPlugin(parent)
53 {
54 
55 }
56 
57 QStringList OdfExtractor::mimetypes() const
58 {
59  return supportedMimeTypes;
60 }
61 
62 void OdfExtractor::extract(ExtractionResult* result)
63 {
64  KZip zip(result->inputUrl());
65  if (!zip.open(QIODevice::ReadOnly)) {
66  qWarning() << "Document is not a valid ZIP archive";
67  return;
68  }
69 
70  const KArchiveDirectory* directory = zip.directory();
71  if (!directory) {
72  qWarning() << "Invalid document structure (main directory is missing)";
73  return;
74  }
75 
76  // we need a meta xml file in the archive!
77  const auto metaXml = directory->file(QStringLiteral("meta.xml"));
78  if (!metaXml) {
79  qWarning() << "Invalid document structure (meta.xml is missing)";
80  return;
81  }
82 
83  if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
84  QDomDocument metaData(QStringLiteral("metaData"));
85  metaData.setContent(metaXml->data(), true);
86 
87  // parse metadata ...
89  officeNS(), QStringLiteral("document-meta")),
90  officeNS(), QStringLiteral("meta"));
91 
92  QDomNode n = meta.firstChild();
93  while (!n.isNull()) {
94  QDomElement e = n.toElement();
95  if (!e.isNull()) {
96  const QString namespaceURI = e.namespaceURI();
97  const QString localName = e.localName();
98 
99  // Dublin Core
100  if (namespaceURI == dcNS()) {
101  if (localName == QLatin1String("description")) {
102  result->add(Property::Description, e.text());
103  } else if (localName == QLatin1String("subject")) {
104  result->add(Property::Subject, e.text());
105  } else if (localName == QLatin1String("title")) {
106  result->add(Property::Title, e.text());
107  } else if (localName == QLatin1String("creator")) {
108  result->add(Property::Author, e.text());
109  } else if (localName == QLatin1String("language")) {
110  result->add(Property::Language, e.text());
111  }
112  }
113  // Meta Properties
114  else if (namespaceURI == metaNS()) {
115  if (localName == QLatin1String("document-statistic")) {
116  bool ok = false;
117  int pageCount = e.attributeNS(metaNS(), QStringLiteral("page-count")).toInt(&ok);
118  if (ok) {
119  result->add(Property::PageCount, pageCount);
120  }
121 
122  int wordCount = e.attributeNS(metaNS(), QStringLiteral("word-count")).toInt(&ok);
123  if (ok) {
124  result->add(Property::WordCount, wordCount);
125  }
126  } else if (localName == QLatin1String("keyword")) {
127  QString keywords = e.text();
128  result->add(Property::Keywords, keywords);
129  } else if (localName == QLatin1String("generator")) {
130  result->add(Property::Generator, e.text());
131  } else if (localName == QLatin1String("creation-date")) {
133  if (!dt.isNull()) {
134  result->add(Property::CreationDate, dt);
135  }
136  }
137  }
138  }
139  n = n.nextSibling();
140  }
141  }
142 
143  result->addType(Type::Document);
144  if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation")) ||
145  (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-template"))) {
146  result->addType(Type::Presentation);
147  }
148  else if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet")) ||
149  (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-template"))) {
150  result->addType(Type::Spreadsheet);
151  }
152 
153  if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
154  return;
155  }
156 
157  // for content indexing, we need content xml file
158  const auto contentXml = directory->file(QStringLiteral("content.xml"));
159  if (!contentXml) {
160  qWarning() << "Invalid document structure (content.xml is missing)";
161  return;
162  }
163 
164  std::unique_ptr<QIODevice> contentIODevice{contentXml->createDevice()};
165  QXmlStreamReader xml(contentIODevice.get());
166  while (!xml.atEnd()) {
167  xml.readNext();
168  if (xml.isCharacters()) {
169  QString str = xml.text().toString();
170  result->append(str);
171  }
172 
173  if (xml.hasError() || xml.isEndDocument()) {
174  break;
175  }
176  }
177 }
static QDateTime dateTimeFromString(const QString &dateString)
Tries to extract a valid date time from the string provided.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString text() const const
QDomNode firstChild() const const
QDomElement toElement() const const
The ExtractionResult class is where all the data extracted by the indexer is saved....
const KArchiveFile * file(const QString &name) const
QString inputMimetype() const
The input mimetype.
bool isNull() const const
bool isNull() const const
virtual QIODevice * createDevice() const
QString inputUrl() const
The input url which the plugins will use to locate the file.
QString attributeNS(const QString nsURI, const QString &localName, const QString &defValue) const const
QString namespaceURI() const const
QString localName() const const
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
int toInt(bool *ok, int base) const const
QDomElement firstChildElement(const QString &tagName) const const
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
QDomNode nextSibling() const const
The ExtractorPlugin is the base class for all file metadata extractors. It is responsible for extract...
QDomElement KPIMKDAV2_EXPORT firstChildElementNS(const QDomElement &parent, const QString &namespaceUri, const QString &tagName)
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Thu May 26 2022 03:46:07 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.