KFileMetaData

odfextractor.cpp
1 /*
2  <one line to give the library's name and an idea of what it does.>
3  Copyright (C) 2013 Vishesh Handa <[email protected]>
4  Copyright (C) 2012 Jörg Ehrichs <[email protected]>
5  Copyright (C) 2016 Christoph Cullmann <[email protected]>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public
9  License as published by the Free Software Foundation; either
10  version 2.1 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License along with this library; if not, write to the Free Software
19  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21 
22 
23 #include "odfextractor.h"
24 
25 #include <KZip>
26 
27 #include <QDebug>
28 #include <QDomDocument>
29 #include <QXmlStreamReader>
30 
31 namespace {
32 
33 inline QString dcNS() { return QStringLiteral("http://purl.org/dc/elements/1.1/"); }
34 inline QString metaNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:meta:1.0"); }
35 inline QString officeNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:office:1.0"); }
36 
37 QDomElement firstChildElementNS(const QDomNode &node, const QString &nsURI, const QString &localName)
38 {
39  for (auto e = node.firstChildElement(); !e.isNull(); e = e.nextSiblingElement()) {
40  if (e.localName() == localName && e.namespaceURI() == nsURI) {
41  return e;
42  }
43  }
44 
45  return QDomElement();
46 }
47 
48 const QStringList supportedMimeTypes = {
49  QStringLiteral("application/vnd.oasis.opendocument.text"),
50  QStringLiteral("application/vnd.oasis.opendocument.presentation"),
51  QStringLiteral("application/vnd.oasis.opendocument.spreadsheet"),
52 };
53 
54 }
55 
56 using namespace KFileMetaData;
57 
58 OdfExtractor::OdfExtractor(QObject* parent)
59  : ExtractorPlugin(parent)
60 {
61 
62 }
63 
64 QStringList OdfExtractor::mimetypes() const
65 {
66  return supportedMimeTypes;
67 }
68 
69 void OdfExtractor::extract(ExtractionResult* result)
70 {
71  KZip zip(result->inputUrl());
72  if (!zip.open(QIODevice::ReadOnly)) {
73  qWarning() << "Document is not a valid ZIP archive";
74  return;
75  }
76 
77  const KArchiveDirectory* directory = zip.directory();
78  if (!directory) {
79  qWarning() << "Invalid document structure (main directory is missing)";
80  return;
81  }
82 
83  // we need a meta xml file in the archive!
84  const auto metaXml = directory->entry(QStringLiteral("meta.xml"));
85  if (!metaXml || !metaXml->isFile()) {
86  qWarning() << "Invalid document structure (meta.xml is missing)";
87  return;
88  }
89 
90  if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
91  QDomDocument metaData(QStringLiteral("metaData"));
92  metaData.setContent(static_cast<const KArchiveFile*>(metaXml)->data(), true);
93 
94  // parse metadata ...
96  officeNS(), QStringLiteral("document-meta")),
97  officeNS(), QStringLiteral("meta"));
98 
99  QDomNode n = meta.firstChild();
100  while (!n.isNull()) {
101  QDomElement e = n.toElement();
102  if (!e.isNull()) {
103  const QString namespaceURI = e.namespaceURI();
104  const QString localName = e.localName();
105 
106  // Dublin Core
107  if (namespaceURI == dcNS()) {
108  if (localName == QLatin1String("description")) {
109  result->add(Property::Description, e.text());
110  } else if (localName == QLatin1String("subject")) {
111  result->add(Property::Subject, e.text());
112  } else if (localName == QLatin1String("title")) {
113  result->add(Property::Title, e.text());
114  } else if (localName == QLatin1String("creator")) {
115  result->add(Property::Author, e.text());
116  } else if (localName == QLatin1String("language")) {
117  result->add(Property::Language, e.text());
118  }
119  }
120  // Meta Properties
121  else if (namespaceURI == metaNS()) {
122  if (localName == QLatin1String("document-statistic")) {
123  bool ok = false;
124  int pageCount = e.attributeNS(metaNS(), QStringLiteral("page-count")).toInt(&ok);
125  if (ok) {
126  result->add(Property::PageCount, pageCount);
127  }
128 
129  int wordCount = e.attributeNS(metaNS(), QStringLiteral("word-count")).toInt(&ok);
130  if (ok) {
131  result->add(Property::WordCount, wordCount);
132  }
133  } else if (localName == QLatin1String("keyword")) {
134  QString keywords = e.text();
135  result->add(Property::Keywords, keywords);
136  } else if (localName == QLatin1String("generator")) {
137  result->add(Property::Generator, e.text());
138  } else if (localName == QLatin1String("creation-date")) {
140  if (!dt.isNull())
141  result->add(Property::CreationDate, dt);
142  }
143  }
144  }
145  n = n.nextSibling();
146  }
147  }
148 
149  result->addType(Type::Document);
150  if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation")) {
151  result->addType(Type::Presentation);
152  }
153  else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet")) {
154  result->addType(Type::Spreadsheet);
155  }
156 
157  if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
158  return;
159  }
160 
161  // for content indexing, we need content xml file
162  const auto contentXml = directory->entry(QStringLiteral("content.xml"));
163  if (!contentXml || !contentXml->isFile()) {
164  qWarning() << "Invalid document structure (content.xml is missing)";
165  return;
166  }
167 
168  QXmlStreamReader xml(static_cast<const KArchiveFile*>(contentXml)->createDevice());
169  while (!xml.atEnd()) {
170  xml.readNext();
171  if (xml.isCharacters()) {
172  QString str = xml.text().toString();
173  result->append(str);
174  }
175 
176  if (xml.hasError() || xml.isEndDocument())
177  break;
178  }
179 }
static QDateTime dateTimeFromString(const QString &dateString)
Tries to extract a valid date time from the string provided.
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
QString attributeNS(const QString nsURI, const QString &localName, const QString &defValue) const const
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
The ExtractorPlugin is the base class for all file metadata extractors.
QString namespaceURI() const const
QDomNode nextSibling() const const
QDomElement toElement() const const
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString localName() const const
QString text() const const
int toInt(bool *ok, int base) const const
QString inputMimetype() const
The input mimetype.
const KArchiveEntry * entry(const QString &name) const
QDomElement KPIMKDAV2_EXPORT firstChildElementNS(const QDomElement &parent, const QString &namespaceUri, const QString &tagName)
bool isNull() const const
QDomNode firstChild() const const
bool isNull() const const
QDomElement firstChildElement(const QString &tagName) const const
The ExtractionResult class is where all the data extracted by the indexer is saved.
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
QString inputUrl() const
The input url which the plugins will use to locate the file.
This file is part of the KDE documentation.
Documentation copyright © 1996-2020 The KDE developers.
Generated on Mon May 25 2020 23:11:16 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.