KFileMetaData

office2007extractor.cpp
1 /*
2  <one line to give the library's name and an idea of what it does.>
3  Copyright (C) 2013 Vishesh Handa <[email protected]>
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License as published by the Free Software Foundation; either
8  version 2.1 of the License, or (at your option) any later version.
9 
10  This library is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with this library; if not, write to the Free Software
17  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19 
20 
21 #include "office2007extractor.h"
22 
23 #include <KZip>
24 
25 #include <QDebug>
26 #include <QDomDocument>
27 #include <QXmlStreamReader>
28 
29 using namespace KFileMetaData;
30 
31 Office2007Extractor::Office2007Extractor(QObject* parent)
32  : ExtractorPlugin(parent)
33 {
34 
35 }
36 
37 const QStringList supportedMimeTypes = {
38  QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
39  QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
40  QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
41 };
42 
43 QStringList Office2007Extractor::mimetypes() const
44 {
45  return supportedMimeTypes;
46 }
47 
48 void Office2007Extractor::extract(ExtractionResult* result)
49 {
50  KZip zip(result->inputUrl());
51  if (!zip.open(QIODevice::ReadOnly)) {
52  qWarning() << "Document is not a valid ZIP archive";
53  return;
54  }
55 
56  const KArchiveDirectory* rootDir = zip.directory();
57  if (!rootDir) {
58  qWarning() << "Invalid document structure (main directory is missing)";
59  return;
60  }
61 
62  const QStringList rootEntries = rootDir->entries();
63  if (!rootEntries.contains(QStringLiteral("docProps"))) {
64  qWarning() << "Invalid document structure (docProps is missing)";
65  return;
66  }
67 
68  const KArchiveEntry* docPropEntry = rootDir->entry(QStringLiteral("docProps"));
69  if (!docPropEntry->isDirectory()) {
70  qWarning() << "Invalid document structure (docProps is not a directory)";
71  return;
72  }
73 
74  const KArchiveDirectory* docPropDirectory = dynamic_cast<const KArchiveDirectory*>(docPropEntry);
75  const QStringList docPropsEntries = docPropDirectory->entries();
76 
77  const bool extractMetaData = result->inputFlags() & ExtractionResult::ExtractMetaData;
78 
79  if (extractMetaData && docPropsEntries.contains(QStringLiteral("core.xml"))) {
80  QDomDocument coreDoc(QStringLiteral("core"));
81  const KArchiveFile* file = static_cast<const KArchiveFile*>(docPropDirectory->entry(QStringLiteral("core.xml")));
82  coreDoc.setContent(file->data());
83 
84  QDomElement docElem = coreDoc.documentElement();
85 
86  QDomElement elem = docElem.firstChildElement(QStringLiteral("dc:description"));
87  if (!elem.isNull()) {
88  QString str = elem.text();
89  if (!str.isEmpty()) {
90  result->add(Property::Description, str);
91  }
92  }
93 
94  elem = docElem.firstChildElement(QStringLiteral("dc:subject"));
95  if (!elem.isNull()) {
96  QString str = elem.text();
97  if (!str.isEmpty()) {
98  result->add(Property::Subject, str);
99  }
100  }
101 
102  elem = docElem.firstChildElement(QStringLiteral("dc:title"));
103  if (!elem.isNull()) {
104  QString str = elem.text();
105  if (!str.isEmpty()) {
106  result->add(Property::Title, str);
107  }
108  }
109 
110  elem = docElem.firstChildElement(QStringLiteral("dc:creator"));
111  if (!elem.isNull()) {
112  QString str = elem.text();
113  if (!str.isEmpty()) {
114  result->add(Property::Author, str);
115  }
116  }
117 
118  elem = docElem.firstChildElement(QStringLiteral("dc:language"));
119  if (!elem.isNull()) {
120  QString str = elem.text();
121  if (!str.isEmpty()) {
122  result->add(Property::Language, str);
123  }
124  }
125 
126  elem = docElem.firstChildElement(QStringLiteral("dcterms:created"));
127  if (!elem.isNull()) {
128  QString str = elem.text();
129  QDateTime dt = dateTimeFromString(str);
130  if (!dt.isNull()) {
131  result->add(Property::CreationDate, dt);
132  }
133  }
134 
135  elem = docElem.firstChildElement(QStringLiteral("cp:keywords"));
136  if (!elem.isNull()) {
137  QString str = elem.text();
138  if (!str.isEmpty()) {
139  result->add(Property::Keywords, str);
140  }
141  }
142  }
143 
144  if (extractMetaData && docPropsEntries.contains(QStringLiteral("app.xml"))) {
145  QDomDocument appDoc(QStringLiteral("app"));
146  const KArchiveFile* file = static_cast<const KArchiveFile*>(docPropDirectory->entry(QStringLiteral("app.xml")));
147  appDoc.setContent(file->data());
148 
149  QDomElement docElem = appDoc.documentElement();
150 
151  // According to the ontologies only Documents can have a wordCount and pageCount
152  const QString mimeType = result->inputMimetype();
153  if (mimeType == QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
154  QDomElement elem = docElem.firstChildElement(QStringLiteral("Pages"));
155  if (!elem.isNull()) {
156  bool ok = false;
157  int pageCount = elem.text().toInt(&ok);
158  if (ok) {
159  result->add(Property::PageCount, pageCount);
160  }
161  }
162 
163  elem = docElem.firstChildElement(QStringLiteral("Words"));
164  if (!elem.isNull()) {
165  bool ok = false;
166  int wordCount = elem.text().toInt(&ok);
167  if (ok) {
168  result->add(Property::WordCount, wordCount);
169  }
170  }
171  }
172 
173  QDomElement elem = docElem.firstChildElement(QStringLiteral("Application"));
174  if (!elem.isNull()) {
175  QString app = elem.text();
176  if (!app.isEmpty()) {
177  result->add(Property::Generator, app);
178  }
179  }
180  }
181 
182  //
183  // Plain Text
184  //
185  bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText);
186 
187  if (rootEntries.contains(QStringLiteral("word"))) {
188  result->addType(Type::Document);
189 
190  if (!extractPlainText)
191  return;
192 
193  const KArchiveEntry* wordEntry = rootDir->entry(QStringLiteral("word"));
194  if (!wordEntry->isDirectory()) {
195  qWarning() << "Invalid document structure (word is not a directory)";
196  return;
197  }
198 
199  const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry);
200  const QStringList wordEntries = wordDirectory->entries();
201 
202  if (wordEntries.contains(QStringLiteral("document.xml"))) {
203  const KArchiveFile* file = static_cast<const KArchiveFile*>(wordDirectory->entry(QStringLiteral("document.xml")));
204 
205  extractTextWithTag(file->createDevice(), QStringLiteral("w:t"), result);
206  }
207  }
208 
209  else if (rootEntries.contains(QStringLiteral("xl"))) {
210  result->addType(Type::Document);
211  result->addType(Type::Spreadsheet);
212 
213  if (!extractPlainText)
214  return;
215 
216  const KArchiveEntry* xlEntry = rootDir->entry(QStringLiteral("xl"));
217  if (!xlEntry->isDirectory()) {
218  qWarning() << "Invalid document structure (xl is not a directory)";
219  return;
220  }
221 
222  const KArchiveDirectory* xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry);
223  extractTextFromFiles(xlDirectory, result);
224  }
225 
226  else if (rootEntries.contains(QStringLiteral("ppt"))) {
227  result->addType(Type::Document);
228  result->addType(Type::Presentation);
229 
230  if (!extractPlainText)
231  return;
232 
233  const KArchiveEntry* pptEntry = rootDir->entry(QStringLiteral("ppt"));
234  if (!pptEntry->isDirectory()) {
235  qWarning() << "Invalid document structure (ppt is not a directory)";
236  return;
237  }
238 
239  const KArchiveDirectory* pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry);
240  extractTextFromFiles(pptDirectory, result);
241  }
242 }
243 
244 void Office2007Extractor::extractAllText(QIODevice* device, ExtractionResult* result)
245 {
246  QXmlStreamReader xml(device);
247 
248  while (!xml.atEnd()) {
249  xml.readNext();
250  if (xml.isCharacters()) {
251  QString str = xml.text().toString();
252  result->append(str);
253  }
254 
255  if (xml.isEndDocument() || xml.hasError())
256  break;
257  }
258 }
259 
260 void Office2007Extractor::extractTextFromFiles(const KArchiveDirectory* archiveDir, ExtractionResult* result)
261 {
262  const QStringList entries = archiveDir->entries();
263  for (const QString & entryName : entries) {
264  const KArchiveEntry* entry = archiveDir->entry(entryName);
265  if (entry->isDirectory()) {
266  const KArchiveDirectory* subDir = dynamic_cast<const KArchiveDirectory*>(entry);
267  extractTextFromFiles(subDir, result);
268  continue;
269  }
270 
271  if (!entryName.endsWith(QLatin1String(".xml")))
272  continue;
273 
274  const KArchiveFile* file = static_cast<const KArchiveFile*>(entry);
275  extractAllText(file->createDevice(), result);
276  }
277 }
278 
279 void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result)
280 {
281  QXmlStreamReader xml(device);
282 
283  while (!xml.atEnd()) {
284  xml.readNext();
285  if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) {
286  QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements);
287 
288  if (!str.isEmpty()) {
289  result->append(str);
290  }
291  }
292 
293  if (xml.isEndDocument() || xml.hasError())
294  break;
295  }
296 }
static QDateTime dateTimeFromString(const QString &dateString)
Tries to extract a valid date time from the string provided.
virtual bool isDirectory() const
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
The ExtractorPlugin is the base class for all file metadata extractors.
bool contains(const QString &str, Qt::CaseSensitivity cs) const const
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString text() const const
virtual QIODevice * createDevice() const
int toInt(bool *ok, int base) const const
bool isEmpty() const const
QString inputMimetype() const
The input mimetype.
const KArchiveEntry * entry(const QString &name) const
bool isNull() const const
bool isNull() const const
QDomElement firstChildElement(const QString &tagName) const const
QString mimeType(Type)
virtual QByteArray data() const
The ExtractionResult class is where all the data extracted by the indexer is saved.
QStringList entries() const
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
QString inputUrl() const
The input url which the plugins will use to locate the file.
This file is part of the KDE documentation.
Documentation copyright © 1996-2020 The KDE developers.
Generated on Tue May 26 2020 23:11:36 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.