KFileMetaData

office2007extractor.cpp
1 /*
2  SPDX-FileCopyrightText: 2013 Vishesh Handa <[email protected]>
3 
4  SPDX-License-Identifier: LGPL-2.1-or-later
5 */
6 
7 
8 #include "office2007extractor.h"
9 #include <memory>
10 
11 #include <KZip>
12 
13 #include <QDebug>
14 #include <QDomDocument>
15 #include <QXmlStreamReader>
16 
17 using namespace KFileMetaData;
18 
19 Office2007Extractor::Office2007Extractor(QObject* parent)
20  : ExtractorPlugin(parent)
21 {
22 
23 }
24 
25 const QStringList supportedMimeTypes = {
26  QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
27  QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template"),
28  QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
29  QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide"),
30  QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
31  QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template"),
32  QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
33  QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template"),
34 };
35 
36 QStringList Office2007Extractor::mimetypes() const
37 {
38  return supportedMimeTypes;
39 }
40 
41 void Office2007Extractor::extract(ExtractionResult* result)
42 {
43  KZip zip(result->inputUrl());
44  if (!zip.open(QIODevice::ReadOnly)) {
45  qWarning() << "Document is not a valid ZIP archive";
46  return;
47  }
48 
49  const KArchiveDirectory* rootDir = zip.directory();
50  if (!rootDir) {
51  qWarning() << "Invalid document structure (main directory is missing)";
52  return;
53  }
54 
55  const QStringList rootEntries = rootDir->entries();
56  if (!rootEntries.contains(QStringLiteral("docProps"))) {
57  qWarning() << "Invalid document structure (docProps is missing)";
58  return;
59  }
60 
61  const KArchiveEntry* docPropEntry = rootDir->entry(QStringLiteral("docProps"));
62  if (!docPropEntry->isDirectory()) {
63  qWarning() << "Invalid document structure (docProps is not a directory)";
64  return;
65  }
66 
67  const KArchiveDirectory* docPropDirectory = dynamic_cast<const KArchiveDirectory*>(docPropEntry);
68 
69  const bool extractMetaData = result->inputFlags() & ExtractionResult::ExtractMetaData;
70 
71  const KArchiveFile* file = docPropDirectory->file(QStringLiteral("core.xml"));
72  if (extractMetaData && file) {
73  QDomDocument coreDoc(QStringLiteral("core"));
74  coreDoc.setContent(file->data());
75 
76  QDomElement docElem = coreDoc.documentElement();
77 
78  QDomElement elem = docElem.firstChildElement(QStringLiteral("dc:description"));
79  if (!elem.isNull()) {
80  QString str = elem.text();
81  if (!str.isEmpty()) {
82  result->add(Property::Description, str);
83  }
84  }
85 
86  elem = docElem.firstChildElement(QStringLiteral("dc:subject"));
87  if (!elem.isNull()) {
88  QString str = elem.text();
89  if (!str.isEmpty()) {
90  result->add(Property::Subject, str);
91  }
92  }
93 
94  elem = docElem.firstChildElement(QStringLiteral("dc:title"));
95  if (!elem.isNull()) {
96  QString str = elem.text();
97  if (!str.isEmpty()) {
98  result->add(Property::Title, str);
99  }
100  }
101 
102  elem = docElem.firstChildElement(QStringLiteral("dc:creator"));
103  if (!elem.isNull()) {
104  QString str = elem.text();
105  if (!str.isEmpty()) {
106  result->add(Property::Author, str);
107  }
108  }
109 
110  elem = docElem.firstChildElement(QStringLiteral("dc:language"));
111  if (!elem.isNull()) {
112  QString str = elem.text();
113  if (!str.isEmpty()) {
114  result->add(Property::Language, str);
115  }
116  }
117 
118  elem = docElem.firstChildElement(QStringLiteral("dcterms:created"));
119  if (!elem.isNull()) {
120  QString str = elem.text();
121  QDateTime dt = dateTimeFromString(str);
122  if (!dt.isNull()) {
123  result->add(Property::CreationDate, dt);
124  }
125  }
126 
127  elem = docElem.firstChildElement(QStringLiteral("cp:keywords"));
128  if (!elem.isNull()) {
129  QString str = elem.text();
130  if (!str.isEmpty()) {
131  result->add(Property::Keywords, str);
132  }
133  }
134  }
135 
136  file = docPropDirectory->file(QStringLiteral("app.xml"));
137  if (extractMetaData && file) {
138  QDomDocument appDoc(QStringLiteral("app"));
139  appDoc.setContent(file->data());
140 
141  QDomElement docElem = appDoc.documentElement();
142 
143  // According to the ontologies only Documents can have a wordCount and pageCount
144  const QString mimeType = result->inputMimetype();
145  if (mimeType == QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
146  QDomElement elem = docElem.firstChildElement(QStringLiteral("Pages"));
147  if (!elem.isNull()) {
148  bool ok = false;
149  int pageCount = elem.text().toInt(&ok);
150  if (ok) {
151  result->add(Property::PageCount, pageCount);
152  }
153  }
154 
155  elem = docElem.firstChildElement(QStringLiteral("Words"));
156  if (!elem.isNull()) {
157  bool ok = false;
158  int wordCount = elem.text().toInt(&ok);
159  if (ok) {
160  result->add(Property::WordCount, wordCount);
161  }
162  }
163  }
164 
165  QDomElement elem = docElem.firstChildElement(QStringLiteral("Application"));
166  if (!elem.isNull()) {
167  QString app = elem.text();
168  if (!app.isEmpty()) {
169  result->add(Property::Generator, app);
170  }
171  }
172  }
173 
174  //
175  // Plain Text
176  //
177  bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText);
178 
179  if (rootEntries.contains(QStringLiteral("word"))) {
180  result->addType(Type::Document);
181 
182  if (!extractPlainText) {
183  return;
184  }
185 
186  const KArchiveEntry* wordEntry = rootDir->entry(QStringLiteral("word"));
187  if (!wordEntry->isDirectory()) {
188  qWarning() << "Invalid document structure (word is not a directory)";
189  return;
190  }
191 
192  const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry);
193  const QStringList wordEntries = wordDirectory->entries();
194 
195  if (wordEntries.contains(QStringLiteral("document.xml"))) {
196  const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml"));
197 
198  if (file) {
199  std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
200  extractTextWithTag(contentIODevice.get(), QStringLiteral("w:t"), result);
201  }
202  }
203  }
204 
205  else if (rootEntries.contains(QStringLiteral("xl"))) {
206  result->addType(Type::Document);
207  result->addType(Type::Spreadsheet);
208 
209  if (!extractPlainText) {
210  return;
211  }
212 
213  const KArchiveEntry* xlEntry = rootDir->entry(QStringLiteral("xl"));
214  if (!xlEntry->isDirectory()) {
215  qWarning() << "Invalid document structure (xl is not a directory)";
216  return;
217  }
218 
219  const KArchiveDirectory* xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry);
220  extractTextFromFiles(xlDirectory, result);
221  }
222 
223  else if (rootEntries.contains(QStringLiteral("ppt"))) {
224  result->addType(Type::Document);
225  result->addType(Type::Presentation);
226 
227  if (!extractPlainText) {
228  return;
229  }
230 
231  const KArchiveEntry* pptEntry = rootDir->entry(QStringLiteral("ppt"));
232  if (!pptEntry->isDirectory()) {
233  qWarning() << "Invalid document structure (ppt is not a directory)";
234  return;
235  }
236 
237  const KArchiveDirectory* pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry);
238  extractTextFromFiles(pptDirectory, result);
239  }
240 }
241 
242 void Office2007Extractor::extractAllText(QIODevice* device, ExtractionResult* result)
243 {
244  QXmlStreamReader xml(device);
245 
246  while (!xml.atEnd()) {
247  xml.readNext();
248  if (xml.isCharacters()) {
249  QString str = xml.text().toString();
250  result->append(str);
251  }
252 
253  if (xml.isEndDocument() || xml.hasError()) {
254  break;
255  }
256  }
257 }
258 
259 void Office2007Extractor::extractTextFromFiles(const KArchiveDirectory* archiveDir, ExtractionResult* result)
260 {
261  const QStringList entries = archiveDir->entries();
262  for (const QString & entryName : entries) {
263  const KArchiveEntry* entry = archiveDir->entry(entryName);
264  if (!entry) {
265  continue;
266  }
267  if (entry->isDirectory()) {
268  const KArchiveDirectory* subDir = dynamic_cast<const KArchiveDirectory*>(entry);
269  extractTextFromFiles(subDir, result);
270  continue;
271  }
272 
273  if (entry->isFile() && entryName.endsWith(QLatin1String(".xml"))) {
274  const KArchiveFile* file = static_cast<const KArchiveFile*>(entry);
275  std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
276  extractAllText(contentIODevice.get() , result);
277  }
278  }
279 }
280 
281 void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result)
282 {
283  QXmlStreamReader xml(device);
284 
285  while (!xml.atEnd()) {
286  xml.readNext();
287  if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) {
288  QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements);
289 
290  if (!str.isEmpty()) {
291  result->append(str);
292  }
293  }
294 
295  if (xml.isEndDocument() || xml.hasError()) {
296  break;
297  }
298  }
299 }
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString text() const const
The ExtractionResult class is where all the data extracted by the indexer is saved....
const KArchiveFile * file(const QString &name) const
QString inputMimetype() const
The input mimetype.
bool isNull() const const
bool contains(const QString &str, Qt::CaseSensitivity cs) const const
bool isNull() const const
KCALUTILS_EXPORT QString mimeType()
virtual QIODevice * createDevice() const
QString inputUrl() const
The input url which the plugins will use to locate the file.
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
bool isEmpty() const const
QStringList entries() const
int toInt(bool *ok, int base) const const
virtual bool isFile() const
QDomElement firstChildElement(const QString &tagName) const const
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
The ExtractorPlugin is the base class for all file metadata extractors. It is responsible for extract...
virtual QByteArray data() const
virtual bool isDirectory() const
const KArchiveEntry * entry(const QString &name) const
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Fri May 27 2022 03:47:54 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.