KFileMetaData

office2007extractor.cpp
1/*
2 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
3
4 SPDX-License-Identifier: LGPL-2.1-or-later
5*/
6
7
8#include "office2007extractor.h"
9#include <memory>
10
11#include <KZip>
12
13#include <QDebug>
14#include <QDomDocument>
15#include <QXmlStreamReader>
16
17using namespace KFileMetaData;
18
19Office2007Extractor::Office2007Extractor(QObject* parent)
20 : ExtractorPlugin(parent)
21{
22
23}
24
25const QStringList supportedMimeTypes = {
26 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
27 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template"),
28 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
29 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide"),
30 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
31 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template"),
32 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
33 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template"),
34};
35
36QStringList Office2007Extractor::mimetypes() const
37{
38 return supportedMimeTypes;
39}
40
41void Office2007Extractor::extract(ExtractionResult* result)
42{
43 KZip zip(result->inputUrl());
44 if (!zip.open(QIODevice::ReadOnly)) {
45 qWarning() << "Document is not a valid ZIP archive";
46 return;
47 }
48
49 const KArchiveDirectory* rootDir = zip.directory();
50 if (!rootDir) {
51 qWarning() << "Invalid document structure (main directory is missing)";
52 return;
53 }
54
55 const QStringList rootEntries = rootDir->entries();
56 if (!rootEntries.contains(QStringLiteral("docProps"))) {
57 qWarning() << "Invalid document structure (docProps is missing)";
58 return;
59 }
60
61 const KArchiveEntry* docPropEntry = rootDir->entry(QStringLiteral("docProps"));
62 if (!docPropEntry->isDirectory()) {
63 qWarning() << "Invalid document structure (docProps is not a directory)";
64 return;
65 }
66
67 const KArchiveDirectory* docPropDirectory = dynamic_cast<const KArchiveDirectory*>(docPropEntry);
68
69 const bool extractMetaData = result->inputFlags() & ExtractionResult::ExtractMetaData;
70
71 const KArchiveFile* file = docPropDirectory->file(QStringLiteral("core.xml"));
72 if (extractMetaData && file) {
73 QDomDocument coreDoc(QStringLiteral("core"));
74 coreDoc.setContent(file->data());
75
76 QDomElement docElem = coreDoc.documentElement();
77
78 QDomElement elem = docElem.firstChildElement(QStringLiteral("dc:description"));
79 if (!elem.isNull()) {
80 QString str = elem.text();
81 if (!str.isEmpty()) {
82 result->add(Property::Description, str);
83 }
84 }
85
86 elem = docElem.firstChildElement(QStringLiteral("dc:subject"));
87 if (!elem.isNull()) {
88 QString str = elem.text();
89 if (!str.isEmpty()) {
90 result->add(Property::Subject, str);
91 }
92 }
93
94 elem = docElem.firstChildElement(QStringLiteral("dc:title"));
95 if (!elem.isNull()) {
96 QString str = elem.text();
97 if (!str.isEmpty()) {
98 result->add(Property::Title, str);
99 }
100 }
101
102 elem = docElem.firstChildElement(QStringLiteral("dc:creator"));
103 if (!elem.isNull()) {
104 QString str = elem.text();
105 if (!str.isEmpty()) {
106 result->add(Property::Author, str);
107 }
108 }
109
110 elem = docElem.firstChildElement(QStringLiteral("dc:language"));
111 if (!elem.isNull()) {
112 QString str = elem.text();
113 if (!str.isEmpty()) {
114 result->add(Property::Language, str);
115 }
116 }
117
118 elem = docElem.firstChildElement(QStringLiteral("dcterms:created"));
119 if (!elem.isNull()) {
120 QString str = elem.text();
122 if (!dt.isNull()) {
123 result->add(Property::CreationDate, dt);
124 }
125 }
126
127 elem = docElem.firstChildElement(QStringLiteral("cp:keywords"));
128 if (!elem.isNull()) {
129 QString str = elem.text();
130 if (!str.isEmpty()) {
131 result->add(Property::Keywords, str);
132 }
133 }
134 }
135
136 file = docPropDirectory->file(QStringLiteral("app.xml"));
137 if (extractMetaData && file) {
138 QDomDocument appDoc(QStringLiteral("app"));
139 appDoc.setContent(file->data());
140
141 QDomElement docElem = appDoc.documentElement();
142
143 const QString mimeType = result->inputMimetype();
144 if (mimeType == QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
145 QDomElement elem = docElem.firstChildElement(QStringLiteral("Pages"));
146 if (!elem.isNull()) {
147 bool ok = false;
148 int pageCount = elem.text().toInt(&ok);
149 if (ok) {
150 result->add(Property::PageCount, pageCount);
151 }
152 }
153
154 elem = docElem.firstChildElement(QStringLiteral("Words"));
155 if (!elem.isNull()) {
156 bool ok = false;
157 int wordCount = elem.text().toInt(&ok);
158 if (ok) {
159 result->add(Property::WordCount, wordCount);
160 }
161 }
162
163 elem = docElem.firstChildElement(QStringLiteral("Lines"));
164 if (!elem.isNull()) {
165 bool ok = false;
166 int lineCount = elem.text().toInt(&ok);
167 if (ok) {
168 result->add(Property::LineCount, lineCount);
169 }
170 }
171 }
172
173 QDomElement elem = docElem.firstChildElement(QStringLiteral("Application"));
174 if (!elem.isNull()) {
175 QString app = elem.text();
176 if (!app.isEmpty()) {
177 result->add(Property::Generator, app);
178 }
179 }
180 }
181
182 //
183 // Plain Text
184 //
185 bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText);
186
187 if (rootEntries.contains(QStringLiteral("word"))) {
188 result->addType(Type::Document);
189
190 if (!extractPlainText) {
191 return;
192 }
193
194 const KArchiveEntry* wordEntry = rootDir->entry(QStringLiteral("word"));
195 if (!wordEntry->isDirectory()) {
196 qWarning() << "Invalid document structure (word is not a directory)";
197 return;
198 }
199
200 const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry);
201 const QStringList wordEntries = wordDirectory->entries();
202
203 if (wordEntries.contains(QStringLiteral("document.xml"))) {
204 const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml"));
205
206 if (file) {
207 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
208 extractTextWithTag(contentIODevice.get(), QStringLiteral("w:t"), result);
209 }
210 }
211 }
212
213 else if (rootEntries.contains(QStringLiteral("xl"))) {
214 result->addType(Type::Document);
215 result->addType(Type::Spreadsheet);
216
217 if (!extractPlainText) {
218 return;
219 }
220
221 const KArchiveEntry* xlEntry = rootDir->entry(QStringLiteral("xl"));
222 if (!xlEntry->isDirectory()) {
223 qWarning() << "Invalid document structure (xl is not a directory)";
224 return;
225 }
226
227 const auto xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry);
228 // TODO: Read the sheets from worksheets/*.xml, and dereference all cells
229 // values in order
230 const KArchiveFile* file = xlDirectory->file(QStringLiteral("sharedStrings.xml"));
231 if (!file) {
232 return;
233 }
234 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
235 extractTextWithTag(contentIODevice.get(), QStringLiteral("t"), result);
236 }
237
238 else if (rootEntries.contains(QStringLiteral("ppt"))) {
239 result->addType(Type::Document);
240 result->addType(Type::Presentation);
241
242 if (!extractPlainText) {
243 return;
244 }
245
246 const KArchiveEntry* pptEntry = rootDir->entry(QStringLiteral("ppt"));
247 if (!pptEntry->isDirectory()) {
248 qWarning() << "Invalid document structure (ppt is not a directory)";
249 return;
250 }
251
252 const auto pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry);
253 const auto slidesEntry = pptDirectory->entry(QStringLiteral("slides"));
254 if (!slidesEntry || !slidesEntry->isDirectory()) {
255 return;
256 }
257
258 const auto slidesDirectory = dynamic_cast<const KArchiveDirectory*>(slidesEntry);
259 QStringList entries = slidesDirectory->entries();
260 // TODO: Read the actual order from presentation.xml, and follow the
261 // references in ppt/_rels/presentation.xml.rel
262 std::sort(entries.begin(), entries.end());
263 for (const QString & entryName : std::as_const(entries)) {
264 const KArchiveFile* file = slidesDirectory->file(entryName);
265 if (!file) {
266 continue;
267 }
268 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
269 extractTextWithTag(contentIODevice.get(), QStringLiteral("a:t"), result);
270 }
271 }
272}
273
274void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result)
275{
276 QXmlStreamReader xml(device);
277
278 while (!xml.atEnd()) {
279 xml.readNext();
280 if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) {
281 QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements);
282
283 if (!str.isEmpty()) {
284 result->append(str);
285 }
286 }
287
288 if (xml.isEndDocument() || xml.hasError()) {
289 break;
290 }
291 }
292}
293
294#include "moc_office2007extractor.cpp"
QStringList entries() const
const KArchiveEntry * entry(const QString &name) const
const KArchiveFile * file(const QString &name) const
virtual bool isDirectory() const
virtual QIODevice * createDevice() const
virtual QByteArray data() const
The ExtractionResult class is where all the data extracted by the indexer is saved.
QString inputUrl() const
The input url which the plugins will use to locate the file.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString inputMimetype() const
The input mimetype.
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
The ExtractorPlugin is the base class for all file metadata extractors.
static QDateTime dateTimeFromString(const QString &dateString)
Tries to extract a valid date time from the string provided.
KCALUTILS_EXPORT QString mimeType()
bool isNull() const const
QString text() const const
QDomElement firstChildElement(const QString &tagName, const QString &namespaceURI) const const
bool isNull() const const
iterator begin()
iterator end()
bool isEmpty() const const
int toInt(bool *ok, int base) const const
bool contains(QLatin1StringView str, Qt::CaseSensitivity cs) const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Tue Mar 26 2024 11:17:54 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.