KFileMetaData

office2007extractor.cpp
1/*
2 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
3
4 SPDX-License-Identifier: LGPL-2.1-or-later
5*/
6
7
8#include "office2007extractor.h"
9
10#include "dublincoreextractor.h"
11#include <memory>
12
13#include <KZip>
14
15#include <QDebug>
16#include <QDomDocument>
17#include <QXmlStreamReader>
18
19using namespace KFileMetaData;
20
21namespace {
22inline QString cpNS() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/metadata/core-properties"); }
23} // namespace
24
25Office2007Extractor::Office2007Extractor(QObject* parent)
26 : ExtractorPlugin(parent)
27{
28
29}
30
31const QStringList supportedMimeTypes = {
32 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
33 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template"),
34 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
35 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide"),
36 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
37 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template"),
38 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
39 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template"),
40};
41
42QStringList Office2007Extractor::mimetypes() const
43{
44 return supportedMimeTypes;
45}
46
47void Office2007Extractor::extract(ExtractionResult* result)
48{
49 KZip zip(result->inputUrl());
50 if (!zip.open(QIODevice::ReadOnly)) {
51 qWarning() << "Document is not a valid ZIP archive";
52 return;
53 }
54
55 const KArchiveDirectory* rootDir = zip.directory();
56 if (!rootDir) {
57 qWarning() << "Invalid document structure (main directory is missing)";
58 return;
59 }
60
61 const QStringList rootEntries = rootDir->entries();
62 if (!rootEntries.contains(QStringLiteral("docProps"))) {
63 qWarning() << "Invalid document structure (docProps is missing)";
64 return;
65 }
66
67 const KArchiveEntry* docPropEntry = rootDir->entry(QStringLiteral("docProps"));
68 if (!docPropEntry->isDirectory()) {
69 qWarning() << "Invalid document structure (docProps is not a directory)";
70 return;
71 }
72
73 const KArchiveDirectory* docPropDirectory = dynamic_cast<const KArchiveDirectory*>(docPropEntry);
74
75 const bool extractMetaData = result->inputFlags() & ExtractionResult::ExtractMetaData;
76
77 const KArchiveFile* file = docPropDirectory->file(QStringLiteral("core.xml"));
78 if (extractMetaData && file) {
79 QDomDocument coreDoc(QStringLiteral("core"));
80 coreDoc.setContent(file->data(), true);
81
82 QDomElement cpElem = coreDoc.documentElement();
83
84 if (!cpElem.isNull() && cpElem.namespaceURI() == cpNS()) {
85 DublinCoreExtractor::extract(result, cpElem);
86 }
87
88 auto elem = cpElem.firstChildElement(QStringLiteral("keywords"));
89 if (!elem.isNull() && elem.namespaceURI() == cpNS()) {
90 QString str = elem.text();
91 if (!str.isEmpty()) {
92 result->add(Property::Keywords, str);
93 }
94 }
95 }
96
97 file = docPropDirectory->file(QStringLiteral("app.xml"));
98 if (extractMetaData && file) {
99 QDomDocument appDoc(QStringLiteral("app"));
100 appDoc.setContent(file->data());
101
102 QDomElement docElem = appDoc.documentElement();
103
104 const QString mimeType = result->inputMimetype();
105 if (mimeType == QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
106 QDomElement elem = docElem.firstChildElement(QStringLiteral("Pages"));
107 if (!elem.isNull()) {
108 bool ok = false;
109 int pageCount = elem.text().toInt(&ok);
110 if (ok) {
111 result->add(Property::PageCount, pageCount);
112 }
113 }
114
115 elem = docElem.firstChildElement(QStringLiteral("Words"));
116 if (!elem.isNull()) {
117 bool ok = false;
118 int wordCount = elem.text().toInt(&ok);
119 if (ok) {
120 result->add(Property::WordCount, wordCount);
121 }
122 }
123
124 elem = docElem.firstChildElement(QStringLiteral("Lines"));
125 if (!elem.isNull()) {
126 bool ok = false;
127 int lineCount = elem.text().toInt(&ok);
128 if (ok) {
129 result->add(Property::LineCount, lineCount);
130 }
131 }
132 }
133
134 QDomElement elem = docElem.firstChildElement(QStringLiteral("Application"));
135 if (!elem.isNull()) {
136 QString app = elem.text();
137 if (!app.isEmpty()) {
138 result->add(Property::Generator, app);
139 }
140 }
141 }
142
143 //
144 // Plain Text
145 //
146 bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText);
147
148 if (rootEntries.contains(QStringLiteral("word"))) {
149 result->addType(Type::Document);
150
151 if (!extractPlainText) {
152 return;
153 }
154
155 const KArchiveEntry* wordEntry = rootDir->entry(QStringLiteral("word"));
156 if (!wordEntry->isDirectory()) {
157 qWarning() << "Invalid document structure (word is not a directory)";
158 return;
159 }
160
161 const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry);
162 const QStringList wordEntries = wordDirectory->entries();
163
164 if (wordEntries.contains(QStringLiteral("document.xml"))) {
165 const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml"));
166
167 if (file) {
168 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
169 extractTextWithTag(contentIODevice.get(), QStringLiteral("w:t"), result);
170 }
171 }
172 }
173
174 else if (rootEntries.contains(QStringLiteral("xl"))) {
175 result->addType(Type::Document);
176 result->addType(Type::Spreadsheet);
177
178 if (!extractPlainText) {
179 return;
180 }
181
182 const KArchiveEntry* xlEntry = rootDir->entry(QStringLiteral("xl"));
183 if (!xlEntry->isDirectory()) {
184 qWarning() << "Invalid document structure (xl is not a directory)";
185 return;
186 }
187
188 const auto xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry);
189 // TODO: Read the sheets from worksheets/*.xml, and dereference all cells
190 // values in order
191 const KArchiveFile* file = xlDirectory->file(QStringLiteral("sharedStrings.xml"));
192 if (!file) {
193 return;
194 }
195 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
196 extractTextWithTag(contentIODevice.get(), QStringLiteral("t"), result);
197 }
198
199 else if (rootEntries.contains(QStringLiteral("ppt"))) {
200 result->addType(Type::Document);
202
203 if (!extractPlainText) {
204 return;
205 }
206
207 const KArchiveEntry* pptEntry = rootDir->entry(QStringLiteral("ppt"));
208 if (!pptEntry->isDirectory()) {
209 qWarning() << "Invalid document structure (ppt is not a directory)";
210 return;
211 }
212
213 const auto pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry);
214 const auto slidesEntry = pptDirectory->entry(QStringLiteral("slides"));
215 if (!slidesEntry || !slidesEntry->isDirectory()) {
216 return;
217 }
218
219 const auto slidesDirectory = dynamic_cast<const KArchiveDirectory*>(slidesEntry);
220 QStringList entries = slidesDirectory->entries();
221 // TODO: Read the actual order from presentation.xml, and follow the
222 // references in ppt/_rels/presentation.xml.rel
223 std::sort(entries.begin(), entries.end());
224 for (const QString & entryName : std::as_const(entries)) {
225 const KArchiveFile* file = slidesDirectory->file(entryName);
226 if (!file) {
227 continue;
228 }
229 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
230 extractTextWithTag(contentIODevice.get(), QStringLiteral("a:t"), result);
231 }
232 }
233}
234
235void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result)
236{
237 QXmlStreamReader xml(device);
238
239 while (!xml.atEnd()) {
240 xml.readNext();
241 if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) {
242 QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements);
243
244 if (!str.isEmpty()) {
245 result->append(str);
246 }
247 }
248
249 if (xml.isEndDocument() || xml.hasError()) {
250 break;
251 }
252 }
253}
254
255#include "moc_office2007extractor.cpp"
QStringList entries() const
const KArchiveEntry * entry(const QString &name) const
const KArchiveFile * file(const QString &name) const
virtual bool isDirectory() const
virtual QIODevice * createDevice() const
virtual QByteArray data() const
The ExtractionResult class is where all the data extracted by the indexer is saved.
QString inputUrl() const
The input URL which the plugins will use to locate the file.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString inputMimetype() const
The input MIME type.
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
The ExtractorPlugin is the base class for all file metadata extractors.
KCALUTILS_EXPORT QString mimeType()
@ WordCount
The number of words in a document.
Definition properties.h:145
@ Generator
Refers to the Application used to create this file.
Definition properties.h:134
@ PageCount
The number of pages in a document.
Definition properties.h:139
@ LineCount
The number of lines in a document.
Definition properties.h:151
@ Keywords
The keywords used to represent the document.
Definition properties.h:183
@ Document
Any file which counts as a document.
Definition types.h:63
@ Presentation
A Presentation file.
Definition types.h:75
@ Spreadsheet
A SpreadSheet file.
Definition types.h:69
The KFileMetaData namespace.
QString text() const const
QDomElement firstChildElement(const QString &tagName, const QString &namespaceURI) const const
bool isNull() const const
QString namespaceURI() const const
iterator begin()
iterator end()
bool isEmpty() const const
int toInt(bool *ok, int base) const const
bool contains(QLatin1StringView str, Qt::CaseSensitivity cs) const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:48:11 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.