KFileMetaData

odfextractor.cpp
1/*
2 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
3 SPDX-FileCopyrightText: 2012 Jörg Ehrichs <joerg.ehrichs@gmx.de>
4 SPDX-FileCopyrightText: 2016 Christoph Cullmann <cullmann@kde.org>
5
6 SPDX-License-Identifier: LGPL-2.1-or-later
7*/
8
9
10#include "datetimeparser_p.h"
11#include "odfextractor.h"
12#include <memory>
13
14#include <KZip>
15
16#include <QDebug>
17#include <QDomDocument>
18#include <QFile>
19#include <QXmlStreamReader>
20
21namespace {
22
23inline QString dcNS() { return QStringLiteral("http://purl.org/dc/elements/1.1/"); }
24inline QString metaNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:meta:1.0"); }
25inline QString officeNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:office:1.0"); }
26inline QString bodyTag() { return QStringLiteral("body"); }
27
28QDomElement firstChildElementNS(const QDomNode &node, const QString &nsURI, const QString &localName)
29{
30 for (auto e = node.firstChildElement(); !e.isNull(); e = e.nextSiblingElement()) {
31 if (e.localName() == localName && e.namespaceURI() == nsURI) {
32 return e;
33 }
34 }
35
36 return QDomElement();
37}
38
39const QStringList supportedMimeTypes = {
40 QStringLiteral("application/vnd.oasis.opendocument.text"),
41 QStringLiteral("application/vnd.oasis.opendocument.text-template"),
42 QStringLiteral("application/vnd.oasis.opendocument.text-master"),
43 QStringLiteral("application/vnd.oasis.opendocument.text-master-template"),
44 QStringLiteral("application/vnd.oasis.opendocument.text-flat-xml"),
45 QStringLiteral("application/vnd.oasis.opendocument.presentation"),
46 QStringLiteral("application/vnd.oasis.opendocument.presentation-template"),
47 QStringLiteral("application/vnd.oasis.opendocument.presentation-flat-xml"),
48 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet"),
49 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-template"),
50 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-flat-xml"),
51 QStringLiteral("application/vnd.oasis.opendocument.graphics"),
52 QStringLiteral("application/vnd.oasis.opendocument.graphics-template"),
53 QStringLiteral("application/vnd.oasis.opendocument.graphics-flat-xml"),
54};
55
56}
57
58using namespace KFileMetaData;
59
60OdfExtractor::OdfExtractor(QObject* parent)
61 : ExtractorPlugin(parent)
62{
63
64}
65
66QStringList OdfExtractor::mimetypes() const
67{
68 return supportedMimeTypes;
69}
70
71void OdfExtractor::extract(ExtractionResult* result)
72{
73 if (result->inputMimetype().endsWith(QLatin1String("-flat-xml"))) {
74 QFile file(result->inputUrl());
75 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
76 return;
77 }
78
79 result->addType(Type::Document);
80 if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-flat-xml")) {
82 } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-flat-xml")) {
84 } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-flat-xml")) {
85 result->addType(Type::Image);
86 }
87
88 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
89 parseMetaData(QStringLiteral("document"), file.readAll(), result);
90 }
91
92 if (result->inputFlags() & ExtractionResult::ExtractPlainText) {
93 file.seek(0);
94 extractPlainText(&file, result);
95 }
96
97 return;
98 }
99
100 KZip zip(result->inputUrl());
101 if (!zip.open(QIODevice::ReadOnly)) {
102 qWarning() << "Document is not a valid ZIP archive";
103 return;
104 }
105
106 const KArchiveDirectory* directory = zip.directory();
107 if (!directory) {
108 qWarning() << "Invalid document structure (main directory is missing)";
109 return;
110 }
111
112 // we need a meta xml file in the archive!
113 const auto metaXml = directory->file(QStringLiteral("meta.xml"));
114 if (!metaXml) {
115 qWarning() << "Invalid document structure (meta.xml is missing)";
116 return;
117 }
118
119 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
120 parseMetaData(QStringLiteral("document-meta"), metaXml->data(), result);
121 }
122
123 result->addType(Type::Document);
124 if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation")) ||
125 (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-template"))) {
127 }
128 else if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet")) ||
129 (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-template"))) {
130 result->addType(Type::Spreadsheet);
131 }
132 else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics") ||
133 result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-template")) {
134 result->addType(Type::Image);
135 }
136
137 if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
138 return;
139 }
140
141 // for content indexing, we need content xml file
142 const auto contentXml = directory->file(QStringLiteral("content.xml"));
143 if (!contentXml) {
144 qWarning() << "Invalid document structure (content.xml is missing)";
145 return;
146 }
147
148 std::unique_ptr<QIODevice> contentIODevice{contentXml->createDevice()};
149 extractPlainText(contentIODevice.get(), result);
150}
151
152void OdfExtractor::parseMetaData(const QString &documentElementId, const QByteArray &data, ExtractionResult *result)
153{
154 QDomDocument metaData(QStringLiteral("metaData"));
155 metaData.setContent(data, true);
156
157 // parse metadata ...
158 QDomElement meta = firstChildElementNS(firstChildElementNS(metaData,
159 officeNS(), documentElementId),
160 officeNS(), QStringLiteral("meta"));
161
162 QDomNode n = meta.firstChild();
163 while (!n.isNull()) {
164 QDomElement e = n.toElement();
165 if (!e.isNull()) {
166 const QString namespaceURI = e.namespaceURI();
167 const QString localName = e.localName();
168
169 // Dublin Core
170 if (namespaceURI == dcNS()) {
171 if (localName == QLatin1String("description")) {
172 result->add(Property::Description, e.text());
173 } else if (localName == QLatin1String("subject")) {
174 result->add(Property::Subject, e.text());
175 } else if (localName == QLatin1String("title")) {
176 result->add(Property::Title, e.text());
177 } else if (localName == QLatin1String("creator")) {
178 result->add(Property::Author, e.text());
179 } else if (localName == QLatin1String("language")) {
180 result->add(Property::Language, e.text());
181 }
182 }
183 // Meta Properties
184 else if (namespaceURI == metaNS()) {
185 if (localName == QLatin1String("document-statistic")) {
186 bool ok = false;
187 int pageCount = e.attributeNS(metaNS(), QStringLiteral("page-count")).toInt(&ok);
188 if (ok) {
189 result->add(Property::PageCount, pageCount);
190 }
191
192 int wordCount = e.attributeNS(metaNS(), QStringLiteral("word-count")).toInt(&ok);
193 if (ok) {
194 result->add(Property::WordCount, wordCount);
195 }
196 } else if (localName == QLatin1String("keyword")) {
197 QString keywords = e.text();
198 result->add(Property::Keywords, keywords);
199 } else if (localName == QLatin1String("generator")) {
200 result->add(Property::Generator, e.text());
201 } else if (localName == QLatin1String("creation-date")) {
202 QDateTime dt = Parser::dateTimeFromString(e.text());
203 if (!dt.isNull()) {
204 result->add(Property::CreationDate, dt);
205 }
206 }
207 }
208 }
209 n = n.nextSibling();
210 }
211}
212
213void OdfExtractor::extractPlainText(QIODevice *device, ExtractionResult *result)
214{
215 bool inOfficeBody = false;
216
217 QXmlStreamReader xml(device);
218 while (!xml.atEnd()) {
219 xml.readNext();
220
221 if (xml.isStartElement() && !inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) {
222 inOfficeBody = true;
223 } else if (xml.isEndElement() && inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) {
224 break;
225 }
226
227 if (inOfficeBody && xml.isCharacters() && !xml.isWhitespace()) {
228 const QString str = xml.text().toString();
229 result->append(str);
230 }
231
232 if (xml.hasError() || xml.isEndDocument()) {
233 break;
234 }
235 }
236}
237
238#include "moc_odfextractor.cpp"
const KArchiveFile * file(const QString &name) const
The ExtractionResult class is where all the data extracted by the indexer is saved.
QString inputUrl() const
The input URL which the plugins will use to locate the file.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString inputMimetype() const
The input MIME type.
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
The ExtractorPlugin is the base class for all file metadata extractors.
QDomElement KPIMKDAV2_EXPORT firstChildElementNS(const QDomElement &parent, const QString &namespaceUri, const QString &tagName)
@ WordCount
The number of words in a document.
Definition properties.h:145
@ Subject
Refers to the subject of the file.
Definition properties.h:127
@ Title
Refers to the Title of the content of the file.
Definition properties.h:121
@ Author
The Author field indicated the primary creator of a document.
Definition properties.h:114
@ Description
Represents the description stored in the file.
Definition properties.h:351
@ Generator
Refers to the Application used to create this file.
Definition properties.h:134
@ PageCount
The number of pages in a document.
Definition properties.h:139
@ CreationDate
The date the content of the file was created.
Definition properties.h:177
@ Language
The language the document is written in.
Definition properties.h:159
@ Keywords
The keywords used to represent the document.
Definition properties.h:183
@ Document
Any file which counts as a document.
Definition types.h:63
@ Image
Any Image file.
Definition types.h:56
@ Presentation
A Presentation file.
Definition types.h:75
@ Spreadsheet
A SpreadSheet file.
Definition types.h:69
The KFileMetaData namespace.
bool isNull() const const
QString attributeNS(const QString &nsURI, const QString &localName, const QString &defValue) const const
QString text() const const
QDomNode firstChild() const const
QDomElement firstChildElement(const QString &tagName, const QString &namespaceURI) const const
bool isNull() const const
QString localName() const const
QString namespaceURI() const const
QDomNode nextSibling() const const
QDomElement toElement() const const
bool endsWith(QChar c, Qt::CaseSensitivity cs) const const
int toInt(bool *ok, int base) const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Feb 21 2025 11:53:46 by doxygen 1.13.2 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.