KFileMetaData

odfextractor.cpp
1/*
2 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
3 SPDX-FileCopyrightText: 2012 Jörg Ehrichs <joerg.ehrichs@gmx.de>
4 SPDX-FileCopyrightText: 2016 Christoph Cullmann <cullmann@kde.org>
5
6 SPDX-License-Identifier: LGPL-2.1-or-later
7*/
8
9
10#include "odfextractor.h"
11#include <memory>
12
13#include <KZip>
14
15#include <QDebug>
16#include <QDomDocument>
17#include <QFile>
18#include <QXmlStreamReader>
19
20namespace {
21
22inline QString dcNS() { return QStringLiteral("http://purl.org/dc/elements/1.1/"); }
23inline QString metaNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:meta:1.0"); }
24inline QString officeNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:office:1.0"); }
25inline QString bodyTag() { return QStringLiteral("body"); }
26
27QDomElement firstChildElementNS(const QDomNode &node, const QString &nsURI, const QString &localName)
28{
29 for (auto e = node.firstChildElement(); !e.isNull(); e = e.nextSiblingElement()) {
30 if (e.localName() == localName && e.namespaceURI() == nsURI) {
31 return e;
32 }
33 }
34
35 return QDomElement();
36}
37
38const QStringList supportedMimeTypes = {
39 QStringLiteral("application/vnd.oasis.opendocument.text"),
40 QStringLiteral("application/vnd.oasis.opendocument.text-template"),
41 QStringLiteral("application/vnd.oasis.opendocument.text-master"),
42 QStringLiteral("application/vnd.oasis.opendocument.text-master-template"),
43 QStringLiteral("application/vnd.oasis.opendocument.text-flat-xml"),
44 QStringLiteral("application/vnd.oasis.opendocument.presentation"),
45 QStringLiteral("application/vnd.oasis.opendocument.presentation-template"),
46 QStringLiteral("application/vnd.oasis.opendocument.presentation-flat-xml"),
47 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet"),
48 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-template"),
49 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-flat-xml"),
50 QStringLiteral("application/vnd.oasis.opendocument.graphics"),
51 QStringLiteral("application/vnd.oasis.opendocument.graphics-template"),
52 QStringLiteral("application/vnd.oasis.opendocument.graphics-flat-xml"),
53};
54
55}
56
57using namespace KFileMetaData;
58
59OdfExtractor::OdfExtractor(QObject* parent)
60 : ExtractorPlugin(parent)
61{
62
63}
64
65QStringList OdfExtractor::mimetypes() const
66{
67 return supportedMimeTypes;
68}
69
70void OdfExtractor::extract(ExtractionResult* result)
71{
72 if (result->inputMimetype().endsWith(QLatin1String("-flat-xml"))) {
73 QFile file(result->inputUrl());
74 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
75 return;
76 }
77
78 result->addType(Type::Document);
79 if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-flat-xml")) {
81 } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-flat-xml")) {
83 } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-flat-xml")) {
84 result->addType(Type::Image);
85 }
86
87 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
88 parseMetaData(QStringLiteral("document"), file.readAll(), result);
89 }
90
91 if (result->inputFlags() & ExtractionResult::ExtractPlainText) {
92 file.seek(0);
93 extractPlainText(&file, result);
94 }
95
96 return;
97 }
98
99 KZip zip(result->inputUrl());
100 if (!zip.open(QIODevice::ReadOnly)) {
101 qWarning() << "Document is not a valid ZIP archive";
102 return;
103 }
104
105 const KArchiveDirectory* directory = zip.directory();
106 if (!directory) {
107 qWarning() << "Invalid document structure (main directory is missing)";
108 return;
109 }
110
111 // we need a meta xml file in the archive!
112 const auto metaXml = directory->file(QStringLiteral("meta.xml"));
113 if (!metaXml) {
114 qWarning() << "Invalid document structure (meta.xml is missing)";
115 return;
116 }
117
118 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
119 parseMetaData(QStringLiteral("document-meta"), metaXml->data(), result);
120 }
121
122 result->addType(Type::Document);
123 if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation")) ||
124 (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-template"))) {
126 }
127 else if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet")) ||
128 (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-template"))) {
129 result->addType(Type::Spreadsheet);
130 }
131 else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics") ||
132 result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-template")) {
133 result->addType(Type::Image);
134 }
135
136 if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
137 return;
138 }
139
140 // for content indexing, we need content xml file
141 const auto contentXml = directory->file(QStringLiteral("content.xml"));
142 if (!contentXml) {
143 qWarning() << "Invalid document structure (content.xml is missing)";
144 return;
145 }
146
147 std::unique_ptr<QIODevice> contentIODevice{contentXml->createDevice()};
148 extractPlainText(contentIODevice.get(), result);
149}
150
151void OdfExtractor::parseMetaData(const QString &documentElementId, const QByteArray &data, ExtractionResult *result)
152{
153 QDomDocument metaData(QStringLiteral("metaData"));
154 metaData.setContent(data, true);
155
156 // parse metadata ...
158 officeNS(), documentElementId),
159 officeNS(), QStringLiteral("meta"));
160
161 QDomNode n = meta.firstChild();
162 while (!n.isNull()) {
163 QDomElement e = n.toElement();
164 if (!e.isNull()) {
165 const QString namespaceURI = e.namespaceURI();
166 const QString localName = e.localName();
167
168 // Dublin Core
169 if (namespaceURI == dcNS()) {
170 if (localName == QLatin1String("description")) {
171 result->add(Property::Description, e.text());
172 } else if (localName == QLatin1String("subject")) {
173 result->add(Property::Subject, e.text());
174 } else if (localName == QLatin1String("title")) {
175 result->add(Property::Title, e.text());
176 } else if (localName == QLatin1String("creator")) {
177 result->add(Property::Author, e.text());
178 } else if (localName == QLatin1String("language")) {
179 result->add(Property::Language, e.text());
180 }
181 }
182 // Meta Properties
183 else if (namespaceURI == metaNS()) {
184 if (localName == QLatin1String("document-statistic")) {
185 bool ok = false;
186 int pageCount = e.attributeNS(metaNS(), QStringLiteral("page-count")).toInt(&ok);
187 if (ok) {
188 result->add(Property::PageCount, pageCount);
189 }
190
191 int wordCount = e.attributeNS(metaNS(), QStringLiteral("word-count")).toInt(&ok);
192 if (ok) {
193 result->add(Property::WordCount, wordCount);
194 }
195 } else if (localName == QLatin1String("keyword")) {
196 QString keywords = e.text();
197 result->add(Property::Keywords, keywords);
198 } else if (localName == QLatin1String("generator")) {
199 result->add(Property::Generator, e.text());
200 } else if (localName == QLatin1String("creation-date")) {
202 if (!dt.isNull()) {
203 result->add(Property::CreationDate, dt);
204 }
205 }
206 }
207 }
208 n = n.nextSibling();
209 }
210}
211
212void OdfExtractor::extractPlainText(QIODevice *device, ExtractionResult *result)
213{
214 bool inOfficeBody = false;
215
216 QXmlStreamReader xml(device);
217 while (!xml.atEnd()) {
218 xml.readNext();
219
220 if (xml.isStartElement() && !inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) {
221 inOfficeBody = true;
222 } else if (xml.isEndElement() && inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) {
223 break;
224 }
225
226 if (inOfficeBody && xml.isCharacters() && !xml.isWhitespace()) {
227 const QString str = xml.text().toString();
228 result->append(str);
229 }
230
231 if (xml.hasError() || xml.isEndDocument()) {
232 break;
233 }
234 }
235}
236
237#include "moc_odfextractor.cpp"
const KArchiveFile * file(const QString &name) const
The ExtractionResult class is where all the data extracted by the indexer is saved.
QString inputUrl() const
The input URL which the plugins will use to locate the file.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString inputMimetype() const
The input MIME type.
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
The ExtractorPlugin is the base class for all file metadata extractors.
static QDateTime dateTimeFromString(const QString &dateString)
Tries to extract a valid date time from the string provided.
QDomElement KPIMKDAV2_EXPORT firstChildElementNS(const QDomElement &parent, const QString &namespaceUri, const QString &tagName)
@ WordCount
The number of words in a document.
Definition properties.h:145
@ Subject
Refers to the subject of the file.
Definition properties.h:127
@ Title
Refers to the Title of the content of the file.
Definition properties.h:121
@ Author
The Author field indicated the primary creator of a document.
Definition properties.h:114
@ Description
Represents the description stored in the file.
Definition properties.h:351
@ Generator
Refers to the Application used to create this file.
Definition properties.h:134
@ PageCount
The number of pages in a document.
Definition properties.h:139
@ CreationDate
The date the content of the file was created.
Definition properties.h:177
@ Language
The language the document is written in.
Definition properties.h:159
@ Keywords
The keywords used to represent the document.
Definition properties.h:183
@ Document
Any file which counts as a document.
Definition types.h:63
@ Image
Any Image file.
Definition types.h:56
@ Presentation
A Presentation file.
Definition types.h:75
@ Spreadsheet
A SpreadSheet file.
Definition types.h:69
The KFileMetaData namespace.
bool isNull() const const
QString attributeNS(const QString &nsURI, const QString &localName, const QString &defValue) const const
QString text() const const
QDomNode firstChild() const const
QDomElement firstChildElement(const QString &tagName, const QString &namespaceURI) const const
bool isNull() const const
QString localName() const const
QString namespaceURI() const const
QDomNode nextSibling() const const
QDomElement toElement() const const
bool endsWith(QChar c, Qt::CaseSensitivity cs) const const
int toInt(bool *ok, int base) const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:48:11 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.