• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdelibs API Reference
  • KDE Home
  • Contact Us
 

Nepomuk-Core

  • sources
  • kde-4.12
  • kdelibs
  • nepomuk-core
  • services
  • fileindexer
  • indexer
office2007extractor.cpp
Go to the documentation of this file.
1 /*
2  <one line to give the library's name and an idea of what it does.>
3  Copyright (C) 2013 Vishesh Handa <me@vhanda.in>
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License as published by the Free Software Foundation; either
8  version 2.1 of the License, or (at your option) any later version.
9 
10  This library is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with this library; if not, write to the Free Software
17  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19 
20 
21 #include "office2007extractor.h"
22 
23 #include "nco.h"
24 #include "nie.h"
25 #include "nfo.h"
26 
27 #include <KDE/KDebug>
28 #include <KDE/KZip>
29 
30 #include <QtXml/QDomDocument>
31 #include <QtXml/QXmlStreamReader>
32 #include <Soprano/Vocabulary/NAO>
33 
34 using namespace Soprano::Vocabulary;
35 using namespace Nepomuk2::Vocabulary;
36 using namespace Nepomuk2;
37 
38 Office2007Extractor::Office2007Extractor(QObject* parent, const QVariantList& ): ExtractorPlugin(parent)
39 {
40 
41 }
42 
43 
44 QStringList Office2007Extractor::mimetypes()
45 {
46  QStringList list;
47  list << QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
48  << QLatin1String("application/vnd.openxmlformats-officedocument.presentationml.presentation")
49  << QLatin1String("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
50 
51  return list;
52 }
53 
54 SimpleResourceGraph Office2007Extractor::extract(const QUrl& resUri, const QUrl& fileUrl, const QString& mimeType)
55 {
56  Q_UNUSED(mimeType);
57 
58  KZip zip(fileUrl.toLocalFile());
59  if (!zip.open(QIODevice::ReadOnly)) {
60  qWarning() << "Document is not a valid ZIP archive";
61  return SimpleResourceGraph();
62  }
63 
64  const KArchiveDirectory *rootDir = zip.directory();
65  if (!rootDir) {
66  qWarning() << "Invalid document structure (main directory is missing)";
67  return SimpleResourceGraph();
68  }
69 
70  const QStringList rootEntries = rootDir->entries();
71  if (!rootEntries.contains("docProps")) {
72  qWarning() << "Invalid document structure (docProps is missing)";
73  return SimpleResourceGraph();
74  }
75 
76  const KArchiveEntry* docPropEntry = rootDir->entry("docProps");
77  if( !docPropEntry->isDirectory() ) {
78  qWarning() << "Invalid document structure (docProps is not a directory)";
79  return SimpleResourceGraph();
80  }
81 
82  SimpleResourceGraph graph;
83  SimpleResource fileRes( resUri );
84 
85  const KArchiveDirectory* docPropDirectory = dynamic_cast<const KArchiveDirectory*>( docPropEntry );
86  const QStringList docPropsEntries = docPropDirectory->entries();
87 
88  if( docPropsEntries.contains("core.xml") ) {
89  QDomDocument coreDoc("core");
90  const KArchiveFile *file = static_cast<const KArchiveFile*>(docPropDirectory->entry("core.xml"));
91  coreDoc.setContent(file->data());
92 
93  QDomElement docElem = coreDoc.documentElement();
94 
95  QDomElement elem = docElem.firstChildElement("dc:description");
96  if( !elem.isNull() ) {
97  QString str = elem.text();
98  if( !str.isEmpty() ) {
99  fileRes.setProperty( NAO::description(), str );
100  }
101  }
102 
103  elem = docElem.firstChildElement("dc:subject");
104  if( !elem.isNull() ) {
105  QString str = elem.text();
106  if( !str.isEmpty() ) {
107  fileRes.setProperty( NIE::subject(), str );
108  }
109  }
110 
111  elem = docElem.firstChildElement("dc:title");
112  if( !elem.isNull() ) {
113  QString str = elem.text();
114  if( !str.isEmpty() ) {
115  fileRes.setProperty( NIE::title(), str );
116  }
117  }
118 
119  elem = docElem.firstChildElement("dc:creator");
120  if( !elem.isNull() ) {
121  QString str = elem.text();
122  if( !str.isEmpty() ) {
123  SimpleResource creator;
124  creator.addType( NCO::Contact() );
125  creator.addProperty( NCO::fullname(), str );
126  graph << creator;
127 
128  fileRes.setProperty( NCO::creator(), creator );
129  }
130  }
131 
132  elem = docElem.firstChildElement("dc:langauge");
133  if( !elem.isNull() ) {
134  QString str = elem.text();
135  if( !str.isEmpty() ) {
136  fileRes.setProperty( NIE::language(), str );
137  }
138  }
139  }
140 
141  if( docPropsEntries.contains("app.xml") ) {
142  QDomDocument appDoc("app");
143  const KArchiveFile *file = static_cast<const KArchiveFile*>(docPropDirectory->entry("app.xml"));
144  appDoc.setContent(file->data());
145 
146  QDomElement docElem = appDoc.documentElement();
147 
148  // According to the ontologies only Documents can have a wordCount and pageCount
149  if( mimeType == QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ) {
150  QDomElement elem = docElem.firstChildElement("Pages");
151  if( !elem.isNull() ) {
152  bool ok = false;
153  int pageCount = elem.text().toInt(&ok);
154  if( ok ) {
155  fileRes.setProperty( NFO::pageCount(), pageCount );
156  }
157  }
158 
159  elem = docElem.firstChildElement("Words");
160  if( !elem.isNull() ) {
161  bool ok = false;
162  int wordCount = elem.text().toInt(&ok);
163  if( ok ) {
164  fileRes.setProperty( NFO::wordCount(), wordCount );
165  }
166  }
167  }
168 
169  QDomElement elem = docElem.firstChildElement("Application");
170  if( !elem.isNull() ) {
171  QString app = elem.text();
172  if( !app.isEmpty() ) {
173  fileRes.setProperty( NIE::generator(), app );
174  }
175  }
176  }
177 
178 
179  if (rootEntries.contains("word")) {
180  const KArchiveEntry* wordEntry = rootDir->entry("word");
181  if( !wordEntry->isDirectory() ) {
182  qWarning() << "Invalid document structure (word is not a directory)";
183  return SimpleResourceGraph();
184  }
185 
186  const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>( wordEntry );
187  const QStringList wordEntries = wordDirectory->entries();
188 
189  if( wordEntries.contains("document.xml") ) {
190  QDomDocument appDoc("document");
191  const KArchiveFile *file = static_cast<const KArchiveFile*>(wordDirectory->entry("document.xml"));
192 
193  QString plainText;
194  QTextStream stream(&plainText);
195 
196  extractTextWithTag(file->createDevice(), QLatin1String("w:t"), stream);
197  if( !plainText.isEmpty() )
198  fileRes.addProperty( NIE::plainTextContent(), plainText );
199  }
200  }
201 
202  else if( rootEntries.contains("xl") ) {
203  const KArchiveEntry* xlEntry = rootDir->entry("xl");
204  if( !xlEntry->isDirectory() ) {
205  qWarning() << "Invalid document structure (xl is not a directory)";
206  return SimpleResourceGraph();
207  }
208 
209  QString plainText;
210  QTextStream stream(&plainText);
211 
212  const KArchiveDirectory* xlDirectory = dynamic_cast<const KArchiveDirectory*>( xlEntry );
213  extractTextFromFiles( xlDirectory, stream );
214  if( !plainText.isEmpty() )
215  fileRes.addProperty( NIE::plainTextContent(), plainText );
216  }
217 
218  else if( rootEntries.contains("ppt") ) {
219  const KArchiveEntry* pptEntry = rootDir->entry("ppt");
220  if( !pptEntry->isDirectory() ) {
221  qWarning() << "Invalid document structure (ppt is not a directory)";
222  return SimpleResourceGraph();
223  }
224 
225  QString plainText;
226  QTextStream stream(&plainText);
227 
228  const KArchiveDirectory* pptDirectory = dynamic_cast<const KArchiveDirectory*>( pptEntry );
229  extractTextFromFiles( pptDirectory, stream );
230  if( !plainText.isEmpty() )
231  fileRes.addProperty( NIE::plainTextContent(), plainText );
232  }
233 
234  if( fileRes.properties().isEmpty() )
235  return SimpleResourceGraph();
236 
237  graph << fileRes;
238  return graph;
239 }
240 
241 void Office2007Extractor::extractAllText(QIODevice* device, QTextStream& stream)
242 {
243  QXmlStreamReader xml( device );
244 
245  while( !xml.atEnd() ) {
246  if( stream.string()->size() >= maxPlainTextSize() )
247  return;
248 
249  xml.readNext();
250  if( xml.isCharacters() ) {
251  QString str = xml.text().toString();
252  stream << str;
253 
254  if( !str.at(str.length()-1).isSpace() )
255  stream << QLatin1Char(' ');
256  }
257 
258  if( xml.isEndDocument() || xml.hasError() )
259  break;
260  }
261 }
262 
263 void Office2007Extractor::extractTextFromFiles(const KArchiveDirectory* archiveDir, QTextStream& stream)
264 {
265  const QStringList entries = archiveDir->entries();
266  foreach(const QString& entryName, entries) {
267  const KArchiveEntry* entry = archiveDir->entry(entryName);
268  if( entry->isDirectory() ) {
269  const KArchiveDirectory* subDir = dynamic_cast<const KArchiveDirectory*>(entry);
270  extractTextFromFiles( subDir, stream );
271  continue;
272  }
273 
274  if( stream.string()->size() >= maxPlainTextSize() )
275  return;
276 
277  if( !entryName.endsWith(".xml") )
278  continue;
279 
280  const KArchiveFile* file = static_cast<const KArchiveFile*>(entry);
281  extractAllText( file->createDevice(), stream );
282  }
283 }
284 
285 void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, QTextStream& stream)
286 {
287  QXmlStreamReader xml( device );
288  int size = 0;
289 
290  while( !xml.atEnd() ) {
291  if( size >= maxPlainTextSize() )
292  break;
293 
294  xml.readNext();
295  if( xml.qualifiedName().startsWith(tag) && xml.isStartElement() ) {
296  QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
297 
298  if( !str.isEmpty() ) {
299  stream << str;
300  size += str.size();
301 
302  if( !str.at(str.length()-1).isSpace() )
303  stream << QLatin1Char(' ');
304  }
305  }
306 
307  if( xml.isEndDocument() || xml.hasError() )
308  break;
309  }
310 }
311 
312 
313 NEPOMUK_EXPORT_EXTRACTOR( Nepomuk2::Office2007Extractor, "nepomukoffice2007extractor" )
Nepomuk2::SimpleResource::setProperty
void setProperty(const QUrl &property, const QVariant &value)
Set a property overwriting existing values.
Definition: simpleresource.cpp:186
Nepomuk2::ExtractorPlugin
The ExtractorPlugin is the base class for all file metadata extractors.
Definition: extractorplugin.h:60
Nepomuk2::SimpleResource
Represents a snapshot of one Nepomuk resource.
Definition: simpleresource.h:46
Nepomuk2::SimpleResource::properties
PropertyHash properties() const
Definition: simpleresource.cpp:155
QObject
Nepomuk2::SimpleResource::addProperty
void addProperty(const QUrl &property, const QVariant &value)
Add a property.
Definition: simpleresource.cpp:206
Nepomuk2::SimpleResourceGraph
Definition: simpleresourcegraph.h:48
Nepomuk2::Office2007Extractor
Definition: office2007extractor.h:30
NEPOMUK_EXPORT_EXTRACTOR
#define NEPOMUK_EXPORT_EXTRACTOR(classname, libname)
Export a Nepomuk file extractor.
Definition: extractorplugin.h:163
Nepomuk2::Office2007Extractor::mimetypes
virtual QStringList mimetypes()
Provide a list of mimetypes which are supported by this plugin.
Definition: office2007extractor.cpp:44
Nepomuk2::ExtractorPlugin::maxPlainTextSize
static int maxPlainTextSize()
Virtuoso does not support streaming operators, and does not accept queries above a certain size...
Definition: extractorplugin.cpp:124
Nepomuk2::SimpleResource::addType
void addType(const QUrl &type)
A convenience method which adds a property of type rdf:type.
Definition: simpleresource.cpp:257
Nepomuk2::Office2007Extractor::extract
virtual SimpleResourceGraph extract(const QUrl &resUri, const QUrl &fileUrl, const QString &mimeType)
The main function of the plugin that is responsible for extracting the data from the file url and ret...
Definition: office2007extractor.cpp:54
office2007extractor.h
This file is part of the KDE documentation.
Documentation copyright © 1996-2014 The KDE developers.
Generated on Tue Oct 14 2014 22:48:08 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

Nepomuk-Core

Skip menu "Nepomuk-Core"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs API Reference

Skip menu "kdelibs API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  • kjsembed
  •   WTF
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Nepomuk-Core
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal