• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdelibs API Reference
  • KDE Home
  • Contact Us
 

Nepomuk-Core

  • sources
  • kde-4.12
  • kdelibs
  • nepomuk-core
  • services
  • fileindexer
  • indexer
officeextractor.cpp
Go to the documentation of this file.
1 /*
2  This file is part of a Nepomuk File Extractor
3  Copyright (C) 2013 Denis Steckelmacher <steckdenis@yahoo.fr>
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License as published by the Free Software Foundation; either
8  version 2.1 of the License, or (at your option) any later version.
9 
10  This library is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with this library; if not, write to the Free Software
17  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19 
20 #include "officeextractor.h"
21 
22 #include <kstandarddirs.h>
23 #include "nie.h"
24 #include "nfo.h"
25 
26 #include <QtCore/QFile>
27 #include <QtCore/QProcess>
28 
29 using namespace Nepomuk2::Vocabulary;
30 
31 Nepomuk2::OfficeExtractor::OfficeExtractor(QObject *parent, const QVariantList &)
32 : ExtractorPlugin(parent)
33 {
34  // Find the executables of catdoc, catppt and xls2csv. If an executable cannot
35  // be found, indexing its corresponding MIME type will be disabled
36  findExe("application/msword", "catdoc", m_catdoc);
37  findExe("application/vnd.ms-excel", "xls2csv", m_xls2csv);
38  findExe("application/vnd.ms-powerpoint", "catppt", m_catppt);
39 }
40 
41 void Nepomuk2::OfficeExtractor::findExe(const QString &mimeType, const QString &name, QString &fullPath)
42 {
43  fullPath = KStandardDirs::findExe(name);
44 
45  if (!fullPath.isEmpty()) {
46  m_available_mime_types << mimeType;
47  }
48 }
49 
50 QStringList Nepomuk2::OfficeExtractor::mimetypes()
51 {
52  return m_available_mime_types;
53 }
54 
55 Nepomuk2::SimpleResourceGraph Nepomuk2::OfficeExtractor::extract(const QUrl &resUri,
56  const QUrl &fileUrl,
57  const QString &mimeType)
58 {
59  SimpleResource res(resUri);
60  QStringList args;
61  QString contents;
62 
63  args << QLatin1String("-s") << QLatin1String("cp1252"); // FIXME: Store somewhere a map between the user's language and the encoding of the Windows files it may use ?
64  args << QLatin1String("-d") << QLatin1String("utf8");
65 
66  if (mimeType == QLatin1String("application/msword")) {
67  res.addType(NFO::TextDocument());
68 
69  args << QLatin1String("-w");
70  contents = textFromFile(fileUrl, m_catdoc, args);
71 
72  // Now that we have the plain text content, count words, lines and characters
73  // (original code from plaintextextractor.cpp, authored by Vishesh Handa)
74  int characters = contents.length();
75  int lines = contents.count( QChar('\n') );
76  int words = contents.count( QRegExp("\\b\\w+\\b") );
77 
78  res.addProperty(NIE::plainTextContent(), contents);
79  res.addProperty(NFO::wordCount(), words);
80  res.addProperty(NFO::lineCount(), lines);
81  res.addProperty(NFO::characterCount(), characters);
82  } else if (mimeType == QLatin1String("application/vnd.ms-excel")) {
83  res.addType(NFO::Spreadsheet());
84 
85  args << QLatin1String("-c") << QLatin1String(" ");
86  args << QLatin1String("-b") << QLatin1String(" ");
87  args << QLatin1String("-q") << QLatin1String("0");
88  contents = textFromFile(fileUrl, m_xls2csv, args);
89  } else if (mimeType == QLatin1String("application/vnd.ms-powerpoint")) {
90  res.addType(NFO::Presentation());
91 
92  contents = textFromFile(fileUrl, m_catppt, args);
93  }
94 
95  if (contents.isEmpty())
96  return SimpleResourceGraph();
97 
98  res.addProperty(NIE::plainTextContent(), contents);
99 
100  return SimpleResourceGraph() << res;
101 }
102 
103 QString Nepomuk2::OfficeExtractor::textFromFile(const QUrl &fileUrl, const QString &command, QStringList &arguments)
104 {
105  arguments << fileUrl.toLocalFile();
106 
107  // Start a process and read its standard output
108  QProcess process;
109 
110  process.setReadChannel(QProcess::StandardOutput);
111  process.start(command, arguments, QIODevice::ReadOnly);
112  process.waitForFinished();
113 
114  if (process.exitStatus() != QProcess::NormalExit || process.exitCode() != 0)
115  return QString();
116  else
117  return QString::fromUtf8(process.readAll());
118 }
119 
120 NEPOMUK_EXPORT_EXTRACTOR(Nepomuk2::OfficeExtractor, "nepomukofficeextractor")
Nepomuk2::OfficeExtractor::mimetypes
virtual QStringList mimetypes()
Provide a list of mimetypes which are supported by this plugin.
Definition: officeextractor.cpp:50
Nepomuk2::ExtractorPlugin
The ExtractorPlugin is the base class for all file metadata extractors.
Definition: extractorplugin.h:60
Nepomuk2::OfficeExtractor::extract
virtual SimpleResourceGraph extract(const QUrl &resUri, const QUrl &fileUrl, const QString &mimeType)
The main function of the plugin that is responsible for extracting the data from the file url and ret...
Definition: officeextractor.cpp:55
Nepomuk2::SimpleResource
Represents a snapshot of one Nepomuk resource.
Definition: simpleresource.h:46
QObject
Nepomuk2::SimpleResource::addProperty
void addProperty(const QUrl &property, const QVariant &value)
Add a property.
Definition: simpleresource.cpp:206
Nepomuk2::SimpleResourceGraph
Definition: simpleresourcegraph.h:48
Nepomuk2::OfficeExtractor::OfficeExtractor
OfficeExtractor(QObject *parent, const QVariantList &)
Definition: officeextractor.cpp:31
officeextractor.h
NEPOMUK_EXPORT_EXTRACTOR
#define NEPOMUK_EXPORT_EXTRACTOR(classname, libname)
Export a Nepomuk file extractor.
Definition: extractorplugin.h:163
Nepomuk2::SimpleResource::addType
void addType(const QUrl &type)
A convenience method which adds a property of type rdf:type.
Definition: simpleresource.cpp:257
Nepomuk2::OfficeExtractor
Definition: officeextractor.h:28
This file is part of the KDE documentation.
Documentation copyright © 1996-2014 The KDE developers.
Generated on Tue Oct 14 2014 22:48:08 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

Nepomuk-Core

Skip menu "Nepomuk-Core"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs API Reference

Skip menu "kdelibs API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  • kjsembed
  •   WTF
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Nepomuk-Core
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal