KFileMetaData

officeextractor.cpp
1 /*
2  This file is part of a KMetaData File Extractor
3  Copyright (C) 2013 Denis Steckelmacher <[email protected]>
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License as published by the Free Software Foundation; either
8  version 2.1 of the License, or (at your option) any later version.
9 
10  This library is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with this library; if not, write to the Free Software
17  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19 
20 #include "officeextractor.h"
21 
22 #include <QRegularExpression>
23 #include <QStandardPaths>
24 
25 #include <QProcess>
26 
27 using namespace KFileMetaData;
28 
29 OfficeExtractor::OfficeExtractor(QObject* parent)
30  : ExtractorPlugin(parent)
31 {
32  // Find the executables of catdoc, catppt and xls2csv. If an executable cannot
33  // be found, indexing its corresponding MIME type will be disabled
34  findExe(QStringLiteral("application/msword"), QStringLiteral("catdoc"), m_catdoc);
35  findExe(QStringLiteral("application/vnd.ms-excel"), QStringLiteral("xls2csv"), m_xls2csv);
36  findExe(QStringLiteral("application/vnd.ms-powerpoint"), QStringLiteral("catppt"), m_catppt);
37 }
38 
39 void OfficeExtractor::findExe(const QString& mimeType, const QString& name, QString& fullPath)
40 {
41  fullPath = QStandardPaths::findExecutable(name);
42 
43  if (!fullPath.isEmpty()) {
44  m_available_mime_types << mimeType;
45  }
46 }
47 
48 QStringList OfficeExtractor::mimetypes() const
49 {
50  return m_available_mime_types;
51 }
52 
53 
54 void OfficeExtractor::extract(ExtractionResult* result)
55 {
56  QStringList args;
57  QString contents;
58 
59  args << QStringLiteral("-s") << QStringLiteral("cp1252"); // FIXME: Store somewhere a map between the user's language and the encoding of the Windows files it may use ?
60  args << QStringLiteral("-d") << QStringLiteral("utf8");
61 
62  const QString fileUrl = result->inputUrl();
63  const QString mimeType = result->inputMimetype();
64  if (mimeType == QLatin1String("application/msword")) {
65  result->addType(Type::Document);
66 
67  args << QStringLiteral("-w");
68  contents = textFromFile(fileUrl, m_catdoc, args);
69 
70  // Now that we have the plain text content, count words, lines and characters
71  // (original code from plaintextextractor.cpp, authored by Vishesh Handa)
72  int lines = contents.count(QLatin1Char('\n'));
73  int words = contents.count(QRegularExpression(QStringLiteral("\\b\\w+\\b")));
74 
75  result->add(Property::WordCount, words);
76  result->add(Property::LineCount, lines);
77  } else if (mimeType == QLatin1String("application/vnd.ms-excel")) {
78  result->addType(Type::Document);
79  result->addType(Type::Spreadsheet);
80 
81  args << QStringLiteral("-c") << QStringLiteral(" ");
82  args << QStringLiteral("-b") << QStringLiteral(" ");
83  args << QStringLiteral("-q") << QStringLiteral("0");
84  contents = textFromFile(fileUrl, m_xls2csv, args);
85  } else if (mimeType == QLatin1String("application/vnd.ms-powerpoint")) {
86  result->addType(Type::Document);
87  result->addType(Type::Presentation);
88 
89  contents = textFromFile(fileUrl, m_catppt, args);
90  }
91 
92  if (contents.isEmpty())
93  return;
94 
95  result->append(contents);
96 
97  return;
98 }
99 
100 QString OfficeExtractor::textFromFile(const QString& fileUrl, const QString& command, QStringList& arguments)
101 {
102  arguments << fileUrl;
103 
104  // Start a process and read its standard output
105  QProcess process;
106 
107  process.setReadChannel(QProcess::StandardOutput);
108  process.start(command, arguments, QIODevice::ReadOnly);
109  process.waitForFinished();
110 
111  if (process.exitStatus() != QProcess::NormalExit || process.exitCode() != 0)
112  return QString();
113  else
114  return QString::fromUtf8(process.readAll());
115 }
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
The ExtractorPlugin is the base class for all file metadata extractors.
QString findExecutable(const QString &executableName, const QStringList &paths)
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString fromUtf8(const char *str, int size)
bool isEmpty() const const
QByteArray readAll()
QString inputMimetype() const
The input mimetype.
int count() const const
void setReadChannel(QProcess::ProcessChannel channel)
The ExtractionResult class is where all the data extracted by the indexer is saved.
QProcess::ExitStatus exitStatus() const const
int exitCode() const const
void start(const QString &program, const QStringList &arguments, QIODevice::OpenMode mode)
QString inputUrl() const
The input url which the plugins will use to locate the file.
bool waitForFinished(int msecs)
This file is part of the KDE documentation.
Documentation copyright © 1996-2020 The KDE developers.
Generated on Wed May 27 2020 23:08:35 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.