KFileMetaData

officeextractor.cpp
1 /*
2  This file is part of a KMetaData File Extractor
3  SPDX-FileCopyrightText: 2013 Denis Steckelmacher <[email protected]>
4 
5  SPDX-License-Identifier: LGPL-2.1-or-later
6 */
7 
8 #include "officeextractor.h"
9 #include "kfilemetadata_debug.h"
10 
11 #include <QRegularExpression>
12 #include <QStandardPaths>
13 
14 #include <QProcess>
15 
16 using namespace KFileMetaData;
17 
18 OfficeExtractor::OfficeExtractor(QObject* parent)
19  : ExtractorPlugin(parent)
20 {
21  // Find the executables of catdoc, catppt and xls2csv. If an executable cannot
22  // be found, indexing its corresponding MIME type will be disabled
23  findExe(QStringLiteral("application/msword"), QStringLiteral("catdoc"), m_catdoc);
24  findExe(QStringLiteral("application/vnd.ms-excel"), QStringLiteral("xls2csv"), m_xls2csv);
25  findExe(QStringLiteral("application/vnd.ms-powerpoint"), QStringLiteral("catppt"), m_catppt);
26 }
27 
28 void OfficeExtractor::findExe(const QString& mimeType, const QString& name, QString& fullPath)
29 {
30  fullPath = QStandardPaths::findExecutable(name);
31 
32  if (!fullPath.isEmpty()) {
33  m_available_mime_types << mimeType;
34  }
35 }
36 
37 QStringList OfficeExtractor::mimetypes() const
38 {
39  return m_available_mime_types;
40 }
41 
42 
43 void OfficeExtractor::extract(ExtractionResult* result)
44 {
45  QStringList args;
46  QString contents;
47 
48  args << QStringLiteral("-s") << QStringLiteral("cp1252"); // FIXME: Store somewhere a map between the user's language and the encoding of the Windows files it may use ?
49  args << QStringLiteral("-d") << QStringLiteral("utf8");
50 
51  const QString fileUrl = result->inputUrl();
52  const QString mimeType = result->inputMimetype();
53  if (mimeType == QLatin1String("application/msword")) {
54  result->addType(Type::Document);
55 
56  args << QStringLiteral("-w");
57  contents = textFromFile(fileUrl, m_catdoc, args);
58 
59  // Now that we have the plain text content, count words, lines and characters
60  // (original code from plaintextextractor.cpp, authored by Vishesh Handa)
61  int lines = contents.count(QLatin1Char('\n'));
62  int words = contents.count(QRegularExpression(QStringLiteral("\\b\\w+\\b"), QRegularExpression::UseUnicodePropertiesOption));
63 
64  result->add(Property::WordCount, words);
65  result->add(Property::LineCount, lines);
66  } else if (mimeType == QLatin1String("application/vnd.ms-excel")) {
67  result->addType(Type::Document);
68  result->addType(Type::Spreadsheet);
69 
70  args << QStringLiteral("-c") << QStringLiteral(" ");
71  args << QStringLiteral("-b") << QStringLiteral(" ");
72  args << QStringLiteral("-q") << QStringLiteral("0");
73  contents = textFromFile(fileUrl, m_xls2csv, args);
74  } else if (mimeType == QLatin1String("application/vnd.ms-powerpoint")) {
75  result->addType(Type::Document);
76  result->addType(Type::Presentation);
77 
78  contents = textFromFile(fileUrl, m_catppt, args);
79  }
80 
81  if (contents.isEmpty()) {
82  return;
83  }
84 
85  result->append(contents);
86 
87  return;
88 }
89 
90 QString OfficeExtractor::textFromFile(const QString& fileUrl, const QString& command, QStringList& arguments)
91 {
92  const QString exec = QStandardPaths::findExecutable(command);
93  if (exec.isEmpty()) {
94  qCDebug(KFILEMETADATA_LOG) << "Could not find executable in PATH:" << command;
95  return {};
96  }
97 
98  arguments << fileUrl;
99 
100  // Start a process and read its standard output
101  QProcess process;
102 
104  process.start(exec, arguments, QIODevice::ReadOnly);
105  process.waitForFinished();
106 
107  if (process.exitStatus() != QProcess::NormalExit || process.exitCode() != 0) {
108  return QString();
109  } else {
110  return QString::fromUtf8(process.readAll());
111  }
112 }
void start(const QString &program, const QStringList &arguments, QIODevice::OpenMode mode)
virtual void addType(Type::Type type)=0
This function is called by the plugins.
The ExtractionResult class is where all the data extracted by the indexer is saved....
QString fromUtf8(const char *str, int size)
bool waitForFinished(int msecs)
QString inputMimetype() const
The input mimetype.
KCALUTILS_EXPORT QString mimeType()
QString inputUrl() const
The input url which the plugins will use to locate the file.
QString findExecutable(const QString &executableName, const QStringList &paths)
QProcess::ExitStatus exitStatus() const const
bool isEmpty() const const
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
int count() const const
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
The ExtractorPlugin is the base class for all file metadata extractors. It is responsible for extract...
QByteArray readAll()
void setReadChannel(QProcess::ProcessChannel channel)
int exitCode() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Thu May 26 2022 03:46:07 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.