KFileMetaData

officeextractor.cpp
1/*
2 This file is part of a KMetaData File Extractor
3 SPDX-FileCopyrightText: 2013 Denis Steckelmacher <steckdenis@yahoo.fr>
4
5 SPDX-License-Identifier: LGPL-2.1-or-later
6*/
7
8#include "officeextractor.h"
9#include "kfilemetadata_debug.h"
10
11#include <QRegularExpression>
12#include <QStandardPaths>
13
14#include <QProcess>
15
16using namespace KFileMetaData;
17
18OfficeExtractor::OfficeExtractor(QObject* parent)
19 : ExtractorPlugin(parent)
20{
21 // Find the executables of catdoc, catppt and xls2csv. If an executable cannot
22 // be found, indexing its corresponding MIME type will be disabled
23 findExe(QStringLiteral("application/msword"), QStringLiteral("catdoc"), m_catdoc);
24 findExe(QStringLiteral("application/vnd.ms-excel"), QStringLiteral("xls2csv"), m_xls2csv);
25 findExe(QStringLiteral("application/vnd.ms-powerpoint"), QStringLiteral("catppt"), m_catppt);
26}
27
28void OfficeExtractor::findExe(const QString& mimeType, const QString& name, QString& fullPath)
29{
30 fullPath = QStandardPaths::findExecutable(name);
31
32 if (!fullPath.isEmpty()) {
33 m_available_mime_types << mimeType;
34 } else {
35 qCDebug(KFILEMETADATA_LOG) << "Could not find executable in PATH:" << name;
36 }
37}
38
39QStringList OfficeExtractor::mimetypes() const
40{
41 return m_available_mime_types;
42}
43
44
45void OfficeExtractor::extract(ExtractionResult* result)
46{
47 QStringList args;
48
49 args << QStringLiteral("-s") << QStringLiteral("cp1252"); // FIXME: Store somewhere a map between the user's language and the encoding of the Windows files it may use ?
50 args << QStringLiteral("-d") << QStringLiteral("utf8");
51
52 const bool extractPlainText = result->inputFlags() & ExtractionResult::ExtractPlainText;
53
54 const QString fileUrl = result->inputUrl();
55 const QString mimeType = result->inputMimetype();
56 if (mimeType == QLatin1String("application/msword")) {
57 result->addType(Type::Document);
58
59 if (!extractPlainText) {
60 return;
61 }
62
63 args << QStringLiteral("-w");
64 if (const auto contents = textFromFile(fileUrl, m_catdoc, args); !contents.isEmpty()) {
65 // Now that we have the plain text content, count words, lines and characters
66 // (original code from plaintextextractor.cpp, authored by Vishesh Handa)
67 int lines = contents.count(QLatin1Char('\n'));
68 int words = contents.count(QRegularExpression(QStringLiteral("\\b\\w+\\b"), QRegularExpression::UseUnicodePropertiesOption));
69
70 result->add(Property::WordCount, words);
71 result->add(Property::LineCount, lines);
72 result->append(contents);
73 }
74 } else if (mimeType == QLatin1String("application/vnd.ms-excel")) {
75 result->addType(Type::Document);
77
78 if (!extractPlainText) {
79 return;
80 }
81
82 args << QStringLiteral("-c") << QStringLiteral(" ");
83 args << QStringLiteral("-b") << QStringLiteral(" ");
84 args << QStringLiteral("-q") << QStringLiteral("0");
85 if (const auto contents = textFromFile(fileUrl, m_xls2csv, args); !contents.isEmpty()) {
86 result->append(contents);
87 }
88 } else if (mimeType == QLatin1String("application/vnd.ms-powerpoint")) {
89 result->addType(Type::Document);
91
92 if (!extractPlainText) {
93 return;
94 }
95
96 if (const auto contents = textFromFile(fileUrl, m_catppt, args); !contents.isEmpty()) {
97 result->append(contents);
98 }
99 }
100}
101
102QString OfficeExtractor::textFromFile(const QString& fileUrl, const QString& command, QStringList& arguments)
103{
104 if (command.isEmpty()) {
105 return {};
106 }
107
108 arguments << fileUrl;
109
110 // Start a process and read its standard output
111 QProcess process;
112
114 process.start(command, arguments, QIODevice::ReadOnly);
115 process.waitForFinished();
116
117 if (process.exitStatus() != QProcess::NormalExit || process.exitCode() != 0) {
118 return QString();
119 } else {
120 return QString::fromUtf8(process.readAll());
121 }
122}
123
124#include "moc_officeextractor.cpp"
The ExtractionResult class is where all the data extracted by the indexer is saved.
QString inputUrl() const
The input URL which the plugins will use to locate the file.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString inputMimetype() const
The input MIME type.
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
The ExtractorPlugin is the base class for all file metadata extractors.
KCALUTILS_EXPORT QString mimeType()
@ WordCount
The number of words in a document.
Definition properties.h:145
@ LineCount
The number of lines in a document.
Definition properties.h:151
@ Document
Any file which counts as a document.
Definition types.h:63
@ Presentation
A Presentation file.
Definition types.h:75
@ Spreadsheet
A SpreadSheet file.
Definition types.h:69
The KFileMetaData namespace.
QString name(StandardAction id)
QByteArray readAll()
int exitCode() const const
QProcess::ExitStatus exitStatus() const const
void setReadChannel(ProcessChannel channel)
void start(OpenMode mode)
bool waitForFinished(int msecs)
QString findExecutable(const QString &executableName, const QStringList &paths)
QString fromUtf8(QByteArrayView str)
bool isEmpty() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:48:11 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.