Baloo

app.cpp
1/*
2 This file is part of the KDE Baloo Project
3 SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <vhanda@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
6*/
7
8#include "app.h"
9#include "basicindexingjob.h"
10#include "result.h"
11#include "idutils.h"
12#include "transaction.h"
13#include "baloodebug.h"
14#include "global.h"
15
16#include <QCoreApplication>
17
18#include <QTimer>
19#include <QFileInfo>
20
21#include <KFileMetaData/Extractor>
22#include <KFileMetaData/MimeUtils>
23#include <KIdleTime>
24
25#include <unistd.h> //for STDIN_FILENO
26#include <iostream>
27
28using namespace Baloo;
29
30App::App(QObject* parent)
31 : QObject(parent)
32 , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read)
33 , m_input()
34 , m_output()
35 , m_workerPipe(&m_input, &m_output)
36 , m_tr(nullptr)
37{
38 m_input.open(STDIN_FILENO, QIODevice::ReadOnly | QIODevice::Unbuffered );
39 m_output.open(STDOUT_FILENO, QIODevice::WriteOnly | QIODevice::Unbuffered );
40
41 static int s_idleTimeout = 1000 * 60 * 1; // 1 min
42 m_idleTime = KIdleTime::instance();
43 m_idleTime->addIdleTimeout(s_idleTimeout);
44 connect(m_idleTime, &KIdleTime::resumingFromIdle, this, [this]() {
45 qCInfo(BALOO) << "Busy, paced indexing";
46 m_isBusy = true;
47 });
48 connect(m_idleTime, qOverload<int, int>(&KIdleTime::timeoutReached), this, [this]() {
49 qCInfo(BALOO) << "Not busy, fast indexing";
50 m_isBusy = false;
51 });
52
53 using WorkerPipe = Baloo::Private::WorkerPipe;
54 connect(&m_notifyNewData, &QSocketNotifier::activated, &m_workerPipe, &WorkerPipe::processIdData);
55 connect(&m_workerPipe, &WorkerPipe::newDocumentIds, this, &App::slotNewBatch);
56 connect(&m_workerPipe, &WorkerPipe::inputEnd, this, &QCoreApplication::quit);
57}
58
59App::~App()
60{
61 if (m_tr) {
62 // Abort the transaction in case the parent process exited
63 m_tr->abort();
64 m_tr.reset();
65 }
66}
67
68void App::slotNewBatch(const QVector<quint64>& ids)
69{
70 m_ids = ids;
71
72 Database *db = globalDatabaseInstance();
73 if (!db->open(Database::ReadWriteDatabase)) {
74 qCCritical(BALOO) << "Failed to open the database";
75 exit(1);
76 }
77
78 Q_ASSERT(m_tr == nullptr);
79
80 if (!m_isBusy) {
81 m_idleTime->catchNextResumeEvent();
82 }
83
84 QTimer::singleShot((m_isBusy ? 500 : 0), this, [this, db] () {
85 // FIXME: The transaction is open for way too long. We should just open it for when we're
86 // committing the data not during the extraction.
87 m_tr = std::make_unique<Transaction>(db, Transaction::ReadWrite);
88 processNextFile();
89 });
90
91 /**
92 * A Single Batch seems to be triggering the SocketNotifier more than once
93 * so we disable it till the batch is done.
94 */
95 m_notifyNewData.setEnabled(false);
96}
97
98void App::processNextFile()
99{
100 if (!m_ids.isEmpty()) {
101 quint64 id = m_ids.takeFirst();
102
103 QString url = QFile::decodeName(m_tr->documentUrl(id));
104 if (url.isEmpty() || !QFile::exists(url)) {
105 m_tr->removeDocument(id);
106 QTimer::singleShot(0, this, &App::processNextFile);
107 return;
108 }
109
110 bool indexed = index(m_tr.get(), url, id);
111
112 int delay = (m_isBusy && indexed) ? 10 : 0;
113 QTimer::singleShot(delay, this, &App::processNextFile);
114
115 } else {
116 bool ok = m_tr->commit();
117 if (!ok) {
118 exit(2);
119 }
120 m_tr.reset();
121
122 // Enable the SocketNotifier for the next batch
123 m_notifyNewData.setEnabled(true);
124 m_workerPipe.batchFinished();
125 }
126}
127
128bool App::index(Transaction* tr, const QString& url, quint64 id)
129{
130 if (!m_config.shouldBeIndexed(url)) {
131 // This apparently happens when the config has changed after the document
132 // was added to the content indexing db
133 qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped";
134 tr->removeDocument(id);
135 m_workerPipe.urlFailed(url);
136 return false;
137 }
138
139 // The initial BasicIndexingJob run has been supplied with the file extension
140 // mimetype only, skip based on the "real" mimetype
142 if (!m_config.shouldMimeTypeBeIndexed(mimetype)) {
143 qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype;
144 // FIXME: in case the extension based and content based mimetype differ
145 // we should update it.
146 tr->removePhaseOne(id);
147 m_workerPipe.urlFailed(url);
148 return false;
149 }
150
151 // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we
152 // have trouble processing them
153 //
154 if (mimetype.startsWith(QLatin1String("text/"))) {
155 QFileInfo fileInfo(url);
156 if (fileInfo.size() >= 10 * 1024 * 1024) {
157 qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype;
158 tr->removePhaseOne(id);
159 m_workerPipe.urlFailed(url);
160 return false;
161 }
162 }
163 qCDebug(BALOO) << "Indexing" << id << url << mimetype;
164 m_workerPipe.urlStarted(url);
165
166 // We always run the basic indexing again. This is mostly so that the proper
167 // mimetype is set and we get proper type information.
168 // The mimetype fetched in the BasicIndexingJob is fast but not accurate
169 BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel);
170 basicIndexer.index();
171
172 Baloo::Document doc = basicIndexer.document();
173
174 Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText);
175 result.setDocument(doc);
176
177 const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype);
178
179 for (KFileMetaData::Extractor* ex : exList) {
180 ex->extract(&result);
181 }
182
183 result.finish();
184 if (doc.id() != id) {
185 qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created";
186 tr->removeDocument(id);
187 if (!tr->hasDocument(doc.id())) {
188 tr->addDocument(result.document());
189 } else {
190 tr->replaceDocument(result.document(), DocumentTerms | DocumentData);
191 }
192 } else {
193 tr->replaceDocument(result.document(), DocumentTerms | DocumentData);
194 }
195 tr->removePhaseOne(doc.id());
196 m_workerPipe.urlFinished(url);
197 return true;
198}
199
200#include "moc_app.cpp"
A document represents an indexed file to be stored in the Baloo engine.
Definition document.h:31
bool shouldMimeTypeBeIndexed(const QString &mimeType) const
Checks if mimeType should be indexed.
bool shouldBeIndexed(const QString &path) const
Check if file or folder path should be indexed taking into account the includeFolders(),...
QList< Extractor * > fetchExtractors(const QString &mimetype) const
void catchNextResumeEvent()
void timeoutReached(int identifier, int msec)
void resumingFromIdle()
static KIdleTime * instance()
The result class is where all the data extracted by the KFileMetaData extractors is saved to.
Definition result.h:27
Implements storage for docIds without any associated data Instantiated for:
Definition coding.cpp:11
QMimeType strictMimeType(const QString &filePath, const QMimeDatabase &db)
KIOCORE_EXPORT MimetypeJob * mimetype(const QUrl &url, JobFlags flags=DefaultFlags)
QString decodeName(const QByteArray &localFileName)
bool exists() const const
QString tr(const char *sourceText, const char *disambiguation, int n)
void activated(QSocketDescriptor socket, QSocketNotifier::Type type)
void setEnabled(bool enable)
bool isEmpty() const const
QFuture< ArgsType< Signal > > connect(Sender *sender, Signal signal)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Fri Jul 26 2024 11:52:28 by doxygen 1.11.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.