Baloo

app.cpp
1/*
2 This file is part of the KDE Baloo Project
3 SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <vhanda@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
6*/
7
8#include "app.h"
9#include "basicindexingjob.h"
10#include "result.h"
11#include "idutils.h"
12#include "transaction.h"
13#include "baloodebug.h"
14#include "global.h"
15
16#include <QCoreApplication>
17
18#include <QTimer>
19#include <QFileInfo>
20
21#include <KFileMetaData/Extractor>
22#include <KFileMetaData/MimeUtils>
23#include <KIdleTime>
24
25#include <unistd.h> //for STDIN_FILENO
26#include <iostream>
27
28using namespace Baloo;
29
30App::App(QObject* parent)
31 : QObject(parent)
32 , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read)
33 , m_input()
34 , m_output()
35 , m_workerPipe(&m_input, &m_output)
36 , m_tr(nullptr)
37{
40
41 static int s_idleTimeout = 1000 * 60 * 1; // 1 min
42 m_idleTime = KIdleTime::instance();
43 m_idleTime->addIdleTimeout(s_idleTimeout);
44 connect(m_idleTime, &KIdleTime::resumingFromIdle, this, [this]() {
45 qCInfo(BALOO) << "Busy, paced indexing";
46 m_isBusy = true;
47 });
48 connect(m_idleTime, qOverload<int, int>(&KIdleTime::timeoutReached), this, [this]() {
49 qCInfo(BALOO) << "Not busy, fast indexing";
50 m_isBusy = false;
51 });
52
53 using WorkerPipe = Baloo::Private::WorkerPipe;
54 connect(&m_notifyNewData, &QSocketNotifier::activated, &m_workerPipe, &WorkerPipe::processIdData);
55 connect(&m_workerPipe, &WorkerPipe::newDocumentIds, this, &App::slotNewBatch);
56 connect(&m_workerPipe, &WorkerPipe::inputEnd, this, &QCoreApplication::quit);
57}
58
59void App::slotNewBatch(const QVector<quint64>& ids)
60{
61 m_ids = ids;
62
63 Database *db = globalDatabaseInstance();
64 if (!db->open(Database::ReadWriteDatabase)) {
65 qCCritical(BALOO) << "Failed to open the database";
66 exit(1);
67 }
68
69 Q_ASSERT(m_tr == nullptr);
70
71 if (!m_isBusy) {
72 m_idleTime->catchNextResumeEvent();
73 }
74
75 QTimer::singleShot((m_isBusy ? 500 : 0), this, [this, db] () {
76 // FIXME: The transaction is open for way too long. We should just open it for when we're
77 // committing the data not during the extraction.
78 m_tr = new Transaction(db, Transaction::ReadWrite);
79 processNextFile();
80 });
81
82 /**
83 * A Single Batch seems to be triggering the SocketNotifier more than once
84 * so we disable it till the batch is done.
85 */
86 m_notifyNewData.setEnabled(false);
87}
88
89void App::processNextFile()
90{
91 if (!m_ids.isEmpty()) {
92 quint64 id = m_ids.takeFirst();
93
94 QString url = QFile::decodeName(m_tr->documentUrl(id));
95 if (url.isEmpty() || !QFile::exists(url)) {
96 m_tr->removeDocument(id);
97 QTimer::singleShot(0, this, &App::processNextFile);
98 return;
99 }
100
101 bool indexed = index(m_tr, url, id);
102
103 int delay = (m_isBusy && indexed) ? 10 : 0;
104 QTimer::singleShot(delay, this, &App::processNextFile);
105
106 } else {
107 bool ok = m_tr->commit();
108 if (!ok) {
109 exit(2);
110 }
111 delete m_tr;
112 m_tr = nullptr;
113
114 // Enable the SocketNotifier for the next batch
115 m_notifyNewData.setEnabled(true);
116 m_workerPipe.batchFinished();
117 }
118}
119
120bool App::index(Transaction* tr, const QString& url, quint64 id)
121{
122 if (!m_config.shouldBeIndexed(url)) {
123 // This apparently happens when the config has changed after the document
124 // was added to the content indexing db
125 qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped";
126 tr->removeDocument(id);
127 m_workerPipe.urlFailed(url);
128 return false;
129 }
130
131 // The initial BasicIndexingJob run has been supplied with the file extension
132 // mimetype only, skip based on the "real" mimetype
133 QString mimetype = KFileMetaData::MimeUtils::strictMimeType(url, m_mimeDb).name();
134 if (!m_config.shouldMimeTypeBeIndexed(mimetype)) {
135 qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype;
136 // FIXME: in case the extension based and content based mimetype differ
137 // we should update it.
138 tr->removePhaseOne(id);
139 m_workerPipe.urlFailed(url);
140 return false;
141 }
142
143 // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we
144 // have trouble processing them
145 //
146 if (mimetype.startsWith(QLatin1String("text/"))) {
147 QFileInfo fileInfo(url);
148 if (fileInfo.size() >= 10 * 1024 * 1024) {
149 qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype;
150 tr->removePhaseOne(id);
151 m_workerPipe.urlFailed(url);
152 return false;
153 }
154 }
155 qCDebug(BALOO) << "Indexing" << id << url << mimetype;
156 m_workerPipe.urlStarted(url);
157
158 // We always run the basic indexing again. This is mostly so that the proper
159 // mimetype is set and we get proper type information.
160 // The mimetype fetched in the BasicIndexingJob is fast but not accurate
161 BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel);
162 basicIndexer.index();
163
164 Baloo::Document doc = basicIndexer.document();
165
166 Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText);
167 result.setDocument(doc);
168
169 const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype);
170
171 for (KFileMetaData::Extractor* ex : exList) {
172 ex->extract(&result);
173 }
174
175 result.finish();
176 if (doc.id() != id) {
177 qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created";
178 tr->removeDocument(id);
179 if (!tr->hasDocument(doc.id())) {
180 tr->addDocument(result.document());
181 } else {
182 tr->replaceDocument(result.document(), DocumentTerms | DocumentData);
183 }
184 } else {
185 tr->replaceDocument(result.document(), DocumentTerms | DocumentData);
186 }
187 tr->removePhaseOne(doc.id());
188 m_workerPipe.urlFinished(url);
189 return true;
190}
191
192#include "moc_app.cpp"
A document represents an indexed file to be stored in the Baloo engine.
Definition document.h:31
bool shouldMimeTypeBeIndexed(const QString &mimeType) const
Checks if mimeType should be indexed.
bool shouldBeIndexed(const QString &path) const
Check if file or folder path should be indexed taking into account the includeFolders(),...
QList< Extractor * > fetchExtractors(const QString &mimetype) const
void catchNextResumeEvent()
void timeoutReached(int identifier, int msec)
void resumingFromIdle()
static KIdleTime * instance()
The result class is where all the data extracted by the KFileMetaData extractors is saved to.
Definition result.h:27
Implements storage for docIds without any associated data Instantiated for:
Definition coding.cpp:11
KIOCORE_EXPORT MimetypeJob * mimetype(const QUrl &url, JobFlags flags=DefaultFlags)
QString decodeName(const QByteArray &localFileName)
bool exists() const const
bool startsWith(parameter_type value) const const
QString tr(const char *sourceText, const char *disambiguation, int n)
void activated(QSocketDescriptor socket, QSocketNotifier::Type type)
void setEnabled(bool enable)
bool isEmpty() const const
QFuture< ArgsType< Signal > > connect(Sender *sender, Signal signal)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Tue Mar 26 2024 11:20:16 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.