Baloo

app.cpp
1 /*
2  This file is part of the KDE Baloo Project
3  SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <[email protected]>
4 
5  SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
6 */
7 
8 #include "app.h"
9 #include "basicindexingjob.h"
10 #include "result.h"
11 #include "idutils.h"
12 #include "transaction.h"
13 #include "baloodebug.h"
14 #include "global.h"
15 
16 #include <QCoreApplication>
17 
18 #include <QTimer>
19 #include <QFileInfo>
20 
21 #include <KFileMetaData/Extractor>
22 #include <KFileMetaData/MimeUtils>
23 #include <KIdleTime>
24 
25 #include <unistd.h> //for STDIN_FILENO
26 #include <iostream>
27 
28 using namespace Baloo;
29 
30 App::App(QObject* parent)
31  : QObject(parent)
32  , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read)
33  , m_input()
34  , m_output()
35  , m_workerPipe(&m_input, &m_output)
36  , m_tr(nullptr)
37 {
38  m_input.open(STDIN_FILENO, QIODevice::ReadOnly | QIODevice::Unbuffered );
39  m_output.open(STDOUT_FILENO, QIODevice::WriteOnly | QIODevice::Unbuffered );
40 
41  static int s_idleTimeout = 1000 * 60 * 1; // 1 min
42  m_idleTime = KIdleTime::instance();
43  m_idleTime->addIdleTimeout(s_idleTimeout);
44  connect(m_idleTime, &KIdleTime::resumingFromIdle, this, [this]() {
45  qCInfo(BALOO) << "Busy, paced indexing";
46  m_isBusy = true;
47  });
48  connect(m_idleTime, qOverload<int, int>(&KIdleTime::timeoutReached), this, [this]() {
49  qCInfo(BALOO) << "Not busy, fast indexing";
50  m_isBusy = false;
51  });
52 
53  using WorkerPipe = Baloo::Private::WorkerPipe;
54  connect(&m_notifyNewData, &QSocketNotifier::activated, &m_workerPipe, &WorkerPipe::processIdData);
55  connect(&m_workerPipe, &WorkerPipe::newDocumentIds, this, &App::slotNewBatch);
56  connect(&m_workerPipe, &WorkerPipe::inputEnd, this, &QCoreApplication::quit);
57 }
58 
59 void App::slotNewBatch(const QVector<quint64>& ids)
60 {
61  m_ids = ids;
62 
63  Database *db = globalDatabaseInstance();
64  if (!db->open(Database::ReadWriteDatabase)) {
65  qCCritical(BALOO) << "Failed to open the database";
66  exit(1);
67  }
68 
69  Q_ASSERT(m_tr == nullptr);
70 
71  if (!m_isBusy) {
72  m_idleTime->catchNextResumeEvent();
73  }
74 
75  QTimer::singleShot((m_isBusy ? 500 : 0), this, [this, db] () {
76  // FIXME: The transaction is open for way too long. We should just open it for when we're
77  // committing the data not during the extraction.
78  m_tr = new Transaction(db, Transaction::ReadWrite);
79  processNextFile();
80  });
81 
82  /**
83  * A Single Batch seems to be triggering the SocketNotifier more than once
84  * so we disable it till the batch is done.
85  */
86  m_notifyNewData.setEnabled(false);
87 }
88 
89 void App::processNextFile()
90 {
91  if (!m_ids.isEmpty()) {
92  quint64 id = m_ids.takeFirst();
93 
94  QString url = QFile::decodeName(m_tr->documentUrl(id));
95  if (url.isEmpty() || !QFile::exists(url)) {
96  m_tr->removeDocument(id);
97  QTimer::singleShot(0, this, &App::processNextFile);
98  return;
99  }
100 
101  bool indexed = index(m_tr, url, id);
102 
103  int delay = (m_isBusy && indexed) ? 10 : 0;
104  QTimer::singleShot(delay, this, &App::processNextFile);
105 
106  } else {
107  bool ok = m_tr->commit();
108  if (!ok) {
109  exit(2);
110  }
111  delete m_tr;
112  m_tr = nullptr;
113 
114  // Enable the SocketNotifier for the next batch
115  m_notifyNewData.setEnabled(true);
116  m_workerPipe.batchFinished();
117  }
118 }
119 
120 bool App::index(Transaction* tr, const QString& url, quint64 id)
121 {
122  if (!m_config.shouldBeIndexed(url)) {
123  // This apparently happens when the config has changed after the document
124  // was added to the content indexing db
125  qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped";
126  tr->removeDocument(id);
127  m_workerPipe.urlFailed(url);
128  return false;
129  }
130 
131  // The initial BasicIndexingJob run has been supplied with the file extension
132  // mimetype only, skip based on the "real" mimetype
133  QString mimetype = KFileMetaData::MimeUtils::strictMimeType(url, m_mimeDb).name();
134  if (!m_config.shouldMimeTypeBeIndexed(mimetype)) {
135  qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype;
136  // FIXME: in case the extension based and content based mimetype differ
137  // we should update it.
138  tr->removePhaseOne(id);
139  m_workerPipe.urlFailed(url);
140  return false;
141  }
142 
143  // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we
144  // have trouble processing them
145  //
146  if (mimetype.startsWith(QLatin1String("text/"))) {
147  QFileInfo fileInfo(url);
148  if (fileInfo.size() >= 10 * 1024 * 1024) {
149  qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype;
150  tr->removePhaseOne(id);
151  m_workerPipe.urlFailed(url);
152  return false;
153  }
154  }
155  qCDebug(BALOO) << "Indexing" << id << url << mimetype;
156  m_workerPipe.urlStarted(url);
157 
158  // We always run the basic indexing again. This is mostly so that the proper
159  // mimetype is set and we get proper type information.
160  // The mimetype fetched in the BasicIndexingJob is fast but not accurate
161  BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel);
162  basicIndexer.index();
163 
164  Baloo::Document doc = basicIndexer.document();
165 
166  Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText);
167  result.setDocument(doc);
168 
169  const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype);
170 
171  for (KFileMetaData::Extractor* ex : exList) {
172  ex->extract(&result);
173  }
174 
175  result.finish();
176  if (doc.id() != id) {
177  qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created";
178  tr->removeDocument(id);
179  if (!tr->hasDocument(doc.id())) {
180  tr->addDocument(result.document());
181  } else {
182  tr->replaceDocument(result.document(), DocumentTerms | DocumentData);
183  }
184  } else {
185  tr->replaceDocument(result.document(), DocumentTerms | DocumentData);
186  }
187  tr->removePhaseOne(doc.id());
188  m_workerPipe.urlFinished(url);
189  return true;
190 }
191 
192 #include "moc_app.cpp"
static KIdleTime * instance()
void resumingFromIdle()
bool exists() const const
KIOCORE_EXPORT MimetypeJob * mimetype(const QUrl &url, JobFlags flags=DefaultFlags)
The result class is where all the data extracted by the KFileMetaData extractors is saved to....
Definition: result.h:26
Implements storage for docIds without any associated data Instantiated for:
Definition: coding.cpp:11
bool isEmpty() const const
void timeoutReached(int identifier, int msec)
A document represents an indexed file to be stored in the Baloo engine.
Definition: document.h:30
void activated(QSocketDescriptor socket, QSocketNotifier::Type type)
QString decodeName(const QByteArray &localFileName)
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Mon Dec 11 2023 03:53:56 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.