Baloo

transaction.cpp
1 /*
2  This file is part of the KDE Baloo project.
3  SPDX-FileCopyrightText: 2015 Vishesh Handa <[email protected]>
4 
5  SPDX-License-Identifier: LGPL-2.1-or-later
6 */
7 
8 #include "transaction.h"
9 #include "documentdb.h"
10 #include "documenturldb.h"
11 #include "documentiddb.h"
12 #include "positiondb.h"
13 #include "documentdatadb.h"
14 
15 #include "document.h"
16 #include "enginequery.h"
17 
18 #include "andpostingiterator.h"
19 #include "orpostingiterator.h"
20 #include "phraseanditerator.h"
21 
22 #include "idutils.h"
23 #include "database.h"
24 #include "databasesize.h"
25 
26 #include "enginedebug.h"
27 
28 #include <QFile>
29 #include <QFileInfo>
30 
31 #include <iostream>
32 
33 using namespace Baloo;
34 
35 Transaction::Transaction(const Database& db, Transaction::TransactionType type)
36  : m_dbis(db.m_dbis)
37  , m_env(db.m_env)
38 {
39  init(type);
40 }
41 
42 void Transaction::reset(TransactionType type)
43 {
44  if (m_txn) {
45  qWarning(ENGINE) << "Resetting a Transaction without calling abort/commit";
46  abort();
47  }
48  init(type);
49 }
50 
51 void Transaction::init(TransactionType type)
52 {
53  uint flags = type == ReadOnly ? MDB_RDONLY : 0;
54  int rc = mdb_txn_begin(m_env, nullptr, flags, &m_txn);
55  if (rc) {
56  qCDebug(ENGINE) << "Transaction" << mdb_strerror(rc);
57  return;
58  }
59 
60  if (type == ReadWrite) {
61  m_writeTrans = std::make_unique<WriteTransaction>(m_dbis, m_txn);
62  }
63 }
64 
65 Transaction::Transaction(Database* db, Transaction::TransactionType type)
66  : Transaction(*db, type)
67 {
68 }
69 
70 Transaction::~Transaction()
71 {
72  if (m_writeTrans) {
73  qWarning(ENGINE) << "Closing an active WriteTransaction without calling abort/commit";
74  }
75 
76  if (m_txn) {
77  abort();
78  }
79 }
80 
81 bool Transaction::hasDocument(quint64 id) const
82 {
83  Q_ASSERT(id > 0);
84 
85  DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
86  return docUrlDb.contains(id);
87 }
88 
89 bool Transaction::inPhaseOne(quint64 id) const
90 {
91  Q_ASSERT(id > 0);
92  DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
93  return contentIndexingDb.contains(id);
94 }
95 
96 bool Transaction::hasFailed(quint64 id) const
97 {
98  Q_ASSERT(id > 0);
99  DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn);
100  return failedIdDb.contains(id);
101 }
102 
103 QVector<quint64> Transaction::failedIds(quint64 limit) const
104 {
105  DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn);
106  return failedIdDb.fetchItems(limit);
107 }
108 
109 QByteArray Transaction::documentUrl(quint64 id) const
110 {
111  Q_ASSERT(m_txn);
112  Q_ASSERT(id > 0);
113 
114  DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
115  return docUrlDb.get(id);
116 }
117 
118 quint64 Transaction::documentId(const QByteArray& path) const
119 {
120  Q_ASSERT(m_txn);
121  Q_ASSERT(!path.isEmpty());
122 
123  DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
124  QList<QByteArray> li = path.split('/');
125 
126  quint64 parentId = 0;
127  for (const QByteArray& fileName : li) {
128  if (fileName.isEmpty()) {
129  continue;
130  }
131 
132  parentId = docUrlDb.getId(parentId, fileName);
133  if (!parentId) {
134  return 0;
135  }
136  }
137 
138  return parentId;
139 }
140 
141 DocumentTimeDB::TimeInfo Transaction::documentTimeInfo(quint64 id) const
142 {
143  Q_ASSERT(m_txn);
144 
145  DocumentTimeDB docTimeDb(m_dbis.docTimeDbi, m_txn);
146  return docTimeDb.get(id);
147 }
148 
149 QByteArray Transaction::documentData(quint64 id) const
150 {
151  Q_ASSERT(m_txn);
152  Q_ASSERT(id > 0);
153 
154  DocumentDataDB docDataDb(m_dbis.docDataDbi, m_txn);
155  return docDataDb.get(id);
156 }
157 
158 QVector<quint64> Transaction::fetchPhaseOneIds(int size) const
159 {
160  Q_ASSERT(m_txn);
161  Q_ASSERT(size > 0);
162 
163  DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
164  return contentIndexingDb.fetchItems(size);
165 }
166 
167 QVector<QByteArray> Transaction::fetchTermsStartingWith(const QByteArray& term) const
168 {
169  Q_ASSERT(term.size() > 0);
170 
171  PostingDB postingDb(m_dbis.postingDbi, m_txn);
172  return postingDb.fetchTermsStartingWith(term);
173 }
174 
175 uint Transaction::phaseOneSize() const
176 {
177  Q_ASSERT(m_txn);
178 
179  DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
180  return contentIndexingDb.size();
181 }
182 
183 uint Transaction::size() const
184 {
185  Q_ASSERT(m_txn);
186 
187  DocumentDB docTermsDb(m_dbis.docTermsDbi, m_txn);
188  return docTermsDb.size();
189 }
190 
191 //
192 // Write Operations
193 //
194 void Transaction::setPhaseOne(quint64 id)
195 {
196  Q_ASSERT(m_txn);
197  Q_ASSERT(id > 0);
198  Q_ASSERT(m_writeTrans);
199 
200  DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
201  contentIndexingDb.put(id);
202 }
203 
204 void Transaction::removePhaseOne(quint64 id)
205 {
206  Q_ASSERT(m_txn);
207  Q_ASSERT(id > 0);
208  Q_ASSERT(m_writeTrans);
209 
210  DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
211  contentIndexingDb.del(id);
212 }
213 
214 void Transaction::addFailed(quint64 id)
215 {
216  Q_ASSERT(m_txn);
217  Q_ASSERT(id > 0);
218  Q_ASSERT(m_writeTrans);
219 
220  DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn);
221  failedIdDb.put(id);
222 }
223 
224 void Transaction::addDocument(const Document& doc)
225 {
226  Q_ASSERT(m_txn);
227  Q_ASSERT(doc.id() > 0);
228  if (!m_writeTrans) {
229  qCWarning(ENGINE) << "m_writeTrans is null";
230  return;
231  }
232 
233  m_writeTrans->addDocument(doc);
234 }
235 
236 void Transaction::removeDocument(quint64 id)
237 {
238  Q_ASSERT(m_txn);
239  Q_ASSERT(id > 0);
240  if (!m_writeTrans) {
241  qCWarning(ENGINE) << "m_writeTrans is null";
242  return;
243  }
244 
245  m_writeTrans->removeDocument(id);
246 }
247 
248 void Transaction::removeRecursively(quint64 id)
249 {
250  Q_ASSERT(m_txn);
251  Q_ASSERT(id > 0);
252  if (!m_writeTrans) {
253  qCWarning(ENGINE) << "m_writeTrans is null";
254  return;
255  }
256 
257  m_writeTrans->removeRecursively(id);
258 }
259 
260 void Transaction::replaceDocument(const Document& doc, DocumentOperations operations)
261 {
262  Q_ASSERT(m_txn);
263  Q_ASSERT(doc.id() > 0);
264  Q_ASSERT(m_writeTrans);
265  if (!hasDocument(doc.id())) {
266  qCDebug(ENGINE) << "Transaction::replaceDocument" << "Document does not exist";
267  }
268 
269  if (!m_writeTrans) {
270  qCWarning(ENGINE) << "m_writeTrans is null";
271  return;
272  }
273 
274  m_writeTrans->replaceDocument(doc, operations);
275 }
276 
277 bool Transaction::commit()
278 {
279  Q_ASSERT(m_txn);
280  if (!m_writeTrans) {
281  qCWarning(ENGINE) << "m_writeTrans is null";
282  return false;
283  }
284 
285  m_writeTrans->commit();
286  m_writeTrans.reset();
287 
288  int rc = mdb_txn_commit(m_txn);
289  m_txn = nullptr;
290 
291  if (rc) {
292  qCWarning(ENGINE) << "Transaction::commit" << mdb_strerror(rc);
293  return false;
294  }
295 
296  return true;
297 }
298 
299 void Transaction::abort()
300 {
301  Q_ASSERT(m_txn);
302 
303  mdb_txn_abort(m_txn);
304  m_txn = nullptr;
305 
306  m_writeTrans.reset();
307 }
308 
309 //
310 // Queries
311 //
312 
313 PostingIterator* Transaction::postingIterator(const EngineQuery& query) const
314 {
315  PostingDB postingDb(m_dbis.postingDbi, m_txn);
316  PositionDB positionDb(m_dbis.positionDBi, m_txn);
317 
318  if (query.leaf()) {
319  if (query.op() == EngineQuery::Equal) {
320  return postingDb.iter(query.term());
321  } else if (query.op() == EngineQuery::StartsWith) {
322  return postingDb.prefixIter(query.term());
323  } else {
324  Q_ASSERT(0);
325  }
326  }
327 
328  const auto subQueries = query.subQueries();
329  if (subQueries.isEmpty()) {
330  return nullptr;
331  }
332 
333  Q_ASSERT(query.op() == EngineQuery::Phrase);
334  if (query.op() == EngineQuery::Phrase) {
335  if (subQueries.size() == 1) {
336  qCDebug(ENGINE) << "Degenerated Phrase with 1 Term:" << query;
337  return postingIterator(subQueries[0]);
338  }
340  vec.reserve(subQueries.size());
341  for (const EngineQuery& q : subQueries) {
342  if (!q.leaf()) {
343  qCDebug(ENGINE) << "Transaction::toPostingIterator" << "Phrase subqueries must be leafs";
344  continue;
345  }
346  auto termMatch = positionDb.iter(q.term());
347  if (!termMatch) {
348  return nullptr;
349  }
350  vec << termMatch;
351  }
352 
353  return new PhraseAndIterator(vec);
354  }
355 
356  return nullptr;
357 }
358 
359 PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, qlonglong value, PostingDB::Comparator com) const
360 {
361  PostingDB postingDb(m_dbis.postingDbi, m_txn);
362  return postingDb.compIter(prefix, value, com);
363 }
364 
365 PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, double value, PostingDB::Comparator com) const
366 {
367  PostingDB postingDb(m_dbis.postingDbi, m_txn);
368  return postingDb.compIter(prefix, value, com);
369 }
370 
371 PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, const QByteArray& value, PostingDB::Comparator com) const
372 {
373  PostingDB postingDb(m_dbis.postingDbi, m_txn);
374  return postingDb.compIter(prefix, value, com);
375 }
376 
377 PostingIterator* Transaction::mTimeRangeIter(quint32 beginTime, quint32 endTime) const
378 {
379  MTimeDB mTimeDb(m_dbis.mtimeDbi, m_txn);
380  return mTimeDb.iterRange(beginTime, endTime);
381 }
382 
383 PostingIterator* Transaction::docUrlIter(quint64 id) const
384 {
385  DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
386  return docUrlDb.iter(id);
387 }
388 
389 //
390 // Introspection
391 //
392 
393 QVector<QByteArray> Transaction::documentTerms(quint64 docId) const
394 {
395  Q_ASSERT(docId);
396 
397  DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
398  return documentTermsDB.get(docId);
399 }
400 
401 QVector<QByteArray> Transaction::documentFileNameTerms(quint64 docId) const
402 {
403  Q_ASSERT(docId);
404 
405  DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
406  return documentFileNameTermsDB.get(docId);
407 }
408 
409 QVector<QByteArray> Transaction::documentXattrTerms(quint64 docId) const
410 {
411  Q_ASSERT(docId);
412 
413  DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
414  return documentXattrTermsDB.get(docId);
415 }
416 
417 //
418 // File Size
419 //
420 static size_t dbiSize(MDB_txn* txn, MDB_dbi dbi)
421 {
422  MDB_stat stat;
423  mdb_stat(txn, dbi, &stat);
424 
425  return (stat.ms_branch_pages + stat.ms_leaf_pages + stat.ms_overflow_pages) * stat.ms_psize;
426 }
427 
428 DatabaseSize Transaction::dbSize()
429 {
430  DatabaseSize dbSize;
431  dbSize.postingDb = dbiSize(m_txn, m_dbis.postingDbi);
432  dbSize.positionDb = dbiSize(m_txn, m_dbis.positionDBi);
433  dbSize.docTerms = dbiSize(m_txn, m_dbis.docTermsDbi);
434  dbSize.docFilenameTerms = dbiSize(m_txn, m_dbis.docFilenameTermsDbi);
435  dbSize.docXattrTerms = dbiSize(m_txn, m_dbis.docXattrTermsDbi);
436 
437  dbSize.idTree = dbiSize(m_txn, m_dbis.idTreeDbi);
438  dbSize.idFilename = dbiSize(m_txn, m_dbis.idFilenameDbi);
439 
440  dbSize.docTime = dbiSize(m_txn, m_dbis.docTimeDbi);
441  dbSize.docData = dbiSize(m_txn, m_dbis.docDataDbi);
442 
443  dbSize.contentIndexingIds = dbiSize(m_txn, m_dbis.contentIndexingDbi);
444  dbSize.failedIds = dbiSize(m_txn, m_dbis.failedIdDbi);
445 
446  dbSize.mtimeDb = dbiSize(m_txn, m_dbis.mtimeDbi);
447 
448  dbSize.expectedSize = dbSize.postingDb + dbSize.positionDb + dbSize.docTerms + dbSize.docFilenameTerms
449  + dbSize.docXattrTerms + dbSize.idTree + dbSize.idFilename + dbSize.docTime
450  + dbSize.docData + dbSize.contentIndexingIds + dbSize.failedIds + dbSize.mtimeDb;
451 
452  MDB_envinfo info;
453  mdb_env_info(m_env, &info);
454  dbSize.actualSize = info.me_last_pgno * 4096; // TODO: separate page size
455 
456  return dbSize;
457 }
458 
459 //
460 // Debugging
461 //
462 void Transaction::checkFsTree()
463 {
464  DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
465  DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
466  DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
467  DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
468  PostingDB postingDb(m_dbis.postingDbi, m_txn);
469 
470  const auto map = postingDb.toTestMap();
471 
472  QSet<quint64> allIds;
473  for (const auto& list : map) {
474  for (quint64 id : list) {
475  allIds << id;
476  }
477  }
478 
479  std::cout << "Total Document IDs: " << allIds.size() << std::endl;
480 
481  int count = 0;
482  for (quint64 id: std::as_const(allIds)) {
483  QByteArray url = docUrlDb.get(id);
484  if (url.isEmpty()) {
485  auto terms = documentTermsDB.get(id);
486  auto fileNameTerms = documentFileNameTermsDB.get(id);
487  auto xAttrTerms = documentXattrTermsDB.get(id);
488 
489  // Lets reverse engineer the terms
490  QList<QByteArray> newTerms;
492  while (it.hasNext()) {
493  it.next();
494  if (it.value().contains(id)) {
495  newTerms << it.key();
496  }
497  }
498 
499  std::cout << "Missing filePath for " << id << std::endl;
500  std::cout << "\tPostingDB Terms: ";
501  for (const QByteArray& term : std::as_const(newTerms)) {
502  std::cout << qPrintable(QString::fromUtf8(term)) << " ";
503  }
504  std::cout << std::endl;
505 
506  std::cout << "\tDocumentTermsDB: ";
507  for (const QByteArray& term : terms) {
508  std::cout << qPrintable(QString::fromUtf8(term)) << " ";
509  }
510  std::cout << std::endl;
511 
512  std::cout << "\tFileNameTermsDB: ";
513  for (const QByteArray& term : fileNameTerms) {
514  std::cout << qPrintable(QString::fromUtf8(term)) << " ";
515  }
516  std::cout << std::endl;
517 
518  std::cout << "\tXAttrTermsDB: ";
519  for (const QByteArray& term : xAttrTerms) {
520  std::cout << qPrintable(QString::fromUtf8(term)) << " ";
521  }
522  std::cout << std::endl;
523 
524  count++;
525  } else if (!QFileInfo::exists(QString::fromUtf8(url))) {
526  std::cout << "FilePath " << qPrintable(QString::fromUtf8(url)) << " for " << id << " does not exist"<< std::endl;
527  count++;
528  }
529  }
530 
531  std::cout << "Invalid Entries: " << count << " (" << count * 100.0 / allIds.size() << "%)" << std::endl;
532 }
533 
534 void Transaction::checkTermsDbinPostingDb()
535 {
536  DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
537  DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
538  DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
539  PostingDB postingDb(m_dbis.postingDbi, m_txn);
540 
541  // Iterate over each document, and fetch all terms
542  // check if each term maps to its own id in the posting db
543 
544  const auto map = postingDb.toTestMap();
545 
546  QSet<quint64> allIds;
547  for (const auto& list : map) {
548  for (quint64 id : list) {
549  allIds << id;
550  }
551  }
552 
553  std::cout << "PostingDB check .." << std::endl;
554  for (quint64 id : std::as_const(allIds)) {
555  QVector<QByteArray> terms = documentTermsDB.get(id);
556  terms += documentXattrTermsDB.get(id);
557  terms += documentFileNameTermsDB.get(id);
558 
559  for (const QByteArray& term : std::as_const(terms)) {
560  PostingList plist = postingDb.get(term);
561  if (!plist.contains(id)) {
562  std::cout << id << " is missing term " << qPrintable(QString::fromUtf8(term)) << std::endl;
563  }
564  }
565  }
566 }
567 
568 void Transaction::checkPostingDbinTermsDb()
569 {
570  DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
571  DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
572  DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
573  PostingDB postingDb(m_dbis.postingDbi, m_txn);
574 
575  QMap<QByteArray, PostingList> map = postingDb.toTestMap();
577 
578  std::cout << "DocumentTermsDB check .." << std::endl;
579  while (it.hasNext()) {
580  it.next();
581 
582  const QByteArray& term = it.key();
583  const PostingList& list = it.value();
584  for (quint64 id : list) {
585  if (documentTermsDB.get(id).contains(term)) {
586  continue;
587  }
588  if (documentFileNameTermsDB.get(id).contains(term)) {
589  continue;
590  }
591  if (documentXattrTermsDB.get(id).contains(term)) {
592  continue;
593  }
594  std::cout << id << " is missing " << qPrintable(QString::fromUtf8(term)) << " from document terms db" << std::endl;
595  }
596  }
597 }
std::optional< QSqlQuery > query(const QString &queryStatement)
QString fromUtf8(const char *str, int size)
Type type(const QSqlDatabase &db)
QStringList split(const QString &sep, QString::SplitBehavior behavior, Qt::CaseSensitivity cs) const const
KIOFILEWIDGETS_EXPORT QStringList list(const QString &fileClass)
bool exists() const const
int size() const const
A PostingIterator is an abstract base class which can be used to iterate over all the "postings" or "...
The MTime DB maps the file mtime to its id.
Definition: mtimedb.h:24
Implements storage for docIds without any associated data Instantiated for:
Definition: coding.cpp:11
bool isEmpty() const const
void reserve(int size)
Implements storage for a set of s for the given docId Instantiated for:
Definition: documentdb.h:25
bool isEmpty() const const
QString path(const QString &relativePath)
void init(KXmlGuiWindow *window, KGameDifficulty *difficulty=nullptr)
Q_SCRIPTABLE Q_NOREPLY void abort()
The PostingDB is the main database that maps -> <id1> <id2> <id2> ...
Definition: postingdb.h:27
int size() const const
QFuture< void > map(Sequence &sequence, MapFunctor function)
A document represents an indexed file to be stored in the Baloo engine.
Definition: document.h:30
T value(int i) const const
int stat(const QString &path, KDE_struct_stat *buf)
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Wed Nov 29 2023 03:56:26 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.