Baloo

termgenerator.cpp
1 /*
2  This file is part of the KDE Baloo project.
3  SPDX-FileCopyrightText: 2014-2015 Vishesh Handa <[email protected]>
4 
5  SPDX-License-Identifier: LGPL-2.1-or-later
6 */
7 
8 #include "termgenerator.h"
9 
10 #include <QTextBoundaryFinder>
11 
12 using namespace Baloo;
13 
14 namespace {
15 
16 QString normalizeTerm(const QString &str)
17 {
18  // Remove all accents. It is important to call toLower after normalization,
19  // since some exotic unicode symbols can remain uppercase
20  const QString denormalized = str.normalized(QString::NormalizationForm_KD).toLower();
21 
22  QString cleanString;
23  cleanString.reserve(denormalized.size());
24  for (const auto& c : denormalized) {
25  if (!c.isMark()) {
26  cleanString.append(c);
27  }
28  }
29 
30  return cleanString.normalized(QString::NormalizationForm_KC);
31 }
32 
33 void appendTerm(QByteArrayList &list, const QString &term)
34 {
35  if (!term.isEmpty()) {
36  // Truncate the string to avoid arbitrarily long terms
37  list << QStringView(term).left(TermGenerator::maxTermSize).toUtf8();
38  }
39 }
40 
41 }
42 
43 TermGenerator::TermGenerator(Document& doc)
44  : m_doc(doc)
45  , m_position(1)
46 {
47 }
48 
49 void TermGenerator::indexText(const QString& text)
50 {
51  indexText(text, QByteArray());
52 }
53 
54 QByteArrayList TermGenerator::termList(const QString& text_)
55 {
56  QString text(text_);
57  text.replace(QLatin1Char('_'), QLatin1Char(' '));
58 
59  int start = 0;
60 
61  auto isSkipChar = [] (const QChar& c) {
62  return c.isPunct() || c.isMark() || c.isSpace() || (!c.isPrint() && !c.isSurrogate());
63  };
64 
67  for (; bf.position() != -1; bf.toNextBoundary()) {
68  int end = bf.position();
69  while (start < end && isSkipChar(text[start])) {
70  start++;
71  }
72  if (end == start) {
73  continue;
74  }
75 
76  // Typically we commit a term when we have an EndOfItem, starting
77  // from the last StartOfItem, everything between last EndOfItem and
78  // StartOfItem is just whitespace and punctuation. Unfortunately,
79  // most CJK characters do not trigger a StartOfItem and thus no
80  // EndOfItem, so everything in front of a StartOfItem has to be
81  // committed as well
82  bool commit = bf.boundaryReasons() & (QTextBoundaryFinder::EndOfItem | QTextBoundaryFinder::StartOfItem);
83 
84  // Also commit term if end-of-text is reached or when we find
85  // any punctuation
86  if (!commit & (end == text.size() || isSkipChar(text[end]))) {
87  commit = true;
88  }
89 
90  if (commit) {
91  const QString term = normalizeTerm(text.mid(start, end - start));
92  appendTerm(list, term);
93  start = end;
94  }
95  }
96  return list;
97 }
98 
99 void TermGenerator::indexText(const QString& text, const QByteArray& prefix)
100 {
101  const QByteArrayList terms = termList(text);
102  if (terms.size() == 1) {
103  QByteArray finalArr = prefix + terms[0];
104  m_doc.addTerm(finalArr);
105  return;
106  }
107  for (const QByteArray& term : terms) {
108  QByteArray finalArr = prefix + term;
109 
110  m_doc.addPositionTerm(finalArr, m_position);
111  m_position++;
112  }
113  m_position++;
114 }
115 
116 void TermGenerator::indexFileNameText(const QString& text)
117 {
118  const QByteArray prefix = QByteArrayLiteral("F");
119  const QByteArrayList terms = termList(text);
120  if (terms.size() == 1) {
121  QByteArray finalArr = prefix + terms[0];
122  m_doc.addFileNameTerm(finalArr);
123  return;
124  }
125  for (const QByteArray& term : terms) {
126  QByteArray finalArr = prefix + term;
127 
128  m_doc.addFileNamePositionTerm(finalArr, m_position);
129  m_position++;
130  }
131  m_position++;
132 }
133 
134 void TermGenerator::indexXattrText(const QString& text, const QByteArray& prefix)
135 {
136  const QByteArrayList terms = termList(text);
137  if (terms.size() == 1) {
138  QByteArray finalArr = prefix + terms[0];
139  m_doc.addXattrTerm(finalArr);
140  return;
141  }
142  for (const QByteArray& term : terms) {
143  QByteArray finalArr = prefix + term;
144 
145  m_doc.addXattrPositionTerm(finalArr, m_position);
146  m_position++;
147  }
148  m_position++;
149 }
150 
151 int TermGenerator::position() const
152 {
153  return m_position;
154 }
155 
156 void TermGenerator::setPosition(int position)
157 {
158  m_position = position;
159 }
NormalizationForm_KD
int size() const const
Q_SCRIPTABLE Q_NOREPLY void start()
void reserve(int size)
KIOFILEWIDGETS_EXPORT QStringList list(const QString &fileClass)
QStringView left(qsizetype length) const const
QByteArray toUtf8() const const
QString normalized(QString::NormalizationForm mode, QChar::UnicodeVersion version) const const
int size() const const
Implements storage for docIds without any associated data Instantiated for:
Definition: coding.cpp:11
bool isEmpty() const const
QString & replace(int position, int n, QChar after)
QString toLower() const const
QString mid(int position, int n) const const
A document represents an indexed file to be stored in the Baloo engine.
Definition: document.h:30
const QList< QKeySequence > & end()
QString & append(QChar ch)
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Wed Nov 29 2023 03:56:26 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.