Baloo

termgenerator.cpp
1/*
2 This file is part of the KDE Baloo project.
3 SPDX-FileCopyrightText: 2014-2015 Vishesh Handa <vhanda@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-or-later
6*/
7
8#include "termgenerator.h"
9
10#include <QTextBoundaryFinder>
11
12using namespace Baloo;
13
14namespace {
15
16QString normalizeTerm(const QString &str)
17{
18 // Remove all accents. It is important to call toLower after normalization,
19 // since some exotic unicode symbols can remain uppercase
20 const QString denormalized = str.normalized(QString::NormalizationForm_KD).toLower();
21
22 QString cleanString;
23 cleanString.reserve(denormalized.size());
24 for (const auto& c : denormalized) {
25 if (!c.isMark()) {
26 cleanString.append(c);
27 }
28 }
29
30 return cleanString.normalized(QString::NormalizationForm_KC);
31}
32
33void appendTerm(QByteArrayList &list, const QString &term)
34{
35 if (!term.isEmpty()) {
36 // Truncate the string to avoid arbitrarily long terms
37 list << QStringView(term).left(TermGenerator::maxTermSize).toUtf8();
38 }
39}
40
41}
42
43TermGenerator::TermGenerator(Document& doc)
44 : m_doc(doc)
45 , m_position(1)
46{
47}
48
49void TermGenerator::indexText(const QString& text)
50{
51 indexText(text, QByteArray());
52}
53
54QByteArrayList TermGenerator::termList(const QString& text_)
55{
56 QString text(text_);
57 text.replace(QLatin1Char('_'), QLatin1Char(' '));
58
59 int start = 0;
60
61 auto isSkipChar = [] (const QChar& c) {
62 return c.isPunct() || c.isMark() || c.isSpace() || (!c.isPrint() && !c.isSurrogate());
63 };
64
67 for (; bf.position() != -1; bf.toNextBoundary()) {
68 int end = bf.position();
69 while (start < end && isSkipChar(text[start])) {
70 start++;
71 }
72 if (end == start) {
73 continue;
74 }
75
76 // Typically we commit a term when we have an EndOfItem, starting
77 // from the last StartOfItem, everything between last EndOfItem and
78 // StartOfItem is just whitespace and punctuation. Unfortunately,
79 // most CJK characters do not trigger a StartOfItem and thus no
80 // EndOfItem, so everything in front of a StartOfItem has to be
81 // committed as well
82 bool commit = bf.boundaryReasons() & (QTextBoundaryFinder::EndOfItem | QTextBoundaryFinder::StartOfItem);
83
84 // Also commit term if end-of-text is reached or when we find
85 // any punctuation
86 if (!commit & (end == text.size() || isSkipChar(text[end]))) {
87 commit = true;
88 }
89
90 if (commit) {
91 const QString term = normalizeTerm(text.mid(start, end - start));
92 appendTerm(list, term);
93 start = end;
94 }
95 }
96 return list;
97}
98
99void TermGenerator::indexText(const QString& text, const QByteArray& prefix)
100{
101 const QByteArrayList terms = termList(text);
102 if (terms.size() == 1) {
103 QByteArray finalArr = prefix + terms[0];
104 m_doc.addTerm(finalArr);
105 return;
106 }
107 for (const QByteArray& term : terms) {
108 QByteArray finalArr = prefix + term;
109
110 m_doc.addPositionTerm(finalArr, m_position);
111 m_position++;
112 }
113 m_position++;
114}
115
116void TermGenerator::indexFileNameText(const QString& text)
117{
118 const QByteArray prefix = QByteArrayLiteral("F");
119 const QByteArrayList terms = termList(text);
120 if (terms.size() == 1) {
121 QByteArray finalArr = prefix + terms[0];
122 m_doc.addFileNameTerm(finalArr);
123 return;
124 }
125 for (const QByteArray& term : terms) {
126 QByteArray finalArr = prefix + term;
127
128 m_doc.addFileNamePositionTerm(finalArr, m_position);
129 m_position++;
130 }
131 m_position++;
132}
133
134void TermGenerator::indexXattrText(const QString& text, const QByteArray& prefix)
135{
136 const QByteArrayList terms = termList(text);
137 if (terms.size() == 1) {
138 QByteArray finalArr = prefix + terms[0];
139 m_doc.addXattrTerm(finalArr);
140 return;
141 }
142 for (const QByteArray& term : terms) {
143 QByteArray finalArr = prefix + term;
144
145 m_doc.addXattrPositionTerm(finalArr, m_position);
146 m_position++;
147 }
148 m_position++;
149}
150
151int TermGenerator::position() const
152{
153 return m_position;
154}
155
156void TermGenerator::setPosition(int position)
157{
158 m_position = position;
159}
Q_SCRIPTABLE Q_NOREPLY void start()
Implements storage for docIds without any associated data Instantiated for:
Definition coding.cpp:11
KIOCORE_EXPORT QStringList list(const QString &fileClass)
const QList< QKeySequence > & end()
qsizetype size() const const
NormalizationForm_KD
QString & append(QChar ch)
bool isEmpty() const const
QString mid(qsizetype position, qsizetype n) const const
QString normalized(NormalizationForm mode, QChar::UnicodeVersion version) const const
QString & replace(QChar before, QChar after, Qt::CaseSensitivity cs)
void reserve(qsizetype size)
qsizetype size() const const
QString toLower() const const
QStringView left(qsizetype length) const const
QByteArray toUtf8() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:51:40 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.