Sonnet

tokenizer.cpp
1 /* This file is part of the KDE libraries
2 
3  SPDX-FileCopyrightText: 2004 Zack Rusin <[email protected]>
4  SPDX-FileCopyrightText: 2006 Jacob R Rideout <[email protected]>
5  SPDX-FileCopyrightText: 2009 Jakub Stachowski <[email protected]>
6 
7  SPDX-License-Identifier: LGPL-2.0-or-later
8 */
9 
10 #include <QList>
11 #include <QString>
12 
13 #include "textbreaks_p.h"
14 #include "tokenizer_p.h"
15 
16 namespace Sonnet
17 {
18 class BreakTokenizerPrivate
19 {
20 public:
21  enum Type {
22  Words,
23  Sentences,
24  };
25 
26  BreakTokenizerPrivate(Type s)
27  : breakFinder(new TextBreaks)
28  , itemPosition(-1)
29  , cacheValid(false)
30  , type(s)
31  {
32  }
33 
34  ~BreakTokenizerPrivate()
35  {
36  delete breakFinder;
37  }
38 
39  TextBreaks::Positions breaks() const;
40  void invalidate();
41  void shiftBreaks(int from, int offset);
42  void replace(int pos, int len, const QString &newWord);
43 
44  TextBreaks *const breakFinder;
45  QString buffer;
46 
47  int itemPosition = -1;
48  mutable bool cacheValid;
49  Token last;
50  const Type type;
51  bool inAddress = false;
52  bool ignoreUppercase = false;
53 
54  bool hasNext() const;
55  Token next();
56  void setBuffer(const QString &b)
57  {
58  invalidate();
59  buffer = b;
60  }
61 
62 private:
63  void regenerateCache() const;
64  mutable TextBreaks::Positions cachedBreaks;
65 };
66 
67 void BreakTokenizerPrivate::invalidate()
68 {
69  cacheValid = false;
70  itemPosition = -1;
71 }
72 
73 bool BreakTokenizerPrivate::hasNext() const
74 {
75  if (itemPosition >= (breaks().size() - 1)) {
76  return false;
77  }
78 
79  return true;
80 }
81 
82 TextBreaks::Positions BreakTokenizerPrivate::breaks() const
83 {
84  if (!cacheValid) {
85  regenerateCache();
86  }
87 
88  return cachedBreaks;
89 }
90 
91 void BreakTokenizerPrivate::shiftBreaks(int from, int offset)
92 {
93  for (int i = 0; i < cachedBreaks.size(); i++) {
94  if (cachedBreaks[i].start > from) {
95  cachedBreaks[i].start = cachedBreaks[i].start - offset;
96  }
97  }
98 }
99 
100 void BreakTokenizerPrivate::regenerateCache() const
101 {
102  if (!breakFinder || buffer.isEmpty()) {
103  cachedBreaks = TextBreaks::Positions();
104  }
105 
106  if (breakFinder) {
107  breakFinder->setText(buffer);
108 
109  if (type == Sentences) {
110  cachedBreaks = breakFinder->sentenceBreaks();
111  } else if (type == Words) {
112  cachedBreaks = breakFinder->wordBreaks();
113  }
114  }
115 
116  cacheValid = true;
117 }
118 
119 Token BreakTokenizerPrivate::next()
120 {
121  Token block;
122 
123  if (!hasNext()) {
124  last = block;
125  return block;
126  }
127 
128  itemPosition++;
129 
130  const TextBreaks::Position &textBreak = this->breaks().at(itemPosition);
131  QStringView token = QStringView(buffer).mid(textBreak.start, textBreak.length);
132  last = {token, textBreak.start};
133  return last;
134 }
135 
136 void BreakTokenizerPrivate::replace(int pos, int len, const QString &newWord)
137 {
138  buffer.replace(pos, len, newWord);
139  int offset = len - newWord.length();
140  if (cacheValid) {
141  shiftBreaks(pos, offset);
142  }
143 }
144 
145 /*-----------------------------------------------------------*/
146 
147 WordTokenizer::WordTokenizer(const QString &buffer)
148  : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Words))
149 {
150  setBuffer(buffer);
151 }
152 
153 WordTokenizer::~WordTokenizer()
154 {
155  delete d;
156 }
157 
158 bool WordTokenizer::hasNext() const
159 {
160  return d->hasNext();
161 }
162 
163 void WordTokenizer::setBuffer(const QString &buffer)
164 {
165  d->setBuffer(buffer);
166 }
167 
168 Token WordTokenizer::next()
169 {
170  Token n = d->next();
171 
172  // end of address of url?
173  if (d->inAddress && n.position() > 0 && d->buffer[n.position() - 1].isSpace()) {
174  d->inAddress = false;
175  }
176 
177  // check if this word starts an email address of url
178  if (!d->inAddress || hasNext()) {
179  const int pos = n.position() + n.length();
180  if ((pos < d->buffer.length()) && d->buffer[pos] == QLatin1Char('@')) {
181  d->inAddress = true;
182  }
183  if ((pos + 2 < d->buffer.length()) && d->buffer[pos] == QLatin1Char(':') && d->buffer[pos + 1] == QLatin1Char('/')
184  && d->buffer[pos + 2] == QLatin1Char('/')) {
185  d->inAddress = true;
186  }
187  }
188  return n;
189 }
190 
191 QString WordTokenizer::buffer() const
192 {
193  return d->buffer;
194 }
195 
196 bool WordTokenizer::isUppercase(QStringView word) const
197 {
198  for (int i = 0; i < word.length(); ++i) {
199  if (word.at(i).isLetter() && !word.at(i).isUpper()) {
200  return false;
201  }
202  }
203  return true;
204 }
205 
206 void WordTokenizer::setIgnoreUppercase(bool val)
207 {
208  d->ignoreUppercase = val;
209 }
210 
211 void WordTokenizer::replace(int pos, int len, const QString &newWord)
212 {
213  d->replace(pos, len, newWord);
214 }
215 
216 bool WordTokenizer::isSpellcheckable() const
217 {
218  if (d->last.isNull() || d->last.isEmpty()) {
219  return false;
220  }
221  if (!d->last.at(0).isLetter()) {
222  return false;
223  }
224  if (d->inAddress) {
225  return false;
226  }
227  if (d->ignoreUppercase && isUppercase(d->last.token)) {
228  return false;
229  }
230  return true;
231 }
232 
233 /* --------------------------------------------------------------------*/
234 
235 SentenceTokenizer::SentenceTokenizer(const QString &buffer)
236  : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Sentences))
237 {
238  setBuffer(buffer);
239 }
240 
241 SentenceTokenizer::~SentenceTokenizer()
242 {
243  delete d;
244 }
245 
246 bool SentenceTokenizer::hasNext() const
247 {
248  return d->hasNext();
249 }
250 
251 void SentenceTokenizer::setBuffer(const QString &buffer)
252 {
253  d->setBuffer(buffer);
254 }
255 
256 Token SentenceTokenizer::next()
257 {
258  return d->next();
259 }
260 
261 QString SentenceTokenizer::buffer() const
262 {
263  return d->buffer;
264 }
265 
266 void SentenceTokenizer::replace(int pos, int len, const QString &newWord)
267 {
268  d->replace(pos, len, newWord);
269 }
270 }
QChar at(qsizetype n) const const
Type type(const QSqlDatabase &db)
QStringView mid(qsizetype start) const const
bool isLetter() const const
Q_SCRIPTABLE Q_NOREPLY void start()
int length() const const
bool isUpper() const const
QAction * replace(const QObject *recvr, const char *slot, QObject *parent)
QString & replace(int position, int n, QChar after)
The sonnet namespace.
int length() const const
QAction * next(const QObject *recvr, const char *slot, QObject *parent)
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Sat Sep 23 2023 03:58:23 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.