KHtml

khtml_filter.cpp
1 /* This file is part of the KDE project
2 
3  Copyright (C) 2005 Ivor Hewitt <[email protected]>
4  Copyright (C) 2008 Maksim Orlovich <[email protected]>
5  Copyright (C) 2008 Vyacheslav Tokarev <[email protected]>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Library General Public
9  License as published by the Free Software Foundation; either
10  version 2 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Library General Public License for more details.
16 
17  You should have received a copy of the GNU Library General Public License
18  along with this library; see the file COPYING.LIB. If not, write to
19  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20  Boston, MA 02110-1301, USA.
21 */
22 
23 #include "khtml_filter_p.h"
24 #include "khtml_debug.h"
25 
26 // rolling hash parameters
27 #define HASH_P (1997)
28 #define HASH_Q (17509)
29 // HASH_MOD = (HASH_P^7) % HASH_Q
30 #define HASH_MOD (523)
31 
32 namespace khtml
33 {
34 
35 // We only want a subset of features of wildcards -- just the
36 // star, so we escape the rest before passing to QRegExp.
37 // The \ is escaped due to a QRegExp bug.
38 // ### we really should rather parse it ourselves, in order to
39 // handle adblock-special things like | and ^ properly.
40 static QRegExp fromAdBlockWildcard(const QString &wcStr)
41 {
42  QRegExp rx;
44 
45  QString out;
46  for (int p = 0; p < wcStr.length(); ++p) {
47  QChar c = wcStr[p];
48  if (c == QLatin1Char('?')) {
49  out += QLatin1String("[?]");
50  } else if (c == QLatin1Char('[')) {
51  out += QLatin1String("[[]");
52  } else if (c == QLatin1Char('\\')) {
53  out += QLatin1String("[\\]");
54  } else {
55  out += c;
56  }
57  }
58 
59  rx.setPattern(out);
60  return rx;
61 }
62 
63 void FilterSet::addFilter(const QString &filterStr)
64 {
65  QString filter = filterStr.trimmed();
66  if (filter.isEmpty()) {
67  return;
68  }
69 
70  /** ignore special lines starting with "[", "!", or "#" or contain "#" (comments or features are not supported by KHTML's AdBlock */
71  const QChar firstChar = filter.at(0);
72  if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#'))) {
73  return;
74  }
75 
76  // Strip leading @@
77  if (filter.startsWith(QLatin1String("@@"))) {
78  filter.remove(0, 2);
79  }
80 
81  // Strip options, we ignore them for now.
82  bool hadOptions = false;
83  int dollar = filter.lastIndexOf(QLatin1Char('$'));
84  if (dollar != -1) {
85  // Is it adblock's options delimiter or the special '$' char in a regular expression?
86  if (!filter.startsWith(QLatin1Char('/')) || !filter.endsWith(QLatin1Char('/'))) {
87  filter = filter.mid(0, dollar);
88  hadOptions = true;
89  }
90  }
91 
92  // Is it a regexp filter?
93  if (filter.length() > 2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/'))) {
94  // Ignore regexp that had options as we do not support them
95  if (hadOptions) {
96  return;
97  }
98  QString inside = filter.mid(1, filter.length() - 2);
99  QRegExp rx(inside);
100  reFilters.append(rx);
101  //qCDebug(KHTML_LOG) << "R:" << inside;
102  return;
103  }
104  // Nope, a wildcard one.
105 
106  // Disregard the rule if only one char is left or it contains unsupported adblock features ('|', "||", '^')
107  if (filter.length() < 2 || filter.contains(QLatin1Char('|')) || filter.contains(QLatin1Char('^'))) {
108  return;
109  }
110 
111  // Strip wildcards at the ends
112  int first = 0;
113  int last = filter.length() - 1;
114 
115  while (first < filter.length() && filter.at(first) == QLatin1Char('*')) {
116  ++first;
117  }
118 
119  while (last >= 0 && filter.at(last) == QLatin1Char('*')) {
120  --last;
121  }
122 
123  if (first > last) {
124  filter = QLatin1String("*"); // erm... Well, they asked for it.
125  } else {
126  filter = filter.mid(first, last - first + 1);
127  }
128 
129  // Now, do we still have any wildcard stuff left?
130  int aPos = filter.indexOf('*');
131  if (aPos != -1) {
132  // check if we can use RK first (and then check full RE for the rest) for better performance
133  if (aPos > 7) {
134  QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*'));
135  // We pad the final r.e. with * so we can check for an exact match
136  stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx);
137  } else {
138  QRegExp rx = fromAdBlockWildcard(filter);
139  reFilters.append(rx);
140  }
141  } else {
142  // Fast path
143  stringFiltersMatcher.addString(filter);
144  }
145 
146 }
147 
148 bool FilterSet::isUrlMatched(const QString &url)
149 {
150  if (stringFiltersMatcher.isMatched(url)) {
151  return true;
152  }
153 
154  for (int c = 0; c < reFilters.size(); ++c) {
155  if (url.contains(reFilters[c])) {
156  return true;
157  }
158  }
159 
160  return false;
161 }
162 
163 QString FilterSet::urlMatchedBy(const QString &url)
164 {
165  QString by;
166 
167  if (stringFiltersMatcher.isMatched(url, &by)) {
168  return by;
169  }
170 
171  for (int c = 0; c < reFilters.size(); ++c) {
172  if (url.contains(reFilters[c])) {
173  by = reFilters[c].pattern();
174  break;
175  }
176  }
177 
178  return by;
179 }
180 
181 void FilterSet::clear()
182 {
183  reFilters.clear();
184  stringFiltersMatcher.clear();
185 }
186 
187 void StringsMatcher::addString(const QString &pattern)
188 {
189  if (pattern.length() < 8) {
190  // handle short string differently
191  shortStringFilters.append(pattern);
192  } else {
193  // use modified Rabin-Karp's algorithm with 8-length string hash
194  // i.e. store hash of first 8 chars in the HashMap for fast look-up
195  stringFilters.append(pattern);
196  int ind = stringFilters.size() - 1;
197  int current = 0;
198 
199  // compute hash using rolling hash
200  // hash for string: x0,x1,x2...xn-1 will be:
201  // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q
202  // where p and q some wisely-chosen integers
203  /*for (int k = 0; k < 8; ++k)*/
204  int len = pattern.length();
205  for (int k = len - 8; k < len; ++k) {
206  current = (current * HASH_P + pattern[k].unicode()) % HASH_Q;
207  }
208 
209  // insert computed hash value into HashMap
210  WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
211  if (it == stringFiltersHash.end()) {
213  list.append(ind);
214  stringFiltersHash.add(current + 1, list);
215  fastLookUp.setBit(current);
216  } else {
217  it->second.append(ind);
218  }
219  }
220 }
221 
222 void StringsMatcher::addWildedString(const QString &prefix, const QRegExp &rx)
223 {
224  rePrefixes.append(prefix);
225  reFilters.append(rx);
226  int index = -rePrefixes.size();
227 
228  int current = 0;
229  for (int k = 0; k < 8; ++k) {
230  current = (current * HASH_P + prefix[k].unicode()) % HASH_Q;
231  }
232 
233  // insert computed hash value into HashMap
234  WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
235  if (it == stringFiltersHash.end()) {
236  QVector<int> list;
237  list.append(index);
238  stringFiltersHash.add(current + 1, list);
239  fastLookUp.setBit(current);
240  } else {
241  it->second.append(index);
242  }
243 }
244 
245 bool StringsMatcher::isMatched(const QString &str, QString *by) const
246 {
247  // check short strings first
248  for (int i = 0; i < shortStringFilters.size(); ++i) {
249  if (str.contains(shortStringFilters[i])) {
250  if (by != nullptr) {
251  *by = shortStringFilters[i];
252  }
253  return true;
254  }
255  }
256 
257  int len = str.length();
258  int k;
259 
260  int current = 0;
261  int next = 0;
262  // compute hash for first 8 characters
263  for (k = 0; k < 8 && k < len; ++k) {
264  current = (current * HASH_P + str[k].unicode()) % HASH_Q;
265  }
266 
267  WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end();
268  // main Rabin-Karp's algorithm loop
269  for (k = 7; k < len; ++k, current = next) {
270  // roll the hash if not at the end
271  // (calculate hash for the next iteration)
272  if (k + 1 < len) {
273  next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q;
274  }
275 
276  if (!fastLookUp.testBit(current)) {
277  continue;
278  }
279 
280  // look-up the hash in the HashMap and check all strings
281  WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1);
282 
283  // check possible strings
284  if (it != hashEnd) {
285  for (int j = 0; j < it->second.size(); ++j) {
286  int index = it->second[j];
287  // check if we got simple string or REs prefix
288  if (index >= 0) {
289  int flen = stringFilters[index].length();
290  if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1, flen)) {
291  if (by != nullptr) {
292  *by = stringFilters[index];
293  }
294  return true;
295  }
296  } else {
297  index = -index - 1;
298  int flen = rePrefixes[index].length();
299  if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen)) {
300  int remStart = k - 7 + flen;
301  QString remainder = QString::fromRawData(str.unicode() + remStart,
302  str.length() - remStart);
303  if (reFilters[index].exactMatch(remainder)) {
304  if (by != nullptr) {
305  *by = rePrefixes[index] + reFilters[index].pattern();
306  }
307  return true;
308  }
309  }
310  }
311  }
312  }
313  }
314 
315  return false;
316 }
317 
318 void StringsMatcher::clear()
319 {
320  stringFilters.clear();
321  shortStringFilters.clear();
322  reFilters.clear();
323  rePrefixes.clear();
324  stringFiltersHash.clear();
325  fastLookUp.resize(HASH_Q);
326  fastLookUp.fill(0, 0, HASH_Q);
327 }
328 
329 }
330 
void setPatternSyntax(QRegExp::PatternSyntax syntax)
int indexOf(QChar ch, int from, Qt::CaseSensitivity cs) const const
void append(const T &value)
This file is part of the HTML rendering engine for KDE.
MESSAGECORE_EXPORT KMime::Content * next(KMime::Content *node, bool allowChildren=true)
QString & remove(int position, int n)
int lastIndexOf(QChar ch, int from, Qt::CaseSensitivity cs) const const
void clear()
void setPattern(const QString &pattern)
QString fromRawData(const QChar *unicode, int size)
bool isEmpty() const const
QString trimmed() const const
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
QFuture< void > filter(Sequence &sequence, KeepFunctor filterFunction)
bool contains(QChar ch, Qt::CaseSensitivity cs) const const
QStringRef midRef(int position, int n) const const
const QChar * unicode() const const
QString mid(int position, int n) const const
const QChar at(int position) const const
int length() const const
KIOFILEWIDGETS_EXPORT QStringList list(const QString &fileClass)
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Sat Oct 16 2021 22:47:56 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.