KHtml

xpath/tokenizer.cpp
1 /*
2  * tokenizer.cc - Copyright 2005 Maksim Orlovich <[email protected]>
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 #include "tokenizer.h"
26 
27 #include "xml/dom_stringimpl.h"
28 #include "xml/dom3_xpathimpl.h"
29 #include "dom/dom3_xpath.h"
30 
31 #include <cstdio>
32 
33 using namespace std;
34 
35 using namespace DOM;
36 using namespace DOM::XPath;
37 using namespace khtml;
38 using namespace khtml::XPath;
39 
40 namespace khtml
41 {
42 namespace XPath
43 {
44 
45 struct AxisNameMapping {
46  const char *name;
47  Step::AxisType type;
48 };
49 
50 static AxisNameMapping axisNames[] = {
51  { "ancestor", Step::AncestorAxis },
52  { "ancestor-or-self", Step::AncestorOrSelfAxis },
53  { "attribute", Step::AttributeAxis },
54  { "child", Step::ChildAxis },
55  { "descendant", Step::DescendantAxis },
56  { "descendant-or-self", Step::DescendantOrSelfAxis },
57  { "following", Step::FollowingAxis },
58  { "following-sibling", Step::FollowingSiblingAxis },
59  { "namespace", Step::NamespaceAxis },
60  { "parent", Step::ParentAxis },
61  { "preceding", Step::PrecedingAxis },
62  { "preceding-sibling", Step::PrecedingSiblingAxis },
63  { "self", Step::SelfAxis }
64 };
65 static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]);
66 
67 static const char *const nodeTypeNames[] = {
68  "comment",
69  "text",
70  "processing-instruction",
71  "node",
72  nullptr
73 };
74 
75 QHash<QString, Step::AxisType> *Tokenizer::s_axisNamesDict = nullptr;
76 QSet<QString> *Tokenizer::s_nodeTypeNamesDict = nullptr;
77 
78 Tokenizer &Tokenizer::self()
79 {
80  static Tokenizer instance;
81  return instance;
82 }
83 
84 Tokenizer::XMLCat Tokenizer::charCat(QChar aChar)
85 {
86  //### might need to add some special cases from the XML spec.
87 
88  if (aChar.unicode() == '_') {
89  return NameStart;
90  }
91 
92  if (aChar.unicode() == '.' || aChar.unicode() == '-') {
93  return NameCont;
94  }
95 
96  switch (aChar.category()) {
97  case QChar::Letter_Lowercase: //Ll
98  case QChar::Letter_Uppercase: //Lu
99  case QChar::Letter_Other: //Lo
100  case QChar::Letter_Titlecase: //Lt
101  case QChar::Number_Letter: //Nl
102  return NameStart;
103 
105  case QChar::Mark_Enclosing: //Me
106  case QChar::Mark_NonSpacing: //Mn
107  case QChar::Letter_Modifier: //Lm
108  case QChar::Number_DecimalDigit: //Nd
109  return NameCont;
110 
111  default:
112  return NotPartOfName;
113  }
114 }
115 
116 bool Tokenizer::isAxisName(QString name, Step::AxisType *type)
117 {
118  if (!s_axisNamesDict) {
119  s_axisNamesDict = new QHash<QString, Step::AxisType>;
120  for (unsigned int p = 0; p < axisNamesCount; ++p)
121  s_axisNamesDict->insert(QLatin1String(axisNames[p].name),
122  axisNames[p].type);
123  }
124 
125  QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name);
126  if (it != s_axisNamesDict->constEnd()) {
127  *type = *it;
128  }
129  return it != s_axisNamesDict->constEnd();
130 }
131 
132 bool Tokenizer::isNodeTypeName(QString name)
133 {
134  if (!s_nodeTypeNamesDict) {
135  s_nodeTypeNamesDict = new QSet<QString>;
136  for (int p = 0; nodeTypeNames[p]; ++p) {
137  s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p]));
138  }
139  }
140  return s_nodeTypeNamesDict->contains(name);
141 }
142 
143 /* Returns whether the last parsed token matches the [32] Operator rule
144  * (check https://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
145  * the tokens.
146  */
147 bool Tokenizer::isOperatorContext()
148 {
149  if (m_nextPos == 0) {
150  return false;
151  }
152 
153  switch (m_lastTokenType) {
154  case AND: case OR: case MULOP:
155  case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
156  case EQOP: case RELOP:
157  case '@': case AXISNAME: case '(': case '[':
158  return false;
159  default:
160  return true;
161  }
162 }
163 
164 void Tokenizer::skipWS()
165 {
166  while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace()) {
167  ++m_nextPos;
168  }
169 }
170 
171 Token Tokenizer::makeTokenAndAdvance(int code, int advance)
172 {
173  m_nextPos += advance;
174  return Token(code);
175 }
176 
177 Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance)
178 {
179  m_nextPos += advance;
180  return Token(code, val);
181 }
182 
183 //Returns next char if it's there and interesting, 0 otherwise
184 char Tokenizer::peekAheadHelper()
185 {
186  if (m_nextPos + 1 >= m_data.length()) {
187  return 0;
188  }
189  QChar next = m_data[m_nextPos + 1];
190  if (next.row() != 0) {
191  return 0;
192  } else {
193  return next.cell();
194  }
195 }
196 
197 char Tokenizer::peekCurHelper()
198 {
199  if (m_nextPos >= m_data.length()) {
200  return 0;
201  }
202  QChar next = m_data[m_nextPos];
203  if (next.row() != 0) {
204  return 0;
205  } else {
206  return next.cell();
207  }
208 }
209 
210 Token Tokenizer::lexString()
211 {
212  QChar delimiter = m_data[m_nextPos];
213  int startPos = m_nextPos + 1;
214 
215  for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
216  if (m_data[m_nextPos] == delimiter) {
217  QString value = m_data.mid(startPos, m_nextPos - startPos);
218  ++m_nextPos; //Consume the char;
219  return Token(LITERAL, value);
220  }
221  }
222 
223  //Ouch, went off the end -- report error
224  return Token(ERROR);
225 }
226 
227 Token Tokenizer::lexNumber()
228 {
229  int startPos = m_nextPos;
230  bool seenDot = false;
231 
232  //Go until end or a non-digits character
233  for (; m_nextPos < m_data.length(); ++m_nextPos) {
234  QChar aChar = m_data[m_nextPos];
235  if (aChar.row() != 0) {
236  break;
237  }
238 
239  if (aChar.cell() < '0' || aChar.cell() > '9') {
240  if (aChar.cell() == '.' && !seenDot) {
241  seenDot = true;
242  } else {
243  break;
244  }
245  }
246  }
247 
248  QString value = m_data.mid(startPos, m_nextPos - startPos);
249  return Token(NUMBER, value);
250 }
251 
252 Token Tokenizer::lexNCName()
253 {
254  int startPos = m_nextPos;
255  if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart) {
256  //Keep going until we get a character that's not good for names.
257  for (; m_nextPos < m_data.length(); ++m_nextPos) {
258  if (charCat(m_data[m_nextPos]) == NotPartOfName) {
259  break;
260  }
261  }
262 
263  QString value = m_data.mid(startPos, m_nextPos - startPos);
264  return Token(value);
265  } else {
266  return makeTokenAndAdvance(ERROR);
267  }
268 }
269 
270 Token Tokenizer::lexQName()
271 {
272  Token t1 = lexNCName();
273  if (t1.type == ERROR) {
274  return t1;
275  }
276  skipWS();
277  //If the next character is :, what we just got it the prefix, if not,
278  //it's the whole thing
279  if (peekAheadHelper() != ':') {
280  return t1;
281  }
282 
283  Token t2 = lexNCName();
284  if (t2.type == ERROR) {
285  return t2;
286  }
287 
288  return Token(t1.value + ":" + t2.value);
289 }
290 
291 Token Tokenizer::nextTokenInternal()
292 {
293  skipWS();
294 
295  if (m_nextPos >= m_data.length()) {
296  return Token(0);
297  }
298 
299  char code = peekCurHelper();
300  switch (code) {
301  case '(': case ')': case '[': case ']':
302  case '@': case ',': case '|':
303  return makeTokenAndAdvance(code);
304  case '\'':
305  case '\"':
306  return lexString();
307  case '0': case '1': case '2': case '3': case '4':
308  case '5': case '6': case '7': case '8': case '9':
309  return lexNumber();
310  case '.': {
311  char next = peekAheadHelper();
312  if (next == '.') {
313  return makeTokenAndAdvance(DOTDOT, 2);
314  } else if (next >= '0' && next <= '9') {
315  return lexNumber();
316  } else {
317  return makeTokenAndAdvance('.');
318  }
319  }
320  case '/':
321  if (peekAheadHelper() == '/') {
322  return makeTokenAndAdvance(SLASHSLASH, 2);
323  } else {
324  return makeTokenAndAdvance('/');
325  }
326  case '+':
327  return makeTokenAndAdvance(PLUS);
328  case '-':
329  return makeTokenAndAdvance(MINUS);
330  case '=':
331  return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ);
332  case '!':
333  if (peekAheadHelper() == '=') {
334  return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2);
335  } else {
336  return Token(ERROR);
337  }
338  case '<':
339  if (peekAheadHelper() == '=') {
340  return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2);
341  } else {
342  return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT);
343  }
344  case '>':
345  if (peekAheadHelper() == '=') {
346  return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2);
347  } else {
348  return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT);
349  }
350  case '*':
351  if (isOperatorContext()) {
352  return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul);
353  } else {
354  ++m_nextPos;
355  return Token(NAMETEST, "*");
356  }
357  case '$': {//$ QName
358  m_nextPos++;
359  Token par = lexQName();
360  if (par.type == ERROR) {
361  return par;
362  } else {
363  return Token(VARIABLEREFERENCE, par.value);
364  }
365  }
366  }
367 
368  Token t1 = lexNCName();
369  if (t1.type == ERROR) {
370  return t1;
371  }
372 
373  skipWS();
374 
375  //If we're in an operator context, check for any operator names
376  if (isOperatorContext()) {
377  if (t1.value == QLatin1String("and")) { //### hash?
378  return Token(AND);
379  }
380  if (t1.value == QLatin1String("or")) {
381  return Token(OR);
382  }
383  if (t1.value == QLatin1String("mod")) {
384  return Token(MULOP, NumericOp::OP_Mod);
385  }
386  if (t1.value == QLatin1String("div")) {
387  return Token(MULOP, NumericOp::OP_Div);
388  }
389  }
390 
391  //See whether we are at a :
392  if (peekCurHelper() == ':') {
393  m_nextPos++;
394  //Any chance it's an axis name?
395  if (peekCurHelper() == ':') {
396  m_nextPos++;
397 
398  //It might be an axis name.
399  Step::AxisType axisType;
400  if (isAxisName(t1.value, &axisType)) {
401  return Token(AXISNAME, axisType);
402  }
403  //Ugh, :: is only valid in axis names -> error
404  return Token(ERROR);
405  }
406 
407  //Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
408  skipWS();
409  if (peekCurHelper() == '*') {
410  m_nextPos++;
411  return Token(NAMETEST, t1.value + ":*");
412  }
413 
414  //Make a full qname..
415  Token t2 = lexNCName();
416  if (t2.type == ERROR) {
417  return t2;
418  }
419 
420  t1.value = t1.value + ':' + t2.value;
421  }
422 
423  skipWS();
424  if (peekCurHelper() == '(') {
425  //note: we don't swallow the ( here!
426 
427  //either node type of function name
428  if (isNodeTypeName(t1.value)) {
429  if (t1.value == "processing-instruction") {
430  return Token(PI, t1.value);
431  } else {
432  return Token(NODETYPE, t1.value);
433  }
434  }
435  //must be a function name.
436  return Token(FUNCTIONNAME, t1.value);
437  }
438 
439  //At this point, it must be NAMETEST
440  return Token(NAMETEST, t1.value);
441 }
442 
443 Token Tokenizer::nextToken()
444 {
445  Token toRet = nextTokenInternal();
446  m_lastTokenType = toRet.type;
447  return toRet;
448 }
449 
450 Tokenizer::Tokenizer()
451 {
452  reset(QString());
453 }
454 
455 Tokenizer::~Tokenizer()
456 {
457  delete s_axisNamesDict;
458  delete s_nodeTypeNamesDict;
459 }
460 
461 void Tokenizer::reset(QString data)
462 {
463  m_nextPos = 0;
464  m_data = data;
465  m_lastTokenType = 0;
466 }
467 
468 int khtmlxpathyylex()
469 {
470  Token tok = Tokenizer::self().nextToken();
471  if (tok.hasString) {
472  khtmlxpathyylval.str = new DOMString(tok.value);
473  } else if (tok.intValue) {
474  khtmlxpathyylval.num = tok.intValue;
475  }
476  return tok.type;
477 }
478 
479 void initTokenizer(const DOM::DOMString &string)
480 {
481  Tokenizer::self().reset(string.string());
482 }
483 
484 } // namespace XPath
485 } // namespace khtml
486 
QHash::iterator insert(const Key &key, const T &value)
QString name(const QVariant &location)
Letter_Lowercase
This file is part of the HTML rendering engine for KDE.
MESSAGECORE_EXPORT KMime::Content * next(KMime::Content *node, bool allowChildren=true)
QHash::const_iterator constFind(const Key &key) const const
uchar cell() const const
QSet::iterator insert(const T &value)
QChar::Category category() const const
QHash::const_iterator constEnd() const const
Type type(const QSqlDatabase &db)
This class implements the basic string we use in the DOM.
Definition: dom_string.h:44
ushort unicode() const const
This library provides a full-featured HTML parser and widget.
uchar row() const const
QString mid(int position, int n) const const
KGuiItem reset()
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Tue Oct 26 2021 22:48:10 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.