• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdelibs API Reference
  • KDE Home
  • Contact Us
 

KHTML

  • sources
  • kde-4.14
  • kdelibs
  • khtml
  • xpath
tokenizer.cpp
Go to the documentation of this file.
1 /*
2  * tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org>
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 #include "tokenizer.h"
26 
27 #include "xml/dom_stringimpl.h"
28 #include "xml/dom3_xpathimpl.h"
29 #include "dom/dom3_xpath.h"
30 
31 #include <cstdio>
32 
33 using namespace std;
34 
35 using namespace DOM;
36 using namespace DOM::XPath;
37 using namespace khtml;
38 using namespace khtml::XPath;
39 
40 namespace khtml {
41 namespace XPath {
42 
43 struct AxisNameMapping
44 {
45  const char *name;
46  Step::AxisType type;
47 };
48 
49 static AxisNameMapping axisNames[] = {
50  { "ancestor", Step::AncestorAxis },
51  { "ancestor-or-self", Step::AncestorOrSelfAxis },
52  { "attribute", Step::AttributeAxis },
53  { "child", Step::ChildAxis },
54  { "descendant", Step::DescendantAxis },
55  { "descendant-or-self", Step::DescendantOrSelfAxis },
56  { "following", Step::FollowingAxis },
57  { "following-sibling", Step::FollowingSiblingAxis },
58  { "namespace", Step::NamespaceAxis },
59  { "parent", Step::ParentAxis },
60  { "preceding", Step::PrecedingAxis },
61  { "preceding-sibling", Step::PrecedingSiblingAxis },
62  { "self", Step::SelfAxis }
63 };
64 static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]);
65 
66 static const char* const nodeTypeNames[] = {
67  "comment",
68  "text",
69  "processing-instruction",
70  "node",
71  0
72 };
73 
74 QHash<QString, Step::AxisType>* Tokenizer::s_axisNamesDict = 0;
75 QSet<QString>* Tokenizer::s_nodeTypeNamesDict = 0;
76 
77 Tokenizer &Tokenizer::self()
78 {
79  static Tokenizer instance;
80  return instance;
81 }
82 
83 Tokenizer::XMLCat Tokenizer::charCat(QChar aChar)
84 {
85  //### might need to add some special cases from the XML spec.
86 
87  if (aChar.unicode() == '_')
88  return NameStart;
89 
90  if (aChar.unicode() == '.' || aChar.unicode() == '-')
91  return NameCont;
92 
93  switch (aChar.category()) {
94  case QChar::Letter_Lowercase: //Ll
95  case QChar::Letter_Uppercase: //Lu
96  case QChar::Letter_Other: //Lo
97  case QChar::Letter_Titlecase: //Lt
98  case QChar::Number_Letter: //Nl
99  return NameStart;
100 
101  case QChar::Mark_SpacingCombining: //Mc
102  case QChar::Mark_Enclosing: //Me
103  case QChar::Mark_NonSpacing: //Mn
104  case QChar::Letter_Modifier: //Lm
105  case QChar::Number_DecimalDigit: //Nd
106  return NameCont;
107 
108  default:
109  return NotPartOfName;
110  }
111 }
112 
113 bool Tokenizer::isAxisName(QString name, Step::AxisType *type)
114 {
115  if (!s_axisNamesDict) {
116  s_axisNamesDict = new QHash<QString, Step::AxisType>;
117  for (unsigned int p = 0; p < axisNamesCount; ++p)
118  s_axisNamesDict->insert(QLatin1String(axisNames[p].name),
119  axisNames[p].type);
120  }
121 
122  QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name);
123  if ( it != s_axisNamesDict->constEnd() ) {
124  *type = *it;
125  }
126  return it != s_axisNamesDict->constEnd();
127 }
128 
129 bool Tokenizer::isNodeTypeName(QString name)
130 {
131  if (!s_nodeTypeNamesDict) {
132  s_nodeTypeNamesDict = new QSet<QString>;
133  for (int p = 0; nodeTypeNames[p]; ++p)
134  s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p]));
135  }
136  return s_nodeTypeNamesDict->contains(name);
137 }
138 
139 /* Returns whether the last parsed token matches the [32] Operator rule
140  * (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
141  * the tokens.
142  */
143 bool Tokenizer::isOperatorContext()
144 {
145  if ( m_nextPos == 0 ) {
146  return false;
147  }
148 
149  switch ( m_lastTokenType ) {
150  case AND: case OR: case MULOP:
151  case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
152  case EQOP: case RELOP:
153  case '@': case AXISNAME: case '(': case '[':
154  return false;
155  default:
156  return true;
157  }
158 }
159 
160 void Tokenizer::skipWS()
161 {
162  while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace())
163  ++m_nextPos;
164 }
165 
166 Token Tokenizer::makeTokenAndAdvance(int code, int advance)
167 {
168  m_nextPos += advance;
169  return Token(code);
170 }
171 
172 Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance)
173 {
174  m_nextPos += advance;
175  return Token(code, val);
176 }
177 
178 //Returns next char if it's there and interesting, 0 otherwise
179 char Tokenizer::peekAheadHelper()
180 {
181  if (m_nextPos + 1 >= m_data.length())
182  return 0;
183  QChar next = m_data[m_nextPos + 1];
184  if (next.row() != 0)
185  return 0;
186  else
187  return next.cell();
188 }
189 
190 char Tokenizer::peekCurHelper()
191 {
192  if (m_nextPos >= m_data.length())
193  return 0;
194  QChar next = m_data[m_nextPos];
195  if (next.row() != 0)
196  return 0;
197  else
198  return next.cell();
199 }
200 
201 Token Tokenizer::lexString()
202 {
203  QChar delimiter = m_data[m_nextPos];
204  int startPos = m_nextPos + 1;
205 
206  for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
207  if (m_data[m_nextPos] == delimiter) {
208  QString value = m_data.mid(startPos, m_nextPos - startPos);
209  ++m_nextPos; //Consume the char;
210  return Token(LITERAL, value);
211  }
212  }
213 
214  //Ouch, went off the end -- report error
215  return Token(ERROR);
216 }
217 
218 Token Tokenizer::lexNumber()
219 {
220  int startPos = m_nextPos;
221  bool seenDot = false;
222 
223  //Go until end or a non-digits character
224  for (; m_nextPos < m_data.length(); ++m_nextPos) {
225  QChar aChar = m_data[m_nextPos];
226  if (aChar.row() != 0) break;
227 
228  if (aChar.cell() < '0' || aChar.cell() > '9') {
229  if (aChar.cell() == '.' && !seenDot)
230  seenDot = true;
231  else
232  break;
233  }
234  }
235 
236  QString value = m_data.mid(startPos, m_nextPos - startPos);
237  return Token(NUMBER, value);
238 }
239 
240 Token Tokenizer::lexNCName()
241 {
242  int startPos = m_nextPos;
243  if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart)
244  {
245  //Keep going until we get a character that's not good for names.
246  for (; m_nextPos < m_data.length(); ++m_nextPos) {
247  if (charCat(m_data[m_nextPos]) == NotPartOfName)
248  break;
249  }
250 
251  QString value = m_data.mid(startPos, m_nextPos - startPos);
252  return Token(value);
253  }
254  else
255  return makeTokenAndAdvance(ERROR);
256 }
257 
258 Token Tokenizer::lexQName()
259 {
260  Token t1 = lexNCName();
261  if (t1.type == ERROR) return t1;
262  skipWS();
263  //If the next character is :, what we just got it the prefix, if not,
264  //it's the whole thing
265  if (peekAheadHelper() != ':')
266  return t1;
267 
268  Token t2 = lexNCName();
269  if (t2.type == ERROR) return t2;
270 
271  return Token(t1.value + ":" + t2.value);
272 }
273 
274 Token Tokenizer::nextTokenInternal()
275 {
276  skipWS();
277 
278  if (m_nextPos >= m_data.length()) {
279  return Token(0);
280  }
281 
282  char code = peekCurHelper();
283  switch (code) {
284  case '(': case ')': case '[': case ']':
285  case '@': case ',': case '|':
286  return makeTokenAndAdvance(code);
287  case '\'':
288  case '\"':
289  return lexString();
290  case '0': case '1': case '2': case '3': case '4':
291  case '5': case '6': case '7': case '8': case '9':
292  return lexNumber();
293  case '.': {
294  char next = peekAheadHelper();
295  if (next == '.')
296  return makeTokenAndAdvance(DOTDOT, 2);
297  else if (next >= '0' && next <= '9')
298  return lexNumber();
299  else
300  return makeTokenAndAdvance('.');
301  }
302  case '/':
303  if (peekAheadHelper() == '/')
304  return makeTokenAndAdvance(SLASHSLASH, 2);
305  else
306  return makeTokenAndAdvance('/');
307  case '+':
308  return makeTokenAndAdvance(PLUS);
309  case '-':
310  return makeTokenAndAdvance(MINUS);
311  case '=':
312  return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ);
313  case '!':
314  if (peekAheadHelper() == '=')
315  return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2);
316  else {
317  return Token(ERROR);
318  }
319  case '<':
320  if (peekAheadHelper() == '=')
321  return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2);
322  else
323  return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT);
324  case '>':
325  if (peekAheadHelper() == '=')
326  return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2);
327  else
328  return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT);
329  case '*':
330  if (isOperatorContext())
331  return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul);
332  else {
333  ++m_nextPos;
334  return Token(NAMETEST, "*");
335  }
336  case '$': {//$ QName
337  m_nextPos++;
338  Token par = lexQName();
339  if (par.type == ERROR)
340  return par;
341  else
342  return Token(VARIABLEREFERENCE, par.value);
343  }
344  }
345 
346  Token t1 = lexNCName();
347  if (t1.type == ERROR) return t1;
348 
349  skipWS();
350 
351  //If we're in an operator context, check for any operator names
352  if (isOperatorContext()) {
353  if (t1.value == QLatin1String("and")) //### hash?
354  return Token(AND);
355  if (t1.value == QLatin1String("or"))
356  return Token(OR);
357  if (t1.value == QLatin1String("mod"))
358  return Token(MULOP, NumericOp::OP_Mod);
359  if (t1.value == QLatin1String("div"))
360  return Token(MULOP, NumericOp::OP_Div);
361  }
362 
363  //See whether we are at a :
364  if (peekCurHelper() == ':') {
365  m_nextPos++;
366  //Any chance it's an axis name?
367  if (peekCurHelper() == ':') {
368  m_nextPos++;
369 
370  //It might be an axis name.
371  Step::AxisType axisType;
372  if (isAxisName(t1.value, &axisType))
373  return Token(AXISNAME, axisType);
374  //Ugh, :: is only valid in axis names -> error
375  return Token(ERROR);
376  }
377 
378  //Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
379  skipWS();
380  if (peekCurHelper() == '*') {
381  m_nextPos++;
382  return Token(NAMETEST, t1.value + ":*");
383  }
384 
385  //Make a full qname..
386  Token t2 = lexNCName();
387  if (t2.type == ERROR) return t2;
388 
389  t1.value = t1.value + ':' + t2.value;
390  }
391 
392  skipWS();
393  if (peekCurHelper() == '(') {
394  //note: we don't swallow the ( here!
395 
396  //either node type of function name
397  if (isNodeTypeName(t1.value)) {
398  if (t1.value == "processing-instruction")
399  return Token(PI, t1.value);
400  else
401  return Token(NODETYPE, t1.value);
402  }
403  //must be a function name.
404  return Token(FUNCTIONNAME, t1.value);
405  }
406 
407  //At this point, it must be NAMETEST
408  return Token(NAMETEST, t1.value);
409 }
410 
411 Token Tokenizer::nextToken()
412 {
413  Token toRet = nextTokenInternal();
414  m_lastTokenType = toRet.type;
415  return toRet;
416 }
417 
418 Tokenizer::Tokenizer()
419 {
420  reset(QString());
421 }
422 
423 Tokenizer::~Tokenizer()
424 {
425  delete s_axisNamesDict;
426  delete s_nodeTypeNamesDict;
427 }
428 
429 void Tokenizer::reset(QString data)
430 {
431  m_nextPos = 0;
432  m_data = data;
433  m_lastTokenType = 0;
434 }
435 
436 int khtmlxpathyylex()
437 {
438  Token tok = Tokenizer::self().nextToken();
439  if (tok.hasString) {
440  khtmlxpathyylval.str = new DOMString(tok.value);
441  } else if (tok.intValue) {
442  khtmlxpathyylval.num = tok.intValue;
443  }
444  return tok.type;
445 }
446 
447 void initTokenizer(const DOM::DOMString& string)
448 {
449  Tokenizer::self().reset(string.string());
450 }
451 
452 } // namespace XPath
453 } // namespace khtml
454 
455 // kate: indent-width 4; replace-tabs off; tab-width 4; indent-spaces: off;
EQOP
Definition: parser.cpp:144
NAMETEST
Definition: parser.cpp:160
AND
Definition: parser.cpp:149
QHash::insert
iterator insert(const Key &key, const T &value)
ERROR
Definition: parser.cpp:161
QChar
MULOP
Definition: parser.cpp:146
OR
Definition: parser.cpp:150
khtml::XPath::Token::value
QString value
Definition: tokenizer.h:46
name
const char * name(StandardAction id)
khtml::XPath::Token
Definition: tokenizer.h:43
DOTDOT
Definition: parser.cpp:158
FUNCTIONNAME
Definition: parser.cpp:154
QHash::constFind
const_iterator constFind(const Key &key) const
initTokenizer
void initTokenizer(QString s)
QChar::cell
uchar cell() const
QSet::insert
const_iterator insert(const T &value)
khtml::XPath::axisNames
static AxisNameMapping axisNames[]
Definition: tokenizer.cpp:49
QChar::category
Category category() const
khtml::XPath::Token::intValue
int intValue
Definition: tokenizer.h:47
VARIABLEREFERENCE
Definition: parser.cpp:156
QHash::constEnd
const_iterator constEnd() const
QHash
khtml::XPath::Token::hasString
bool hasString
Definition: tokenizer.h:48
NUMBER
Definition: parser.cpp:157
AXISNAME
Definition: parser.cpp:151
DOM::DOMString
This class implements the basic string we use in the DOM.
Definition: dom_string.h:43
QSet< QString >
SLASHSLASH
Definition: parser.cpp:159
QString
QChar::unicode
ushort unicode() const
khtmlxpathyylval
YYSTYPE khtmlxpathyylval
next
KAction * next(const QObject *recvr, const char *slot, QObject *parent)
dom3_xpath.h
khtml::XPath::khtmlxpathyylex
int khtmlxpathyylex()
Definition: tokenizer.cpp:436
PI
Definition: parser.cpp:153
RELOP
Definition: parser.cpp:145
QChar::row
uchar row() const
QString::mid
QString mid(int position, int n) const
self
static KJavaAppletServer * self
Definition: kjavaappletserver.cpp:133
QLatin1String
khtml::XPath::Tokenizer
Definition: tokenizer.h:56
PLUS
Definition: parser.cpp:148
khtml::XPath::nodeTypeNames
static const char *const nodeTypeNames[]
Definition: tokenizer.cpp:66
tokenizer.h
reset
KGuiItem reset()
LITERAL
Definition: parser.cpp:155
khtml::XPath::Step::AxisType
AxisType
Definition: step.h:45
NODETYPE
Definition: parser.cpp:152
khtml::XPath::Token::type
int type
Definition: tokenizer.h:45
MINUS
Definition: parser.cpp:147
khtml::XPath::axisNamesCount
static unsigned int axisNamesCount
Definition: tokenizer.cpp:64
This file is part of the KDE documentation.
Documentation copyright © 1996-2020 The KDE developers.
Generated on Mon Jun 22 2020 13:26:19 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KHTML

Skip menu "KHTML"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Related Pages

kdelibs API Reference

Skip menu "kdelibs API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal