KHtml

htmltokenizer.cpp
1 /*
2  This file is part of the KDE libraries
3 
4  Copyright (C) 1997 Martin Jones ([email protected])
5  (C) 1997 Torben Weis ([email protected])
6  (C) 1998 Waldo Bastian ([email protected])
7  (C) 1999 Lars Knoll ([email protected])
8  (C) 1999 Antti Koivisto ([email protected])
9  (C) 2001-2003 Dirk Mueller ([email protected])
10  (C) 2004-2008 Apple Computer, Inc.
11  (C) 2006-2008 Germain Garand ([email protected])
12 
13  This library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU Library General Public
15  License as published by the Free Software Foundation; either
16  version 2 of the License, or (at your option) any later version.
17 
18  This library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  Library General Public License for more details.
22 
23  You should have received a copy of the GNU Library General Public License
24  along with this library; see the file COPYING.LIB. If not, write to
25  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26  Boston, MA 02110-1301, USA.
27 */
28 //----------------------------------------------------------------------------
29 //
30 // KDE HTML Widget - Tokenizers
31 
32 // #define TOKEN_DEBUG 1
33 // #define TOKEN_DEBUG 2
34 
35 #include "htmltokenizer.h"
36 #include "html_documentimpl.h"
37 #include "htmlparser.h"
38 #include "dtd.h"
39 
40 #include <misc/loader.h>
41 
42 #include <khtmlview.h>
43 #include <khtml_part.h>
44 #include <xml/dom_docimpl.h>
45 #include <ecma/kjs_proxy.h>
46 #include <kcharsets.h>
47 #include <ctype.h>
48 #include <assert.h>
49 #include <QVariant>
50 #include "khtml_debug.h"
51 #include <stdlib.h>
52 
53 #include "kentities_p.h"
54 #include "htmlprospectivetokenizer.h"
55 
56 #define PROSPECTIVE_TOKENIZER_ENABLED 1
57 
58 using namespace khtml;
59 
60 static const QChar commentStart [] = { '<', '!', '-', '-', QChar::Null };
61 static const char doctypeStart [] = "<!doctype";
62 static const char publicStart [] = "public";
63 static const char systemStart [] = "system";
64 
65 static const char scriptEnd [] = "</script";
66 static const char xmpEnd [] = "</xmp";
67 static const char styleEnd [] = "</style";
68 static const char textareaEnd [] = "</textarea";
69 static const char titleEnd [] = "</title";
70 
71 #ifndef NDEBUG
72 static const int sTokenizerChunkSize = 2048;
73 static const int sTokenizerFastYieldDelay = 220;
74 static const int sTokenizerYieldDelay = 650;
75 #else
76 static const int sTokenizerChunkSize = 4096;
77 static const int sTokenizerFastYieldDelay = 180;
78 static const int sTokenizerYieldDelay = 450;
79 #endif
80 
81 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
82 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) realloc(P, sizeof(QChar)*( N ))
83 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
84 
85 // Full support for MS Windows extensions to Latin-1.
86 // Technically these extensions should only be activated for pages
87 // marked "windows-1252" or "cp1252", but
88 // in the standard Microsoft way, these extensions infect hundreds of thousands
89 // of web pages. Note that people with non-latin-1 Microsoft extensions
90 // are SOL.
91 //
92 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
93 // http://www.bbsinc.com/iso8859.html
94 // http://www.obviously.com/
95 //
96 // There may be better equivalents
97 #if 0
98 #define fixUpChar(x)
99 #else
100 #define fixUpChar(x) \
101  switch ((x).unicode()) \
102  { \
103  case 0x80: (x) = 0x20ac; break; \
104  case 0x82: (x) = 0x201a; break; \
105  case 0x83: (x) = 0x0192; break; \
106  case 0x84: (x) = 0x201e; break; \
107  case 0x85: (x) = 0x2026; break; \
108  case 0x86: (x) = 0x2020; break; \
109  case 0x87: (x) = 0x2021; break; \
110  case 0x88: (x) = 0x02C6; break; \
111  case 0x89: (x) = 0x2030; break; \
112  case 0x8A: (x) = 0x0160; break; \
113  case 0x8b: (x) = 0x2039; break; \
114  case 0x8C: (x) = 0x0152; break; \
115  case 0x8E: (x) = 0x017D; break; \
116  case 0x91: (x) = 0x2018; break; \
117  case 0x92: (x) = 0x2019; break; \
118  case 0x93: (x) = 0x201C; break; \
119  case 0x94: (x) = 0X201D; break; \
120  case 0x95: (x) = 0x2022; break; \
121  case 0x96: (x) = 0x2013; break; \
122  case 0x97: (x) = 0x2014; break; \
123  case 0x98: (x) = 0x02DC; break; \
124  case 0x99: (x) = 0x2122; break; \
125  case 0x9A: (x) = 0x0161; break; \
126  case 0x9b: (x) = 0x203A; break; \
127  case 0x9C: (x) = 0x0153; break; \
128  case 0x9E: (x) = 0x017E; break; \
129  case 0x9F: (x) = 0x0178; break; \
130  default: break; \
131  }
132 #endif
133 // ----------------------------------------------------------------------------
134 
135 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, KHTMLView *_view)
136 {
137  view = _view;
138  buffer = nullptr;
139  rawContent = nullptr;
140  rawContentSize = rawContentMaxSize = rawContentResync = rawContentSinceLastEntity = 0;
142  parser = new KHTMLParser(_view, _doc);
143  m_executingScript = 0;
144  m_externalScriptsTimerId = 0;
145  m_tokenizerYieldDelay = sTokenizerFastYieldDelay;
146  m_yieldTimer = 0;
147  m_prospectiveTokenizer = nullptr;
148  onHold = false;
149  m_documentTokenizer = true;
150  m_hasScriptsWaitingForStylesheets = false;
151 
152  reset();
153 }
154 
155 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, DOM::DocumentFragmentImpl *i)
156 {
157  view = nullptr;
158  buffer = nullptr;
159  rawContent = nullptr;
160  rawContentSize = rawContentMaxSize = rawContentResync = rawContentSinceLastEntity = 0;
162  parser = new KHTMLParser(i, _doc);
163  m_executingScript = 0;
164  m_externalScriptsTimerId = 0;
165  m_tokenizerYieldDelay = sTokenizerFastYieldDelay;
166  m_yieldTimer = 0;
167  m_prospectiveTokenizer = nullptr;
168  onHold = false;
169  m_documentTokenizer = false;
170  m_hasScriptsWaitingForStylesheets = false;
171 
172  reset();
173 }
174 
175 void HTMLTokenizer::setNormalYieldDelay()
176 {
177  m_tokenizerYieldDelay = sTokenizerYieldDelay;
178 }
179 
180 void HTMLTokenizer::reset()
181 {
182  assert(m_executingScript == 0);
183  Q_ASSERT(onHold == false);
184  m_abort = false;
185 
186  while (!cachedScript.isEmpty()) {
187  cachedScript.dequeue()->deref(this);
188  }
189 
190  if (buffer) {
191  KHTML_DELETE_QCHAR_VEC(buffer);
192  }
193  buffer = dest = nullptr;
194  size = 0;
195 
196  if (rawContent) {
197  KHTML_DELETE_QCHAR_VEC(rawContent);
198  }
199  rawContent = nullptr;
200  rawContentSize = rawContentMaxSize = rawContentResync = 0;
201 
202  if (m_yieldTimer > 0) {
203  killTimer(m_yieldTimer);
204  m_yieldTimer = 0;
205  }
206 
207  if (m_externalScriptsTimerId > 0) {
208  killTimer(m_externalScriptsTimerId);
209  m_externalScriptsTimerId = 0;
210  }
211  currToken.reset();
212  doctypeToken.reset();
213  javascript = false;
214 }
215 
216 void HTMLTokenizer::begin()
217 {
218  m_executingScript = 0;
219  onHold = false;
220  reset();
221  size = 254;
222  buffer = KHTML_ALLOC_QCHAR_VEC(255);
223  dest = buffer;
224  tag = NoTag;
225  pending = NonePending;
226  discard = NoneDiscard;
227  pre = false;
228  prePos = 0;
229  plaintext = false;
230  xmp = false;
231  processingInstruction = false;
232  script = false;
233  escaped = false;
234  style = false;
235  skipLF = false;
236  select = false;
237  comment = false;
238  doctype = false;
239  doctypeComment = NoDoctypeComment;
240  doctypeAllowComment = false;
241  server = false;
242  textarea = false;
243  title = false;
244  startTag = false;
245  tquote = NoQuote;
246  searchCount = 0;
247  doctypeSearchCount = 0;
248  doctypeSecondarySearchCount = 0;
249  Entity = NoEntity;
250  noMoreData = false;
251  brokenComments = false;
252  brokenServer = false;
253  lineno = 0;
254  scriptStartLineno = 0;
255  tagStartLineno = 0;
256 }
257 
258 void HTMLTokenizer::processListing(TokenizerString list)
259 {
260  bool old_pre = pre;
261 
262  // This function adds the listing 'list' as
263  // preformatted text-tokens to the token-collection
264  // thereby converting TABs.
265  if (!style) {
266  pre = true;
267  }
268  prePos = 0;
269 
270  while (!list.isEmpty()) {
271  checkBuffer(3 * TAB_SIZE);
272 
273  if (skipLF && (list->unicode() != '\n')) {
274  skipLF = false;
275  }
276 
277  if (skipLF) {
278  skipLF = false;
279  ++list;
280  } else if ((list->unicode() == '\n') || (list->unicode() == '\r')) {
281  if (discard == LFDiscard) {
282  // Ignore this LF
283  discard = NoneDiscard; // We have discarded 1 LF
284  } else {
285  // Process this LF
286  if (pending) {
287  addPending();
288  }
289 
290  // we used to do it not at all and we want to have
291  // it fixed for textarea. So here we are
292  if (textarea) {
293  prePos++;
294  *dest++ = *list;
295  } else {
296  pending = LFPending;
297  }
298  }
299  /* Check for MS-DOS CRLF sequence */
300  if (list->unicode() == '\r') {
301  skipLF = true;
302  }
303  ++list;
304  } else if ((list->unicode() == ' ') || (list->unicode() == '\t')) {
305  if (pending) {
306  addPending();
307  }
308  if (*list == ' ') {
309  pending = SpacePending;
310  } else {
311  pending = TabPending;
312  }
313 
314  ++list;
315  } else {
316  discard = NoneDiscard;
317  if (pending) {
318  addPending();
319  }
320 
321  prePos++;
322  *dest++ = *list;
323  ++list;
324  }
325 
326  }
327 
328  if ((pending == SpacePending) || (pending == TabPending)) {
329  addPending();
330  } else {
331  pending = NonePending;
332  }
333 
334  prePos = 0;
335  pre = old_pre;
336 }
337 
338 void HTMLTokenizer::parseRawContent(TokenizerString &src)
339 {
340  // The 'raw content' mode is a very lax tokenizing mode
341  // that will absorb anything but the exact closing tag
342  // that made us enter this mode, *except* if it inside a comment.
343  //
344  // Any other tag or comment will be passed verbatim to the parser as part
345  // of the content. It is used for script, style, and a few others.
346  //
347  assert(textarea || title || !Entity);
348  assert(!tag);
349  assert(xmp + textarea + title + style + script == 1);
350  if (script) {
351  scriptStartLineno = lineno + src.lineCount();
352  }
353 
354  if (comment) {
355  parseComment(src);
356  }
357 
358  while (!src.isEmpty()) {
359  checkRawContentBuffer();
360  unsigned char ch = src->toLatin1();
361  if (!rawContentResync && !brokenComments && !xmp && ch == '-' &&
362  rawContentSize >= 3 && ((!textarea && !title) || rawContentSinceLastEntity >= 3) && !src.escaped() &&
363  QString::fromRawData(rawContent + rawContentSize - 3, 3) == "<!-") {
364  comment = true;
365  rawContent[ rawContentSize++ ] = ch;
366  ++src;
367  parseComment(src);
368  continue;
369  }
370  if (rawContentResync && !tquote && (ch == '>')) {
371  ++src;
372  rawContentSize = rawContentResync - 1;
373  rawContentResync = 0;
374  rawContent[ rawContentSize ] = rawContent[ rawContentSize + 1 ] = 0;
375  if (script) {
376  scriptHandler();
377  } else {
378  processListing(TokenizerString(rawContent, rawContentSize));
379  processToken();
380  if (style) {
381  currToken.tid = ID_STYLE + ID_CLOSE_TAG;
382  } else if (textarea) {
383  currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG;
384  } else if (title) {
385  currToken.tid = ID_TITLE + ID_CLOSE_TAG;
386  } else if (xmp) {
387  currToken.tid = ID_XMP + ID_CLOSE_TAG;
388  }
389  processToken();
390  script = style = textarea = title = xmp = false;
391  tquote = NoQuote;
392  rawContentSize = rawContentResync = 0;
393  }
394  return;
395  }
396  // possible end of tagname, lets check.
397  if (!rawContentResync && !escaped && !src.escaped() && (ch == '>' || ch == '/' || ch <= ' ') && ch &&
398  rawContentSize >= searchStopperLen && ((!textarea && !title) || rawContentSinceLastEntity >= searchStopperLen) &&
399  QString::compare(QString::fromRawData(rawContent + rawContentSize - searchStopperLen, searchStopperLen),
400  QLatin1String(searchStopper), Qt::CaseInsensitive) == 0) {
401 
402  // the purpose of rawContentResync is to look for an end tag that could possibly be of the form:
403  // </endtag junk="more junk>\"><>" >
404  // IOW, once the '</endtag' sequence has been found, the rest of the tag must still be validated,
405  // so this micro-tokenizer switches to rawContentResync state until '>' is finally found.
406  rawContentResync = rawContentSize - searchStopperLen + 1;
407  tquote = NoQuote;
408  continue;
409  }
410  if (rawContentResync && !escaped) {
411  if (ch == '\"') {
412  tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
413  } else if (ch == '\'') {
414  tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
415  } else if (tquote != NoQuote && (ch == '\r' || ch == '\n')) {
416  tquote = NoQuote;
417  }
418  }
419  escaped = (!escaped && ch == '\\');
420  if (!rawContentResync && (textarea || title) && !src.escaped() && ch == '&') {
421  QChar *rawContentDest = rawContent + rawContentSize;
422  ++src;
423  parseEntity(src, rawContentDest, true);
424  rawContentSize = rawContentDest - rawContent;
425  } else {
426  rawContent[ rawContentSize++ ] = *src;
427  ++src;
428  ++rawContentSinceLastEntity;
429  }
430  }
431 }
432 
433 void HTMLTokenizer::scriptHandler()
434 {
435  QString currentScriptSrc = scriptSrc;
436  scriptSrc.clear();
437 
438  processListing(TokenizerString(rawContent, rawContentSize));
439  QString exScript(buffer, dest - buffer);
440 
441  processToken();
442  currToken.tid = ID_SCRIPT + ID_CLOSE_TAG;
443  processToken();
444 
445  // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts.
446  bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET);
447  bool effectiveScript = !parser->skipMode() && !followingFrameset;
448  bool deferredScript = false;
449 
450  if (effectiveScript) {
451  CachedScript *cs = nullptr;
452 
453  // forget what we just got, load from src url instead
454  if (!currentScriptSrc.isEmpty() && javascript) {
455  const QString completeScriptUrl = parser->doc()->completeURL(currentScriptSrc);
456  cs = parser->doc()->docLoader()->requestScript(completeScriptUrl, scriptSrcCharset);
457  }
458 
459  if (cs) {
460  cachedScript.enqueue(cs);
461  pendingQueue.push(src);
462  int scriptCount = cachedScript.count();
463  setSrc(TokenizerString());
464  rawContentSize = rawContentResync = 0;
465  cs->ref(this);
466  if (cachedScript.count() == scriptCount) {
467  deferredScript = true;
468  }
469  } else if (currentScriptSrc.isNull()/*no src attribute*/ && view && javascript) {
470  pendingQueue.push(src);
471  setSrc(TokenizerString());
472  rawContentSize = rawContentResync = 0;
473  scriptExecution(exScript, QString(), tagStartLineno /*scriptStartLineno*/);
474  } else {
475  // script was filtered or disallowed
476  effectiveScript = false;
477  }
478  }
479 
480  script = false;
481  rawContentSize = rawContentResync = 0;
482 
483  if (!effectiveScript) {
484  return;
485  }
486 
487  if (!m_executingScript && cachedScript.isEmpty()) {
488  src.append(pendingQueue.pop());
489  } else if (cachedScript.isEmpty()) {
490  write(pendingQueue.pop(), false);
491  } else if (!deferredScript && pendingQueue.count() > 1) {
492  TokenizerString t = pendingQueue.pop();
493  pendingQueue.top().prepend(t);
494  }
495 #if PROSPECTIVE_TOKENIZER_ENABLED
496  if (!cachedScript.isEmpty() && !m_executingScript) {
497  if (!m_prospectiveTokenizer) {
498  m_prospectiveTokenizer = new ProspectiveTokenizer(parser->docPtr());
499  }
500  if (!m_prospectiveTokenizer->inProgress() && !pendingQueue.isEmpty()) {
501  m_prospectiveTokenizer->begin();
502  m_prospectiveTokenizer->write(pendingQueue.top());
503  }
504  }
505 #endif
506 
507 }
508 
509 void HTMLTokenizer::scriptExecution(const QString &str, const QString &scriptURL,
510  int baseLine)
511 {
512  bool oldscript = script;
513  m_executingScript++;
514  script = false;
515  QString url;
516  if (scriptURL.isNull() && view) {
517  url = static_cast<DocumentImpl *>(view->part()->document().handle())->URL().url();
518  } else {
519  url = scriptURL;
520  }
521 
522  if (view) {
523  view->part()->executeScript(url, baseLine, Node(), str);
524  }
525  m_executingScript--;
526  script = oldscript;
527 }
528 
529 void HTMLTokenizer::parseComment(TokenizerString &src)
530 {
531  checkRawContentBuffer(src.length());
532  while (src.length()) {
533  rawContent[ rawContentSize++ ] = *src;
534 
535 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
536  qDebug("comment is now: *%s*", src.toString().left(16).toLatin1().constData());
537 #endif
538 
539  if (src->unicode() == '>') {
540  bool handleBrokenComments = brokenComments && !(script || style);
541  bool scriptEnd = false;
542  if (rawContentSize > 2 && rawContent[rawContentSize - 3] == '-' &&
543  rawContent[rawContentSize - 2] == '-') {
544  scriptEnd = true;
545  }
546 
547  if (handleBrokenComments || scriptEnd) {
548  ++src;
549  if (!(title || script || xmp || textarea || style)) {
550  checkRawContentBuffer();
551  rawContent[ rawContentSize ] = 0;
552  rawContent[ rawContentSize + 1 ] = 0;
553  currToken.tid = ID_COMMENT;
554  int size = scriptEnd ? rawContentSize - 3 : rawContentSize - 1;
555  processListing(TokenizerString(rawContent, size));
556  processToken();
557  currToken.tid = ID_COMMENT + ID_CLOSE_TAG;
558  processToken();
559  rawContentSize = 0;
560  }
561  comment = false;
562  return; // Finished parsing comment
563  }
564  }
565  ++src;
566  }
567 }
568 
569 void HTMLTokenizer::parseDoctypeComment(TokenizerString &src)
570 {
571  while (!src.isEmpty()) {
572  QChar c = *src;
573  switch (doctypeComment) {
574  case DoctypeCommentHalfBegin: {
575  if (c != '-') {
576  // Ooops, it's not comment
577  doctypeComment = DoctypeCommentBogus;
578  return;
579  } else {
580  // Doctype comment begins
581  doctypeComment = DoctypeComment;
582  ++src;
583  }
584  break;
585  }
586  case DoctypeComment: {
587  if (c == '-') {
588  // Perhaps this is end of comment
589  doctypeComment = DoctypeCommentHalfEnd;
590  ++src;
591  } else {
592  // Keep scanning for '--'
593  ++src;
594  }
595  break;
596  }
597  case DoctypeCommentHalfEnd: {
598  if (c == '-') {
599  // Doctype comment ends
600  doctypeComment = DoctypeCommentEnd;
601  return;
602  } else {
603  // It's not '--'
604  ++src;
605  doctypeComment = DoctypeComment;
606  }
607  break;
608  }
609  default: {
610  assert(!"Undefined doctype comment state");
611  break;
612  }
613  }
614  }
615 }
616 
617 void HTMLTokenizer::parseDoctype(TokenizerString &src)
618 {
619  while (!src.isEmpty() && doctype) {
620  QChar c;
621  bool isWhitespace = false;
622  int dontAdvance = 0;
623  if (doctypeComment == DoctypeCommentEnd) {
624  doctypeComment = NoDoctypeComment;
625  isWhitespace = true;
626  } else if (doctypeComment == DoctypeCommentBogus) {
627  doctypeComment = NoDoctypeComment;
628  c = '-';
629  dontAdvance++;
630  } else {
631  c = *src;
632  if (doctypeAllowComment) {
633  if (!doctypeComment && c == '-') {
634  doctypeComment = DoctypeCommentHalfBegin;
635  ++src;
636  }
637  if (doctypeComment) {
638  parseDoctypeComment(src);
639  continue;
640  }
641  isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
642  }
643  }
644 
645  switch (doctypeToken.state) {
646  case DoctypeBegin: {
647  doctypeToken.state = DoctypeBeforeName;
648  if (isWhitespace) {
649  // nothing
650  }
651  break;
652  }
653  case DoctypeBeforeName: {
654  if (c == '>') {
655  // Malformed. Just exit.
656  doctype = false;
657  } else if (isWhitespace) {
658  // nothing
659  } else {
660  dontAdvance++;
661  doctypeToken.state = DoctypeName;
662  }
663  break;
664  }
665  case DoctypeName: {
666  if (c == '>') {
667  // Valid doctype. Emit it.
668  doctype = false;
669  processDoctypeToken();
670  } else if (isWhitespace) {
671  doctypeSearchCount = 0; // Used now to scan for PUBLIC
672  doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
673  doctypeToken.state = DoctypeAfterName;
674  } else {
675  doctypeToken.name.append(c);
676  }
677  break;
678  }
679  case DoctypeAfterName: {
680  if (c == '>') {
681  // Valid doctype. Emit it.
682  doctype = false;
683  processDoctypeToken();
684  } else if (c == '[') {
685  if (doctypeSearchCount > 0 || doctypeSecondarySearchCount > 0) { // is there any public/system indicator before?
686  doctypeSearchCount = doctypeSecondarySearchCount = 0;
687  doctypeToken.state = DoctypeBogus;
688  }
689  // Found internal subset
690  doctypeToken.state = DoctypeInternalSubset;
691  doctypeAllowComment = false;
692  } else if (!isWhitespace) {
693  if (c.toLower() == publicStart[doctypeSearchCount]) {
694  doctypeSearchCount++;
695  if (doctypeSearchCount == 6)
696  // Found 'PUBLIC' sequence
697  {
698  doctypeToken.state = DoctypeBeforePublicID;
699  }
700  } else if (doctypeSearchCount > 0) {
701  doctypeSearchCount = 0;
702  doctypeToken.state = DoctypeBogus;
703  } else if (c.toLower() == systemStart[doctypeSecondarySearchCount]) {
704  doctypeSecondarySearchCount++;
705  if (doctypeSecondarySearchCount == 6)
706  // Found 'SYSTEM' sequence
707  {
708  doctypeToken.state = DoctypeBeforeSystemID;
709  }
710  } else {
711  doctypeSecondarySearchCount = 0;
712  doctypeToken.state = DoctypeBogus;
713  }
714  } else {
715  // Whitespace keeps us in the after name state
716  }
717  break;
718  }
719  case DoctypeBeforePublicID: {
720  if (c == '\"' || c == '\'') {
721  tquote = c == '\"' ? DoubleQuote : SingleQuote;
722  doctypeToken.state = DoctypePublicID;
723  doctypeAllowComment = false;
724  } else if (c == '>') {
725  // Considered bogus. Don't process the doctype.
726  doctype = false;
727  } else if (isWhitespace) {
728  // nothing
729  } else {
730  doctypeToken.state = DoctypeBogus;
731  }
732  break;
733  }
734  case DoctypePublicID: {
735  if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
736  doctypeToken.state = DoctypeAfterPublicID;
737  doctypeAllowComment = true;
738  } else if (c == '>') {
739  // Considered bogus. Don't process the doctype.
740  doctype = false;
741  } else {
742  doctypeToken.publicID.append(c);
743  }
744  break;
745  }
746  case DoctypeAfterPublicID: {
747  if (c == '\"' || c == '\'') {
748  tquote = c == '\"' ? DoubleQuote : SingleQuote;
749  doctypeToken.state = DoctypeSystemID;
750  } else if (c == '>') {
751  // Valid doctype. Emit it now.
752  doctype = false;
753  processDoctypeToken();
754  } else if (isWhitespace) {
755  // nothing
756  } else if (c == '[') {
757  // Found internal subset
758  doctypeToken.state = DoctypeInternalSubset;
759  doctypeAllowComment = false;
760  } else {
761  doctypeToken.state = DoctypeBogus;
762  }
763  break;
764  }
765  case DoctypeBeforeSystemID: {
766  if (c == '\"' || c == '\'') {
767  tquote = c == '\"' ? DoubleQuote : SingleQuote;
768  doctypeToken.state = DoctypeSystemID;
769  doctypeAllowComment = false;
770  } else if (c == '>') {
771  // Considered bogus. Don't process the doctype.
772  doctype = false;
773  } else if (isWhitespace) {
774  // nothing
775  } else {
776  doctypeToken.state = DoctypeBogus;
777  }
778  break;
779  }
780  case DoctypeSystemID: {
781  if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
782  doctypeToken.state = DoctypeAfterSystemID;
783  doctypeAllowComment = true;
784  } else if (c == '>') {
785  // Considered bogus. Don't process the doctype.
786  doctype = false;
787  } else {
788  doctypeToken.systemID.append(c);
789  }
790  break;
791  }
792  case DoctypeAfterSystemID: {
793  if (c == '>') {
794  // Valid doctype. Emit it now.
795  doctype = false;
796  processDoctypeToken();
797  } else if (isWhitespace) {
798  // nothing
799  } else if (c == '[') {
800  // Found internal subset
801  doctypeToken.state = DoctypeInternalSubset;
802  doctypeAllowComment = false;
803  } else {
804  doctypeToken.state = DoctypeBogus;
805  }
806  break;
807  }
808  case DoctypeInternalSubset: {
809  if (c == ']') {
810  // Done
811  doctypeToken.state = DoctypeAfterInternalSubset;
812  doctypeAllowComment = true;
813  } else {
814  doctypeToken.internalSubset.append(c);
815  }
816  break;
817  }
818  case DoctypeAfterInternalSubset: {
819  if (c == '>') {
820  // Valid doctype. Emit it now.
821  doctype = false;
822  processDoctypeToken();
823  } else if (isWhitespace) {
824  // nothing
825  } else {
826  doctypeToken.state = DoctypeBogus;
827  }
828  break;
829  }
830  case DoctypeBogus: {
831  if (c == '>') {
832  // Done with the bogus doctype.
833  doctype = false;
834  } else {
835  // Just keep scanning for '>'
836  }
837  break;
838  }
839  default:
840  break;
841  }
842  if (!dontAdvance) {
843  ++src;
844  } else if (dontAdvance == 1) {
845  continue;
846  } else { // double dontAdvance++, do workaround
847  doctypeComment = DoctypeCommentBogus;
848  }
849  }
850 }
851 
852 void HTMLTokenizer::parseServer(TokenizerString &src)
853 {
854  checkRawContentBuffer(src.length());
855  while (!src.isEmpty()) {
856  rawContent[ rawContentSize++ ] = *src;
857  if (src->unicode() == '>' &&
858  rawContentSize > 1 && rawContent[rawContentSize - 2] == '%') {
859  ++src;
860  server = false;
861  rawContentSize = 0;
862  return; // Finished parsing server include
863  }
864  ++src;
865  }
866 }
867 
868 void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src)
869 {
870  char oldchar = 0;
871  while (!src.isEmpty()) {
872  unsigned char chbegin = src->toLatin1();
873  if (chbegin == '\'') {
874  tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
875  } else if (chbegin == '\"') {
876  tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
877  }
878  // Look for '?>'
879  // some crappy sites omit the "?" before it, so
880  // we look for an unquoted '>' instead. (IE compatible)
881  else if (chbegin == '>' && (!tquote || oldchar == '?')) {
882  // We got a '?>' sequence
883  processingInstruction = false;
884  ++src;
885  discard = LFDiscard;
886  return; // Finished parsing comment!
887  }
888  ++src;
889  oldchar = chbegin;
890  }
891 }
892 
893 void HTMLTokenizer::parseText(TokenizerString &src)
894 {
895  while (!src.isEmpty()) {
896  // do we need to enlarge the buffer?
897  checkBuffer();
898 
899  // ascii is okay because we only do ascii comparisons
900  unsigned char chbegin = src->toLatin1();
901 
902  if (skipLF && (chbegin != '\n')) {
903  skipLF = false;
904  }
905 
906  if (skipLF) {
907  skipLF = false;
908  ++src;
909  } else if ((chbegin == '\n') || (chbegin == '\r')) {
910  if (chbegin == '\r') {
911  skipLF = true;
912  }
913 
914  *dest++ = '\n';
915  ++src;
916  } else {
917  *dest++ = *src;
918  ++src;
919  }
920  }
921 }
922 
923 void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
924 {
925  if (start) {
926  cBufferPos = 0;
927  entityLen = 0;
928  Entity = SearchEntity;
929  }
930 
931  while (!src.isEmpty()) {
932  ushort cc = src->unicode();
933  switch (Entity) {
934  case NoEntity:
935  return;
936 
937  break;
938  case SearchEntity:
939  if (cc == '#') {
940  cBuffer[cBufferPos++] = cc;
941  ++src;
942  Entity = NumericSearch;
943  } else {
944  Entity = EntityName;
945  }
946 
947  break;
948 
949  case NumericSearch:
950  if (cc == 'x' || cc == 'X') {
951  cBuffer[cBufferPos++] = cc;
952  ++src;
953  Entity = Hexadecimal;
954  } else if (cc >= '0' && cc <= '9') {
955  Entity = Decimal;
956  } else {
957  Entity = SearchSemicolon;
958  }
959 
960  break;
961 
962  case Hexadecimal: {
963  int uc = EntityChar.unicode();
964  int ll = qMin<uint>(src.length(), 8);
965  while (ll--) {
966  QChar csrc(src->toLower());
967  cc = csrc.cell();
968 
969  if (csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
970  break;
971  }
972  uc = uc * 16 + (cc - (cc < 'a' ? '0' : 'a' - 10));
973  cBuffer[cBufferPos++] = cc;
974  ++src;
975  }
976  EntityChar = QChar(uc);
977  Entity = SearchSemicolon;
978  break;
979  }
980  case Decimal: {
981  int uc = EntityChar.unicode();
982  int ll = qMin(src.length(), 9 - cBufferPos);
983  while (ll--) {
984  cc = src->cell();
985 
986  if (src->row() || !(cc >= '0' && cc <= '9')) {
987  Entity = SearchSemicolon;
988  break;
989  }
990 
991  uc = uc * 10 + (cc - '0');
992  cBuffer[cBufferPos++] = cc;
993  ++src;
994  }
995  EntityChar = QChar(uc);
996  if (cBufferPos == 9) {
997  Entity = SearchSemicolon;
998  }
999  break;
1000  }
1001  case EntityName: {
1002  int ll = qMin(src.length(), 9 - cBufferPos);
1003  while (ll--) {
1004  QChar csrc = *src;
1005  cc = csrc.cell();
1006 
1007  if (csrc.row() || !((cc >= 'a' && cc <= 'z') ||
1008  (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
1009  Entity = SearchSemicolon;
1010  break;
1011  }
1012 
1013  cBuffer[cBufferPos++] = cc;
1014  ++src;
1015 
1016  // be IE compatible and interpret even unterminated entities
1017  // outside tags. like "foo &nbspstuff bla".
1018  if (tag == NoTag) {
1019  int code;
1020  const bool found = kde_findEntity(cBuffer, cBufferPos, &code);
1021  if (found && code < 256) {
1022  EntityChar = code;
1023  entityLen = cBufferPos;
1024  }
1025  }
1026  }
1027  if (cBufferPos == 9) {
1028  Entity = SearchSemicolon;
1029  }
1030  if (Entity == SearchSemicolon) {
1031  if (cBufferPos > 1) {
1032  int code;
1033  const bool found = kde_findEntity(cBuffer, cBufferPos, &code);
1034  // IE only accepts unterminated entities < 256,
1035  // Gecko accepts them all, but only outside tags
1036  if (found && (tag == NoTag || code < 256 || *src == ';')) {
1037  EntityChar = code;
1038  entityLen = cBufferPos;
1039  }
1040  }
1041  }
1042  break;
1043  }
1044  case SearchSemicolon:
1045 #ifdef TOKEN_DEBUG
1046  qCDebug(KHTML_LOG) << "ENTITY " << EntityChar.unicode();
1047 #endif
1048  fixUpChar(EntityChar);
1049 
1050  if (*src == ';') {
1051  ++src;
1052  }
1053 
1054  if (!EntityChar.isNull()) {
1055  checkBuffer();
1056  if (entityLen > 0 && entityLen < cBufferPos) {
1057  int rem = cBufferPos - entityLen;
1058  src.prepend(TokenizerString(QString::fromLatin1(cBuffer + entityLen, rem)));
1059  }
1060  src.push(EntityChar);
1061  rawContentSinceLastEntity = -1;
1062  } else {
1063 #ifdef TOKEN_DEBUG
1064  qCDebug(KHTML_LOG) << "unknown entity!";
1065 #endif
1066  checkBuffer(11);
1067  // ignore the sequence, add it to the buffer as plaintext
1068  *dest++ = '&';
1069  for (unsigned int i = 0; i < cBufferPos; i++) {
1070  dest[i] = cBuffer[i];
1071  }
1072  dest += cBufferPos;
1073  rawContentSinceLastEntity += cBufferPos + 1;
1074  if (pre) {
1075  prePos += cBufferPos + 1;
1076  }
1077  }
1078 
1079  Entity = NoEntity;
1080  EntityChar = QChar::Null;
1081  return;
1082  };
1083  }
1084 }
1085 
1086 void HTMLTokenizer::parseTag(TokenizerString &src)
1087 {
1088  assert(!Entity);
1089  checkRawContentBuffer(src.length());
1090 
1091  while (!src.isEmpty()) {
1092  checkBuffer();
1093 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1094  uint l = 0;
1095  while (l < src.length() && (src.toString()[l]).toLatin1() != '>') {
1096  l++;
1097  }
1098  qDebug("src is now: *%s*, tquote: %d", src.toString().left(l).toLatin1().constData(), tquote);
1099 #endif
1100  switch (tag) {
1101  case NoTag:
1102  return;
1103  case TagName: {
1104 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1105  qDebug("TagName");
1106 #endif
1107  if (searchCount > 0) {
1108  if (*src == commentStart[searchCount]) {
1109  searchCount++;
1110  if (searchCount == 2) {
1111  doctypeSearchCount++; // A '!' is also part of doctype, so we are moving through that still as well
1112  } else {
1113  doctypeSearchCount = 0;
1114  }
1115 
1116  if (searchCount == 4) {
1117 #ifdef TOKEN_DEBUG
1118  qCDebug(KHTML_LOG) << "Found comment";
1119 #endif
1120  // Found '<!--' sequence
1121  ++src;
1122  dest = buffer; // ignore the previous part of this tag
1123  tag = NoTag;
1124 
1125  comment = true;
1126  parseComment(src);
1127  return; // Finished parsing tag!
1128  }
1129  // cuts of high part, is okay
1130  cBuffer[cBufferPos++] = src->cell();
1131  ++src;
1132  break;
1133  } else {
1134  searchCount = 0; // Stop looking for '<!--' sequence
1135  }
1136  }
1137 
1138  if (doctypeSearchCount > 0) {
1139  if ((*src).toLower() == doctypeStart[doctypeSearchCount]) {
1140  doctypeSearchCount++;
1141  cBuffer[cBufferPos++] = src->cell();
1142  ++src;
1143  if (doctypeSearchCount == 9) {
1144  // Found '<!DOCTYPE' sequence
1145  tag = NoTag;
1146  doctypeAllowComment = true;
1147  doctypeComment = NoDoctypeComment;
1148  doctypeToken.reset();
1149  doctype = true;
1150 
1151  parseDoctype(src);
1152  return;
1153  }
1154  break;
1155  } else {
1156  doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
1157  }
1158  }
1159 
1160  bool finish = false;
1161  unsigned int ll = qMin(src.length(), CBUFLEN - cBufferPos);
1162  while (ll--) {
1163  ushort curchar = src->unicode();
1164  if (curchar <= ' ' || curchar == '>') {
1165  finish = true;
1166  break;
1167  }
1168  // this is a nasty performance trick. will work for the A-Z
1169  // characters, but not for others. if it contains one,
1170  // we fail anyway
1171  char cc = curchar;
1172  cBuffer[cBufferPos++] = cc | 0x20;
1173  ++src;
1174  }
1175 
1176  // Disadvantage: we add the possible rest of the tag
1177  // as attribute names. ### judge if this causes problems
1178  if (finish || CBUFLEN == cBufferPos) {
1179  bool beginTag;
1180  char *ptr = cBuffer;
1181  unsigned int len = cBufferPos;
1182  cBuffer[cBufferPos] = '\0';
1183  if ((cBufferPos > 0) && (*ptr == '/')) {
1184  // End Tag
1185  beginTag = false;
1186  ptr++;
1187  len--;
1188  } else
1189  // Start Tag
1190  {
1191  beginTag = true;
1192  }
1193  // Accept empty xml tags like <br/>
1194  if (len > 1 && ptr[len - 1] == '/') {
1195  ptr[--len] = '\0';
1196  // if it is like <br/> and not like <input/ value=foo>, take it as flat
1197  if (*src == '>') {
1198  currToken.flat = true;
1199  }
1200  }
1201 
1202  uint tagID = 0;
1203  if (!tagID) {
1204  DOMString tagName(ptr);
1205  if (Element::khtmlValidQualifiedName(tagName)) {
1206  safeLocalName = LocalName::fromString(tagName, IDS_NormalizeLower);
1207  tagID = safeLocalName.id();
1208  }
1209 #ifdef TOKEN_DEBUG
1210  QByteArray tmp(ptr, len + 1);
1211  qCDebug(KHTML_LOG) << "Unknown tag: \"" << tmp.data() << "\"";
1212 #endif
1213  }
1214  if (tagID) {
1215 #ifdef TOKEN_DEBUG
1216  QByteArray tmp(ptr, len + 1);
1217  qCDebug(KHTML_LOG) << "found tag id=" << tagID << ": " << tmp.data();
1218 #endif
1219  currToken.tid = beginTag ? tagID : tagID + ID_CLOSE_TAG;
1220  }
1221  dest = buffer;
1222  tag = SearchAttribute;
1223  cBufferPos = 0;
1224  }
1225  break;
1226  }
1227  case SearchAttribute: {
1228 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1229  qDebug("SearchAttribute");
1230 #endif
1231  bool atespace = false;
1232  ushort curchar;
1233  while (!src.isEmpty()) {
1234  curchar = src->unicode();
1235  if (curchar > ' ') {
1236  if (curchar == '<' || curchar == '>') {
1237  tag = SearchEnd;
1238  } else if (atespace && (curchar == '\'' || curchar == '"')) {
1239  tag = SearchValue;
1240  *dest++ = 0;
1241  attrName = DOMString("");
1242  } else {
1243  tag = AttributeName;
1244  }
1245 
1246  cBufferPos = 0;
1247  break;
1248  }
1249  atespace = true;
1250  ++src;
1251  }
1252  break;
1253  }
1254  case AttributeName: {
1255 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1256  qDebug("AttributeName");
1257 #endif
1258  ushort curchar;
1259  int ll = qMin(src.length(), CBUFLEN - cBufferPos);
1260 
1261  while (ll--) {
1262  curchar = src->unicode();
1263  if (curchar <= '>') {
1264  if (curchar <= ' ' || curchar == '=' || curchar == '>') {
1265  unsigned int a;
1266  cBuffer[cBufferPos] = '\0';
1267  a = LocalName::fromString(DOMString(cBuffer), IDS_NormalizeLower).id(); // ### still deep copy?
1268  if (a > ATTR_LAST_ATTR) {
1269  a = 0;
1270  }
1271 
1272  if (!a) {
1273  // did we just get /> or e.g checked/>
1274  if (curchar == '>' && cBufferPos >= 1 && cBuffer[cBufferPos - 1] == '/') {
1275  currToken.flat = true;
1276  cBuffer[cBufferPos - 1] = '\0';
1277  if (cBufferPos > 1) {
1278  a = LocalName::fromString(DOMString(cBuffer), IDS_NormalizeLower).id();
1279  }
1280  if (a > ATTR_LAST_ATTR) {
1281  a = 0;
1282  }
1283  cBuffer[cBufferPos - 1] = '/';
1284  }
1285  if (!a) {
1286  attrName = DOMString(cBuffer, cBufferPos);
1287  }
1288  }
1289 
1290  dest = buffer;
1291  *dest++ = a;
1292 #ifdef TOKEN_DEBUG
1293  if (!a || (cBufferPos && *cBuffer == '!')) {
1294  qCDebug(KHTML_LOG) << "Unknown attribute: *" << QByteArray(cBuffer, cBufferPos + 1).data() << "*";
1295  } else {
1296  qCDebug(KHTML_LOG) << "Known attribute: " << QByteArray(cBuffer, cBufferPos + 1).data();
1297  }
1298 #endif
1299 
1300  tag = SearchEqual;
1301  break;
1302  }
1303  }
1304  cBuffer[cBufferPos++] =
1305  (curchar >= 'A' && curchar <= 'Z') ? curchar | 0x20 : curchar;
1306  ++src;
1307  }
1308  if (cBufferPos == CBUFLEN) {
1309  cBuffer[cBufferPos] = '\0';
1310  attrName = DOMString(cBuffer, cBufferPos);
1311  dest = buffer;
1312  *dest++ = 0;
1313  tag = SearchEqual;
1314  }
1315  break;
1316  }
1317  case SearchEqual: {
1318 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1319  qDebug("SearchEqual");
1320 #endif
1321  ushort curchar;
1322  bool atespace = false;
1323  while (!src.isEmpty()) {
1324  curchar = src->unicode();
1325  if (curchar > ' ') {
1326  if (curchar == '=') {
1327 #ifdef TOKEN_DEBUG
1328  qCDebug(KHTML_LOG) << "found equal";
1329 #endif
1330  tag = SearchValue;
1331  ++src;
1332  } else if (atespace && (curchar == '\'' || curchar == '"')) {
1333  tag = SearchValue;
1334  *dest++ = 0;
1335  attrName = DOMString("");
1336  } else {
1337  DOMString v("");
1338  currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1339  dest = buffer;
1340  tag = SearchAttribute;
1341  }
1342  break;
1343  }
1344  atespace = true;
1345  ++src;
1346  }
1347  break;
1348  }
1349  case SearchValue: {
1350  ushort curchar;
1351  while (!src.isEmpty()) {
1352  curchar = src->unicode();
1353  if (curchar > ' ') {
1354  if ((curchar == '\'' || curchar == '\"')) {
1355  tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1356  tag = QuotedValue;
1357  ++src;
1358  } else {
1359  tag = Value;
1360  }
1361 
1362  break;
1363  }
1364  ++src;
1365  }
1366  break;
1367  }
1368  case QuotedValue: {
1369 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1370  qDebug("QuotedValue");
1371 #endif
1372  ushort curchar;
1373  while (!src.isEmpty()) {
1374  checkBuffer();
1375 
1376  curchar = src->unicode();
1377  if (curchar <= '\'' && !src.escaped()) {
1378  // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
1379  if (curchar == '&') {
1380  ++src;
1381  parseEntity(src, dest, true);
1382  break;
1383  } else if ((tquote == SingleQuote && curchar == '\'') ||
1384  (tquote == DoubleQuote && curchar == '\"')) {
1385  // some <input type=hidden> rely on trailing spaces. argh
1386  while (dest > buffer + 1 && (*(dest - 1) == '\n' || *(dest - 1) == '\r')) {
1387  dest--; // remove trailing newlines
1388  }
1389  DOMString v(buffer + 1, dest - buffer - 1);
1390  currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1391 
1392  dest = buffer;
1393  tag = SearchAttribute;
1394  tquote = NoQuote;
1395  ++src;
1396  break;
1397  }
1398  }
1399  *dest++ = *src;
1400  ++src;
1401  }
1402  break;
1403  }
1404  case Value: {
1405 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1406  qDebug("Value");
1407 #endif
1408  ushort curchar;
1409  while (!src.isEmpty()) {
1410  checkBuffer();
1411  curchar = src->unicode();
1412  if (curchar <= '>' && !src.escaped()) {
1413  // parse Entities
1414  if (curchar == '&') {
1415  ++src;
1416  parseEntity(src, dest, true);
1417  break;
1418  }
1419  // no quotes. Every space means end of value
1420  // '/' does not delimit in IE!
1421  // HTML5: must not contain any literal space characters, any U+0022 QUOTATION MARK (") characters,
1422  // U+0027 APOSTROPHE (') characters, U+003D EQUALS SIGN (=) characters, U+003C LESS-THAN SIGN (<) characters,
1423  // U+003E GREATER-THAN SIGN (>) characters, or U+0060 GRAVE ACCENT (`) characters, and must not be the empty string.
1424  // Real life: images.google.com uses URLs including form arguments (foo=bar)
1425  // in unquoted parameters --- with an html5 <!doctype html> DTD.
1426  // Real life takes priority, so we accept at least =
1427  if (curchar <= ' ' || curchar == '>' || curchar == '\'' || curchar == '"' || curchar == '<' || /*curchar == '=' ||*/ curchar == '`') {
1428  DOMString v(buffer + 1, dest - buffer - 1);
1429  currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1430  dest = buffer;
1431  tag = SearchAttribute;
1432  break;
1433  }
1434  }
1435 
1436  *dest++ = *src;
1437  ++src;
1438  }
1439  break;
1440  }
1441  case SearchEnd: {
1442 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1443  qDebug("SearchEnd");
1444 #endif
1445  while (!src.isEmpty()) {
1446  if (*src == '<' || *src == '>') {
1447  break;
1448  }
1449 
1450  if (*src == '/') {
1451  currToken.flat = true;
1452  }
1453 
1454  ++src;
1455  }
1456  if (src.isEmpty() && *src != '<' && *src != '>') {
1457  break;
1458  }
1459 
1460  searchCount = 0; // Stop looking for '<!--' sequence
1461  tag = NoTag;
1462  tquote = NoQuote;
1463  if (*src == '>') {
1464  ++src;
1465  }
1466 
1467  if (!currToken.tid) { //stop if tag is unknown
1468  return;
1469  }
1470 
1471  uint tagID = currToken.tid;
1472 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
1473  qCDebug(KHTML_LOG) << "appending Tag: " << tagID;
1474 #endif
1475  // When parsing HTML flat tags like <div /> should
1476  // be ignored, the only exception is SCRIPT, and
1477  // tags with forbidden end-tags
1478  if (tagID < ID_CLOSE_TAG && tagID != ID_SCRIPT &&
1479  DOM::endTagRequirement(tagID) != DOM::FORBIDDEN &&
1480  parser->doc()->htmlMode() != DocumentImpl::XHtml) {
1481  currToken.flat = false;
1482  }
1483 
1484  bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
1485  HTMLScriptElementImpl *prevScriptElem = nullptr;
1486 
1487  if (tagID >= ID_CLOSE_TAG) {
1488  tagID -= ID_CLOSE_TAG;
1489  } else if (tagID == ID_SCRIPT) {
1490  prevScriptElem = parser->currentScriptElement();
1491  DOMStringImpl *a = nullptr;
1492  scriptSrc.clear(); scriptSrcCharset.clear();
1493  if (currToken.attrs && /* potentially have a ATTR_SRC ? */
1494  view && /* are we a regular tokenizer or just for innerHTML ? */
1495  parser->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */
1496  ) {
1497  if ((a = currToken.attrs->getValue(ATTR_SRC))) {
1498  scriptSrc = DOMString(a).trimSpaces().string();
1499  }
1500  if ((a = currToken.attrs->getValue(ATTR_CHARSET))) {
1501  scriptSrcCharset = DOMString(a).string().trimmed();
1502  }
1503  if (scriptSrcCharset.isEmpty() && view) {
1504  scriptSrcCharset = parser->doc()->view()->part()->encoding();
1505  }
1506  }
1507  javascript = true;
1508  }
1509 
1510  processToken();
1511 
1512  if (javascript) {
1513  HTMLScriptElementImpl *sc = parser->currentScriptElement();
1514  javascript = (sc && sc != prevScriptElem) ? sc->isValidScript() : false;
1515  }
1516 
1517  if (parser->selectMode() && beginTag) {
1518  discard = AllDiscard;
1519  }
1520 
1521  switch (tagID) {
1522  case ID_LISTING:
1523  case ID_PRE:
1524  pre = beginTag;
1525  if (beginTag) {
1526  discard = LFDiscard;
1527  }
1528  prePos = 0;
1529  break;
1530  case ID_BR:
1531  prePos = 0;
1532  break;
1533  case ID_SCRIPT:
1534  if (beginTag) {
1535  searchStopper = scriptEnd;
1536  searchStopperLen = 8;
1537  script = true;
1538  parseRawContent(src);
1539  } else if (tagID < ID_CLOSE_TAG) { // Handle <script src="foo"/>
1540  script = true;
1541  scriptHandler();
1542  }
1543  break;
1544  case ID_STYLE:
1545  if (beginTag) {
1546  searchStopper = styleEnd;
1547  searchStopperLen = 7;
1548  style = true;
1549  parseRawContent(src);
1550  }
1551  break;
1552  case ID_TEXTAREA:
1553  if (beginTag) {
1554  searchStopper = textareaEnd;
1555  searchStopperLen = 10;
1556  textarea = true;
1557  discard = NoneDiscard;
1558  rawContentSinceLastEntity = 0;
1559  parseRawContent(src);
1560  }
1561  break;
1562  case ID_TITLE:
1563  if (beginTag) {
1564  searchStopper = titleEnd;
1565  searchStopperLen = 7;
1566  title = true;
1567  rawContentSinceLastEntity = 0;
1568  parseRawContent(src);
1569  }
1570  break;
1571  case ID_XMP:
1572  if (beginTag) {
1573  searchStopper = xmpEnd;
1574  searchStopperLen = 5;
1575  xmp = true;
1576  parseRawContent(src);
1577  }
1578  break;
1579  case ID_SELECT:
1580  select = beginTag;
1581  break;
1582  case ID_PLAINTEXT:
1583  plaintext = beginTag;
1584  break;
1585  }
1586  return; // Finished parsing tag!
1587  }
1588  } // end switch
1589  }
1590  return;
1591 }
1592 
1593 void HTMLTokenizer::addPending()
1594 {
1595  if (select && !(comment || script)) {
1596  *dest++ = ' ';
1597  } else {
1598  switch (pending) {
1599  case LFPending: *dest++ = QLatin1Char('\n'); prePos = 0; break;
1600  case SpacePending: *dest++ = QLatin1Char(' '); ++prePos; break;
1601  case TabPending: {
1602  // Don't expand tabs inside <textarea> or script
1603  int p = TAB_SIZE - (prePos % TAB_SIZE);
1604  if (textarea || script) {
1605  *dest++ = QLatin1Char('\t');
1606  } else {
1607  for (int x = 0; x < p; x++) {
1608  *dest++ = QLatin1Char(' ');
1609  }
1610  }
1611  prePos += p;
1612  break;
1613  }
1614  case NonePending:
1615  assert(0);
1616  }
1617  }
1618 
1619  pending = NonePending;
1620 }
1621 
1622 inline bool HTMLTokenizer::continueProcessing(int &processedCount)
1623 {
1624  // We don't want to be checking elapsed time with every character, so we only check after we've
1625  // processed a certain number of characters. We also do not do suspension if we're
1626  // parsing something like innerHTML.
1627  if (!m_executingScript && processedCount > sTokenizerChunkSize && cachedScript.isEmpty()) {
1628  processedCount = 0;
1629  if (m_time.elapsed() > m_tokenizerYieldDelay && m_documentTokenizer) {
1630  m_yieldTimer = startTimer(0);
1631  m_tokenizerYieldDelay = sTokenizerFastYieldDelay;
1632  return false;
1633  }
1634  }
1635  processedCount++;
1636  return true;
1637 }
1638 
1639 #include "khtmlpart_p.h"
1640 void HTMLTokenizer::write(const TokenizerString &str, bool appendData)
1641 {
1642 #ifdef TOKEN_DEBUG
1643  qCDebug(KHTML_LOG) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")";
1644 #endif
1645  if (!buffer) {
1646  return;
1647  }
1648 
1649  if ((m_executingScript && appendData) || cachedScript.count()) {
1650  // don't parse; we will do this later
1651  if (pendingQueue.isEmpty()) {
1652  pendingQueue.push(str);
1653  } else if (appendData) {
1654  pendingQueue.bottom().append(str);
1655  } else {
1656  pendingQueue.top().append(str);
1657  }
1658 #if PROSPECTIVE_TOKENIZER_ENABLED
1659  if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData) {
1660  m_prospectiveTokenizer->write(str);
1661  }
1662 #endif
1663  return;
1664  }
1665 
1666 #if PROSPECTIVE_TOKENIZER_ENABLED
1667  if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData) {
1668  m_prospectiveTokenizer->end();
1669  }
1670 #endif
1671 
1672  if (onHold) {
1673  src.append(str);
1674  return;
1675  }
1676 
1677  if (!src.isEmpty()) {
1678  src.append(str);
1679  } else {
1680  setSrc(str);
1681  }
1682 
1683  // Once a timer is set, it has control of when the tokenizer continues.
1684  if (m_yieldTimer > 0) {
1685  return;
1686  }
1687 
1688  int processedCount = 0;
1689  m_time.start();
1690 
1691  while (!src.isEmpty()) {
1692  if (m_abort || !continueProcessing(processedCount)) {
1693  break;
1694  }
1695  // do we need to enlarge the buffer?
1696  checkBuffer();
1697 
1698  ushort cc = src->unicode();
1699 
1700  if (skipLF && (cc != '\n')) {
1701  skipLF = false;
1702  }
1703 
1704  if (skipLF) {
1705  skipLF = false;
1706  ++src;
1707  } else if (Entity) {
1708  parseEntity(src, dest);
1709  } else if (plaintext) {
1710  parseText(src);
1711  } else if (script) {
1712  parseRawContent(src);
1713  } else if (style) {
1714  parseRawContent(src);
1715  } else if (xmp) {
1716  parseRawContent(src);
1717  } else if (textarea) {
1718  parseRawContent(src);
1719  } else if (title) {
1720  parseRawContent(src);
1721  } else if (comment) {
1722  parseComment(src);
1723  } else if (doctypeComment && doctypeComment != DoctypeCommentEnd && doctypeComment != DoctypeCommentBogus) {
1724  parseDoctypeComment(src);
1725  } else if (doctype) {
1726  parseDoctype(src);
1727  } else if (server) {
1728  parseServer(src);
1729  } else if (processingInstruction) {
1730  parseProcessingInstruction(src);
1731  } else if (tag) {
1732  parseTag(src);
1733  } else if (startTag) {
1734  startTag = false;
1735 
1736  switch (cc) {
1737  case '/':
1738  break;
1739  case '!': {
1740  // <!-- comment --> or <!DOCTYPE ...>
1741  searchCount = 1; // Look for '<!--' sequence to start comment...
1742  doctypeSearchCount = 1; // ... or for '<!DOCTYPE' sequence to start doctype
1743  break;
1744  }
1745  case '?': {
1746  // xml processing instruction
1747  processingInstruction = true;
1748  tquote = NoQuote;
1749  parseProcessingInstruction(src);
1750  continue;
1751  }
1752  case '%':
1753  if (!brokenServer) {
1754  // <% server stuff, handle as comment %>
1755  server = true;
1756  tquote = NoQuote;
1757  parseServer(src);
1758  continue;
1759  }
1760  // else fall through
1761  default: {
1762  if (((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
1763  // Start of a Start-Tag
1764  } else {
1765  // Invalid tag
1766  // Add as is
1767  if (pending) {
1768  addPending();
1769  }
1770  *dest = '<';
1771  dest++;
1772  continue;
1773  }
1774  }
1775  }; // end case
1776 
1777  // According to SGML any LF immediately after a starttag, or
1778  // immediately before an endtag should be ignored.
1779  // ### Gecko and MSIE though only ignores LF immediately after
1780  // starttags and only for PRE elements -- asj (28/06-2005)
1781  if (pending) {
1782  if (!select) {
1783  addPending();
1784  } else {
1785  pending = NonePending;
1786  }
1787  }
1788 
1789  // Cancel unused discards
1790  discard = NoneDiscard;
1791  // if (!endTag) discard = LFDiscard;
1792 
1793  processToken();
1794 
1795  cBufferPos = 0;
1796  tag = TagName;
1797  parseTag(src);
1798  } else if (cc == '&' && !src.escaped()) {
1799  ++src;
1800  if (pending) {
1801  addPending();
1802  }
1803  discard = NoneDiscard;
1804  parseEntity(src, dest, true);
1805  } else if (cc == '<' && !src.escaped()) {
1806  tagStartLineno = lineno + src.lineCount();
1807  ++src;
1808  discard = NoneDiscard;
1809  startTag = true;
1810  } else if ((cc == '\n') || (cc == '\r')) {
1811  if (discard == SpaceDiscard) {
1812  discard = NoneDiscard;
1813  }
1814 
1815  if (discard == LFDiscard) {
1816  // Ignore one LF
1817  discard = NoneDiscard;
1818  } else if (discard == AllDiscard) {
1819  // Ignore
1820  } else {
1821  if (select && !script) {
1822  pending = LFPending;
1823  } else {
1824  if (pending) {
1825  addPending();
1826  }
1827  pending = LFPending;
1828  }
1829  }
1830 
1831  /* Check for MS-DOS CRLF sequence */
1832  if (cc == '\r') {
1833  skipLF = true;
1834  }
1835  ++src;
1836  } else if ((cc == ' ') || (cc == '\t')) {
1837  if (discard == LFDiscard) {
1838  discard = NoneDiscard;
1839  }
1840 
1841  if (discard == SpaceDiscard) {
1842  // Ignore one space
1843  discard = NoneDiscard;
1844  } else if (discard == AllDiscard) {
1845  // Ignore
1846  } else {
1847  if (select && !script) {
1848  if (!pending) {
1849  pending = SpacePending;
1850  }
1851  } else {
1852  if (pending) {
1853  addPending();
1854  }
1855  if (cc == ' ') {
1856  pending = SpacePending;
1857  } else {
1858  pending = TabPending;
1859  }
1860  }
1861  }
1862 
1863  ++src;
1864  } else {
1865  if (pending) {
1866  addPending();
1867  }
1868 
1869  discard = NoneDiscard;
1870  if (pre) {
1871  prePos++;
1872  }
1873  *dest = *src;
1874  fixUpChar(*dest);
1875  ++dest;
1876  ++src;
1877  }
1878  }
1879 
1880  if (noMoreData && cachedScript.isEmpty() && !m_executingScript && m_yieldTimer <= 0) {
1881  end(); // this actually causes us to be deleted
1882  }
1883 }
1884 
1885 void HTMLTokenizer::timerEvent(QTimerEvent *e)
1886 {
1887  if (e->timerId() == m_yieldTimer) {
1888  killTimer(m_yieldTimer);
1889  m_yieldTimer = 0;
1890  write(TokenizerString(), true);
1891  } else if (e->timerId() == m_externalScriptsTimerId) {
1892  if (view && view->hasLayoutPending()) {
1893  // all stylesheets are loaded but the style modifications
1894  // they triggered have yet to be applied, BBIAB
1895  return;
1896  }
1897  killTimer(m_externalScriptsTimerId);
1898  m_externalScriptsTimerId = 0;
1899  notifyFinished(nullptr);
1900  }
1901 }
1902 
1903 void HTMLTokenizer::end()
1904 {
1905  if (buffer) {
1906  // parseTag is using the buffer for different matters
1907  if (!tag) {
1908  processToken();
1909  }
1910 
1911  if (buffer) {
1912  KHTML_DELETE_QCHAR_VEC(buffer);
1913  }
1914 
1915  if (rawContent) {
1916  KHTML_DELETE_QCHAR_VEC(rawContent);
1917  }
1918 
1919  rawContent = nullptr;
1920  rawContentSize = rawContentMaxSize = rawContentResync = 0;
1921  buffer = nullptr;
1922  }
1923  emit finishedParsing();
1924 }
1925 
1926 void HTMLTokenizer::finish()
1927 {
1928  // The purpose of this iteration is to recover from 'raw content' tokenizing mode.
1929  // In this mode, any error such as the lack of a closing tag (for the considered element) or of a closing comment,
1930  // would result in the entire document being absorbed in one node.
1931  // When it happens, we simply put back in the input buffer what this mode's output has accumulated so far,
1932  // and retokenize after either disabling the 'raw content' mode (by setting the corresponding members to false)
1933  // or after setting a few flags disabling some lax parsing 'features' (brokenComments/brokenServer).
1934  while ((title || comment || server) && rawContent && rawContentSize) {
1935  // we've found an unmatched comment start
1936  if (comment) {
1937  brokenComments = true;
1938  } else if (server) {
1939  brokenServer = true;
1940  }
1941 
1942  checkRawContentBuffer();
1943  rawContent[ rawContentSize ] = 0;
1944  rawContent[ rawContentSize + 1 ] = 0;
1945  int pos;
1946  QString food;
1947  if (title || style || script || textarea) {
1948  rawContentSinceLastEntity = 0;
1949  food.setUnicode(rawContent, rawContentSize);
1950  } else if (server) {
1951  food = "<";
1952  food += QString(rawContent, rawContentSize);
1953  } else {
1954  pos = QString::fromRawData(rawContent, rawContentSize).indexOf('>');
1955  food.setUnicode(rawContent + pos + 1, rawContentSize - pos - 1); // deep copy
1956  }
1957  KHTML_DELETE_QCHAR_VEC(rawContent);
1958  rawContent = nullptr;
1959  rawContentSize = rawContentMaxSize = rawContentResync = 0;
1960 
1961  comment = server = title = false;
1962  if (!food.isEmpty()) {
1963  write(food, true);
1964  }
1965  }
1966  // this indicates we will not receive any more data... but if we are waiting on
1967  // an external script to load, we can't finish parsing until that is done
1968  noMoreData = true;
1969  if (cachedScript.isEmpty() && !m_executingScript && !onHold && m_yieldTimer <= 0) {
1970  end(); // this actually causes us to be deleted
1971  }
1972 }
1973 
1974 void HTMLTokenizer::processToken()
1975 {
1976  KJSProxy *jsProxy = view ? view->part()->jScript() : nullptr;
1977  if (jsProxy) {
1978  jsProxy->setEventHandlerLineno(tagStartLineno);
1979  }
1980  if (dest > buffer) {
1981 #if 0
1982  if (currToken.tid) {
1983  qDebug("unexpected token id: %d, str: *%s*", currToken.tid, QString::fromRawData(buffer, dest - buffer).toLatin1().constData());
1984  assert(0);
1985  }
1986 
1987 #endif
1988  currToken.text = new DOMStringImpl(buffer, dest - buffer);
1989  currToken.text->ref();
1990  if (currToken.tid != ID_COMMENT) {
1991  currToken.tid = ID_TEXT;
1992  }
1993  } else if (!currToken.tid) {
1994  currToken.reset();
1995  if (jsProxy) {
1996  jsProxy->setEventHandlerLineno(lineno + src.lineCount());
1997  }
1998  return;
1999  }
2000 
2001  dest = buffer;
2002 
2003 #ifdef TOKEN_DEBUG
2004  QString text;
2005  bool closing = (currToken.tid > ID_CLOSE_TAG);
2006  int rid = currToken.tid - (closing ? ID_CLOSE_TAG : 0);
2007  if (currToken.text) {
2008  text = QString::fromRawData(currToken.text->s, currToken.text->l);
2009  }
2010  qCDebug(KHTML_LOG) << "Token -->" << LocalName::fromId(localNamePart(rid)).toString()
2011  << "id =" << currToken.tid << "closing =" << closing;
2012  if (currToken.flat) {
2013  qCDebug(KHTML_LOG) << "Token is FLAT!";
2014  }
2015  if (!text.isNull()) {
2016  qCDebug(KHTML_LOG) << "text: \"" << text << "\"";
2017  }
2018  unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
2019 
2020  if (l) {
2021  qCDebug(KHTML_LOG) << "Attributes: " << l;
2022  for (unsigned long i = 0; i < l; ++i) {
2023  NodeImpl::Id tid = currToken.attrs->idAt(i);
2024  DOMString value = currToken.attrs->valueAt(i);
2025  qCDebug(KHTML_LOG) << " " << tid << " " << LocalName::fromId(localNamePart(tid)).toString()
2026  << "=\"" << value.string() << "\"";
2027  }
2028  }
2029 #endif
2030 
2031  // In some cases, parseToken() can cause javascript code to be executed
2032  // (for example, when setting an attribute that causes an event handler
2033  // to be created). So we need to protect against re-entrancy into the parser
2034  m_executingScript++;
2035 
2036  // pass the token over to the parser, the parser DOES NOT delete the token
2037  parser->parseToken(&currToken);
2038 
2039  m_executingScript--;
2040 
2041  if (currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces()) {
2042  discard = NoneDiscard;
2043  }
2044 
2045  currToken.reset();
2046  if (jsProxy) {
2047  jsProxy->setEventHandlerLineno(0);
2048  }
2049 }
2050 
2051 void HTMLTokenizer::processDoctypeToken()
2052 {
2053  // qCDebug(KHTML_LOG) << "Process DoctypeToken (name: " << doctypeToken.name << ", publicID: " << doctypeToken.publicID << ", systemID: " << doctypeToken.systemID;
2054  doctypeToken.publicID = doctypeToken.publicID.simplified();
2055  doctypeToken.systemID = doctypeToken.systemID.simplified();
2056  parser->parseDoctypeToken(&doctypeToken);
2057 }
2058 
2059 HTMLTokenizer::~HTMLTokenizer()
2060 {
2061  reset();
2062  delete m_prospectiveTokenizer;
2063  delete parser;
2064 }
2065 
2066 void HTMLTokenizer::enlargeBuffer(int len)
2067 {
2068  int newsize = qMax(size * 2, size + len);
2069  int oldoffs = (dest - buffer);
2070 
2071  buffer = KHTML_REALLOC_QCHAR_VEC(buffer, newsize);
2072  dest = buffer + oldoffs;
2073  size = newsize;
2074 }
2075 
2076 void HTMLTokenizer::enlargeRawContentBuffer(int len)
2077 {
2078  int newsize = qMax(rawContentMaxSize * 2, rawContentMaxSize + len);
2079  rawContent = KHTML_REALLOC_QCHAR_VEC(rawContent, newsize);
2080  rawContentMaxSize = newsize;
2081 }
2082 
2083 void HTMLTokenizer::notifyFinished(CachedObject *finishedObj)
2084 {
2085  Q_UNUSED(finishedObj);
2086  assert(!cachedScript.isEmpty());
2087  // Make external scripts wait for external stylesheets.
2088  // FIXME: This needs to be done for inline scripts too.
2089  m_hasScriptsWaitingForStylesheets = !parser->doc()->haveStylesheetsLoaded();
2090  if (m_hasScriptsWaitingForStylesheets) {
2091  // qCDebug(KHTML_LOG) << "Delaying script execution until stylesheets have loaded.";
2092  return;
2093  }
2094  // qCDebug(KHTML_LOG) << (finishedObj ? "Processing an external script" : "Continuing processing of delayed external scripts");
2095 
2096  bool done = false;
2097  m_scriptTime.start();
2098  while (!done && cachedScript.head()->isLoaded()) {
2099  if (!continueProcessingScripts()) {
2100  break;
2101  }
2102 
2103  CachedScript *cs = cachedScript.dequeue();
2104  DOMString scriptSource = cs->script();
2105 #ifdef TOKEN_DEBUG
2106  qCDebug(KHTML_LOG) << "External script is:" << endl << scriptSource.string();
2107 #endif
2108  setSrc(TokenizerString());
2109 
2110  // make sure we forget about the script before we execute the new one
2111  // infinite recursion might happen otherwise
2112  QString cachedScriptUrl(cs->url().string());
2113  cs->deref(this);
2114 
2115  scriptExecution(scriptSource.string(), cachedScriptUrl);
2116 
2117  done = cachedScript.isEmpty();
2118  if (done) {
2119  assert(!m_hasScriptsWaitingForStylesheets);
2120  } else if (m_hasScriptsWaitingForStylesheets) {
2121  // flag has changed during the script execution,
2122  // so we need to wait for stylesheets again.
2123  done = true;
2124  }
2125  // 'script' is true when we are called synchronously from
2126  // scriptHandler(). In that case scriptHandler() will take care
2127  // of the pending queue.
2128  if (!script) {
2129  while (pendingQueue.count() > 1) {
2130  TokenizerString t = pendingQueue.pop();
2131  pendingQueue.top().prepend(t);
2132  }
2133  if (done) {
2134  write(pendingQueue.pop(), false);
2135  }
2136  // we might be deleted at this point, do not
2137  // access any members.
2138  }
2139  }
2140 }
2141 
2142 bool HTMLTokenizer::continueProcessingScripts()
2143 {
2144  if (m_externalScriptsTimerId) {
2145  return false;
2146  }
2147  if (m_scriptTime.elapsed() > m_tokenizerYieldDelay && m_documentTokenizer) {
2148  if ((m_externalScriptsTimerId = startTimer(0))) {
2149  return false;
2150  }
2151  }
2152  return true;
2153 }
2154 
2155 void HTMLTokenizer::executeScriptsWaitingForStylesheets()
2156 {
2157  assert(parser->doc()->haveStylesheetsLoaded());
2158  if (m_hasScriptsWaitingForStylesheets) {
2159  notifyFinished(nullptr);
2160  }
2161 }
2162 
2163 bool HTMLTokenizer::isWaitingForScripts() const
2164 {
2165  return cachedScript.count();
2166 }
2167 
2168 bool HTMLTokenizer::isExecutingScript() const
2169 {
2170  return (m_executingScript > 0);
2171 }
2172 
2173 void HTMLTokenizer::setSrc(const TokenizerString &source)
2174 {
2175  lineno += src.lineCount();
2176  src = source;
2177  src.resetLineCount();
2178 }
2179 
2180 void HTMLTokenizer::setOnHold(bool _onHold)
2181 {
2182  if (onHold == _onHold) {
2183  return;
2184  }
2185  onHold = _onHold;
2186 }
2187 
int indexOf(QChar ch, int from, Qt::CaseSensitivity cs) const const
The Node interface is the primary datatype for the entire Document Object Model.
Definition: dom_node.h:278
This file is part of the HTML rendering engine for KDE.
Proxy class serving as interface when being dlopen&#39;ed.
Definition: kjs_proxy.h:61
Renders and displays HTML in a QScrollArea.
Definition: khtmlview.h:97
QTextStream & endl(QTextStream &stream)
uchar cell() const const
This interface represents an entity, either parsed or unparsed, in an XML document.
Definition: dom_xml.h:125
bool isNull() const const
void clear()
QString fromRawData(const QChar *unicode, int size)
CaseInsensitive
bool isEmpty() const const
QString trimmed() const const
const char * constData() const const
KCharsets * charsets()
This class implements the basic string we use in the DOM.
Definition: dom_string.h:44
ushort unicode() const const
KGuiItem discard()
static KCharsets * charsets()
QChar toLower() const const
const QList< QKeySequence > & end()
uchar row() const const
QByteArray toLatin1() const const
int length() const const
int timerId() const const
QString fromLatin1(const char *str, int size)
KGuiItem reset()
int compare(const QString &other, Qt::CaseSensitivity cs) const const
The parser for html.
Definition: htmlparser.h:70
a cached script
Definition: loader.h:322
QString & setUnicode(const QChar *unicode, int size)
DOMString trimSpaces() const
Returns a string with Space Characters removed from the start and the end.
Definition: dom_string.cpp:345
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Sat Oct 16 2021 22:47:55 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.