KHtml

htmltokenizer.h
1 /*
2  This file is part of the KDE libraries
3 
4  Copyright (C) 1997 Martin Jones ([email protected])
5  (C) 1997 Torben Weis ([email protected])
6  (C) 1998 Waldo Bastian ([email protected])
7  (C) 2001 Dirk Mueller ([email protected])
8 
9  This library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU Library General Public
11  License as published by the Free Software Foundation; either
12  version 2 of the License, or (at your option) any later version.
13 
14  This library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  Library General Public License for more details.
18 
19  You should have received a copy of the GNU Library General Public License
20  along with this library; see the file COPYING.LIB. If not, write to
21  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22  Boston, MA 02110-1301, USA.
23 */
24 //----------------------------------------------------------------------------
25 //
26 // KDE HTML Widget -- Tokenizers
27 
28 #ifndef HTMLTOKENIZER_H
29 #define HTMLTOKENIZER_H
30 
31 #include <QString>
32 #include <QObject>
33 #include <QQueue>
34 #include <QTime>
35 
36 #include "misc/loader_client.h"
37 #include "misc/stringit.h"
38 #include "xml/dom_stringimpl.h"
39 #include "xml/xml_tokenizer.h"
40 #include "xml/dom_elementimpl.h"
41 #include "xml/dom_docimpl.h"
42 
43 class KCharsets;
44 class KHTMLView;
45 
46 namespace DOM
47 {
48 class DocumentImpl;
49 class DocumentFragmentImpl;
50 }
51 
52 namespace khtml
53 {
54 class CachedScript;
55 class KHTMLParser;
56 class ProspectiveTokenizer;
57 
58 /**
59  * @internal
60  * represents one HTML tag. Consists of a numerical id, and the list
61  * of attributes. Can also represent text. In this case the id = 0 and
62  * text contains the text.
63  */
64 class Token
65 {
66 public:
67  Token()
68  {
69  tid = 0;
70  attrs = nullptr;
71  text = nullptr;
72  flat = false;
73  //qDebug("new token, creating %08lx", attrs);
74  }
75  ~Token()
76  {
77  if (attrs) {
78  attrs->deref();
79  }
80  if (text) {
81  text->deref();
82  }
83  }
84  void addAttribute(DocumentImpl * /*doc*/, QChar *buffer, const DOMString &_attrName, const DOMString &v)
85  {
86  DOMStringImpl *value = v.implementation();
87  LocalName localname = LocalName::fromId(0);
88  PrefixName prefixname = PrefixName::fromId(emptyPrefix);
89  if (buffer->unicode()) {
90  localname = LocalName::fromId(buffer->unicode());
91  } else if (!_attrName.isEmpty() && _attrName != "/") {
92  splitPrefixLocalName(_attrName, prefixname, localname, true /* htmlCompat*/);
93  }
94 
95  if (value && localname.id()) {
96  if (!attrs) {
97  attrs = new DOM::NamedAttrMapImpl(nullptr);
98  attrs->ref();
99  }
100  if (!attrs->getValue(makeId(emptyNamespace, localname.id()), prefixname))
101  // place attributes in the empty namespace
102  {
103  attrs->setValue(makeId(emptyNamespace, localname.id()), value, prefixname);
104  }
105  }
106  }
107  void reset()
108  {
109  if (attrs) {
110  attrs->deref();
111  attrs = nullptr;
112  }
113  tid = 0;
114  if (text) {
115  text->deref();
116  text = nullptr;
117  }
118  flat = false;
119  }
120  DOM::NamedAttrMapImpl *attrs;
121  DOMStringImpl *text;
122  ushort tid;
123  bool flat;
124 };
125 
126 enum DoctypeState {
127  DoctypeBegin,
128  DoctypeBeforeName,
129  DoctypeName,
130  DoctypeAfterName,
131  DoctypeBeforePublicID,
132  DoctypePublicID,
133  DoctypeAfterPublicID,
134  DoctypeBeforeSystemID,
135  DoctypeSystemID,
136  DoctypeAfterSystemID,
137  DoctypeInternalSubset,
138  DoctypeAfterInternalSubset,
139  DoctypeBogus
140 };
141 
142 class DoctypeToken
143 {
144 public:
145  DoctypeToken() {}
146 
147  void reset()
148  {
149  name.clear();
150  publicID.clear();
151  systemID.clear();
152  internalSubset.clear();
153  state = DoctypeBegin;
154  }
155 
156  DoctypeState state;
157  QString name;
158  QString publicID;
159  QString systemID;
160  QString internalSubset;
161 };
162 
163 // The count of spaces used for each tab.
164 #define TAB_SIZE 8
165 
166 //-----------------------------------------------------------------------------
167 
168 class HTMLTokenizer : public Tokenizer, public CachedObjectClient
169 {
170  friend class KHTMLParser;
171 public:
172  HTMLTokenizer(DOM::DocumentImpl *, KHTMLView * = nullptr);
173  HTMLTokenizer(DOM::DocumentImpl *, DOM::DocumentFragmentImpl *frag);
174  virtual ~HTMLTokenizer();
175 
176  void begin() override;
177  void write(const khtml::TokenizerString &str, bool appendData) override;
178  void end() override;
179  void finish() override;
180  void timerEvent(QTimerEvent *e) override;
181  bool continueProcessing(int &);
182  void setNormalYieldDelay() override;
183  void setOnHold(bool _onHold) override;
184  void abort() override
185  {
186  m_abort = true;
187  }
188  bool isWaitingForScripts() const override;
189  bool isExecutingScript() const override;
190 
191  void executeScriptsWaitingForStylesheets() override;
192 
193 protected:
194  void reset();
195  void addPending();
196  void processToken();
197  void processDoctypeToken();
198  void processListing(khtml::TokenizerString list);
199 
200  void parseComment(khtml::TokenizerString &str);
201  void parseDoctype(khtml::TokenizerString &str);
202  void parseDoctypeComment(khtml::TokenizerString &str);
203  void parseServer(khtml::TokenizerString &str);
204  void parseText(khtml::TokenizerString &str);
205  void parseListing(khtml::TokenizerString &str);
206  void parseRawContent(khtml::TokenizerString &str);
207  void parseTag(khtml::TokenizerString &str);
208  void parseEntity(khtml::TokenizerString &str, QChar *&dest, bool start = false);
209  void parseProcessingInstruction(khtml::TokenizerString &str);
210  void scriptHandler();
211  void scriptExecution(const QString &script, const QString &scriptURL = QString(), int baseLine = 0);
212  void setSrc(const TokenizerString &source);
213 
214  // check if we have enough space in the buffer.
215  // if not enlarge it
216  inline void checkBuffer(int len = 10)
217  {
218  if ((dest - buffer) > size - len) {
219  enlargeBuffer(len);
220  }
221  }
222  inline void checkRawContentBuffer(int len = 10)
223  {
224  if (rawContentSize + len >= rawContentMaxSize) {
225  enlargeRawContentBuffer(len);
226  }
227  }
228 
229  void enlargeBuffer(int len);
230  void enlargeRawContentBuffer(int len);
231 
232  // from CachedObjectClient
233  void notifyFinished(khtml::CachedObject *finishedObj) override;
234 
235  bool continueProcessingScripts();
236 protected:
237  // Internal buffers
238  ///////////////////
239  QChar *buffer;
240  QChar *dest;
241 
242  khtml::Token currToken;
243  LocalName safeLocalName;
244 
245  // the size of buffer
246  int size;
247 
248  // Tokenizer flags
249  //////////////////
250  // are we in quotes within a html tag
251  enum {
252  NoQuote = 0,
253  SingleQuote,
254  DoubleQuote
255  } tquote;
256 
257  enum {
258  NonePending = 0,
259  SpacePending,
260  LFPending,
261  TabPending
262  } pending;
263 
264  enum {
265  NoneDiscard = 0,
266  SpaceDiscard, // Discard spaces after '=' within tags
267  LFDiscard, // Discard line breaks immediately after start-tags
268  AllDiscard // discard all spaces, LF's etc until next non white char
269  } discard;
270 
271  // Discard the LF part of CRLF sequence
272  bool skipLF;
273 
274  // Flag to say that we have the '<' but not the character following it.
275  bool startTag;
276 
277  // Flag to say, we are just parsing a tag, meaning, we are in the middle
278  // of <tag...
279  enum {
280  NoTag = 0,
281  TagName,
282  SearchAttribute,
283  AttributeName,
284  SearchEqual,
285  SearchValue,
286  QuotedValue,
287  Value,
288  SearchEnd
289  } tag;
290 
291  // Are we in a &... character entity description?
292  enum {
293  NoEntity = 0,
294  SearchEntity,
295  NumericSearch,
296  Hexadecimal,
297  Decimal,
298  EntityName,
299  SearchSemicolon
300  } Entity;
301 
302  // are we in a <script> ... </script> block
303  bool script;
304 
305  QChar EntityChar;
306 
307  // Are we in a <pre> ... </pre> block
308  bool pre;
309 
310  // if 'pre == true' we track in which column we are
311  int prePos;
312 
313  // Are we in a <style> ... </style> block
314  bool style;
315 
316  // Are we in a <select> ... </select> block
317  bool select;
318 
319  // Are we in a <xmp> ... </xmp> block
320  bool xmp;
321 
322  // Are we in a <title> ... </title> block
323  bool title;
324 
325  // Are we in plain textmode ?
326  bool plaintext;
327 
328  // XML processing instructions. Ignored at the moment
329  bool processingInstruction;
330 
331  // Area we in a <!-- comment --> block
332  bool comment;
333 
334  // Are we in a <textarea> ... </textarea> block
335  bool textarea;
336 
337  // was the previous character escaped ?
338  bool escaped;
339 
340  // are we in a server includes statement?
341  bool server;
342 
343  bool brokenServer;
344 
345  // doctype parsing from WebCore + internal subset checker and comments in doctype
346  // are we in <!DOCTYPE ...> block?
347  bool doctype;
348  DoctypeToken doctypeToken;
349  int doctypeSearchCount;
350  int doctypeSecondarySearchCount;
351  bool doctypeAllowComment; // is comment allowed in current doctype state?
352 
353  // are we in <!DOCTYPE -- ... --> block?
354  enum {
355  NoDoctypeComment = 0,
356  DoctypeCommentHalfBegin,
357  DoctypeComment,
358  DoctypeCommentHalfEnd,
359  DoctypeCommentEnd,
360  DoctypeCommentBogus
361  } doctypeComment;
362 
363  // name of an unknown attribute
364  DOMString attrName;
365 
366  // Used to store the content of
367  QChar *rawContent;
368  // Size of the script sequenze stored in rawContent
369  int rawContentSize;
370  // Maximal size that can be stored in rawContent
371  int rawContentMaxSize;
372  // resync point of script code size
373  int rawContentResync;
374  // this tracks the number of advances done in 'raw' tokenizing
375  // mode since we last decoded an entity.
376  int rawContentSinceLastEntity;
377  // Stores characters if we are scanning for a string like "</script>"
378  QChar searchBuffer[ 10 ];
379  // Counts where we are in the string we are scanning for
380  int searchCount;
381  // The string we are searching for
382  const QChar *searchFor;
383  // the stopper string
384  const char *searchStopper;
385  // the stopper len
386  int searchStopperLen;
387  // if no more data is coming, just parse what we have (including ext scripts that
388  // may be still downloading) and finish
389  bool noMoreData;
390  // URL to get source code of script from
391  QString scriptSrc;
392  QString scriptSrcCharset;
393  bool javascript;
394  // the HTML code we will parse after the external script we are waiting for has loaded
395  TokenizerQueue pendingQueue;
396  // true if we are executing a script while parsing a document. This causes the parsing of
397  // the output of the script to be postponed until after the script has finished executing
398  int m_executingScript;
399 
400  int m_externalScriptsTimerId;
401  bool m_hasScriptsWaitingForStylesheets;
402 
403  QQueue<khtml::CachedScript *> cachedScript;
404  // you can pause the tokenizer if you need to display a dialog or something
405  bool onHold;
406  // you can ask the tokenizer to abort the current write() call, e.g. to redirect somewhere else
407  bool m_abort;
408  // if we found one broken comment, there are most likely others as well
409  // store a flag to get rid of the O(n^2) behavior in such a case.
410  bool brokenComments;
411  // current line number
412  int lineno;
413  // line number at which the current <script> started
414  int scriptStartLineno;
415  int tagStartLineno;
416  int m_tokenizerYieldDelay;
417  int m_yieldTimer;
418  QTime m_time;
419  QTime m_scriptTime;
420 
421  // Set true if this tokenizer is used for documents and not fragments
422  bool m_documentTokenizer;
423 
424 #define CBUFLEN 1024
425  char cBuffer[CBUFLEN + 2];
426  unsigned int cBufferPos;
427  unsigned int entityLen;
428 
429  khtml::TokenizerString src;
430 
431  KCharsets *charsets;
432  KHTMLParser *parser;
433 
434  KHTMLView *view;
435 
436  khtml::ProspectiveTokenizer *m_prospectiveTokenizer;
437 };
438 
439 } // namespace
440 
441 #endif // HTMLTOKENIZER
442 
This file is part of the HTML rendering engine for KDE.
Renders and displays HTML in a QScrollArea.
Definition: khtmlview.h:97
This interface represents an entity, either parsed or unparsed, in an XML document.
Definition: dom_xml.h:125
void clear()
This class implements the basic string we use in the DOM.
Definition: dom_string.h:44
ushort unicode() const const
This library provides a full-featured HTML parser and widget.
DOMStringImpl * implementation() const
Definition: dom_string.h:145
The parser for html.
Definition: htmlparser.h:70
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Sat Oct 16 2021 22:47:55 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.