KHtml

xml_tokenizer.cpp
1 /**
2  * This file is part of the DOM implementation for KDE.
3  *
4  * Copyright (C) 2000 Peter Kelly ([email protected])
5  * Copyright (C) 2003 Apple Computer, Inc.
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Library General Public
9  * License as published by the Free Software Foundation; either
10  * version 2 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Library General Public License for more details.
16  *
17  * You should have received a copy of the GNU Library General Public License
18  * along with this library; see the file COPYING.LIB. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #include "xml_tokenizer.h"
24 #include "xml/dom_docimpl.h"
25 #include "xml/dom_textimpl.h"
26 #include "xml/dom_xmlimpl.h"
27 #include "html/html_tableimpl.h"
28 #include "html/html_headimpl.h"
29 #include "rendering/render_object.h"
30 #include "misc/loader.h"
31 
32 #include "khtmlview.h"
33 #include "khtml_part.h"
34 #include <QVariant>
35 #include <klocalizedstring.h>
36 #include <kencodingdetector.h>
37 
38 // SVG includes
39 #include "svg/SVGScriptElement.h"
40 #include "svg/XLinkNames.h"
41 
42 using namespace DOM;
43 using namespace khtml;
44 
45 XMLIncrementalSource::XMLIncrementalSource()
46  : QXmlInputSource(), m_pos(0), m_unicode(nullptr),
47  m_finished(false), m_paused(false)
48 {
49 }
50 
51 void XMLIncrementalSource::fetchData()
52 {
53  //just a dummy to overwrite default behavior
54 }
55 
56 QChar XMLIncrementalSource::next()
57 {
58  if (m_finished) {
59  return QXmlInputSource::EndOfDocument;
60  } else if (m_paused || m_data.length() <= m_pos) {
61  return QXmlInputSource::EndOfData;
62  } else {
63  return m_unicode[m_pos++];
64  }
65 }
66 
67 void XMLIncrementalSource::setData(const QString &str)
68 {
69  m_data = str;
70  m_unicode = m_data.unicode();
71  m_pos = 0;
72  if (!str.isEmpty()) {
73  m_finished = false;
74  }
75 }
76 void XMLIncrementalSource::setData(const QByteArray &data)
77 {
78  setData(fromRawData(data, true));
79 }
80 
81 void XMLIncrementalSource::appendXML(const QString &str)
82 {
83  m_data += str;
84  m_unicode = m_data.unicode();
85 }
86 
87 QString XMLIncrementalSource::data() const
88 {
89  return m_data;
90 }
91 
92 void XMLIncrementalSource::setFinished(bool finished)
93 {
94  m_finished = finished;
95 }
96 
97 XMLHandler::XMLHandler(DocumentImpl *_doc, KHTMLView *_view)
98  : errorLine(-1)
99 {
100  m_doc = _doc;
101  m_view = _view;
102  pushNode(_doc);
103 }
104 
105 XMLHandler::~XMLHandler()
106 {
107 }
108 
109 void XMLHandler::pushNode(NodeImpl *node)
110 {
111  m_nodes.push(node);
112 }
113 
114 NodeImpl *XMLHandler::popNode()
115 {
116  return m_nodes.pop();
117 }
118 
119 NodeImpl *XMLHandler::currentNode() const
120 {
121  if (m_nodes.isEmpty()) {
122  return nullptr;
123  } else {
124  return m_nodes.top();
125  }
126 }
127 
128 QString XMLHandler::errorProtocol()
129 {
130  return errorProt;
131 }
132 
133 bool XMLHandler::startDocument()
134 {
135  // at the beginning of parsing: do some initialization
136  errorProt = "";
137  state = StateInit;
138 
139  return true;
140 }
141 
142 bool XMLHandler::startPrefixMapping(const QString &prefix, const QString &uri)
143 {
144  namespaceInfo[prefix].push(uri);
145  return true;
146 }
147 
148 bool XMLHandler::endPrefixMapping(const QString &prefix)
149 {
150  if (namespaceInfo.contains(prefix)) {
151  QStack<QString> &stack = namespaceInfo[prefix];
152  stack.pop();
153  if (stack.isEmpty()) {
154  namespaceInfo.remove(prefix);
155  }
156  return true;
157  } else {
158  return false;
159  }
160 }
161 
162 void XMLHandler::fixUpNSURI(QString &uri, const QString &qname)
163 {
164  /* QXml does not resolve the namespaces of attributes in the same
165  tag that preceed the xmlns declaration. This fixes up that case */
166  if (uri.isEmpty() && qname.indexOf(':') != -1) {
168  QString localName, prefix;
169  ns.splitName(qname, prefix, localName);
170  if (namespaceInfo.contains(prefix)) {
171  uri = namespaceInfo[prefix].top();
172  }
173  }
174 }
175 
176 bool XMLHandler::startElement(const QString &namespaceURI, const QString & /*localName*/,
177  const QString &qName, const QXmlAttributes &atts)
178 {
179  if (currentNode()->nodeType() == Node::TEXT_NODE) {
180  exitText();
181  }
182 
183  DOMString nsURI;
184  if (!namespaceURI.isNull()) {
185  nsURI = DOMString(namespaceURI);
186  } else
187  // No namespace declared, default to the no namespace
188  {
189  nsURI = DOMString("");
190  }
191  ElementImpl *newElement = m_doc->createElementNS(nsURI, qName);
192  if (!newElement) {
193  return false;
194  }
195  int i;
196  for (i = 0; i < atts.length(); i++) {
197  int exceptioncode = 0;
198  QString uriString = atts.uri(i);
199  QString qnString = atts.qName(i);
200  fixUpNSURI(uriString, qnString);
201  DOMString uri(uriString);
202  DOMString qn(qnString);
203  DOMString val(atts.value(i));
204  newElement->setAttributeNS(uri, qn, val, exceptioncode);
205  if (exceptioncode) { // exception setting attributes
206  return false;
207  }
208  }
209 
210  if (newElement->id() == ID_SCRIPT || newElement->id() == makeId(xhtmlNamespace, ID_SCRIPT)) {
211  static_cast<HTMLScriptElementImpl *>(newElement)->setCreatedByParser(true);
212  }
213 
214  //this is tricky. in general the node doesn't have to attach to the one it's in. as far
215  //as standards go this is wrong, but there's literally thousands of documents where
216  //we see <p><ul>...</ul></p>. the following code is there for those cases.
217  //when we can't attach to the currently holding us node we try to attach to its parent
218  bool attached = false;
219  for (NodeImpl *current = currentNode(); current; current = current->parent()) {
220  attached = current->addChild(newElement);
221  if (attached) {
222  break;
223  }
224  }
225  if (attached) {
226  if (m_view && !newElement->attached() && !m_doc->hasPendingSheets()) {
227  newElement->attach();
228  }
229  pushNode(newElement);
230  return true;
231  } else {
232  delete newElement;
233  return false;
234  }
235 
236  // ### DOM spec states: "if there is no markup inside an element's content, the text is contained in a
237  // single object implementing the Text interface that is the only child of the element."... do we
238  // need to ensure that empty elements always have an empty text child?
239 }
240 
241 bool XMLHandler::endElement(const QString & /*namespaceURI*/, const QString & /*localName*/, const QString & /*qName*/)
242 {
243  if (currentNode()->nodeType() == Node::TEXT_NODE) {
244  exitText();
245  }
246 
247  NodeImpl *node = popNode();
248  if (node) {
249  node->close();
250  while (currentNode() && currentNode()->implicitNode()) { //for the implicit HTMLTableSectionElementImpl
251  popNode()->close();
252  }
253  } else {
254  return false;
255  }
256 
257  // if the node is a script element try to execute it immediately
258  if ((node->id() == ID_SCRIPT) || (node->id() == makeId(xhtmlNamespace, ID_SCRIPT)) || node->id() == WebCore::SVGNames::scriptTag.id()) {
259  static_cast<XMLTokenizer *>(m_doc->tokenizer())->executeScript(node);
260  }
261 
262  return true;
263 }
264 
265 bool XMLHandler::startCDATA()
266 {
267  if (currentNode()->nodeType() == Node::TEXT_NODE) {
268  exitText();
269  }
270 
271  int exceptioncode = 0;
272  NodeImpl *newNode = m_doc->createCDATASection(new DOMStringImpl(""), exceptioncode);
273  if (!exceptioncode && currentNode()->addChild(newNode)) {
274  if (m_view && !newNode->attached() && !m_doc->hasPendingSheets()) {
275  newNode->attach();
276  }
277  pushNode(newNode);
278  return true;
279  } else {
280  delete newNode;
281  return false;
282  }
283 
284 }
285 
286 bool XMLHandler::endCDATA()
287 {
288  popNode();
289  Q_ASSERT(currentNode());
290  return currentNode();
291 }
292 
293 bool XMLHandler::characters(const QString &ch)
294 {
295  if (currentNode()->nodeType() == Node::TEXT_NODE ||
296  currentNode()->nodeType() == Node::CDATA_SECTION_NODE ||
297  enterText()) {
298  int exceptioncode = 0;
299  static_cast<TextImpl *>(currentNode())->appendData(ch, exceptioncode);
300  if (exceptioncode) {
301  return false;
302  }
303  return true;
304  } else {
305  // Don't worry about white-space violating DTD
306  if (ch.trimmed().isEmpty()) {
307  return true;
308  }
309 
310  return false;
311  }
312 
313 }
314 
315 bool XMLHandler::comment(const QString &ch)
316 {
317  if (currentNode()->nodeType() == Node::TEXT_NODE) {
318  exitText();
319  }
320  // ### handle exceptions
321  currentNode()->addChild(m_doc->createComment(new DOMStringImpl(ch.unicode(), ch.length())));
322  return true;
323 }
324 
325 bool XMLHandler::processingInstruction(const QString &target, const QString &data)
326 {
327  if (currentNode()->nodeType() == Node::TEXT_NODE) {
328  exitText();
329  }
330 
331  // Ignore XML target -- shouldn't be part of the DOM
332  if (target == "xml") {
333  return true;
334  }
335 
336  // ### handle exceptions
337  ProcessingInstructionImpl *pi =
338  m_doc->createProcessingInstruction(target, new DOMStringImpl(data.unicode(), data.length()));
339  currentNode()->addChild(pi);
340  pi->checkStyleSheet();
341  return true;
342 }
343 
344 QString XMLHandler::errorString() const
345 {
346  // ### Make better error-messages
347  return i18n("the document is not in the correct file format");
348 }
349 
350 bool XMLHandler::fatalError(const QXmlParseException &exception)
351 {
352  errorProt += i18n("fatal parsing error: %1 in line %2, column %3",
353  exception.message(),
354  exception.lineNumber(),
355  exception.columnNumber());
356 
357  errorLine = exception.lineNumber();
358  errorCol = exception.columnNumber();
359 
360  return false;
361 }
362 
363 bool XMLHandler::enterText()
364 {
365  NodeImpl *newNode = m_doc->createTextNode("");
366  if (currentNode()->addChild(newNode)) {
367  pushNode(newNode);
368  return true;
369  } else {
370  delete newNode;
371  return false;
372  }
373 }
374 
375 void XMLHandler::exitText()
376 {
377  if (m_view && !currentNode()->attached() && !m_doc->hasPendingSheets()) {
378  currentNode()->attach();
379  }
380  popNode();
381 }
382 
383 bool XMLHandler::attributeDecl(const QString &/*eName*/, const QString &/*aName*/, const QString &/*type*/,
384  const QString &/*valueDefault*/, const QString &/*value*/)
385 {
386  // qt's xml parser (as of 2.2.3) does not currently give us values for type, valueDefault and
387  // value. When it does, we can store these somewhere and have default attributes on elements
388  return true;
389 }
390 
391 bool XMLHandler::externalEntityDecl(const QString &/*name*/, const QString &/*publicId*/, const QString &/*systemId*/)
392 {
393  // ### insert these too - is there anything special we have to do here?
394  return true;
395 }
396 
397 bool XMLHandler::internalEntityDecl(const QString &name, const QString &value)
398 {
399  EntityImpl *e = new EntityImpl(m_doc, name);
400  // ### further parse entities inside the value and add them as separate nodes (or entityreferences)?
401  e->addChild(m_doc->createTextNode(new DOMStringImpl(value.unicode(), value.length())));
402  if (m_doc->doctype()) {
403  static_cast<GenericRONamedNodeMapImpl *>(m_doc->doctype()->entities())->addNode(e);
404  }
405  return true;
406 }
407 
408 bool XMLHandler::notationDecl(const QString &/*name*/, const QString &/*publicId*/, const QString &/*systemId*/)
409 {
410 // ### FIXME
411 // if (m_doc->document()->doctype()) {
412 // NotationImpl *n = new NotationImpl(m_doc,name,publicId,systemId);
413 // static_cast<GenericRONamedNodeMapImpl*>(m_doc->document()->doctype()->notations())->addNode(n);
414 // }
415  return true;
416 }
417 
418 bool XMLHandler::unparsedEntityDecl(const QString &/*name*/, const QString &/*publicId*/,
419  const QString &/*systemId*/, const QString &/*notationName*/)
420 {
421  // ###
422  return true;
423 }
424 
425 bool XMLHandler::startDTD(const QString &name, const QString &publicId, const QString &systemId)
426 {
427  int exceptionCode = 0;
428  SharedPtr<DocumentTypeImpl> docType = m_doc->implementation()->createDocumentType(name, publicId, systemId, exceptionCode);
429 
430  if (exceptionCode == 0) {
431  docType->setDocument(m_doc);
432  m_doc->appendChild(docType.get(), exceptionCode);
433  }
434 
435  return (exceptionCode == 0);
436 }
437 
438 bool XMLHandler::endDTD()
439 {
440  return true;
441 }
442 
443 //------------------------------------------------------------------------------
444 
445 XMLTokenizer::XMLTokenizer(DOM::DocumentImpl *_doc, KHTMLView *_view)
446  : m_handler(_doc, _view)
447 {
448  m_doc = _doc;
449  m_view = _view;
450  m_cachedScript = nullptr;
451  m_noErrors = true;
452  m_executingScript = false;
453  m_explicitFinishParsingNeeded = false;
454  m_insideWrite = false;
455  m_reader.setContentHandler(&m_handler);
456  m_reader.setLexicalHandler(&m_handler);
457  m_reader.setErrorHandler(&m_handler);
458  m_reader.setDeclHandler(&m_handler);
459  m_reader.setDTDHandler(&m_handler);
460  m_reader.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
461 }
462 
463 XMLTokenizer::~XMLTokenizer()
464 {
465  if (m_cachedScript) {
466  m_cachedScript->deref(this);
467  }
468 }
469 
470 void XMLTokenizer::begin()
471 {
472  // parse xml file
473  m_reader.parse(&m_source, true);
474 }
475 
476 void XMLTokenizer::write(const TokenizerString &str, bool appendData)
477 {
478  if (!m_noErrors && appendData) {
479  return;
480  }
481 
482  // check if we try to re-enter inside write()
483  // if so buffer the data
484  if (m_insideWrite) {
485  m_bufferedData.append(str.toString());
486  return;
487  }
488  m_insideWrite = true;
489 
490  if (appendData) {
491  m_source.appendXML(str.toString());
492 
493  } else {
494  m_source.setData(str.toString());
495  }
496  m_noErrors = m_reader.parseContinue();
497 
498  if (m_doc->decoder() && m_doc->decoder()->decodedInvalidCharacters()) {
499  // any invalid character spotted by the decoder is fatal, per XML 1.0 spec. Tested by Acid 3 - 70
500  m_handler.fatalError(QXmlParseException(m_handler.errorString())); // ### FIXME: make that more informative after string freeze : i18n("input stream contains invalid characters")
501  m_noErrors = false;
502  finish();
503  return;
504  }
505 
506  // check if while parsing we tried to re-enter write() method so now we have some buffered data we need to write to document
507  while (m_noErrors && !m_bufferedData.isEmpty()) {
508  m_source.appendXML(m_bufferedData);
509  m_bufferedData.clear();
510  m_noErrors = m_reader.parseContinue();
511  }
512  // check if we need to call finish explicitly (see XMLTokenizer::finish() comment for details)
513  if (m_explicitFinishParsingNeeded) {
514  finish();
515  }
516  m_insideWrite = false;
517 }
518 
519 void XMLTokenizer::end()
520 {
521  m_source.setFinished(true);
522  //if ( m_noErrors )
523  //m_noErrors = m_reader.parseContinue();
524  emit finishedParsing();
525 }
526 
527 void XMLTokenizer::finish()
528 {
529  if (m_executingScript) {
530  // still executing script, it can happen because of reentrancy, e.g. when we have alert() inside script and we got the rest of the data
531  m_explicitFinishParsingNeeded = true;
532  return;
533  }
534  m_source.setFinished(true);
535  if (!m_noErrors) {
536  // An error occurred during parsing of the code. Display an error page to the user (the DOM
537  // tree is created manually and includes an excerpt from the code where the error is located)
538 
539  // ### for multiple error messages, display the code for each (can this happen?)
540 
541  // Clear the document
542  int exceptioncode = 0;
543  while (m_doc->hasChildNodes()) {
544  static_cast<NodeImpl *>(m_doc)->removeChild(m_doc->firstChild(), exceptioncode);
545  }
546 
547  QString line, errorLocPtr;
548  if (m_handler.errorLine != -1) {
549  QString xmlCode = m_source.data();
550  QTextStream stream(&xmlCode, QIODevice::ReadOnly);
551  for (int lineno = 0; lineno < m_handler.errorLine - 1; lineno++) {
552  stream.readLine();
553  }
554  line = stream.readLine();
555 
556  for (long colno = 0; colno < m_handler.errorCol - 1; colno++) {
557  errorLocPtr += ' ';
558  }
559  errorLocPtr += '^';
560  }
561 
562  // Create elements for display
563  DocumentImpl *doc = m_doc;
564  NodeImpl *html = doc->createElementNS(XHTML_NAMESPACE, "html");
565  NodeImpl *body = doc->createElementNS(XHTML_NAMESPACE, "body");
566  NodeImpl *h1 = doc->createElementNS(XHTML_NAMESPACE, "h1");
567  NodeImpl *headingText = doc->createTextNode(i18n("XML parsing error"));
568  NodeImpl *errorText = doc->createTextNode(m_handler.errorProtocol());
569  NodeImpl *hr = nullptr;
570  NodeImpl *pre = nullptr;
571  NodeImpl *lineText = nullptr;
572  NodeImpl *errorLocText = nullptr;
573  if (!line.isNull()) {
574  hr = doc->createElementNS(XHTML_NAMESPACE, "hr");
575  pre = doc->createElementNS(XHTML_NAMESPACE, "pre");
576  lineText = doc->createTextNode(line + '\n');
577  errorLocText = doc->createTextNode(errorLocPtr);
578  }
579 
580  // Construct DOM tree. We ignore exceptions as we assume they will not be thrown here (due to the
581  // fact we are using a known tag set)
582  doc->appendChild(html, exceptioncode);
583  html->appendChild(body, exceptioncode);
584  body->appendChild(h1, exceptioncode);
585  h1->appendChild(headingText, exceptioncode);
586  body->appendChild(errorText, exceptioncode);
587  body->appendChild(hr, exceptioncode);
588  body->appendChild(pre, exceptioncode);
589  if (pre) {
590  pre->appendChild(lineText, exceptioncode);
591  pre->appendChild(errorLocText, exceptioncode);
592  }
593 
594  // Close the renderers so that they update their display correctly
595  // ### this should not be necessary, but requires changes in the rendering code...
596  h1->close();
597  if (pre) {
598  pre->close();
599  }
600  body->close();
601 
602  m_doc->recalcStyle(NodeImpl::Inherit);
603  m_doc->updateRendering();
604  } else {
605  // Parsing was successful, all scripts have finished downloading and executing,
606  // calculating the style for the document and close the last element
607  m_doc->updateStyleSelector();
608  }
609 
610  // finished parsing, call end()
611  end();
612 }
613 
614 void XMLTokenizer::notifyFinished(CachedObject *finishedObj)
615 {
616  // This is called when a script has finished loading that was requested from executeScript(). We execute
617  // the script, and then continue parsing of the document
618  if (finishedObj == m_cachedScript) {
619  DOMString scriptSource = m_cachedScript->script();
620  m_cachedScript->deref(this);
621  m_cachedScript = nullptr;
622  if (m_view) {
623  m_executingScript = true;
624  m_view->part()->executeScript(DOM::Node(), scriptSource.string());
625  m_executingScript = false;
626  }
627  // should continue parsing here after we fetched and executed the script
628  m_source.setPaused(false);
629  m_reader.parseContinue();
630  }
631 }
632 
633 bool XMLTokenizer::isWaitingForScripts() const
634 {
635  return m_cachedScript != nullptr;
636 }
637 
638 void XMLTokenizer::executeScript(NodeImpl *node)
639 {
640  ElementImpl *script = static_cast<ElementImpl *>(node);
641  DOMString scriptSrc;
642  if (node->id() == WebCore::SVGNames::scriptTag.id()) {
643  scriptSrc = script->getAttribute(WebCore::XLinkNames::hrefAttr.id());
644  } else {
645  scriptSrc = script->getAttribute(ATTR_SRC);
646  }
647 
648  QString charset = script->getAttribute(ATTR_CHARSET).string();
649 
650  if (!scriptSrc.isEmpty()) {
651  // we have a src attribute
652  m_cachedScript = m_doc->docLoader()->requestScript(scriptSrc, charset);
653  if (m_cachedScript) {
654  // pause parsing until we got script
655  m_source.setPaused();
656  m_cachedScript->ref(this); // the parsing will be continued once the script is fetched and executed in notifyFinished()
657  return;
658  }
659  } else {
660  // no src attribute - execute from contents of tag
661  QString scriptCode = "";
662  NodeImpl *child;
663  for (child = script->firstChild(); child; child = child->nextSibling()) {
664  if ((child->nodeType() == Node::TEXT_NODE || child->nodeType() == Node::CDATA_SECTION_NODE) &&
665  static_cast<TextImpl *>(child)->string())
666  scriptCode += QString::fromRawData(static_cast<TextImpl *>(child)->string()->s,
667  static_cast<TextImpl *>(child)->string()->l);
668  }
669  // the script cannot do document.write until we support incremental parsing
670  // ### handle the case where the script deletes the node or redirects to
671  // another page, etc. (also in notifyFinished())
672  // ### the script may add another script node after this one which should be executed
673  if (m_view) {
674  m_executingScript = true;
675  m_view->part()->executeScript(DOM::Node(), scriptCode);
676  m_executingScript = false;
677  }
678  }
679 }
680 
int indexOf(QChar ch, int from, Qt::CaseSensitivity cs) const const
The Node interface is the primary datatype for the entire Document Object Model.
Definition: dom_node.h:278
This file is part of the HTML rendering engine for KDE.
int columnNumber() const const
Renders and displays HTML in a QScrollArea.
Definition: khtmlview.h:97
void splitName(const QString &qname, QString &prefix, QString &localname) const const
QString & remove(int position, int n)
int length() const const
int lineNumber() const const
bool isNull() const const
QString fromRawData(const QChar *unicode, int size)
QString message() const const
bool isEmpty() const const
QString trimmed() const const
This class implements the basic string we use in the DOM.
Definition: dom_string.h:44
QString qName(int index) const const
QString i18n(const char *text, const TYPE &arg...)
const QChar * unicode() const const
This library provides a full-featured HTML parser and widget.
const QList< QKeySequence > & end()
bool isEmpty() const const
QString uri(int index) const const
int length() const const
QString value(int index) const const
QChar * data()
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Tue Oct 26 2021 22:48:10 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.