KHtml

dom_stringimpl.cpp
1 /**
2  * This file is part of the DOM implementation for KDE.
3  *
4  * Copyright (C) 1999-2003 Lars Knoll ([email protected])
5  * (C) 1999 Antti Koivisto ([email protected])
6  * (C) 2001-2003 Dirk Mueller ( [email protected] )
7  * (C) 2002, 2004 Apple Computer, Inc.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Library General Public
11  * License as published by the Free Software Foundation; either
12  * version 2 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Library General Public License for more details.
18  *
19  * You should have received a copy of the GNU Library General Public License
20  * along with this library; see the file COPYING.LIB. If not, write to
21  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22  * Boston, MA 02110-1301, USA.
23  *
24  */
25 
26 #include "dom_stringimpl.h"
27 
28 #include <string.h>
29 #include <QMutableStringListIterator>
30 #include "misc/AtomicString.h"
31 
32 using namespace DOM;
33 using namespace khtml;
34 
35 DOMStringImpl::DOMStringImpl(const char *str) : m_hash(0), m_inTable(0), m_shallowCopy(0)
36 {
37  if (str && *str) {
38  l = strlen(str);
39  s = QT_ALLOC_QCHAR_VEC(l);
40  int i = l;
41  QChar *ptr = s;
42  while (i--) {
43  *ptr++ = *str++;
44  }
45  } else {
46  s = QT_ALLOC_QCHAR_VEC(1); // crash protection
47  s[0] = 0x0; // == QChar::null;
48  l = 0;
49  }
50 }
51 
52 DOMStringImpl::DOMStringImpl(const char *str, uint len) : m_hash(0), m_inTable(0), m_shallowCopy(0)
53 {
54  if (str && *str) {
55  l = len;
56  s = QT_ALLOC_QCHAR_VEC(l);
57  int i = l;
58  QChar *ptr = s;
59  while (i--) {
60  *ptr++ = *str++;
61  }
62  } else {
63  s = QT_ALLOC_QCHAR_VEC(1); // crash protection
64  s[0] = 0x0; // == QChar::null;
65  l = 0;
66  }
67 }
68 
69 DOMStringImpl::DOMStringImpl(const char *str, unsigned len/*gth*/, unsigned hash) : m_hash(hash), m_inTable(true), m_shallowCopy(0)
70 {
71  if (str && *str) {
72  l = len;
73  s = QT_ALLOC_QCHAR_VEC(l);
74  int i = l;
75  QChar *ptr = s;
76  while (i--) {
77  *ptr++ = *str++;
78  }
79  } else {
80  s = QT_ALLOC_QCHAR_VEC(1); // crash protection
81  s[0] = 0x0; // == QChar::null;
82  l = 0;
83  }
84 }
85 
86 DOMStringImpl::~DOMStringImpl()
87 {
88  if (m_shallowCopy) {
89  return;
90  }
91  if (m_inTable) {
92  khtml::AtomicString::remove(this);
93  }
94  if (s) {
95  QT_DELETE_QCHAR_VEC(s);
96  }
97 }
98 
99 // FIXME: should be a cached flag maybe.
100 bool DOMStringImpl::containsOnlyWhitespace() const
101 {
102  if (!s) {
103  return true;
104  }
105 
106  for (uint i = 0; i < l; i++) {
107  QChar c = s[i];
108  if (c.unicode() <= 0x7F) {
109  if (c.unicode() > ' ') {
110  return false;
111  }
112  } else {
113  if (c.direction() != QChar::DirWS) {
114  return false;
115  }
116  }
117  }
118  return true;
119 }
120 
121 void DOMStringImpl::append(DOMStringImpl *str)
122 {
123  if (str && str->l != 0) {
124  int newlen = l + str->l;
125  QChar *c = QT_ALLOC_QCHAR_VEC(newlen);
126  memcpy(c, s, l * sizeof(QChar));
127  memcpy(c + l, str->s, str->l * sizeof(QChar));
128  if (s) {
129  QT_DELETE_QCHAR_VEC(s);
130  }
131  s = c;
132  l = newlen;
133  }
134 }
135 
136 void DOMStringImpl::insert(DOMStringImpl *str, unsigned int pos)
137 {
138  if (pos > l) {
139  append(str);
140  return;
141  }
142  if (str && str->l != 0) {
143  int newlen = l + str->l;
144  QChar *c = QT_ALLOC_QCHAR_VEC(newlen);
145  memcpy(c, s, pos * sizeof(QChar));
146  memcpy(c + pos, str->s, str->l * sizeof(QChar));
147  memcpy(c + pos + str->l, s + pos, (l - pos)*sizeof(QChar));
148  if (s) {
149  QT_DELETE_QCHAR_VEC(s);
150  }
151  s = c;
152  l = newlen;
153  }
154 }
155 
156 void DOMStringImpl::truncate(int len)
157 {
158  if (len > (int)l) {
159  return;
160  }
161 
162  int nl = len < 1 ? 1 : len;
163  QChar *c = QT_ALLOC_QCHAR_VEC(nl);
164  memcpy(c, s, nl * sizeof(QChar));
165  if (s) {
166  QT_DELETE_QCHAR_VEC(s);
167  }
168  s = c;
169  l = len;
170 }
171 
172 void DOMStringImpl::remove(unsigned int pos, int len)
173 {
174  if (pos >= l) {
175  return;
176  }
177  if (pos + len > l) {
178  len = l - pos;
179  }
180 
181  uint newLen = l - len;
182  QChar *c = QT_ALLOC_QCHAR_VEC(newLen);
183  memcpy(c, s, pos * sizeof(QChar));
184  memcpy(c + pos, s + pos + len, (l - len - pos)*sizeof(QChar));
185  if (s) {
186  QT_DELETE_QCHAR_VEC(s);
187  }
188  s = c;
189  l = newLen;
190 }
191 
192 DOMStringImpl *DOMStringImpl::split(unsigned int pos)
193 {
194  if (pos >= l) {
195  return new DOMStringImpl();
196  }
197 
198  uint newLen = l - pos;
199  DOMStringImpl *str = new DOMStringImpl(s + pos, newLen);
200  truncate(pos);
201  return str;
202 }
203 
204 DOMStringImpl *DOMStringImpl::substring(unsigned int pos, unsigned int len)
205 {
206  if (pos >= l) {
207  return new DOMStringImpl();
208  }
209  if (len == UINT_MAX || pos + len > l) {
210  len = l - pos;
211  }
212 
213  return new DOMStringImpl(s + pos, len);
214 }
215 
216 // Collapses white-space according to CSS 2.1 rules
217 DOMStringImpl *DOMStringImpl::collapseWhiteSpace(bool preserveLF, bool preserveWS)
218 {
219  if (preserveLF && preserveWS) {
220  return this;
221  }
222 
223  // Notice we are likely allocating more space than needed (worst case)
224  QChar *n = QT_ALLOC_QCHAR_VEC(l);
225 
226  unsigned int pos = 0;
227  bool collapsing = false; // collapsing white-space
228  bool collapsingLF = false; // collapsing around linefeed
229  bool changedLF = false;
230  for (unsigned int i = 0; i < l; i++) {
231  ushort ch = s[i].unicode();
232 
233  // We act on \r as we would on \n because CSS uses it to indicate new-line
234  if (ch == '\r') {
235  ch = '\n';
236  } else
237  // ### The XML parser lets \t through, for now treat them as spaces
238  if (ch == '\t') {
239  ch = ' ';
240  }
241 
242  if (!preserveLF && ch == '\n') {
243  // ### Not strictly correct according to CSS3 text-module.
244  // - In ideographic languages linefeed should be ignored
245  // - and in Thai and Khmer it should be treated as a zero-width space
246  ch = ' '; // Treat as space
247  changedLF = true;
248  }
249 
250  if (collapsing) {
251  if (ch == ' ') {
252  continue;
253  }
254  if (ch == '\n') {
255  collapsingLF = true;
256  continue;
257  }
258 
259  n[pos++] = (collapsingLF) ? QLatin1Char('\n') : QLatin1Char(' ');
260  collapsing = false;
261  collapsingLF = false;
262  } else if (!preserveWS && ch == ' ') {
263  collapsing = true;
264  continue;
265  } else if (!preserveWS && ch == '\n') {
266  collapsing = true;
267  collapsingLF = true;
268  continue;
269  }
270 
271  n[pos++] = ch;
272  }
273  if (collapsing) {
274  n[pos++] = ((collapsingLF) ? QLatin1Char('\n') : QLatin1Char(' '));
275  }
276 
277  if (pos == l && !changedLF) {
278  QT_DELETE_QCHAR_VEC(n);
279  return this;
280  } else {
281  DOMStringImpl *out = new DOMStringImpl();
282  out->s = n;
283  out->l = pos;
284 
285  return out;
286  }
287 }
288 
289 static Length parseLength(const QChar *s, unsigned int l)
290 {
291  if (l == 0) {
292  return Length(1, Relative);
293  }
294 
295  unsigned i = 0;
296  while (i < l && s[i].isSpace()) {
297  ++i;
298  }
299  if (i < l && (s[i] == '+' || s[i] == '-')) {
300  ++i;
301  }
302  while (i < l && s[i].isDigit()) {
303  ++i;
304  }
305 
306  bool ok;
307  int r = QString::fromRawData(s, i).toInt(&ok);
308 
309  /* Skip over any remaining digits, we are not that accurate (5.5% => 5%) */
310  while (i < l && (s[i].isDigit() || s[i] == '.')) {
311  ++i;
312  }
313 
314  /* IE Quirk: Skip any whitespace (20 % => 20%) */
315  while (i < l && s[i].isSpace()) {
316  ++i;
317  }
318 
319  if (ok) {
320  if (i == l) {
321  return Length(r, Fixed);
322  } else {
323  const QChar *next = s + i;
324 
325  if (*next == '%') {
326  return Length(static_cast<double>(r), Percent);
327  }
328 
329  if (*next == '*') {
330  return Length(r, Relative);
331  }
332  }
333  return Length(r, Fixed);
334  } else {
335  if (i < l) {
336  const QChar *next = s + i;
337 
338  if (*next == '*') {
339  return Length(1, Relative);
340  }
341 
342  if (*next == '%') {
343  return Length(1, Relative);
344  }
345  }
346  }
347  return Length(0, Relative);
348 }
349 
350 khtml::Length *DOMStringImpl::toCoordsArray(int &len) const
351 {
352  QString str(s, l);
353  for (unsigned int i = 0; i < l; i++) {
354  QChar cc = s[i];
355  if (cc > '9' || (cc < '0' && cc != '-' && cc != '*' && cc != '.')) {
356  str[i] = ' ';
357  }
358  }
359  str = str.simplified();
360 
361  len = str.count(' ') + 1;
362  khtml::Length *r = new khtml::Length[len];
363 
364  int j = 0;
365  int pos = 0;
366  int pos2;
367 
368  while ((pos2 = str.indexOf(QLatin1Char(' '), pos)) != -1) {
369  r[j++] = parseLength((QChar *) str.unicode() + pos, pos2 - pos);
370  pos = pos2 + 1;
371  }
372  r[j] = parseLength((QChar *) str.unicode() + pos, str.length() - pos);
373 
374  return r;
375 }
376 
377 khtml::Length *DOMStringImpl::toLengthArray(int &len) const
378 {
379  QString str(s, l);
380  str = str.simplified();
381 
382  len = str.count(QLatin1Char(',')) + 1;
383 
384  // If we have no commas, we have no array.
385  if (len == 1) {
386  return nullptr;
387  }
388 
389  khtml::Length *r = new khtml::Length[len];
390 
391  int i = 0;
392  int pos = 0;
393  int pos2;
394 
395  while ((pos2 = str.indexOf(QLatin1Char(','), pos)) != -1) {
396  r[i++] = parseLength((QChar *) str.unicode() + pos, pos2 - pos);
397  pos = pos2 + 1;
398  }
399 
400  /* IE Quirk: If the last comma is the last char skip it and reduce len by one */
401  if (str.length() - pos > 0) {
402  r[i] = parseLength((QChar *) str.unicode() + pos, str.length() - pos);
403  } else {
404  len--;
405  }
406 
407  return r;
408 }
409 
410 bool DOMStringImpl::isLower() const
411 {
412  unsigned int i;
413  for (i = 0; i < l; i++)
414  if (s[i].toLower() != s[i]) {
415  return false;
416  }
417  return true;
418 }
419 
420 DOMStringImpl *DOMStringImpl::lower() const
421 {
422  DOMStringImpl *c = new DOMStringImpl;
423  if (!l) {
424  return c;
425  }
426 
427  c->s = QT_ALLOC_QCHAR_VEC(l);
428  c->l = l;
429 
430  for (unsigned int i = 0; i < l; i++) {
431  c->s[i] = s[i].toLower();
432  }
433 
434  return c;
435 }
436 
437 DOMStringImpl *DOMStringImpl::upper() const
438 {
439  DOMStringImpl *c = new DOMStringImpl;
440  if (!l) {
441  return c;
442  }
443 
444  c->s = QT_ALLOC_QCHAR_VEC(l);
445  c->l = l;
446 
447  for (unsigned int i = 0; i < l; i++) {
448  c->s[i] = s[i].toUpper();
449  }
450 
451  return c;
452 }
453 
454 DOMStringImpl *DOMStringImpl::capitalize(bool noFirstCap) const
455 {
456  bool canCapitalize = !noFirstCap;
457  DOMStringImpl *c = new DOMStringImpl;
458  if (!l) {
459  return c;
460  }
461 
462  c->s = QT_ALLOC_QCHAR_VEC(l);
463  c->l = l;
464 
465  for (unsigned int i = 0; i < l; i++) {
466  if (s[i].isLetterOrNumber() && canCapitalize) {
467  c->s[i] = s[i].toUpper();
468  canCapitalize = false;
469  } else {
470  c->s[i] = s[i];
471  if (s[i].isSpace()) {
472  canCapitalize = true;
473  }
474  }
475  }
476 
477  return c;
478 }
479 
480 QString DOMStringImpl::string() const
481 {
482  return QString(s, l);
483 }
484 
485 int DOMStringImpl::toInt(bool *ok) const
486 {
487  // match \s*[+-]?\d*
488  unsigned i = 0;
489  while (i < l && s[i].isSpace()) {
490  ++i;
491  }
492  if (i < l && (s[i] == '+' || s[i] == '-')) {
493  ++i;
494  }
495  while (i < l && s[i].isDigit()) {
496  ++i;
497  }
498 
499  return QString::fromRawData(s, i).toInt(ok);
500 }
501 
502 float DOMStringImpl::toFloat(bool *ok) const
503 {
504  return QString::fromRawData(s, l).toFloat(ok);
505 }
506 
507 bool DOMStringImpl::endsWith(DOMStringImpl *str, CaseSensitivity cs) const
508 {
509  if (l < str->l) {
510  return false;
511  }
512  const QChar *a = s + l - 1;
513  const QChar *b = str->s + str->l - 1;
514  int i = str->l;
515  if (cs == CaseSensitive) {
516  while (i--) {
517  if (*a != *b) {
518  return false;
519  }
520  a--, b--;
521  }
522  } else {
523  while (i--) {
524  if (a->toLower() != b->toLower()) {
525  return false;
526  }
527  a--, b--;
528  }
529  }
530  return true;
531 }
532 
533 bool DOMStringImpl::startsWith(DOMStringImpl *str, CaseSensitivity cs) const
534 {
535  if (l < str->l) {
536  return false;
537  }
538  const QChar *a = s;
539  const QChar *b = str->s;
540  int i = str->l;
541  if (cs == CaseSensitive) {
542  while (i--) {
543  if (*a != *b) {
544  return false;
545  }
546  a++, b++;
547  }
548  } else {
549  while (i--) {
550  if (a->toLower() != b->toLower()) {
551  return false;
552  }
553  a++, b++;
554  }
555  }
556  return true;
557 }
558 
559 DOMStringImpl *DOMStringImpl::substring(unsigned pos, unsigned len) const
560 {
561  if (pos >= l) {
562  return nullptr;
563  }
564  if (len > l - pos) {
565  len = l - pos;
566  }
567  return new DOMStringImpl(s + pos, len);
568 }
569 
570 static const unsigned short amp[] = {'&', 'a', 'm', 'p', ';'};
571 static const unsigned short lt[] = {'&', 'l', 't', ';'};
572 static const unsigned short gt[] = {'&', 'g', 't', ';'};
573 static const unsigned short nbsp[] = {'&', 'n', 'b', 's', 'p', ';'};
574 
575 DOMStringImpl *DOMStringImpl::escapeHTML()
576 {
577  unsigned outL = 0;
578  for (unsigned int i = 0; i < l; ++i) {
579  if (s[i] == '&') {
580  outL += 5; //&amp;
581  } else if (s[i] == '<' || s[i] == '>') {
582  outL += 4; //&gt;/&lt;
583  } else if (s[i] == QChar::Nbsp) {
584  outL += 6; //&nbsp;
585  } else {
586  ++outL;
587  }
588  }
589  if (outL == l) {
590  return this;
591  }
592 
593  DOMStringImpl *toRet = new DOMStringImpl();
594  toRet->s = QT_ALLOC_QCHAR_VEC(outL);
595  toRet->l = outL;
596 
597  unsigned outP = 0;
598  for (unsigned int i = 0; i < l; ++i) {
599  if (s[i] == '&') {
600  memcpy(&toRet->s[outP], amp, sizeof(amp));
601  outP += 5;
602  } else if (s[i] == '<') {
603  memcpy(&toRet->s[outP], lt, sizeof(lt));
604  outP += 4;
605  } else if (s[i] == '>') {
606  memcpy(&toRet->s[outP], gt, sizeof(gt));
607  outP += 4;
608  } else if (s[i] == QChar::Nbsp) {
609  memcpy(&toRet->s[outP], nbsp, sizeof(nbsp));
610  outP += 6;
611  } else {
612  toRet->s[outP] = s[i];
613  ++outP;
614  }
615  }
616  return toRet;
617 }
618 
619 enum NoFoldTag { DoNotFold };
620 enum FoldLowerTag { FoldLower };
621 enum FoldUpperTag { FoldUpper };
622 
623 static inline
624 unsigned short foldChar(unsigned short c, NoFoldTag)
625 {
626  return c;
627 }
628 
629 static inline
630 unsigned short foldChar(unsigned short c, FoldLowerTag)
631 {
632  // ### fast path for first ones?
633  return QChar::toLower(c);
634 }
635 
636 static inline
637 unsigned short foldChar(unsigned short c, FoldUpperTag)
638 {
639  // ### fast path for first ones?
640  return QChar::toUpper(c);
641 }
642 
643 // Paul Hsieh's SuperFastHash
644 // http://www.azillionmonkeys.com/qed/hash.html
645 
646 // Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
647 // or anything like that.
648 const unsigned PHI = 0x9e3779b9U;
649 
650 template<typename FoldTag>
651 static unsigned calcHash(const QChar *s, unsigned l, FoldTag foldMode)
652 {
653  // Note: this is originally from KJS
654  unsigned hash = PHI;
655  unsigned tmp;
656 
657  int rem = l & 1;
658  l >>= 1;
659 
660  // Main loop
661  for (; l > 0; l--) {
662  hash += foldChar(s[0].unicode(), foldMode);
663  tmp = (foldChar(s[1].unicode(), foldMode) << 11) ^ hash;
664  hash = (hash << 16) ^ tmp;
665  s += 2;
666  hash += hash >> 11;
667  }
668 
669  // Handle end case
670  if (rem) {
671  hash += foldChar(s[0].unicode(), foldMode);
672  hash ^= hash << 11;
673  hash += hash >> 17;
674  }
675 
676  // Force "avalanching" of final 127 bits
677  hash ^= hash << 3;
678  hash += hash >> 5;
679  hash ^= hash << 2;
680  hash += hash >> 15;
681  hash ^= hash << 10;
682 
683  // this avoids ever returning a hash code of 0, since that is used to
684  // signal "hash not computed yet", using a value that is likely to be
685  // effectively the same as 0 when the low bits are masked
686  if (hash == 0) {
687  hash = 0x80000000;
688  }
689 
690  return hash;
691 }
692 
693 unsigned DOMStringImpl::hash() const
694 {
695  if (m_hash != 0) {
696  return m_hash;
697  }
698 
699  return m_hash = calcHash(s, l, DoNotFold);
700 }
701 
702 unsigned DOMStringImpl::lowerHash() const
703 {
704  return calcHash(s, l, FoldLower);
705 }
706 
707 unsigned DOMStringImpl::upperHash() const
708 {
709  return calcHash(s, l, FoldUpper);
710 }
711 
712 unsigned DOMStringImpl::computeHash(const QChar *str, unsigned int length)
713 {
714  return calcHash(str, length, DoNotFold);
715 }
716 
717 DOMStringImpl *DOMStringImpl::empty()
718 {
719  static DOMString e("");
720  return e.implementation();
721 }
722 
723 bool DOM::strcasecmp(const DOMStringImpl *a, const DOMStringImpl *b)
724 {
725  if (!(a && b)) {
726  return (a && a->l) || (b && b->l);
727  }
728  if (a->l != b->l) {
729  return true;
730  }
731  QChar *ai = a->s;
732  QChar *bi = b->s;
733  int l = a->l;
734  while (l--) {
735  if (*ai != *bi && ai->toLower() != bi->toLower()) {
736  return true;
737  }
738  ++ai, ++bi;
739  }
740  return false;
741 }
742 
This file is part of the HTML rendering engine for KDE.
MESSAGECORE_EXPORT KMime::Content * next(KMime::Content *node, bool allowChildren=true)
QString fromRawData(const QChar *unicode, int size)
int toInt(bool *ok, int base) const const
This class implements the basic string we use in the DOM.
Definition: dom_string.h:44
QChar::Direction direction() const const
ushort unicode() const const
QChar toLower() const const
This library provides a full-featured HTML parser and widget.
QChar toUpper() const const
float toFloat(bool *ok) const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Tue Oct 26 2021 22:48:01 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.