Libksieve

lexer.cpp
1 /* -*- c++ -*-
2  parser/lexer.cpp
3 
4  This file is part of KSieve,
5  the KDE internet mail/usenet news message filtering library.
6  SPDX-FileCopyrightText: 2002-2003 Marc Mutz <[email protected]>
7 
8  SPDX-License-Identifier: GPL-2.0-only
9 */
10 
11 #include <impl/lexer.h>
12 #include <ksieve/lexer.h>
13 
14 #include <impl/utf8validator.h>
15 #include <ksieve/error.h>
16 
17 #include <QString>
18 #include <QStringList>
19 #include <QTextCodec>
20 
21 #include <memory> // std::unique_ptr
22 
23 #include <assert.h>
24 #include <ctype.h> // isdigit
25 
26 #ifdef STR_DIM
27 #undef STR_DIM
28 #endif
29 #define STR_DIM(x) (sizeof(x) - 1)
30 
31 namespace KSieve
32 {
33 //
34 //
35 // Lexer Bridge implementation
36 //
37 //
38 
39 Lexer::Lexer(const char *scursor, const char *send, int options)
40  : i(new Impl(scursor, send, options))
41 {
42 }
43 
44 Lexer::~Lexer()
45 {
46  delete i;
47  i = nullptr;
48 }
49 
50 bool Lexer::ignoreComments() const
51 {
52  assert(i);
53  return i->ignoreComments();
54 }
55 
56 const Error &Lexer::error() const
57 {
58  assert(i);
59  return i->error();
60 }
61 
62 bool Lexer::atEnd() const
63 {
64  assert(i);
65  return i->atEnd();
66 }
67 
68 int Lexer::column() const
69 {
70  assert(i);
71  return i->column();
72 }
73 
74 int Lexer::line() const
75 {
76  assert(i);
77  return i->line();
78 }
79 
80 void Lexer::save()
81 {
82  assert(i);
83  i->save();
84 }
85 
86 void Lexer::restore()
87 {
88  assert(i);
89  i->restore();
90 }
91 
92 Lexer::Token Lexer::nextToken(QString &result)
93 {
94  assert(i);
95  return i->nextToken(result);
96 }
97 } // namespace KSieve
98 
99 // none except a-zA-Z0-9_
100 static const unsigned char iTextMap[16] = {
101  0x00,
102  0x00,
103  0x00,
104  0x00, // CTLs: none
105  0x00,
106  0x00,
107  0xFF,
108  0xC0, // SP ... '?': 0-9
109  0x7F,
110  0xFF,
111  0xFF,
112  0xE1, // '@' ... '_': A-Z_
113  0x7F,
114  0xFF,
115  0xFF,
116  0xE0 // '`' ... DEL: a-z
117 };
118 
119 // SP, HT, CR, LF, {}[]();,#/
120 // ### exclude '['? Why would one want to write identifier["foo"]?
121 static const unsigned char delimMap[16] = {
122  0x00,
123  0x64,
124  0x00,
125  0x00, // CTLs: CR, HT, LF
126  0x90,
127  0xC9,
128  0x00,
129  0x10, // SP ... '?': SP, #(),;
130  0x00,
131  0x00,
132  0x00,
133  0x16, // '@' ... '_': []
134  0x00,
135  0x00,
136  0x00,
137  0x16 // '`' ... DEL: {}
138 };
139 
140 // All except iText, delim, "*:
141 static const unsigned char illegalMap[16] = {0xFF, 0x9B, 0xFF, 0xFF, 0x4F, 0x16, 0x00, 0x0F, 0x80, 0x00, 0x00, 0x0A, 0x80, 0x00, 0x00, 0x0A};
142 
143 static inline bool isOfSet(const unsigned char map[16], unsigned char ch)
144 {
145  assert(ch < 128);
146  return map[ch / 8] & 0x80 >> ch % 8;
147 }
148 
149 static inline bool isIText(unsigned char ch)
150 {
151  return ch <= 'z' && isOfSet(iTextMap, ch);
152 }
153 
154 static inline bool isDelim(unsigned char ch)
155 {
156  return ch <= '}' && isOfSet(delimMap, ch);
157 }
158 
159 static inline bool isIllegal(unsigned char ch)
160 {
161  return ch >= '~' || isOfSet(illegalMap, ch);
162 }
163 
164 static inline bool is8Bit(signed char ch)
165 {
166  return ch < 0;
167 }
168 
169 static QString removeCRLF(const QString &s)
170 {
171  const bool CRLF = s.endsWith(QLatin1String("\r\n"));
172  const bool LF = !CRLF && s.endsWith(QLatin1Char('\n'));
173 
174  const int e = CRLF ? 2 : LF ? 1 : 0; // what to chop off at the end
175 
176  return s.left(s.length() - e);
177 }
178 
179 static QString removeDotStuff(const QString &s)
180 {
181  return s.startsWith(QLatin1String("..")) ? s.mid(1) : s;
182 }
183 
184 namespace KSieve
185 {
186 //
187 //
188 // Lexer Implementation
189 //
190 //
191 
192 Lexer::Impl::Impl(const char *scursor, const char *send, int options)
193  : mState(scursor ? scursor : send)
194  , mEnd(send ? send : scursor)
195  , mIgnoreComments(options & IgnoreComments)
196  , mIgnoreLF(options & IgnoreLineFeeds)
197 {
198  if (!scursor || !send) {
199  assert(atEnd());
200  }
201 }
202 
203 Lexer::Token Lexer::Impl::nextToken(QString &result)
204 {
205  assert(!atEnd());
206  result.clear();
207  // clearErrors();
208 
209  const int oldLine = line();
210 
211  const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS();
212 
213  if (!ignoreLineFeeds() && oldLine != line()) {
214  result.setNum(line() - oldLine); // return number of linefeeds encountered
215  return LineFeeds;
216  }
217 
218  if (!eatingWSSucceeded) {
219  return None;
220  }
221 
222  if (atEnd()) {
223  return None;
224  }
225 
226  switch (*mState.cursor) {
227  case '#': // HashComment
228  assert(!ignoreComments());
229  ++mState.cursor;
230  if (!atEnd()) {
231  parseHashComment(result, true);
232  }
233  return HashComment;
234  case '/': // BracketComment
235  assert(!ignoreComments());
236  ++mState.cursor; // eat slash
237  if (atEnd() || *mState.cursor != '*') {
238  makeError(Error::SlashWithoutAsterisk);
239  return BracketComment;
240  }
241  ++mState.cursor; // eat asterisk
242  if (atEnd()) {
243  makeError(Error::UnfinishedBracketComment);
244  return BracketComment;
245  }
246  parseBracketComment(result, true);
247  return BracketComment;
248  case ':': // Tag
249  ++mState.cursor;
250  if (atEnd()) {
251  makeError(Error::UnexpectedCharacter, line(), column() - 1);
252  return Tag;
253  }
254  if (!isIText(*mState.cursor)) {
255  makeIllegalCharError(*mState.cursor);
256  return Tag;
257  }
258  parseTag(result);
259  return Tag;
260  case '"': // QuotedString
261  ++mState.cursor;
262  parseQuotedString(result);
263  return QuotedString;
264  case '{':
265  case '}':
266  case '[':
267  case ']':
268  case '(':
269  case ')':
270  case ';':
271  case ',': // Special
272  result = QLatin1Char(*mState.cursor++);
273  return Special;
274  case '0':
275  case '1':
276  case '2':
277  case '3':
278  case '4':
279  case '5':
280  case '6':
281  case '7':
282  case '8':
283  case '9': // Number
284  parseNumber(result);
285  return Number;
286  case 't': // maybe MultiLineString, else Identifier
287  if (_strnicmp(mState.cursor, "text:", STR_DIM("text:")) == 0) {
288  // MultiLineString
289  mState.cursor += STR_DIM("text:");
290  parseMultiLine(result);
291  // ### FIXME: There can be a hash-comment between "text:"
292  // and CRLF! That should be preserved somehow...
293  return MultiLineString;
294  }
295  Q_FALLTHROUGH();
296  default: // Identifier (first must not be 0-9, and can't (caught by Number above))
297  if (!isIText(*mState.cursor)) {
298  makeError(Error::IllegalCharacter);
299  return None;
300  }
301  parseIdentifier(result);
302  return Identifier;
303  }
304 }
305 
306 bool Lexer::Impl::eatWS()
307 {
308  while (!atEnd()) {
309  switch (*mState.cursor) {
310  case '\r':
311  case '\n':
312  if (!eatCRLF()) {
313  return false;
314  }
315  break;
316  case ' ':
317  case '\t':
318  ++mState.cursor;
319  break;
320  default:
321  return true;
322  }
323  }
324 
325  // at end:
326  return true;
327 }
328 
329 bool Lexer::Impl::eatCRLF()
330 {
331  assert(!atEnd());
332  assert(*mState.cursor == '\n' || *mState.cursor == '\r');
333 
334  if (*mState.cursor == '\r') {
335  ++mState.cursor;
336  if (atEnd() || *mState.cursor != '\n') {
337  // CR w/o LF -> error
338  makeError(Error::CRWithoutLF);
339  return false;
340  } else {
341  // good CRLF
342  newLine();
343  return true;
344  }
345  } else { /* *mState.cursor == '\n' */
346  // good, LF only
347  newLine();
348  return true;
349  }
350 }
351 
352 bool Lexer::Impl::parseHashComment(QString &result, bool reallySave)
353 {
354  // hash-comment := "#" *CHAR-NOT-CRLF CRLF
355 
356  // check that the caller plays by the rules:
357  assert(*(mState.cursor - 1) == '#');
358 
359  const char *const commentStart = mState.cursor;
360 
361  // find next CRLF:
362  while (!atEnd()) {
363  if (*mState.cursor == '\n' || *mState.cursor == '\r') {
364  break;
365  }
366  ++mState.cursor;
367  }
368  const char *const commentEnd = mState.cursor - 1;
369 
370  // Laurent it creates a problem when we have just "#F" => it doesn't see it as a comment
371  // if (commentEnd == commentStart) {
372  // return true; // # was last char in script...
373  // }
374 
375  if (atEnd() || eatCRLF()) {
376  const int commentLength = commentEnd - commentStart + 1;
377  if (commentLength > 0) {
378  if (!isValidUtf8(commentStart, commentLength)) {
379  makeError(Error::InvalidUTF8);
380  return false;
381  }
382  if (reallySave) {
383  result += QString::fromUtf8(commentStart, commentLength);
384  // In comment < or > breaks parsing => convert them to double quote
385  // See src/ksieveui/scriptsparsing/tests/failed/script1.siv
386  result.replace(QLatin1Char('<'), QLatin1Char('"'));
387  result.replace(QLatin1Char('>'), QLatin1Char('"'));
388  }
389  }
390  return true;
391  }
392 
393  return false;
394 }
395 
396 bool Lexer::Impl::parseBracketComment(QString &result, bool reallySave)
397 {
398  // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
399 
400  // check that caller plays by the rules:
401  assert(*(mState.cursor - 2) == '/');
402  assert(*(mState.cursor - 1) == '*');
403 
404  const char *const commentStart = mState.cursor;
405  const int commentCol = column() - 2;
406  const int commentLine = line();
407 
408  // find next asterisk:
409  do {
410  if (!skipTo('*')) {
411  if (!error()) {
412  makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
413  }
414  return false;
415  }
416  } while (!atEnd() && *++mState.cursor != '/');
417 
418  if (atEnd()) {
419  makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
420  return false;
421  }
422 
423  assert(*mState.cursor == '/');
424 
425  const int commentLength = mState.cursor - commentStart - 1;
426  if (commentLength > 0) {
427  if (!isValidUtf8(commentStart, commentLength)) {
428  makeError(Error::InvalidUTF8);
429  return false;
430  }
431  if (reallySave) {
432  QString tmp = QString::fromUtf8(commentStart, commentLength);
433  result += tmp.remove(QLatin1Char('\r')); // get rid of CR in CRLF pairs
434  }
435  }
436 
437  ++mState.cursor; // eat '/'
438  return true;
439 }
440 
441 bool Lexer::Impl::parseComment(QString &result, bool reallySave)
442 {
443  // comment := hash-comment / bracket-comment
444 
445  switch (*mState.cursor) {
446  case '#':
447  ++mState.cursor;
448  return parseHashComment(result, reallySave);
449  case '/':
450  if (charsLeft() < 2 || mState.cursor[1] != '*') {
451  makeError(Error::IllegalCharacter);
452  return false;
453  } else {
454  mState.cursor += 2; // eat "/*"
455  return parseBracketComment(result, reallySave);
456  }
457  default:
458  return false; // don't set an error here - there was no comment
459  }
460 }
461 
462 bool Lexer::Impl::eatCWS()
463 {
464  // white-space := 1*(SP / CRLF / HTAB / comment )
465 
466  while (!atEnd()) {
467  switch (*mState.cursor) {
468  case ' ':
469  case '\t': // SP / HTAB
470  ++mState.cursor;
471  break;
472  case '\n':
473  case '\r': // CRLF
474  if (!eatCRLF()) {
475  return false;
476  }
477  break;
478  case '#':
479  case '/': { // comments
480  QString dummy;
481  if (!parseComment(dummy)) {
482  return false;
483  }
484  break;
485  }
486  default:
487  return true;
488  }
489  }
490  return true;
491 }
492 
493 bool Lexer::Impl::parseIdentifier(QString &result)
494 {
495  // identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
496 
497  assert(isIText(*mState.cursor));
498 
499  const char *const identifierStart = mState.cursor;
500 
501  // first char:
502  if (isdigit(*mState.cursor)) { // no digits for the first
503  makeError(Error::NoLeadingDigits);
504  return false;
505  }
506 
507  // rest of identifier chars ( now digits are allowed ):
508  for (++mState.cursor; !atEnd() && isIText(*mState.cursor); ++mState.cursor) { }
509 
510  const int identifierLength = mState.cursor - identifierStart;
511 
512  // Can use the fast fromLatin1 here, since identifiers are always
513  // in the us-ascii subset:
514  result += QString::fromLatin1(identifierStart, identifierLength);
515 
516  if (atEnd() || isDelim(*mState.cursor)) {
517  return true;
518  }
519 
520  makeIllegalCharError(*mState.cursor);
521  return false;
522 }
523 
524 bool Lexer::Impl::parseTag(QString &result)
525 {
526  // tag := ":" identifier
527 
528  // check that the caller plays by the rules:
529  assert(*(mState.cursor - 1) == ':');
530  assert(!atEnd());
531  assert(isIText(*mState.cursor));
532 
533  return parseIdentifier(result);
534 }
535 
536 bool Lexer::Impl::parseNumber(QString &result)
537 {
538  // number := 1*DIGIT [QUANTIFIER]
539  // QUANTIFIER := "K" / "M" / "G"
540 
541  assert(isdigit(*mState.cursor));
542 
543  while (!atEnd() && isdigit(*mState.cursor)) {
544  result += QLatin1Char(*mState.cursor++);
545  }
546 
547  if (atEnd() || isDelim(*mState.cursor)) {
548  return true;
549  }
550 
551  switch (*mState.cursor) {
552  case 'G':
553  case 'g':
554  case 'M':
555  case 'm':
556  case 'K':
557  case 'k':
558  result += QLatin1Char(*mState.cursor++);
559  break;
560  default:
561  makeIllegalCharError();
562  return false;
563  }
564 
565  // quantifier found. Check for delimiter:
566  if (atEnd() || isDelim(*mState.cursor)) {
567  return true;
568  }
569  makeIllegalCharError();
570  return false;
571 }
572 
573 bool Lexer::Impl::parseMultiLine(QString &result)
574 {
575  // multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF)
576  // *(multi-line-literal / multi-line-dotstuff)
577  // "." CRLF
578  // multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
579  // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
580  // ;; A line containing only "." ends the multi-line.
581  // ;; Remove a leading '.' if followed by another '.'.
582 
583  assert(_strnicmp(mState.cursor - 5, "text:", STR_DIM("text:")) == 0);
584 
585  const int mlBeginLine = line();
586  const int mlBeginCol = column() - 5;
587 
588  while (!atEnd()) {
589  switch (*mState.cursor) {
590  case ' ':
591  case '\t':
592  ++mState.cursor;
593  break;
594  case '#': {
595  ++mState.cursor;
596  QString dummy;
597  if (!parseHashComment(dummy)) {
598  return false;
599  }
600  goto MultiLineStart; // break from switch _and_ while
601  }
602  case '\n':
603  case '\r':
604  if (!eatCRLF()) {
605  return false;
606  }
607  goto MultiLineStart; // break from switch _and_ while
608  default:
609  makeError(Error::NonCWSAfterTextColon);
610  return false;
611  }
612  }
613 
614 MultiLineStart:
615  if (atEnd()) {
616  makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
617  return false;
618  }
619 
620  // Now, collect the single lines until one with only a single dot is found:
621  QStringList lines;
622  while (!atEnd()) {
623  const char *const oldBeginOfLine = beginOfLine();
624  if (!skipToCRLF()) {
625  return false;
626  }
627  const int lineLength = mState.cursor - oldBeginOfLine;
628  if (lineLength > 0) {
629  if (!isValidUtf8(oldBeginOfLine, lineLength)) {
630  makeError(Error::InvalidUTF8);
631  return false;
632  }
633  const QString line = removeCRLF(QString::fromUtf8(oldBeginOfLine, lineLength));
634  lines.push_back(removeDotStuff(line));
635  if (line == QLatin1Char('.')) {
636  break;
637  }
638  } else {
639  lines.push_back(QString());
640  }
641  }
642 
643  if (lines.back() != QLatin1String(".")) {
644  makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
645  return false;
646  }
647 
648  assert(!lines.empty());
649  lines.erase(--lines.end()); // don't include the lone dot.
650  result = lines.join(QLatin1Char('\n'));
651  return true;
652 }
653 
654 bool Lexer::Impl::parseQuotedString(QString &result)
655 {
656  // quoted-string := DQUOTE *CHAR DQUOTE
657 
658  // check that caller plays by the rules:
659  assert(*(mState.cursor - 1) == '"');
660 
661  const int qsBeginCol = column() - 1;
662  const int qsBeginLine = line();
663 
664  const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8
665  assert(codec);
666  const std::unique_ptr<QTextDecoder> dec(codec->makeDecoder());
667  assert(dec.get());
668 
669  while (!atEnd()) {
670  switch (*mState.cursor) {
671  case '"':
672  ++mState.cursor;
673  return true;
674  case '\r':
675  case '\n':
676  if (!eatCRLF()) {
677  return false;
678  }
679  result += QLatin1Char('\n');
680  break;
681  case '\\':
682  ++mState.cursor;
683  if (atEnd()) {
684  break;
685  }
686  Q_FALLTHROUGH();
687  default:
688  if (!is8Bit(*mState.cursor)) {
689  result += QLatin1Char(*mState.cursor++);
690  } else { // probably UTF-8
691  const char *const eightBitBegin = mState.cursor;
692  skipTo8BitEnd();
693  const int eightBitLen = mState.cursor - eightBitBegin;
694  assert(eightBitLen > 0);
695  if (isValidUtf8(eightBitBegin, eightBitLen)) {
696  result += dec->toUnicode(eightBitBegin, eightBitLen);
697  } else {
698  assert(column() >= eightBitLen);
699  makeError(Error::InvalidUTF8, line(), column() - eightBitLen);
700  return false;
701  }
702  }
703  }
704  }
705 
706  makeError(Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol);
707  return false;
708 }
709 
710 void Lexer::Impl::makeIllegalCharError(char ch)
711 {
712  makeError(isIllegal(ch) ? Error::IllegalCharacter : Error::UnexpectedCharacter);
713 }
714 } // namespace KSieve
void push_back(const T &value)
QList::iterator erase(QList::iterator pos)
QString join(const QString &separator) const const
QString & remove(int position, int n)
void clear()
QString fromUtf8(const char *str, int size)
bool empty() const const
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
void error(QWidget *parent, const QString &text, const QString &caption=QString(), Options options=Notify)
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
QList::iterator end()
QString & replace(int position, int n, QChar after)
QString mid(int position, int n) const const
QTextDecoder * makeDecoder(QTextCodec::ConversionFlags flags) const const
QString & setNum(short n, int base)
QTextCodec * codecForMib(int mib)
int length() const const
QString left(int n) const const
QString fromLatin1(const char *str, int size)
T & back()
QTextStream & dec(QTextStream &stream)
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Fri Apr 16 2021 23:09:33 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.