Libksieve

lexer.cpp
1 /* -*- c++ -*-
2  parser/lexer.cpp
3 
4  This file is part of KSieve,
5  the KDE internet mail/usenet news message filtering library.
6  Copyright (c) 2002-2003 Marc Mutz <[email protected]>
7 
8  KSieve is free software; you can redistribute it and/or modify it
9  under the terms of the GNU General Public License, version 2, as
10  published by the Free Software Foundation.
11 
12  KSieve is distributed in the hope that it will be useful, but
13  WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program; if not, write to the Free Software
19  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 
21  In addition, as a special exception, the copyright holders give
22  permission to link the code of this program with any edition of
23  the Qt library by Trolltech AS, Norway (or with modified versions
24  of Qt that use the same license as Qt), and distribute linked
25  combinations including the two. You must obey the GNU General
26  Public License in all respects for all of the code used other than
27  Qt. If you modify this file, you may extend this exception to
28  your version of the file, but you are not obligated to do so. If
29  you do not wish to do so, delete this exception statement from
30  your version.
31 */
32 
33 #include <ksieve/lexer.h>
34 #include <impl/lexer.h>
35 
36 #include <impl/utf8validator.h>
37 #include <ksieve/error.h>
38 
39 #include <QString>
40 #include <QStringList>
41 #include <QTextCodec>
42 
43 #include <memory> // std::unique_ptr
44 
45 #include <assert.h>
46 #include <ctype.h> // isdigit
47 
48 #ifdef STR_DIM
49 # undef STR_DIM
50 #endif
51 #define STR_DIM(x) (sizeof(x) - 1)
52 
53 namespace KSieve {
54 //
55 //
56 // Lexer Bridge implementation
57 //
58 //
59 
60 Lexer::Lexer(const char *scursor, const char *send, int options)
61  : i(nullptr)
62 {
63  i = new Impl(scursor, send, options);
64 }
65 
66 Lexer::~Lexer()
67 {
68  delete i;
69  i = nullptr;
70 }
71 
72 bool Lexer::ignoreComments() const
73 {
74  assert(i);
75  return i->ignoreComments();
76 }
77 
78 const Error &Lexer::error() const
79 {
80  assert(i);
81  return i->error();
82 }
83 
84 bool Lexer::atEnd() const
85 {
86  assert(i);
87  return i->atEnd();
88 }
89 
90 int Lexer::column() const
91 {
92  assert(i);
93  return i->column();
94 }
95 
96 int Lexer::line() const
97 {
98  assert(i);
99  return i->line();
100 }
101 
102 void Lexer::save()
103 {
104  assert(i);
105  i->save();
106 }
107 
108 void Lexer::restore()
109 {
110  assert(i);
111  i->restore();
112 }
113 
114 Lexer::Token Lexer::nextToken(QString &result)
115 {
116  assert(i);
117  return i->nextToken(result);
118 }
119 } // namespace KSieve
120 
121 // none except a-zA-Z0-9_
122 static const unsigned char iTextMap[16] = {
123  0x00, 0x00, 0x00, 0x00, // CTLs: none
124  0x00, 0x00, 0xFF, 0xC0, // SP ... '?': 0-9
125  0x7F, 0xFF, 0xFF, 0xE1, // '@' ... '_': A-Z_
126  0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL: a-z
127 };
128 
129 // SP, HT, CR, LF, {}[]();,#/
130 // ### exclude '['? Why would one want to write identifier["foo"]?
131 static const unsigned char delimMap[16] = {
132  0x00, 0x64, 0x00, 0x00, // CTLs: CR, HT, LF
133  0x90, 0xC9, 0x00, 0x10, // SP ... '?': SP, #(),;
134  0x00, 0x00, 0x00, 0x16, // '@' ... '_': []
135  0x00, 0x00, 0x00, 0x16 // '`' ... DEL: {}
136 };
137 
138 // All except iText, delim, "*:
139 static const unsigned char illegalMap[16] = {
140  0xFF, 0x9B, 0xFF, 0xFF,
141  0x4F, 0x16, 0x00, 0x0F,
142  0x80, 0x00, 0x00, 0x0A,
143  0x80, 0x00, 0x00, 0x0A
144 };
145 
146 static inline bool isOfSet(const unsigned char map[16], unsigned char ch)
147 {
148  assert(ch < 128);
149  return map[ ch / 8 ] & 0x80 >> ch % 8;
150 }
151 
152 static inline bool isIText(unsigned char ch)
153 {
154  return ch <= 'z' && isOfSet(iTextMap, ch);
155 }
156 
157 static inline bool isDelim(unsigned char ch)
158 {
159  return ch <= '}' && isOfSet(delimMap, ch);
160 }
161 
162 static inline bool isIllegal(unsigned char ch)
163 {
164  return ch >= '~' || isOfSet(illegalMap, ch);
165 }
166 
167 static inline bool is8Bit(signed char ch)
168 {
169  return ch < 0;
170 }
171 
172 static QString removeCRLF(const QString &s)
173 {
174  const bool CRLF = s.endsWith(QLatin1String("\r\n"));
175  const bool LF = !CRLF && s.endsWith(QLatin1Char('\n'));
176 
177  const int e = CRLF ? 2 : LF ? 1 : 0; // what to chop off at the end
178 
179  return s.left(s.length() - e);
180 }
181 
182 static QString removeDotStuff(const QString &s)
183 {
184  return s.startsWith(QLatin1String("..")) ? s.mid(1) : s;
185 }
186 
187 namespace KSieve {
188 //
189 //
190 // Lexer Implementation
191 //
192 //
193 
194 Lexer::Impl::Impl(const char *scursor, const char *send, int options)
195  : mState(scursor ? scursor : send)
196  , mEnd(send ? send : scursor)
197  , mIgnoreComments(options & IgnoreComments)
198  , mIgnoreLF(options & IgnoreLineFeeds)
199 {
200  if (!scursor || !send) {
201  assert(atEnd());
202  }
203 }
204 
205 Lexer::Token Lexer::Impl::nextToken(QString &result)
206 {
207  assert(!atEnd());
208  result.clear();
209  //clearErrors();
210 
211  const int oldLine = line();
212 
213  const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS();
214 
215  if (!ignoreLineFeeds() && oldLine != line()) {
216  result.setNum(line() - oldLine); // return number of linefeeds encountered
217  return LineFeeds;
218  }
219 
220  if (!eatingWSSucceeded) {
221  return None;
222  }
223 
224  if (atEnd()) {
225  return None;
226  }
227 
228  switch (*mState.cursor) {
229  case '#': // HashComment
230  assert(!ignoreComments());
231  ++mState.cursor;
232  if (!atEnd()) {
233  parseHashComment(result, true);
234  }
235  return HashComment;
236  case '/': // BracketComment
237  assert(!ignoreComments());
238  ++mState.cursor; // eat slash
239  if (atEnd() || *mState.cursor != '*') {
240  makeError(Error::SlashWithoutAsterisk);
241  return BracketComment;
242  }
243  ++mState.cursor; // eat asterisk
244  if (atEnd()) {
245  makeError(Error::UnfinishedBracketComment);
246  return BracketComment;
247  }
248  parseBracketComment(result, true);
249  return BracketComment;
250  case ':': // Tag
251  ++mState.cursor;
252  if (atEnd()) {
253  makeError(Error::UnexpectedCharacter, line(), column() - 1);
254  return Tag;
255  }
256  if (!isIText(*mState.cursor)) {
257  makeIllegalCharError(*mState.cursor);
258  return Tag;
259  }
260  parseTag(result);
261  return Tag;
262  case '"': // QuotedString
263  ++mState.cursor;
264  parseQuotedString(result);
265  return QuotedString;
266  case '{':
267  case '}':
268  case '[':
269  case ']':
270  case '(':
271  case ')':
272  case ';':
273  case ',': // Special
274  result = QLatin1Char(*mState.cursor++);
275  return Special;
276  case '0':
277  case '1':
278  case '2':
279  case '3':
280  case '4':
281  case '5':
282  case '6':
283  case '7':
284  case '8':
285  case '9': // Number
286  parseNumber(result);
287  return Number;
288  case 't': // maybe MultiLineString, else Identifier
289  if (_strnicmp(mState.cursor, "text:", STR_DIM("text:")) == 0) {
290  // MultiLineString
291  mState.cursor += STR_DIM("text:");
292  parseMultiLine(result);
293  // ### FIXME: There can be a hash-comment between "text:"
294  // and CRLF! That should be preserved somehow...
295  return MultiLineString;
296  }
297  Q_FALLTHROUGH();
298  default: // Identifier (first must not be 0-9, and can't (caught by Number above))
299  if (!isIText(*mState.cursor)) {
300  makeError(Error::IllegalCharacter);
301  return None;
302  }
303  parseIdentifier(result);
304  return Identifier;
305  }
306 }
307 
308 bool Lexer::Impl::eatWS()
309 {
310  while (!atEnd()) {
311  switch (*mState.cursor) {
312  case '\r':
313  case '\n':
314  if (!eatCRLF()) {
315  return false;
316  }
317  break;
318  case ' ':
319  case '\t':
320  ++mState.cursor;
321  break;
322  default:
323  return true;
324  }
325  }
326 
327  // at end:
328  return true;
329 }
330 
331 bool Lexer::Impl::eatCRLF()
332 {
333  assert(!atEnd());
334  assert(*mState.cursor == '\n' || *mState.cursor == '\r');
335 
336  if (*mState.cursor == '\r') {
337  ++mState.cursor;
338  if (atEnd() || *mState.cursor != '\n') {
339  // CR w/o LF -> error
340  makeError(Error::CRWithoutLF);
341  return false;
342  } else {
343  // good CRLF
344  newLine();
345  return true;
346  }
347  } else { /* *mState.cursor == '\n' */
348  // good, LF only
349  newLine();
350  return true;
351  }
352 }
353 
354 bool Lexer::Impl::parseHashComment(QString &result, bool reallySave)
355 {
356  // hash-comment := "#" *CHAR-NOT-CRLF CRLF
357 
358  // check that the caller plays by the rules:
359  assert(*(mState.cursor - 1) == '#');
360 
361  const char *const commentStart = mState.cursor;
362 
363  // find next CRLF:
364  while (!atEnd()) {
365  if (*mState.cursor == '\n' || *mState.cursor == '\r') {
366  break;
367  }
368  ++mState.cursor;
369  }
370  const char *const commentEnd = mState.cursor - 1;
371 
372  //Laurent it creates a problem when we have just "#F" => it doesn't see it as a comment
373 // if (commentEnd == commentStart) {
374 // return true; // # was last char in script...
375 // }
376 
377  if (atEnd() || eatCRLF()) {
378  const int commentLength = commentEnd - commentStart + 1;
379  if (commentLength > 0) {
380  if (!isValidUtf8(commentStart, commentLength)) {
381  makeError(Error::InvalidUTF8);
382  return false;
383  }
384  if (reallySave) {
385  result += QString::fromUtf8(commentStart, commentLength);
386  // In comment < or > breaks parsing => convert them to double quote
387  // See src/ksieveui/scriptsparsing/tests/failed/script1.siv
388  result.replace(QLatin1Char('<'), QLatin1Char('"'));
389  result.replace(QLatin1Char('>'), QLatin1Char('"'));
390  }
391  }
392  return true;
393  }
394 
395  return false;
396 }
397 
398 bool Lexer::Impl::parseBracketComment(QString &result, bool reallySave)
399 {
400  // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
401 
402  // check that caller plays by the rules:
403  assert(*(mState.cursor - 2) == '/');
404  assert(*(mState.cursor - 1) == '*');
405 
406  const char *const commentStart = mState.cursor;
407  const int commentCol = column() - 2;
408  const int commentLine = line();
409 
410  // find next asterisk:
411  do {
412  if (!skipTo('*')) {
413  if (!error()) {
414  makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
415  }
416  return false;
417  }
418  } while (!atEnd() && *++mState.cursor != '/');
419 
420  if (atEnd()) {
421  makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
422  return false;
423  }
424 
425  assert(*mState.cursor == '/');
426 
427  const int commentLength = mState.cursor - commentStart - 1;
428  if (commentLength > 0) {
429  if (!isValidUtf8(commentStart, commentLength)) {
430  makeError(Error::InvalidUTF8);
431  return false;
432  }
433  if (reallySave) {
434  QString tmp = QString::fromUtf8(commentStart, commentLength);
435  result += tmp.remove(QLatin1Char('\r')); // get rid of CR in CRLF pairs
436  }
437  }
438 
439  ++mState.cursor; // eat '/'
440  return true;
441 }
442 
443 bool Lexer::Impl::parseComment(QString &result, bool reallySave)
444 {
445  // comment := hash-comment / bracket-comment
446 
447  switch (*mState.cursor) {
448  case '#':
449  ++mState.cursor;
450  return parseHashComment(result, reallySave);
451  case '/':
452  if (charsLeft() < 2 || mState.cursor[1] != '*') {
453  makeError(Error::IllegalCharacter);
454  return false;
455  } else {
456  mState.cursor += 2; // eat "/*"
457  return parseBracketComment(result, reallySave);
458  }
459  default:
460  return false; // don't set an error here - there was no comment
461  }
462 }
463 
464 bool Lexer::Impl::eatCWS()
465 {
466  // white-space := 1*(SP / CRLF / HTAB / comment )
467 
468  while (!atEnd()) {
469  switch (*mState.cursor) {
470  case ' ':
471  case '\t': // SP / HTAB
472  ++mState.cursor;
473  break;
474  case '\n':
475  case '\r': // CRLF
476  if (!eatCRLF()) {
477  return false;
478  }
479  break;
480  case '#':
481  case '/':
482  { // comments
483  QString dummy;
484  if (!parseComment(dummy)) {
485  return false;
486  }
487  break;
488  }
489  default:
490  return true;
491  }
492  }
493  return true;
494 }
495 
496 bool Lexer::Impl::parseIdentifier(QString &result)
497 {
498  // identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
499 
500  assert(isIText(*mState.cursor));
501 
502  const char *const identifierStart = mState.cursor;
503 
504  // first char:
505  if (isdigit(*mState.cursor)) { // no digits for the first
506  makeError(Error::NoLeadingDigits);
507  return false;
508  }
509 
510  // rest of identifier chars ( now digits are allowed ):
511  for (++mState.cursor; !atEnd() && isIText(*mState.cursor); ++mState.cursor) {
512  }
513 
514  const int identifierLength = mState.cursor - identifierStart;
515 
516  // Can use the fast fromLatin1 here, since identifiers are always
517  // in the us-ascii subset:
518  result += QString::fromLatin1(identifierStart, identifierLength);
519 
520  if (atEnd() || isDelim(*mState.cursor)) {
521  return true;
522  }
523 
524  makeIllegalCharError(*mState.cursor);
525  return false;
526 }
527 
528 bool Lexer::Impl::parseTag(QString &result)
529 {
530  // tag := ":" identifier
531 
532  // check that the caller plays by the rules:
533  assert(*(mState.cursor - 1) == ':');
534  assert(!atEnd());
535  assert(isIText(*mState.cursor));
536 
537  return parseIdentifier(result);
538 }
539 
540 bool Lexer::Impl::parseNumber(QString &result)
541 {
542  // number := 1*DIGIT [QUANTIFIER]
543  // QUANTIFIER := "K" / "M" / "G"
544 
545  assert(isdigit(*mState.cursor));
546 
547  while (!atEnd() && isdigit(*mState.cursor)) {
548  result += QLatin1Char(*mState.cursor++);
549  }
550 
551  if (atEnd() || isDelim(*mState.cursor)) {
552  return true;
553  }
554 
555  switch (*mState.cursor) {
556  case 'G':
557  case 'g':
558  case 'M':
559  case 'm':
560  case 'K':
561  case 'k':
562  result += QLatin1Char(*mState.cursor++);
563  break;
564  default:
565  makeIllegalCharError();
566  return false;
567  }
568 
569  // quantifier found. Check for delimiter:
570  if (atEnd() || isDelim(*mState.cursor)) {
571  return true;
572  }
573  makeIllegalCharError();
574  return false;
575 }
576 
577 bool Lexer::Impl::parseMultiLine(QString &result)
578 {
579  // multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF)
580  // *(multi-line-literal / multi-line-dotstuff)
581  // "." CRLF
582  // multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
583  // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
584  // ;; A line containing only "." ends the multi-line.
585  // ;; Remove a leading '.' if followed by another '.'.
586 
587  assert(_strnicmp(mState.cursor - 5, "text:", STR_DIM("text:")) == 0);
588 
589  const int mlBeginLine = line();
590  const int mlBeginCol = column() - 5;
591 
592  while (!atEnd()) {
593  switch (*mState.cursor) {
594  case ' ':
595  case '\t':
596  ++mState.cursor;
597  break;
598  case '#':
599  {
600  ++mState.cursor;
601  QString dummy;
602  if (!parseHashComment(dummy)) {
603  return false;
604  }
605  goto MultiLineStart; // break from switch _and_ while
606  }
607  case '\n':
608  case '\r':
609  if (!eatCRLF()) {
610  return false;
611  }
612  goto MultiLineStart; // break from switch _and_ while
613  default:
614  makeError(Error::NonCWSAfterTextColon);
615  return false;
616  }
617  }
618 
619 MultiLineStart:
620  if (atEnd()) {
621  makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
622  return false;
623  }
624 
625  // Now, collect the single lines until one with only a single dot is found:
626  QStringList lines;
627  while (!atEnd()) {
628  const char *const oldBeginOfLine = beginOfLine();
629  if (!skipToCRLF()) {
630  return false;
631  }
632  const int lineLength = mState.cursor - oldBeginOfLine;
633  if (lineLength > 0) {
634  if (!isValidUtf8(oldBeginOfLine, lineLength)) {
635  makeError(Error::InvalidUTF8);
636  return false;
637  }
638  const QString line = removeCRLF(QString::fromUtf8(oldBeginOfLine, lineLength));
639  lines.push_back(removeDotStuff(line));
640  if (line == QLatin1Char('.')) {
641  break;
642  }
643  } else {
644  lines.push_back(QString());
645  }
646  }
647 
648  if (lines.back() != QLatin1String(".")) {
649  makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
650  return false;
651  }
652 
653  assert(!lines.empty());
654  lines.erase(--lines.end()); // don't include the lone dot.
655  result = lines.join(QLatin1Char('\n'));
656  return true;
657 }
658 
659 bool Lexer::Impl::parseQuotedString(QString &result)
660 {
661  // quoted-string := DQUOTE *CHAR DQUOTE
662 
663  // check that caller plays by the rules:
664  assert(*(mState.cursor - 1) == '"');
665 
666  const int qsBeginCol = column() - 1;
667  const int qsBeginLine = line();
668 
669  const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8
670  assert(codec);
671  const std::unique_ptr<QTextDecoder> dec(codec->makeDecoder());
672  assert(dec.get());
673 
674  while (!atEnd()) {
675  switch (*mState.cursor) {
676  case '"':
677  ++mState.cursor;
678  return true;
679  case '\r':
680  case '\n':
681  if (!eatCRLF()) {
682  return false;
683  }
684  result += QLatin1Char('\n');
685  break;
686  case '\\':
687  ++mState.cursor;
688  if (atEnd()) {
689  break;
690  }
691  Q_FALLTHROUGH();
692  default:
693  if (!is8Bit(*mState.cursor)) {
694  result += QLatin1Char(*mState.cursor++);
695  } else { // probably UTF-8
696  const char *const eightBitBegin = mState.cursor;
697  skipTo8BitEnd();
698  const int eightBitLen = mState.cursor - eightBitBegin;
699  assert(eightBitLen > 0);
700  if (isValidUtf8(eightBitBegin, eightBitLen)) {
701  result += dec->toUnicode(eightBitBegin, eightBitLen);
702  } else {
703  assert(column() >= eightBitLen);
704  makeError(Error::InvalidUTF8, line(), column() - eightBitLen);
705  return false;
706  }
707  }
708  }
709  }
710 
711  makeError(Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol);
712  return false;
713 }
714 
715 void Lexer::Impl::makeIllegalCharError(char ch)
716 {
717  makeError(isIllegal(ch) ? Error::IllegalCharacter : Error::UnexpectedCharacter);
718 }
719 } // namespace KSieve
void push_back(const T &value)
QList::iterator erase(QList::iterator pos)
QString join(const QString &separator) const const
QString & remove(int position, int n)
void clear()
QString fromUtf8(const char *str, int size)
bool empty() const const
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
void error(QWidget *parent, const QString &text, const QString &caption=QString(), Options options=Notify)
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
QList::iterator end()
QString & replace(int position, int n, QChar after)
QString mid(int position, int n) const const
QTextDecoder * makeDecoder(QTextCodec::ConversionFlags flags) const const
QString & setNum(short n, int base)
QTextCodec * codecForMib(int mib)
int length() const const
QString left(int n) const const
QString fromLatin1(const char *str, int size)
T & back()
QTextStream & dec(QTextStream &stream)
This file is part of the KDE documentation.
Documentation copyright © 1996-2020 The KDE developers.
Generated on Thu Jul 9 2020 23:07:44 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.