Libksieve

lexer.cpp
1/* -*- c++ -*-
2 parser/lexer.cpp
3
4 This file is part of KSieve,
5 the KDE internet mail/usenet news message filtering library.
6 SPDX-FileCopyrightText: 2002-2003 Marc Mutz <mutz@kde.org>
7
8 SPDX-License-Identifier: GPL-2.0-only
9*/
10
11#include "lexer_p.h"
12
13#include "error.h"
14#include "utf8validator.h"
15
16#include <QString>
17#include <QStringList>
18
19#include <memory> // std::unique_ptr
20
21#include <QStringDecoder>
22#include <cassert>
23#include <cctype> // isdigit
24
25#ifdef STR_DIM
26#undef STR_DIM
27#endif
28#define STR_DIM(x) (sizeof(x) - 1)
29
30namespace KSieve
31{
32//
33//
34// Lexer Bridge implementation
35//
36//
37
38Lexer::Lexer(const char *scursor, const char *send, int options)
39 : i(new Impl(scursor, send, options))
40{
41}
42
43Lexer::~Lexer()
44{
45 delete i;
46 i = nullptr;
47}
48
49bool Lexer::ignoreComments() const
50{
51 assert(i);
52 return i->ignoreComments();
53}
54
55const Error &Lexer::error() const
56{
57 assert(i);
58 return i->error();
59}
60
61bool Lexer::atEnd() const
62{
63 assert(i);
64 return i->atEnd();
65}
66
67int Lexer::column() const
68{
69 assert(i);
70 return i->column();
71}
72
73int Lexer::line() const
74{
75 assert(i);
76 return i->line();
77}
78
79void Lexer::save()
80{
81 assert(i);
82 i->save();
83}
84
85void Lexer::restore()
86{
87 assert(i);
88 i->restore();
89}
90
91Lexer::Token Lexer::nextToken(QString &result)
92{
93 assert(i);
94 return i->nextToken(result);
95}
96} // namespace KSieve
97
98// none except a-zA-Z0-9_
99static const unsigned char iTextMap[16] = {
100 0x00,
101 0x00,
102 0x00,
103 0x00, // CTLs: none
104 0x00,
105 0x00,
106 0xFF,
107 0xC0, // SP ... '?': 0-9
108 0x7F,
109 0xFF,
110 0xFF,
111 0xE1, // '@' ... '_': A-Z_
112 0x7F,
113 0xFF,
114 0xFF,
115 0xE0 // '`' ... DEL: a-z
116};
117
118// SP, HT, CR, LF, {}[]();,#/
119// ### exclude '['? Why would one want to write identifier["foo"]?
120static const unsigned char delimMap[16] = {
121 0x00,
122 0x64,
123 0x00,
124 0x00, // CTLs: CR, HT, LF
125 0x90,
126 0xC9,
127 0x00,
128 0x10, // SP ... '?': SP, #(),;
129 0x00,
130 0x00,
131 0x00,
132 0x16, // '@' ... '_': []
133 0x00,
134 0x00,
135 0x00,
136 0x16 // '`' ... DEL: {}
137};
138
139// All except iText, delim, "*:
140static const unsigned char illegalMap[16] = {0xFF, 0x9B, 0xFF, 0xFF, 0x4F, 0x16, 0x00, 0x0F, 0x80, 0x00, 0x00, 0x0A, 0x80, 0x00, 0x00, 0x0A};
141
142static inline bool isOfSet(const unsigned char map[16], unsigned char ch)
143{
144 assert(ch < 128);
145 return map[ch / 8] & 0x80 >> ch % 8;
146}
147
148static inline bool isIText(unsigned char ch)
149{
150 return ch <= 'z' && isOfSet(iTextMap, ch);
151}
152
153static inline bool isDelim(unsigned char ch)
154{
155 return ch <= '}' && isOfSet(delimMap, ch);
156}
157
158static inline bool isIllegal(unsigned char ch)
159{
160 return ch >= '~' || isOfSet(illegalMap, ch);
161}
162
163static inline bool is8Bit(signed char ch)
164{
165 return ch < 0;
166}
167
168static QString removeCRLF(const QString &s)
169{
170 const bool CRLF = s.endsWith(QLatin1StringView("\r\n"));
171 const bool LF = !CRLF && s.endsWith(QLatin1Char('\n'));
172
173 const int e = CRLF ? 2 : LF ? 1 : 0; // what to chop off at the end
174
175 return s.left(s.length() - e);
176}
177
178static QString removeDotStuff(const QString &s)
179{
180 return s.startsWith(QLatin1StringView("..")) ? s.mid(1) : s;
181}
182
183namespace KSieve
184{
185//
186//
187// Lexer Implementation
188//
189//
190
191Lexer::Impl::Impl(const char *scursor, const char *send, int options)
192 : mState(scursor ? scursor : send)
193 , mEnd(send ? send : scursor)
194 , mIgnoreComments(options & IgnoreComments)
195 , mIgnoreLF(options & IgnoreLineFeeds)
196{
197 if (!scursor || !send) {
198 assert(atEnd());
199 }
200}
201
202Lexer::Token Lexer::Impl::nextToken(QString &result)
203{
204 assert(!atEnd());
205 result.clear();
206 // clearErrors();
207
208 const int oldLine = line();
209
210 const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS();
211
212 if (!ignoreLineFeeds() && oldLine != line()) {
213 result.setNum(line() - oldLine); // return number of linefeeds encountered
214 return LineFeeds;
215 }
216
217 if (!eatingWSSucceeded) {
218 return None;
219 }
220
221 if (atEnd()) {
222 return None;
223 }
224
225 switch (*mState.cursor) {
226 case '#': // HashComment
227 assert(!ignoreComments());
228 ++mState.cursor;
229 if (!atEnd()) {
230 parseHashComment(result, true);
231 }
232 return HashComment;
233 case '/': // BracketComment
234 assert(!ignoreComments());
235 ++mState.cursor; // eat slash
236 if (atEnd() || *mState.cursor != '*') {
237 makeError(Error::SlashWithoutAsterisk);
238 return BracketComment;
239 }
240 ++mState.cursor; // eat asterisk
241 if (atEnd()) {
242 makeError(Error::UnfinishedBracketComment);
243 return BracketComment;
244 }
245 parseBracketComment(result, true);
246 return BracketComment;
247 case ':': // Tag
248 ++mState.cursor;
249 if (atEnd()) {
250 makeError(Error::UnexpectedCharacter, line(), column() - 1);
251 return Tag;
252 }
253 if (!isIText(*mState.cursor)) {
254 makeIllegalCharError(*mState.cursor);
255 return Tag;
256 }
257 parseTag(result);
258 return Tag;
259 case '"': // QuotedString
260 ++mState.cursor;
261 parseQuotedString(result);
262 return QuotedString;
263 case '{':
264 case '}':
265 case '[':
266 case ']':
267 case '(':
268 case ')':
269 case ';':
270 case ',': // Special
271 result = QLatin1Char(*mState.cursor++);
272 return Special;
273 case '0':
274 case '1':
275 case '2':
276 case '3':
277 case '4':
278 case '5':
279 case '6':
280 case '7':
281 case '8':
282 case '9': // Number
283 parseNumber(result);
284 return Number;
285 case 't': // maybe MultiLineString, else Identifier
286 if (_strnicmp(mState.cursor, "text:", STR_DIM("text:")) == 0) {
287 // MultiLineString
288 mState.cursor += STR_DIM("text:");
289 parseMultiLine(result);
290 // ### FIXME: There can be a hash-comment between "text:"
291 // and CRLF! That should be preserved somehow...
292 return MultiLineString;
293 }
294 [[fallthrough]];
295 default: // Identifier (first must not be 0-9, and can't (caught by Number above))
296 if (!isIText(*mState.cursor)) {
297 makeError(Error::IllegalCharacter);
298 return None;
299 }
300 parseIdentifier(result);
301 return Identifier;
302 }
303}
304
305bool Lexer::Impl::eatWS()
306{
307 while (!atEnd()) {
308 switch (*mState.cursor) {
309 case '\r':
310 case '\n':
311 if (!eatCRLF()) {
312 return false;
313 }
314 break;
315 case ' ':
316 case '\t':
317 ++mState.cursor;
318 break;
319 default:
320 return true;
321 }
322 }
323
324 // at end:
325 return true;
326}
327
328bool Lexer::Impl::eatCRLF()
329{
330 assert(!atEnd());
331 assert(*mState.cursor == '\n' || *mState.cursor == '\r');
332
333 if (*mState.cursor == '\r') {
334 ++mState.cursor;
335 if (atEnd() || *mState.cursor != '\n') {
336 // CR w/o LF -> error
337 makeError(Error::CRWithoutLF);
338 return false;
339 } else {
340 // good CRLF
341 newLine();
342 return true;
343 }
344 } else { /* *mState.cursor == '\n' */
345 // good, LF only
346 newLine();
347 return true;
348 }
349}
350
351bool Lexer::Impl::parseHashComment(QString &result, bool reallySave)
352{
353 // hash-comment := "#" *CHAR-NOT-CRLF CRLF
354
355 // check that the caller plays by the rules:
356 assert(*(mState.cursor - 1) == '#');
357
358 const char *const commentStart = mState.cursor;
359
360 // find next CRLF:
361 while (!atEnd()) {
362 if (*mState.cursor == '\n' || *mState.cursor == '\r') {
363 break;
364 }
365 ++mState.cursor;
366 }
367 const char *const commentEnd = mState.cursor - 1;
368
369 // Laurent it creates a problem when we have just "#F" => it doesn't see it as a comment
370 // if (commentEnd == commentStart) {
371 // return true; // # was last char in script...
372 // }
373
374 if (atEnd() || eatCRLF()) {
375 const int commentLength = commentEnd - commentStart + 1;
376 if (commentLength > 0) {
377 if (!isValidUtf8(commentStart, commentLength)) {
378 makeError(Error::InvalidUTF8);
379 return false;
380 }
381 if (reallySave) {
382 result += QString::fromUtf8(commentStart, commentLength);
383 // In comment < or > breaks parsing => convert them to double quote
384 // See src/ksieveui/scriptsparsing/tests/failed/script1.siv
385 result.replace(QLatin1Char('<'), QLatin1Char('"'));
386 result.replace(QLatin1Char('>'), QLatin1Char('"'));
387 }
388 }
389 return true;
390 }
391
392 return false;
393}
394
395bool Lexer::Impl::parseBracketComment(QString &result, bool reallySave)
396{
397 // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
398
399 // check that caller plays by the rules:
400 assert(*(mState.cursor - 2) == '/');
401 assert(*(mState.cursor - 1) == '*');
402
403 const char *const commentStart = mState.cursor;
404 const int commentCol = column() - 2;
405 const int commentLine = line();
406
407 // find next asterisk:
408 do {
409 if (!skipTo('*')) {
410 if (!error()) {
411 makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
412 }
413 return false;
414 }
415 } while (!atEnd() && *++mState.cursor != '/');
416
417 if (atEnd()) {
418 makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
419 return false;
420 }
421
422 assert(*mState.cursor == '/');
423
424 const int commentLength = mState.cursor - commentStart - 1;
425 if (commentLength > 0) {
426 if (!isValidUtf8(commentStart, commentLength)) {
427 makeError(Error::InvalidUTF8);
428 return false;
429 }
430 if (reallySave) {
431 QString tmp = QString::fromUtf8(commentStart, commentLength);
432 result += tmp.remove(QLatin1Char('\r')); // get rid of CR in CRLF pairs
433 }
434 }
435
436 ++mState.cursor; // eat '/'
437 return true;
438}
439
440bool Lexer::Impl::parseComment(QString &result, bool reallySave)
441{
442 // comment := hash-comment / bracket-comment
443
444 switch (*mState.cursor) {
445 case '#':
446 ++mState.cursor;
447 return parseHashComment(result, reallySave);
448 case '/':
449 if (charsLeft() < 2 || mState.cursor[1] != '*') {
450 makeError(Error::IllegalCharacter);
451 return false;
452 } else {
453 mState.cursor += 2; // eat "/*"
454 return parseBracketComment(result, reallySave);
455 }
456 default:
457 return false; // don't set an error here - there was no comment
458 }
459}
460
461bool Lexer::Impl::eatCWS()
462{
463 // white-space := 1*(SP / CRLF / HTAB / comment )
464
465 while (!atEnd()) {
466 switch (*mState.cursor) {
467 case ' ':
468 case '\t': // SP / HTAB
469 ++mState.cursor;
470 break;
471 case '\n':
472 case '\r': // CRLF
473 if (!eatCRLF()) {
474 return false;
475 }
476 break;
477 case '#':
478 case '/': { // comments
479 QString dummy;
480 if (!parseComment(dummy)) {
481 return false;
482 }
483 break;
484 }
485 default:
486 return true;
487 }
488 }
489 return true;
490}
491
492bool Lexer::Impl::parseIdentifier(QString &result)
493{
494 // identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
495
496 assert(isIText(*mState.cursor));
497
498 const char *const identifierStart = mState.cursor;
499
500 // first char:
501 if (isdigit(*mState.cursor)) { // no digits for the first
502 makeError(Error::NoLeadingDigits);
503 return false;
504 }
505
506 // rest of identifier chars ( now digits are allowed ):
507 for (++mState.cursor; !atEnd() && isIText(*mState.cursor); ++mState.cursor) { }
508
509 const int identifierLength = mState.cursor - identifierStart;
510
511 // Can use the fast fromLatin1 here, since identifiers are always
512 // in the us-ascii subset:
513 result += QString::fromLatin1(identifierStart, identifierLength);
514
515 if (atEnd() || isDelim(*mState.cursor)) {
516 return true;
517 }
518
519 makeIllegalCharError(*mState.cursor);
520 return false;
521}
522
523bool Lexer::Impl::parseTag(QString &result)
524{
525 // tag := ":" identifier
526
527 // check that the caller plays by the rules:
528 assert(*(mState.cursor - 1) == ':');
529 assert(!atEnd());
530 assert(isIText(*mState.cursor));
531
532 return parseIdentifier(result);
533}
534
535bool Lexer::Impl::parseNumber(QString &result)
536{
537 // number := 1*DIGIT [QUANTIFIER]
538 // QUANTIFIER := "K" / "M" / "G"
539
540 assert(isdigit(*mState.cursor));
541
542 while (!atEnd() && isdigit(*mState.cursor)) {
543 result += QLatin1Char(*mState.cursor++);
544 }
545
546 if (atEnd() || isDelim(*mState.cursor)) {
547 return true;
548 }
549
550 switch (*mState.cursor) {
551 case 'G':
552 case 'g':
553 case 'M':
554 case 'm':
555 case 'K':
556 case 'k':
557 result += QLatin1Char(*mState.cursor++);
558 break;
559 default:
560 makeIllegalCharError();
561 return false;
562 }
563
564 // quantifier found. Check for delimiter:
565 if (atEnd() || isDelim(*mState.cursor)) {
566 return true;
567 }
568 makeIllegalCharError();
569 return false;
570}
571
572bool Lexer::Impl::parseMultiLine(QString &result)
573{
574 // multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF)
575 // *(multi-line-literal / multi-line-dotstuff)
576 // "." CRLF
577 // multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
578 // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
579 // ;; A line containing only "." ends the multi-line.
580 // ;; Remove a leading '.' if followed by another '.'.
581
582 assert(_strnicmp(mState.cursor - 5, "text:", STR_DIM("text:")) == 0);
583
584 const int mlBeginLine = line();
585 const int mlBeginCol = column() - 5;
586
587 while (!atEnd()) {
588 switch (*mState.cursor) {
589 case ' ':
590 case '\t':
591 ++mState.cursor;
592 break;
593 case '#': {
594 ++mState.cursor;
595 QString dummy;
596 if (!parseHashComment(dummy)) {
597 return false;
598 }
599 goto MultiLineStart; // break from switch _and_ while
600 }
601 case '\n':
602 case '\r':
603 if (!eatCRLF()) {
604 return false;
605 }
606 goto MultiLineStart; // break from switch _and_ while
607 default:
608 makeError(Error::NonCWSAfterTextColon);
609 return false;
610 }
611 }
612
613MultiLineStart:
614 if (atEnd()) {
615 makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
616 return false;
617 }
618
619 // Now, collect the single lines until one with only a single dot is found:
620 QStringList lines;
621 while (!atEnd()) {
622 const char *const oldBeginOfLine = beginOfLine();
623 if (!skipToCRLF()) {
624 return false;
625 }
626 const int lineLength = mState.cursor - oldBeginOfLine;
627 if (lineLength > 0) {
628 if (!isValidUtf8(oldBeginOfLine, lineLength)) {
629 makeError(Error::InvalidUTF8);
630 return false;
631 }
632 const QString line = removeCRLF(QString::fromUtf8(oldBeginOfLine, lineLength));
633 lines.push_back(removeDotStuff(line));
634 if (line == QLatin1Char('.')) {
635 break;
636 }
637 } else {
638 lines.push_back(QString());
639 }
640 }
641
642 if (lines.back() != QLatin1StringView(".")) {
643 makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
644 return false;
645 }
646
647 assert(!lines.empty());
648 lines.erase(--lines.end()); // don't include the lone dot.
649 result = lines.join(QLatin1Char('\n'));
650 return true;
651}
652
653bool Lexer::Impl::parseQuotedString(QString &result)
654{
655 // quoted-string := DQUOTE *CHAR DQUOTE
656
657 // check that caller plays by the rules:
658 assert(*(mState.cursor - 1) == '"');
659
660 const int qsBeginCol = column() - 1;
661 const int qsBeginLine = line();
662
664 while (!atEnd()) {
665 switch (*mState.cursor) {
666 case '"':
667 ++mState.cursor;
668 return true;
669 case '\r':
670 case '\n':
671 if (!eatCRLF()) {
672 return false;
673 }
674 result += QLatin1Char('\n');
675 break;
676 case '\\':
677 ++mState.cursor;
678 if (atEnd()) {
679 break;
680 }
681 [[fallthrough]];
682 default:
683 if (!is8Bit(*mState.cursor)) {
684 result += QLatin1Char(*mState.cursor++);
685 } else { // probably UTF-8
686 const char *const eightBitBegin = mState.cursor;
687 skipTo8BitEnd();
688 const int eightBitLen = mState.cursor - eightBitBegin;
689 assert(eightBitLen > 0);
690 if (isValidUtf8(eightBitBegin, eightBitLen)) {
691 result += dec.decode(QByteArrayView(eightBitBegin, eightBitLen));
692 } else {
693 assert(column() >= eightBitLen);
694 makeError(Error::InvalidUTF8, line(), column() - eightBitLen);
695 return false;
696 }
697 }
698 }
699 }
700
701 makeError(Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol);
702 return false;
703}
704
705void Lexer::Impl::makeIllegalCharError(char ch)
706{
707 makeError(isIllegal(ch) ? Error::IllegalCharacter : Error::UnexpectedCharacter);
708}
709} // namespace KSieve
void error(QWidget *parent, const QString &text, const QString &title, const KGuiItem &buttonOk, Options options=Notify)
reference back()
bool empty() const const
iterator end()
iterator erase(const_iterator begin, const_iterator end)
void push_back(parameter_type value)
void clear()
bool endsWith(QChar c, Qt::CaseSensitivity cs) const const
QString fromLatin1(QByteArrayView str)
QString fromUtf8(QByteArrayView str)
QString left(qsizetype n) const const
qsizetype length() const const
QString mid(qsizetype position, qsizetype n) const const
QString & remove(QChar ch, Qt::CaseSensitivity cs)
QString & replace(QChar before, QChar after, Qt::CaseSensitivity cs)
QString & setNum(double n, char format, int precision)
bool startsWith(QChar c, Qt::CaseSensitivity cs) const const
QString join(QChar separator) const const
QTextStream & dec(QTextStream &stream)
QFuture< void > map(Iterator begin, Iterator end, MapFunctor &&function)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Tue Mar 26 2024 11:17:19 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.