KMime

kmime_header_parsing.cpp
1 /* -*- c++ -*-
2  kmime_header_parsing.cpp
3 
4  KMime, the KDE Internet mail/usenet news message library.
5  SPDX-FileCopyrightText: 2001-2002 Marc Mutz <[email protected]>
6 
7  SPDX-License-Identifier: LGPL-2.0-or-later
8 */
9 
10 #include "kmime_header_parsing.h"
11 
12 #include "kmime_headerfactory_p.h"
13 #include "kmime_headers.h"
14 #include "kmime_util.h"
15 #include "kmime_util_p.h"
16 #include "kmime_codecs.h"
17 #include "kmime_dateformatter.h"
18 #include "kmime_debug.h"
19 #include "kmime_warning.h"
20 
21 #include <KCharsets>
22 
23 #include <KCodecs/KCodecs>
24 
25 #include <QTextCodec>
26 #include <QMap>
27 #include <QStringList>
28 
29 #include <ctype.h> // for isdigit
30 #include <cassert>
31 
32 using namespace KMime;
33 using namespace KMime::Types;
34 
35 namespace KMime
36 {
37 
38  namespace Types
39  {
40  // Optimization to avoid allocating QStrings when the value isn't encoded
41  struct KMIME_EXPORT QStringOrQPair {
42  QStringOrQPair() : qstring(), qpair(nullptr, 0) {}
43  QString qstring;
45  };
46  } // namespace Types
47 
48 namespace HeaderParsing
49 {
50 
51 // parse the encoded-word (scursor points to after the initial '=')
52 bool parseEncodedWord(const char *&scursor, const char *const send,
53  QString &result, QByteArray &language,
54  QByteArray &usedCS, const QByteArray &defaultCS,
55  bool forceCS)
56 {
57  // make sure the caller already did a bit of the work.
58  assert(*(scursor - 1) == '=');
59 
60  //
61  // STEP 1:
62  // scan for the charset/language portion of the encoded-word
63  //
64 
65  char ch = *scursor++;
66 
67  if (ch != '?') {
68  // qCDebug(KMIME_LOG) << "first";
69  //KMIME_WARN_PREMATURE_END_OF( EncodedWord );
70  return false;
71  }
72 
73  // remember start of charset (ie. just after the initial "=?") and
74  // language (just after the first '*') fields:
75  const char *charsetStart = scursor;
76  const char *languageStart = nullptr;
77 
78  // find delimiting '?' (and the '*' separating charset and language
79  // tags, if any):
80  for (; scursor != send ; scursor++) {
81  if (*scursor == '?') {
82  break;
83  } else if (*scursor == '*' && languageStart == nullptr) {
84  languageStart = scursor + 1;
85  }
86  }
87 
88  // not found? can't be an encoded-word!
89  if (scursor == send || *scursor != '?') {
90  // qCDebug(KMIME_LOG) << "second";
91  KMIME_WARN_PREMATURE_END_OF(EncodedWord);
92  return false;
93  }
94 
95  // extract the language information, if any (if languageStart is 0,
96  // language will be null, too):
97  QByteArray maybeLanguage(languageStart, scursor - languageStart);
98  // extract charset information (keep in mind: the size given to the
99  // ctor is one off due to the \0 terminator):
100  QByteArray maybeCharset(charsetStart,
101  (languageStart ? languageStart - 1 : scursor) - charsetStart);
102 
103  //
104  // STEP 2:
105  // scan for the encoding portion of the encoded-word
106  //
107 
108  // remember start of encoding (just _after_ the second '?'):
109  scursor++;
110  const char *encodingStart = scursor;
111 
112  // find next '?' (ending the encoding tag):
113  for (; scursor != send ; scursor++) {
114  if (*scursor == '?') {
115  break;
116  }
117  }
118 
119  // not found? Can't be an encoded-word!
120  if (scursor == send || *scursor != '?') {
121  // qCDebug(KMIME_LOG) << "third";
122  KMIME_WARN_PREMATURE_END_OF(EncodedWord);
123  return false;
124  }
125 
126  // extract the encoding information:
127  QByteArray maybeEncoding(encodingStart, scursor - encodingStart);
128 
129  // qCDebug(KMIME_LOG) << "parseEncodedWord: found charset == \"" << maybeCharset
130  // << "\"; language == \"" << maybeLanguage
131  // << "\"; encoding == \"" << maybeEncoding << "\"";
132 
133  //
134  // STEP 3:
135  // scan for encoded-text portion of encoded-word
136  //
137 
138  // remember start of encoded-text (just after the third '?'):
139  scursor++;
140  const char *encodedTextStart = scursor;
141 
142  // find the '?=' sequence (ending the encoded-text):
143  for (; scursor != send ; scursor++) {
144  if (*scursor == '?') {
145  if (scursor + 1 != send) {
146  if (*(scursor + 1) != '=') { // We expect a '=' after the '?', but we got something else; ignore
147  KMIME_WARN << "Stray '?' in q-encoded word, ignoring this.";
148  continue;
149  } else { // yep, found a '?=' sequence
150  scursor += 2;
151  break;
152  }
153  } else { // The '?' is the last char, but we need a '=' after it!
154  KMIME_WARN_PREMATURE_END_OF(EncodedWord);
155  return false;
156  }
157  }
158  }
159 
160  if (*(scursor - 2) != '?' || *(scursor - 1) != '=' ||
161  scursor < encodedTextStart + 2) {
162  KMIME_WARN_PREMATURE_END_OF(EncodedWord);
163  return false;
164  }
165 
166  // set end sentinel for encoded-text:
167  const char *const encodedTextEnd = scursor - 2;
168 
169  //
170  // STEP 4:
171  // setup decoders for the transfer encoding and the charset
172  //
173 
174  // try if there's a codec for the encoding found:
175  KCodecs::Codec *codec = KCodecs::Codec::codecForName(maybeEncoding);
176  if (!codec) {
177  KMIME_WARN_UNKNOWN(Encoding, maybeEncoding);
178  return false;
179  }
180 
181  // get an instance of a corresponding decoder:
182  KCodecs::Decoder *dec = codec->makeDecoder();
183  assert(dec);
184 
185  // try if there's a (text)codec for the charset found:
186  bool matchOK = false;
187  QTextCodec *textCodec = nullptr;
188  if (forceCS || maybeCharset.isEmpty()) {
189  textCodec = KCharsets::charsets()->codecForName(QLatin1String(defaultCS), matchOK);
190  usedCS = cachedCharset(defaultCS);
191  } else {
192  textCodec = KCharsets::charsets()->codecForName(QLatin1String(maybeCharset), matchOK);
193  if (!matchOK) { //no suitable codec found => use default charset
194  textCodec = KCharsets::charsets()->codecForName(QLatin1String(defaultCS), matchOK);
195  usedCS = cachedCharset(defaultCS);
196  } else {
197  usedCS = cachedCharset(maybeCharset);
198  }
199  }
200 
201  if (!matchOK || !textCodec) {
202  KMIME_WARN_UNKNOWN(Charset, maybeCharset);
203  delete dec;
204  return false;
205  };
206 
207  // qCDebug(KMIME_LOG) << "mimeName(): \"" << textCodec->name() << "\"";
208 
209  // allocate a temporary buffer to store the 8bit text:
210  int encodedTextLength = encodedTextEnd - encodedTextStart;
211  QByteArray buffer;
212  buffer.resize(codec->maxDecodedSizeFor(encodedTextLength));
213  char *bbegin = buffer.data();
214  char *bend = bbegin + buffer.length();
215 
216  //
217  // STEP 5:
218  // do the actual decoding
219  //
220 
221  if (!dec->decode(encodedTextStart, encodedTextEnd, bbegin, bend)) {
222  KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor("
223  << encodedTextLength << ")\nresult may be truncated";
224  }
225 
226  result = textCodec->toUnicode(buffer.data(), bbegin - buffer.data());
227 
228  // qCDebug(KMIME_LOG) << "result now: \"" << result << "\"";
229  // cleanup:
230  delete dec;
231  language = maybeLanguage;
232 
233  return true;
234 }
235 
236 static inline void eatWhiteSpace(const char *&scursor, const char *const send)
237 {
238  while (scursor != send &&
239  (*scursor == ' ' || *scursor == '\n' ||
240  *scursor == '\t' || *scursor == '\r')) {
241  scursor++;
242  }
243 }
244 
245 bool parseAtom(const char*&scursor, const char *const send,
246  QByteArray &result, bool allow8Bit)
247 {
248  QPair<const char *, int> maybeResult;
249 
250  if (parseAtom(scursor, send, maybeResult, allow8Bit)) {
251  result = QByteArray(maybeResult.first, maybeResult.second);
252  return true;
253  }
254 
255  return false;
256 }
257 
258 bool parseAtom(const char*&scursor, const char *const send,
259  QPair<const char *, int> &result, bool allow8Bit)
260 {
261  bool success = false;
262  const char *start = scursor;
263 
264  while (scursor != send) {
265  signed char ch = *scursor++;
266  if (ch > 0 && isAText(ch)) {
267  // AText: OK
268  success = true;
269  } else if (allow8Bit && ch < 0) {
270  // 8bit char: not OK, but be tolerant.
271  KMIME_WARN_8BIT(ch);
272  success = true;
273  } else {
274  // CTL or special - marking the end of the atom:
275  // re-set sursor to point to the offending
276  // char and return:
277  scursor--;
278  break;
279  }
280  }
281  result.first = start;
282  result.second = scursor - start;
283  return success;
284 }
285 
286 bool parseToken(const char*&scursor, const char *const send,
287  QByteArray &result, ParseTokenFlags flags)
288 {
289  QPair<const char *, int> maybeResult;
290 
291  if (parseToken(scursor, send, maybeResult, flags)) {
292  result = QByteArray(maybeResult.first, maybeResult.second);
293  return true;
294  }
295 
296  return false;
297 }
298 
299 bool parseToken(const char*&scursor, const char *const send,
300  QPair<const char *, int> &result, ParseTokenFlags flags)
301 {
302  bool success = false;
303  const char *start = scursor;
304 
305  while (scursor != send) {
306  signed char ch = *scursor++;
307  if (ch > 0 && isTText(ch)) {
308  // TText: OK
309  success = true;
310  } else if ((flags & ParseTokenAllow8Bit) && ch < 0) {
311  // 8bit char: not OK, but be tolerant.
312  KMIME_WARN_8BIT(ch);
313  success = true;
314  } else if ((flags & ParseTokenRelaxedTText) && ch == '/') {
315  success = true;
316  } else {
317  // CTL or tspecial - marking the end of the atom:
318  // re-set sursor to point to the offending
319  // char and return:
320  scursor--;
321  break;
322  }
323  }
324  result.first = start;
325  result.second = scursor - start;
326  return success;
327 }
328 
329 #define READ_ch_OR_FAIL if ( scursor == send ) { \
330  KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \
331  return false; \
332  } else { \
333  ch = *scursor++; \
334  }
335 
336 // known issues:
337 //
338 // - doesn't handle quoted CRLF
339 
340 bool parseGenericQuotedString(const char *&scursor, const char *const send,
341  QString &result, bool isCRLF,
342  const char openChar, const char closeChar)
343 {
344  // We are in a quoted-string or domain-literal or comment and the
345  // cursor points to the first char after the openChar.
346  // We will apply unfolding and quoted-pair removal.
347  // We return when we either encounter the end or unescaped openChar
348  // or closeChar.
349  assert(*(scursor - 1) == openChar || *(scursor - 1) == closeChar);
350 
351  while (scursor != send) {
352  char ch = *scursor++;
353 
354  if (ch == closeChar || ch == openChar) {
355  // end of quoted-string or another opening char:
356  // let caller decide what to do.
357  return true;
358  }
359 
360  switch (ch) {
361  case '\\': // quoted-pair
362  // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5
363  READ_ch_OR_FAIL;
364  KMIME_WARN_IF_8BIT(ch);
365  result += QLatin1Char(ch);
366  break;
367  case '\r':
368  // ###
369  // The case of lonely '\r' is easy to solve, as they're
370  // not part of Unix Line-ending conventions.
371  // But I see a problem if we are given Unix-native
372  // line-ending-mails, where we cannot determine anymore
373  // whether a given '\n' was part of a CRLF or was occurring
374  // on it's own.
375  READ_ch_OR_FAIL;
376  if (ch != '\n') {
377  // CR on it's own...
378  KMIME_WARN_LONE(CR);
379  result += QLatin1Char('\r');
380  scursor--; // points to after the '\r' again
381  } else {
382  // CRLF encountered.
383  // lookahead: check for folding
384  READ_ch_OR_FAIL;
385  if (ch == ' ' || ch == '\t') {
386  // correct folding;
387  // position cursor behind the CRLF WSP (unfolding)
388  // and add the WSP to the result
389  result += QLatin1Char(ch);
390  } else {
391  // this is the "shouldn't happen"-case. There is a CRLF
392  // inside a quoted-string without it being part of FWS.
393  // We take it verbatim.
394  KMIME_WARN_NON_FOLDING(CRLF);
395  result += QLatin1String("\r\n");
396  // the cursor is decremented again, so's we need not
397  // duplicate the whole switch here. "ch" could've been
398  // everything (incl. openChar or closeChar).
399  scursor--;
400  }
401  }
402  break;
403  case '\n':
404  // Note: CRLF has been handled above already!
405  // ### LF needs special treatment, depending on whether isCRLF
406  // is true (we can be sure a lonely '\n' was meant this way) or
407  // false ('\n' alone could have meant LF or CRLF in the original
408  // message. This parser assumes CRLF iff the LF is followed by
409  // either WSP (folding) or NULL (premature end of quoted-string;
410  // Should be fixed, since NULL is allowed as per rfc822).
411  READ_ch_OR_FAIL;
412  if (!isCRLF && (ch == ' ' || ch == '\t')) {
413  // folding
414  // correct folding
415  result += QLatin1Char(ch);
416  } else {
417  // non-folding
418  KMIME_WARN_LONE(LF);
419  result += QLatin1Char('\n');
420  // pos is decremented, so's we need not duplicate the whole
421  // switch here. ch could've been everything (incl. <">, "\").
422  scursor--;
423  }
424  break;
425  case '=': {
426  // ### Work around broken clients that send encoded words in quoted-strings
427  // For example, older KMail versions.
428  if (scursor == send) {
429  break;
430  }
431 
432  const char *oldscursor = scursor;
433  QString tmp;
434  QByteArray lang, charset;
435  if (*scursor++ == '?') {
436  --scursor;
437  if (parseEncodedWord(scursor, send, tmp, lang, charset)) {
438  result += tmp;
439  //qDebug() << " tmp " << tmp;
440  if (scursor == send) {
441  break;
442  } else if (*scursor++ == ' ') { //Workaround Bug 362650 thunderbird add space for each new line
443  if (scursor == send) {
444  --scursor;
445  break;
446  } else if (*scursor++ == '=') {
447  if (scursor == send) {
448  --scursor;
449  --scursor;
450  break;
451  } else if (*scursor++ == '?') {
452  --scursor;
453  --scursor;
454  break;
455  }
456  } else {
457  --scursor;
458  --scursor;
459  }
460  } else {
461  --scursor;
462  }
463 
464  break;
465  } else {
466  scursor = oldscursor;
467  }
468  } else {
469  scursor = oldscursor;
470  }
471  // fall through
472  Q_FALLTHROUGH();
473  }
474  default:
475  KMIME_WARN_IF_8BIT(ch);
476  result += QLatin1Char(ch);
477  }
478  }
479 
480  return false;
481 }
482 
483 // known issues:
484 //
485 // - doesn't handle encoded-word inside comments.
486 
487 bool parseComment(const char *&scursor, const char *const send,
488  QString &result, bool isCRLF, bool reallySave)
489 {
490  int commentNestingDepth = 1;
491  const char *afterLastClosingParenPos = nullptr;
492  QString maybeCmnt;
493  const char *oldscursor = scursor;
494 
495  assert(*(scursor - 1) == '(');
496 
497  while (commentNestingDepth) {
498  QString cmntPart;
499  if (parseGenericQuotedString(scursor, send, cmntPart, isCRLF, '(', ')')) {
500  assert(*(scursor - 1) == ')' || *(scursor - 1) == '(');
501  // see the kdoc for above function for the possible conditions
502  // we have to check:
503  switch (*(scursor - 1)) {
504  case ')':
505  if (reallySave) {
506  // add the chunk that's now surely inside the comment.
507  result += maybeCmnt;
508  result += cmntPart;
509  if (commentNestingDepth > 1) {
510  // don't add the outermost ')'...
511  result += QLatin1Char(')');
512  }
513  maybeCmnt.clear();
514  }
515  afterLastClosingParenPos = scursor;
516  --commentNestingDepth;
517  break;
518  case '(':
519  if (reallySave) {
520  // don't add to "result" yet, because we might find that we
521  // are already outside the (broken) comment...
522  maybeCmnt += cmntPart;
523  maybeCmnt += QLatin1Char('(');
524  }
525  ++commentNestingDepth;
526  break;
527  default: assert(0);
528  } // switch
529  } else {
530  // !parseGenericQuotedString, ie. premature end
531  if (afterLastClosingParenPos) {
532  scursor = afterLastClosingParenPos;
533  } else {
534  scursor = oldscursor;
535  }
536  return false;
537  }
538  } // while
539 
540  return true;
541 }
542 
543 // known issues: none.
544 
545 bool parsePhrase(const char *&scursor, const char *const send,
546  QString &result, bool isCRLF)
547 {
548  enum {
549  None, Phrase, Atom, EncodedWord, QuotedString
550  } found = None;
551 
552  QString tmp;
553  QByteArray lang, charset;
554  QPair<const char *, int> tmpAtom;
555  const char *successfullyParsed = nullptr;
556  // only used by the encoded-word branch
557  const char *oldscursor;
558  // used to suppress whitespace between adjacent encoded-words
559  // (rfc2047, 6.2):
560  bool lastWasEncodedWord = false;
561 
562  while (scursor != send) {
563  char ch = *scursor++;
564  switch (ch) {
565  case '.': // broken, but allow for intorop's sake
566  if (found == None) {
567  --scursor;
568  return false;
569  } else {
570  if (scursor != send && (*scursor == ' ' || *scursor == '\t')) {
571  result += QLatin1String(". ");
572  } else {
573  result += QLatin1Char('.');
574  }
575  successfullyParsed = scursor;
576  }
577  break;
578  case '"': // quoted-string
579  tmp.clear();
580  if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) {
581  successfullyParsed = scursor;
582  assert(*(scursor - 1) == '"');
583  switch (found) {
584  case None:
585  found = QuotedString;
586  break;
587  case Phrase:
588  case Atom:
589  case EncodedWord:
590  case QuotedString:
591  found = Phrase;
592  result += QLatin1Char(' '); // rfc822, 3.4.4
593  break;
594  default:
595  assert(0);
596  }
597  lastWasEncodedWord = false;
598  result += tmp;
599  } else {
600  // premature end of quoted string.
601  // What to do? Return leading '"' as special? Return as quoted-string?
602  // We do the latter if we already found something, else signal failure.
603  if (found == None) {
604  return false;
605  } else {
606  result += QLatin1Char(' '); // rfc822, 3.4.4
607  result += tmp;
608  return true;
609  }
610  }
611  break;
612  case '(': // comment
613  // parse it, but ignore content:
614  tmp.clear();
615  if (parseComment(scursor, send, tmp, isCRLF,
616  false /*don't bother with the content*/)) {
617  successfullyParsed = scursor;
618  lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2
619  } else {
620  if (found == None) {
621  return false;
622  } else {
623  scursor = successfullyParsed;
624  return true;
625  }
626  }
627  break;
628  case '=': // encoded-word
629  tmp.clear();
630  oldscursor = scursor;
631  lang.clear();
632  charset.clear();
633  if (parseEncodedWord(scursor, send, tmp, lang, charset)) {
634  successfullyParsed = scursor;
635  switch (found) {
636  case None:
637  found = EncodedWord;
638  break;
639  case Phrase:
640  case EncodedWord:
641  case Atom:
642  case QuotedString:
643  if (!lastWasEncodedWord) {
644  result += QLatin1Char(' '); // rfc822, 3.4.4
645  }
646  found = Phrase;
647  break;
648  default: assert(0);
649  }
650  lastWasEncodedWord = true;
651  result += tmp;
652  break;
653  } else {
654  // parse as atom:
655  scursor = oldscursor;
656  }
657  Q_FALLTHROUGH();
658  // fall though...
659 
660  default: //atom
661  scursor--;
662  if (parseAtom(scursor, send, tmpAtom, true /* allow 8bit */)) {
663  successfullyParsed = scursor;
664  switch (found) {
665  case None:
666  found = Atom;
667  break;
668  case Phrase:
669  case Atom:
670  case EncodedWord:
671  case QuotedString:
672  found = Phrase;
673  result += QLatin1Char(' '); // rfc822, 3.4.4
674  break;
675  default:
676  assert(0);
677  }
678  lastWasEncodedWord = false;
679  result += QLatin1String(tmpAtom.first, tmpAtom.second);
680  } else {
681  if (found == None) {
682  return false;
683  } else {
684  scursor = successfullyParsed;
685  return true;
686  }
687  }
688  }
689  eatWhiteSpace(scursor, send);
690  }
691 
692  return found != None;
693 }
694 
695 bool parseDotAtom(const char *&scursor, const char *const send,
696  QByteArray &result, bool isCRLF)
697 {
698  eatCFWS(scursor, send, isCRLF);
699 
700  // always points to just after the last atom parsed:
701  const char *successfullyParsed;
702 
703  QByteArray tmp;
704  if (!parseAtom(scursor, send, tmp, false /* no 8bit */)) {
705  return false;
706  }
707  result += tmp;
708  successfullyParsed = scursor;
709 
710  while (scursor != send) {
711 
712  // end of header or no '.' -> return
713  if (scursor == send || *scursor != '.') {
714  return true;
715  }
716  scursor++; // eat '.'
717 
718  if (scursor == send || !isAText(*scursor)) {
719  // end of header or no AText, but this time following a '.'!:
720  // reset cursor to just after last successfully parsed char and
721  // return:
722  scursor = successfullyParsed;
723  return true;
724  }
725 
726  // try to parse the next atom:
727  QByteArray maybeAtom;
728  if (!parseAtom(scursor, send, maybeAtom, false /*no 8bit*/)) {
729  scursor = successfullyParsed;
730  return true;
731  }
732 
733  result += '.';
734  result += maybeAtom;
735  successfullyParsed = scursor;
736  }
737 
738  scursor = successfullyParsed;
739  return true;
740 }
741 
742 void eatCFWS(const char *&scursor, const char *const send, bool isCRLF)
743 {
744  QString dummy;
745 
746  while (scursor != send) {
747  const char *oldscursor = scursor;
748 
749  char ch = *scursor++;
750 
751  switch (ch) {
752  case ' ':
753  case '\t': // whitespace
754  case '\r':
755  case '\n': // folding
756  continue;
757 
758  case '(': // comment
759  if (parseComment(scursor, send, dummy, isCRLF, false /*don't save*/)) {
760  continue;
761  }
762  scursor = oldscursor;
763  return;
764 
765  default:
766  scursor = oldscursor;
767  return;
768  }
769  }
770 }
771 
772 bool parseDomain(const char *&scursor, const char *const send,
773  QString &result, bool isCRLF)
774 {
775  eatCFWS(scursor, send, isCRLF);
776  if (scursor == send) {
777  return false;
778  }
779 
780  // domain := dot-atom / domain-literal / atom *("." atom)
781  //
782  // equivalent to:
783  // domain = dot-atom / domain-literal,
784  // since parseDotAtom does allow CFWS between atoms and dots
785 
786  if (*scursor == '[') {
787  // domain-literal:
788  QString maybeDomainLiteral;
789  // eat '[':
790  scursor++;
791  while (parseGenericQuotedString(scursor, send, maybeDomainLiteral,
792  isCRLF, '[', ']')) {
793  if (scursor == send) {
794  // end of header: check for closing ']':
795  if (*(scursor - 1) == ']') {
796  // OK, last char was ']':
797  result = maybeDomainLiteral;
798  return true;
799  } else {
800  // not OK, domain-literal wasn't closed:
801  return false;
802  }
803  }
804  // we hit openChar in parseGenericQuotedString.
805  // include it in maybeDomainLiteral and keep on parsing:
806  if (*(scursor - 1) == '[') {
807  maybeDomainLiteral += QLatin1Char('[');
808  continue;
809  }
810  // OK, real end of domain-literal:
811  result = maybeDomainLiteral;
812  return true;
813  }
814  } else {
815  // dot-atom:
816  QByteArray maybeDotAtom;
817  if (parseDotAtom(scursor, send, maybeDotAtom, isCRLF)) {
818  // Domain may end with '.', if so preserve it'
819  if (scursor != send && *scursor == '.') {
820  maybeDotAtom += '.';
821  scursor++;
822  }
823  result = QString::fromLatin1(maybeDotAtom);
824  return true;
825  }
826  }
827  return false;
828 }
829 
830 bool parseObsRoute(const char *&scursor, const char *const send,
831  QStringList &result, bool isCRLF, bool save)
832 {
833  while (scursor != send) {
834  eatCFWS(scursor, send, isCRLF);
835  if (scursor == send) {
836  return false;
837  }
838 
839  // empty entry:
840  if (*scursor == ',') {
841  scursor++;
842  if (save) {
843  result.append(QString());
844  }
845  continue;
846  }
847 
848  // empty entry ending the list:
849  if (*scursor == ':') {
850  scursor++;
851  if (save) {
852  result.append(QString());
853  }
854  return true;
855  }
856 
857  // each non-empty entry must begin with '@':
858  if (*scursor != '@') {
859  return false;
860  } else {
861  scursor++;
862  }
863 
864  QString maybeDomain;
865  if (!parseDomain(scursor, send, maybeDomain, isCRLF)) {
866  return false;
867  }
868  if (save) {
869  result.append(maybeDomain);
870  }
871 
872  // eat the following (optional) comma:
873  eatCFWS(scursor, send, isCRLF);
874  if (scursor == send) {
875  return false;
876  }
877  if (*scursor == ':') {
878  scursor++;
879  return true;
880  }
881  if (*scursor == ',') {
882  scursor++;
883  }
884  }
885 
886  return false;
887 }
888 
889 bool parseAddrSpec(const char *&scursor, const char *const send,
890  AddrSpec &result, bool isCRLF)
891 {
892  //
893  // STEP 1:
894  // local-part := dot-atom / quoted-string / word *("." word)
895  //
896  // this is equivalent to:
897  // local-part := word *("." word)
898 
899  QString maybeLocalPart;
900  QString tmp;
901  QPair<const char *, int> tmpAtom;
902 
903  while (scursor != send) {
904  // first, eat any whitespace
905  eatCFWS(scursor, send, isCRLF);
906 
907  char ch = *scursor++;
908  switch (ch) {
909  case '.': // dot
910  maybeLocalPart += QLatin1Char('.');
911  break;
912 
913  case '@':
914  goto SAW_AT_SIGN;
915  break;
916 
917  case '"': // quoted-string
918  tmp.clear();
919  if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) {
920  maybeLocalPart += tmp;
921  } else {
922  return false;
923  }
924  break;
925 
926  default: // atom
927  scursor--; // re-set scursor to point to ch again
928  if (parseAtom(scursor, send, tmpAtom, false /* no 8bit */)) {
929  maybeLocalPart += QLatin1String(tmpAtom.first, tmpAtom.second);
930  } else {
931  return false; // parseAtom can only fail if the first char is non-atext.
932  }
933  break;
934  }
935  }
936 
937  return false;
938 
939  //
940  // STEP 2:
941  // domain
942  //
943 
944 SAW_AT_SIGN:
945 
946  assert(*(scursor - 1) == '@');
947 
948  QString maybeDomain;
949  if (!parseDomain(scursor, send, maybeDomain, isCRLF)) {
950  return false;
951  }
952 
953  result.localPart = maybeLocalPart;
954  result.domain = maybeDomain;
955 
956  return true;
957 }
958 
959 bool parseAngleAddr(const char *&scursor, const char *const send,
960  AddrSpec &result, bool isCRLF)
961 {
962  // first, we need an opening angle bracket:
963  eatCFWS(scursor, send, isCRLF);
964  if (scursor == send || *scursor != '<') {
965  return false;
966  }
967  scursor++; // eat '<'
968 
969  eatCFWS(scursor, send, isCRLF);
970  if (scursor == send) {
971  return false;
972  }
973 
974  if (*scursor == '@' || *scursor == ',') {
975  // obs-route: parse, but ignore:
976  KMIME_WARN << "obsolete source route found! ignoring.";
977  QStringList dummy;
978  if (!parseObsRoute(scursor, send, dummy,
979  isCRLF, false /* don't save */)) {
980  return false;
981  }
982  // angle-addr isn't complete until after the '>':
983  if (scursor == send) {
984  return false;
985  }
986  }
987 
988  // parse addr-spec:
989  AddrSpec maybeAddrSpec;
990  if (!parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) {
991  return false;
992  }
993 
994  eatCFWS(scursor, send, isCRLF);
995  if (scursor == send || *scursor != '>') {
996  return false;
997  }
998  scursor++;
999 
1000  result = maybeAddrSpec;
1001  return true;
1002 
1003 }
1004 
1005 static QString stripQuotes(const QString &input)
1006 {
1007  const QLatin1Char quotes('"');
1008  if (input.startsWith(quotes) && input.endsWith(quotes)) {
1009  QString stripped(input.mid(1, input.size() - 2));
1010  return stripped;
1011  } else {
1012  return input;
1013  }
1014 }
1015 
1016 bool parseMailbox(const char *&scursor, const char *const send,
1017  Mailbox &result, bool isCRLF)
1018 {
1019  eatCFWS(scursor, send, isCRLF);
1020  if (scursor == send) {
1021  return false;
1022  }
1023 
1024  AddrSpec maybeAddrSpec;
1025  QString maybeDisplayName;
1026 
1027  // first, try if it's a vanilla addr-spec:
1028  const char *oldscursor = scursor;
1029  if (parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) {
1030  result.setAddress(maybeAddrSpec);
1031  // check for the obsolete form of display-name (as comment):
1032  eatWhiteSpace(scursor, send);
1033  if (scursor != send && *scursor == '(') {
1034  scursor++;
1035  if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) {
1036  return false;
1037  }
1038  }
1039  result.setName(stripQuotes(maybeDisplayName));
1040  return true;
1041  }
1042  scursor = oldscursor;
1043 
1044  // second, see if there's a display-name:
1045  if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) {
1046  // failed: reset cursor, note absent display-name
1047  maybeDisplayName.clear();
1048  scursor = oldscursor;
1049  } else {
1050  // succeeded: eat CFWS
1051  eatCFWS(scursor, send, isCRLF);
1052  if (scursor == send) {
1053  return false;
1054  }
1055  }
1056 
1057  // third, parse the angle-addr:
1058  if (!parseAngleAddr(scursor, send, maybeAddrSpec, isCRLF)) {
1059  return false;
1060  }
1061 
1062  if (maybeDisplayName.isNull()) {
1063  // check for the obsolete form of display-name (as comment):
1064  eatWhiteSpace(scursor, send);
1065  if (scursor != send && *scursor == '(') {
1066  scursor++;
1067  if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) {
1068  return false;
1069  }
1070  }
1071  }
1072 
1073  result.setName(stripQuotes(maybeDisplayName));
1074  result.setAddress(maybeAddrSpec);
1075  return true;
1076 }
1077 
1078 bool parseGroup(const char *&scursor, const char *const send,
1079  Address &result, bool isCRLF)
1080 {
1081  // group := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS]
1082  //
1083  // equivalent to:
1084  // group := display-name ":" [ obs-mbox-list ] ";"
1085 
1086  eatCFWS(scursor, send, isCRLF);
1087  if (scursor == send) {
1088  return false;
1089  }
1090 
1091  // get display-name:
1092  QString maybeDisplayName;
1093  if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) {
1094  return false;
1095  }
1096 
1097  // get ":":
1098  eatCFWS(scursor, send, isCRLF);
1099  if (scursor == send || *scursor != ':') {
1100  return false;
1101  }
1102 
1103  // KDE5 TODO: Don't expose displayName as public, but rather add setter for it that
1104  // automatically calls removeBidiControlChars
1105  result.displayName = removeBidiControlChars(maybeDisplayName);
1106 
1107  // get obs-mbox-list (may contain empty entries):
1108  scursor++;
1109  while (scursor != send) {
1110  eatCFWS(scursor, send, isCRLF);
1111  if (scursor == send) {
1112  return false;
1113  }
1114 
1115  // empty entry:
1116  if (*scursor == ',') {
1117  scursor++;
1118  continue;
1119  }
1120 
1121  // empty entry ending the list:
1122  if (*scursor == ';') {
1123  scursor++;
1124  return true;
1125  }
1126 
1127  Mailbox maybeMailbox;
1128  if (!parseMailbox(scursor, send, maybeMailbox, isCRLF)) {
1129  return false;
1130  }
1131  result.mailboxList.append(maybeMailbox);
1132 
1133  eatCFWS(scursor, send, isCRLF);
1134  // premature end:
1135  if (scursor == send) {
1136  return false;
1137  }
1138  // regular end of the list:
1139  if (*scursor == ';') {
1140  scursor++;
1141  return true;
1142  }
1143  // eat regular list entry separator:
1144  if (*scursor == ',') {
1145  scursor++;
1146  }
1147  }
1148  return false;
1149 }
1150 
1151 bool parseAddress(const char *&scursor, const char *const send,
1152  Address &result, bool isCRLF)
1153 {
1154  // address := mailbox / group
1155 
1156  eatCFWS(scursor, send, isCRLF);
1157  if (scursor == send) {
1158  return false;
1159  }
1160 
1161  // first try if it's a single mailbox:
1162  Mailbox maybeMailbox;
1163  const char *oldscursor = scursor;
1164  if (parseMailbox(scursor, send, maybeMailbox, isCRLF)) {
1165  // yes, it is:
1166  result.displayName.clear();
1167  result.mailboxList.append(maybeMailbox);
1168  return true;
1169  }
1170  scursor = oldscursor;
1171 
1172  Address maybeAddress;
1173 
1174  // no, it's not a single mailbox. Try if it's a group:
1175  if (!parseGroup(scursor, send, maybeAddress, isCRLF)) {
1176  return false;
1177  }
1178 
1179  result = maybeAddress;
1180  return true;
1181 }
1182 
1183 bool parseAddressList(const char *&scursor, const char *const send,
1184  AddressList &result, bool isCRLF)
1185 {
1186  while (scursor != send) {
1187  eatCFWS(scursor, send, isCRLF);
1188  // end of header: this is OK.
1189  if (scursor == send) {
1190  return true;
1191  }
1192  // empty entry: ignore:
1193  if (*scursor == ',') {
1194  scursor++;
1195  continue;
1196  }
1197  // broken clients might use ';' as list delimiter, accept that as well
1198  if (*scursor == ';') {
1199  scursor++;
1200  continue;
1201  }
1202 
1203  // parse one entry
1204  Address maybeAddress;
1205  if (!parseAddress(scursor, send, maybeAddress, isCRLF)) {
1206  return false;
1207  }
1208  result.append(maybeAddress);
1209 
1210  eatCFWS(scursor, send, isCRLF);
1211  // end of header: this is OK.
1212  if (scursor == send) {
1213  return true;
1214  }
1215  // comma separating entries: eat it.
1216  if (*scursor == ',') {
1217  scursor++;
1218  }
1219  }
1220  return true;
1221 }
1222 
1223 static bool parseParameter(const char *&scursor, const char *const send,
1224  QPair<QString, QStringOrQPair> &result, bool isCRLF)
1225 {
1226  // parameter = regular-parameter / extended-parameter
1227  // regular-parameter = regular-parameter-name "=" value
1228  // extended-parameter =
1229  // value = token / quoted-string
1230  //
1231  // note that rfc2231 handling is out of the scope of this function.
1232  // Therefore we return the attribute as QByteArray and the value as
1233  // (start,length) tupel if we see that the value is encoded
1234  // (trailing asterisk), for parseParameterList to decode...
1235 
1236  eatCFWS(scursor, send, isCRLF);
1237  if (scursor == send) {
1238  return false;
1239  }
1240 
1241  //
1242  // parse the parameter name:
1243  //
1244  QByteArray tmpAttr;
1245  if (!parseToken(scursor, send, tmpAttr, ParseTokenNoFlag)) {
1246  return false;
1247  }
1248  // FIXME: we could use QMap<QByteArray, ...> in the API for parameters
1249  QString maybeAttribute = QString::fromLatin1(tmpAttr);
1250 
1251  eatCFWS(scursor, send, isCRLF);
1252  // premature end: not OK (haven't seen '=' yet).
1253  if (scursor == send || *scursor != '=') {
1254  return false;
1255  }
1256  scursor++; // eat '='
1257 
1258  eatCFWS(scursor, send, isCRLF);
1259  if (scursor == send) {
1260  // don't choke on attribute=, meaning the value was omitted:
1261  if (maybeAttribute.endsWith(QLatin1Char('*'))) {
1262  KMIME_WARN << "attribute ends with \"*\", but value is empty!"
1263  "Chopping away \"*\".";
1264  maybeAttribute.chop(1);
1265  }
1266  result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1267  return true;
1268  }
1269 
1270  const char *oldscursor = scursor;
1271 
1272  //
1273  // parse the parameter value:
1274  //
1275  QStringOrQPair maybeValue;
1276  if (*scursor == '"') {
1277  // value is a quoted-string:
1278  scursor++;
1279  if (maybeAttribute.endsWith(QLatin1Char('*'))) {
1280  // attributes ending with "*" designate extended-parameters,
1281  // which cannot have quoted-strings as values. So we remove the
1282  // trailing "*" to not confuse upper layers.
1283  KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!"
1284  "Chopping away \"*\".";
1285  maybeAttribute.chop(1);
1286  }
1287 
1288  if (!parseGenericQuotedString(scursor, send, maybeValue.qstring, isCRLF)) {
1289  scursor = oldscursor;
1290  result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1291  return false; // this case needs further processing by upper layers!!
1292  }
1293  } else {
1294  // value is a token:
1295  if (!parseToken(scursor, send, maybeValue.qpair, ParseTokenRelaxedTText)) {
1296  scursor = oldscursor;
1297  result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1298  return false; // this case needs further processing by upper layers!!
1299  }
1300  }
1301 
1302  result = qMakePair(maybeAttribute.toLower(), maybeValue);
1303  return true;
1304 }
1305 
1306 static bool parseRawParameterList(const char *&scursor, const char *const send,
1308  bool isCRLF)
1309 {
1310  // we use parseParameter() consecutively to obtain a map of raw
1311  // attributes to raw values. "Raw" here means that we don't do
1312  // rfc2231 decoding and concatenation. This is left to
1313  // parseParameterList(), which will call this function.
1314  //
1315  // The main reason for making this chunk of code a separate
1316  // (private) method is that we can deal with broken parameters
1317  // _here_ and leave the rfc2231 handling solely to
1318  // parseParameterList(), which will still be enough work.
1319  while (scursor != send) {
1320  eatCFWS(scursor, send, isCRLF);
1321  // empty entry ending the list: OK.
1322  if (scursor == send) {
1323  return true;
1324  }
1325  // empty list entry: ignore.
1326  if (*scursor == ';') {
1327  scursor++;
1328  continue;
1329  }
1330  QPair<QString, QStringOrQPair> maybeParameter;
1331  if (!parseParameter(scursor, send, maybeParameter, isCRLF)) {
1332  // we need to do a bit of work if the attribute is not
1333  // NULL. These are the cases marked with "needs further
1334  // processing" in parseParameter(). Specifically, parsing of the
1335  // token or the quoted-string, which should represent the value,
1336  // failed. We take the easy way out and simply search for the
1337  // next ';' to start parsing again. (Another option would be to
1338  // take the text between '=' and ';' as value)
1339  if (maybeParameter.first.isNull()) {
1340  return false;
1341  }
1342  while (scursor != send) {
1343  if (*scursor++ == ';') {
1344  goto IS_SEMICOLON;
1345  }
1346  }
1347  // scursor == send case: end of list.
1348  return true;
1349  IS_SEMICOLON:
1350  // *scursor == ';' case: parse next entry.
1351  continue;
1352  }
1353  // successful parsing brings us here:
1354  result.insert(maybeParameter.first, maybeParameter.second);
1355 
1356  eatCFWS(scursor, send, isCRLF);
1357  // end of header: ends list.
1358  if (scursor == send) {
1359  return true;
1360  }
1361  // regular separator: eat it.
1362  if (*scursor == ';') {
1363  scursor++;
1364  }
1365  }
1366  return true;
1367 }
1368 
1369 static void decodeRFC2231Value(KCodecs::Codec *&rfc2231Codec,
1370  QTextCodec *&textcodec,
1371  bool isContinuation, QString &value,
1372  QPair<const char *, int> &source, QByteArray &charset)
1373 {
1374  //
1375  // parse the raw value into (charset,language,text):
1376  //
1377 
1378  const char *decBegin = source.first;
1379  const char *decCursor = decBegin;
1380  const char *decEnd = decCursor + source.second;
1381 
1382  if (!isContinuation) {
1383  // find the first single quote
1384  while (decCursor != decEnd) {
1385  if (*decCursor == '\'') {
1386  break;
1387  } else {
1388  decCursor++;
1389  }
1390  }
1391 
1392  if (decCursor == decEnd) {
1393  // there wasn't a single single quote at all!
1394  // take the whole value to be in latin-1:
1395  KMIME_WARN << "No charset in extended-initial-value."
1396  "Assuming \"iso-8859-1\".";
1397  value += QString::fromLatin1(decBegin, source.second);
1398  return;
1399  }
1400 
1401  charset = QByteArray(decBegin, decCursor - decBegin);
1402 
1403  const char *oldDecCursor = ++decCursor;
1404  // find the second single quote (we ignore the language tag):
1405  while (decCursor != decEnd) {
1406  if (*decCursor == '\'') {
1407  break;
1408  } else {
1409  decCursor++;
1410  }
1411  }
1412  if (decCursor == decEnd) {
1413  KMIME_WARN << "No language in extended-initial-value."
1414  "Trying to recover.";
1415  decCursor = oldDecCursor;
1416  } else {
1417  decCursor++;
1418  }
1419 
1420  // decCursor now points to the start of the
1421  // "extended-other-values":
1422 
1423  //
1424  // get the decoders:
1425  //
1426 
1427  bool matchOK = false;
1428  textcodec = KCharsets::charsets()->codecForName(QLatin1String(charset), matchOK);
1429  if (!matchOK) {
1430  textcodec = nullptr;
1431  KMIME_WARN_UNKNOWN(Charset, charset);
1432  }
1433  }
1434 
1435  if (!rfc2231Codec) {
1436  rfc2231Codec = KCodecs::Codec::codecForName("x-kmime-rfc2231");
1437  assert(rfc2231Codec);
1438  }
1439 
1440  if (!textcodec) {
1441  value += QString::fromLatin1(decCursor, decEnd - decCursor);
1442  return;
1443  }
1444 
1445  KCodecs::Decoder *dec = rfc2231Codec->makeDecoder();
1446  assert(dec);
1447 
1448  //
1449  // do the decoding:
1450  //
1451 
1452  QByteArray buffer;
1453  buffer.resize(rfc2231Codec->maxDecodedSizeFor(decEnd - decCursor));
1454  QByteArray::Iterator bit = buffer.begin();
1455  QByteArray::ConstIterator bend = buffer.end();
1456 
1457  if (!dec->decode(decCursor, decEnd, bit, bend)) {
1458  KMIME_WARN << rfc2231Codec->name()
1459  << "codec lies about its maxDecodedSizeFor()"
1460  << Qt::endl
1461  << "result may be truncated";
1462  }
1463 
1464  value += textcodec->toUnicode(buffer.begin(), bit - buffer.begin());
1465 
1466  // qCDebug(KMIME_LOG) << "value now: \"" << value << "\"";
1467  // cleanup:
1468  delete dec;
1469 }
1470 
1471 // known issues:
1472 // - permutes rfc2231 continuations when the total number of parts
1473 // exceeds 10 (other-sections then becomes *xy, ie. two digits)
1474 
1475 bool parseParameterListWithCharset(const char *&scursor,
1476  const char *const send,
1477  QMap<QString, QString> &result,
1478  QByteArray &charset, bool isCRLF)
1479 {
1480 // parse the list into raw attribute-value pairs:
1481  QMap<QString, QStringOrQPair> rawParameterList;
1482  if (!parseRawParameterList(scursor, send, rawParameterList, isCRLF)) {
1483  return false;
1484  }
1485 
1486  if (rawParameterList.isEmpty()) {
1487  return true;
1488  }
1489 
1490  // decode rfc 2231 continuations and alternate charset encoding:
1491 
1492  // NOTE: this code assumes that what QMapIterator delivers is sorted
1493  // by the key!
1494 
1495  KCodecs::Codec *rfc2231Codec = nullptr;
1496  QTextCodec *textcodec = nullptr;
1497  QString attribute;
1498  QString value;
1499  enum Mode {
1500  NoMode = 0x0, Continued = 0x1, Encoded = 0x2
1501  };
1502 
1503  enum EncodingMode {
1504  NoEncoding,
1505  RFC2047,
1506  RFC2231
1507  };
1508 
1509  QMap<QString, QStringOrQPair>::Iterator it, end = rawParameterList.end();
1510 
1511  for (it = rawParameterList.begin() ; it != end ; ++it) {
1512  if (attribute.isNull() || !it.key().startsWith(attribute)) {
1513  //
1514  // new attribute:
1515  //
1516 
1517  // store the last attribute/value pair in the result map now:
1518  if (!attribute.isNull()) {
1519  result.insert(attribute, value);
1520  }
1521  // and extract the information from the new raw attribute:
1522  value.clear();
1523  attribute = it.key();
1524  int mode = NoMode;
1525  EncodingMode encodingMode = NoEncoding;
1526 
1527  // is the value rfc2331-encoded?
1528  if (attribute.endsWith(QLatin1Char('*'))) {
1529  attribute.chop(1);
1530  mode |= Encoded;
1531  encodingMode = RFC2231;
1532  }
1533  // is the value rfc2047-encoded?
1534  if (!(*it).qstring.isNull() && (*it).qstring.contains(QLatin1String("=?"))) {
1535  mode |= Encoded;
1536  encodingMode = RFC2047;
1537  }
1538  // is the value continued?
1539  if (attribute.endsWith(QLatin1String("*0"))) {
1540  attribute.chop(2);
1541  mode |= Continued;
1542  }
1543  //
1544  // decode if necessary:
1545  //
1546  if (mode & Encoded) {
1547  if (encodingMode == RFC2231) {
1548  decodeRFC2231Value(rfc2231Codec, textcodec,
1549  false, /* isn't continuation */
1550  value, (*it).qpair, charset);
1551  } else if (encodingMode == RFC2047) {
1552  value += KCodecs::decodeRFC2047String((*it).qstring.toLatin1(), &charset);
1553  }
1554  } else {
1555  // not encoded.
1556  if ((*it).qpair.first) {
1557  value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second);
1558  } else {
1559  value += (*it).qstring;
1560  }
1561  }
1562 
1563  //
1564  // shortcut-processing when the value isn't encoded:
1565  //
1566 
1567  if (!(mode & Continued)) {
1568  // save result already:
1569  result.insert(attribute, value);
1570  // force begin of a new attribute:
1571  attribute.clear();
1572  }
1573  } else { // it.key().startsWith( attribute )
1574  //
1575  // continuation
1576  //
1577 
1578  // ignore the section and trust QMap to have sorted the keys:
1579  if (it.key().endsWith(QLatin1Char('*'))) {
1580  // encoded
1581  decodeRFC2231Value(rfc2231Codec, textcodec,
1582  true, /* is continuation */
1583  value, (*it).qpair, charset);
1584  } else {
1585  // not encoded
1586  if ((*it).qpair.first) {
1587  value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second);
1588  } else {
1589  value += (*it).qstring;
1590  }
1591  }
1592  }
1593  }
1594  // write last attr/value pair:
1595  if (!attribute.isNull()) {
1596  result.insert(attribute, value);
1597  }
1598 
1599  return true;
1600 }
1601 
1602 bool parseParameterList(const char *&scursor, const char *const send,
1603  QMap<QString, QString> &result, bool isCRLF)
1604 {
1605  QByteArray charset;
1606  return parseParameterListWithCharset(scursor, send, result, charset, isCRLF);
1607 }
1608 
1609 static const char stdDayNames[][4] = {
1610  "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
1611 };
1612 static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames;
1613 
1614 static bool parseDayName(const char *&scursor, const char *const send)
1615 {
1616  // check bounds:
1617  if (send - scursor < 3) {
1618  return false;
1619  }
1620 
1621  for (int i = 0 ; i < stdDayNamesLen ; ++i) {
1622  if (qstrnicmp(scursor, stdDayNames[i], 3) == 0) {
1623  scursor += 3;
1624  // qCDebug(KMIME_LOG) << "found" << stdDayNames[i];
1625  return true;
1626  }
1627  }
1628 
1629  return false;
1630 }
1631 
1632 static const char stdMonthNames[][4] = {
1633  "Jan", "Feb", "Mar", "Apr", "May", "Jun",
1634  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
1635 };
1636 static const int stdMonthNamesLen =
1637  sizeof stdMonthNames / sizeof *stdMonthNames;
1638 
1639 static bool parseMonthName(const char *&scursor, const char *const send,
1640  int &result)
1641 {
1642  // check bounds:
1643  if (send - scursor < 3) {
1644  return false;
1645  }
1646 
1647  for (result = 0 ; result < stdMonthNamesLen ; ++result) {
1648  if (qstrnicmp(scursor, stdMonthNames[result], 3) == 0) {
1649  scursor += 3;
1650  return true;
1651  }
1652  }
1653 
1654  // not found:
1655  return false;
1656 }
1657 
1658 static const struct {
1659  const char tzName[5];
1660  long int secsEastOfGMT;
1661 } timeZones[] = {
1662  // rfc 822 timezones:
1663  { "GMT", 0 },
1664  { "UT", 0 },
1665  { "EDT", -4 * 3600 },
1666  { "EST", -5 * 3600 },
1667  { "MST", -5 * 3600 },
1668  { "CST", -6 * 3600 },
1669  { "MDT", -6 * 3600 },
1670  { "MST", -7 * 3600 },
1671  { "PDT", -7 * 3600 },
1672  { "PST", -8 * 3600 },
1673  // common, non-rfc-822 zones:
1674  { "CET", 1 * 3600 },
1675  { "MET", 1 * 3600 },
1676  { "UTC", 0 },
1677  { "CEST", 2 * 3600 },
1678  { "BST", 1 * 3600 },
1679  // rfc 822 military timezones:
1680  { "Z", 0 },
1681  { "A", -1 * 3600 },
1682  { "B", -2 * 3600 },
1683  { "C", -3 * 3600 },
1684  { "D", -4 * 3600 },
1685  { "E", -5 * 3600 },
1686  { "F", -6 * 3600 },
1687  { "G", -7 * 3600 },
1688  { "H", -8 * 3600 },
1689  { "I", -9 * 3600 },
1690  // J is not used!
1691  { "K", -10 * 3600 },
1692  { "L", -11 * 3600 },
1693  { "M", -12 * 3600 },
1694  { "N", 1 * 3600 },
1695  { "O", 2 * 3600 },
1696  { "P", 3 * 3600 },
1697  { "Q", 4 * 3600 },
1698  { "R", 5 * 3600 },
1699  { "S", 6 * 3600 },
1700  { "T", 7 * 3600 },
1701  { "U", 8 * 3600 },
1702  { "V", 9 * 3600 },
1703  { "W", 10 * 3600 },
1704  { "X", 11 * 3600 },
1705  { "Y", 12 * 3600 },
1706 };
1707 static const int timeZonesLen = sizeof timeZones / sizeof *timeZones;
1708 
1709 static bool parseAlphaNumericTimeZone(const char *&scursor,
1710  const char *const send,
1711  long int &secsEastOfGMT,
1712  bool &timeZoneKnown)
1713 {
1714  // allow the timezone to be wrapped in quotes; bug 260761
1715  if (scursor < send && *scursor == '"') {
1716  scursor++;
1717 
1718  if (scursor == send) {
1719  return false;
1720  }
1721  }
1722 
1723  QPair<const char *, int> maybeTimeZone(nullptr, 0);
1724  if (!parseToken(scursor, send, maybeTimeZone, ParseTokenNoFlag)) {
1725  return false;
1726  }
1727  for (int i = 0 ; i < timeZonesLen ; ++i) {
1728  if (qstrnicmp(timeZones[i].tzName,
1729  maybeTimeZone.first, maybeTimeZone.second) == 0) {
1730  scursor += maybeTimeZone.second;
1731  secsEastOfGMT = timeZones[i].secsEastOfGMT;
1732  timeZoneKnown = true;
1733 
1734  if (scursor < send && *scursor == '"') {
1735  scursor++;
1736  }
1737 
1738  return true;
1739  }
1740  }
1741 
1742  // don't choke just because we don't happen to know the time zone
1743  KMIME_WARN_UNKNOWN(time zone,
1744  QByteArray(maybeTimeZone.first, maybeTimeZone.second));
1745  secsEastOfGMT = 0;
1746  timeZoneKnown = false;
1747  return true;
1748 }
1749 
1750 // parse a number and return the number of digits parsed:
1751 int parseDigits(const char *&scursor, const char *const send, int &result)
1752 {
1753  result = 0;
1754  int digits = 0;
1755  for (; scursor != send && isdigit(*scursor) ; scursor++, digits++) {
1756  result *= 10;
1757  result += int(*scursor - '0');
1758  }
1759  return digits;
1760 }
1761 
1762 static bool parseTimeOfDay(const char *&scursor, const char *const send,
1763  int &hour, int &min, int &sec, bool isCRLF = false)
1764 {
1765  // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ]
1766 
1767  //
1768  // 2DIGIT representing "hour":
1769  //
1770  if (!parseDigits(scursor, send, hour)) {
1771  return false;
1772  }
1773 
1774  eatCFWS(scursor, send, isCRLF);
1775  if (scursor == send || *scursor != ':') {
1776  return false;
1777  }
1778  scursor++; // eat ':'
1779 
1780  eatCFWS(scursor, send, isCRLF);
1781  if (scursor == send) {
1782  return false;
1783  }
1784 
1785  //
1786  // 2DIGIT representing "minute":
1787  //
1788  if (!parseDigits(scursor, send, min)) {
1789  return false;
1790  }
1791 
1792  eatCFWS(scursor, send, isCRLF);
1793  if (scursor == send) {
1794  return true; // seconds are optional
1795  }
1796 
1797  //
1798  // let's see if we have a 2DIGIT representing "second":
1799  //
1800  if (*scursor == ':') {
1801  // yepp, there are seconds:
1802  scursor++; // eat ':'
1803  eatCFWS(scursor, send, isCRLF);
1804  if (scursor == send) {
1805  return false;
1806  }
1807 
1808  if (!parseDigits(scursor, send, sec)) {
1809  return false;
1810  }
1811  } else {
1812  sec = 0;
1813  }
1814 
1815  return true;
1816 }
1817 
1818 bool parseTime(const char *&scursor, const char *send,
1819  int &hour, int &min, int &sec, long int &secsEastOfGMT,
1820  bool &timeZoneKnown, bool isCRLF)
1821 {
1822  // time := time-of-day CFWS ( zone / obs-zone )
1823  //
1824  // obs-zone := "UT" / "GMT" /
1825  // "EST" / "EDT" / ; -0500 / -0400
1826  // "CST" / "CDT" / ; -0600 / -0500
1827  // "MST" / "MDT" / ; -0700 / -0600
1828  // "PST" / "PDT" / ; -0800 / -0700
1829  // "A"-"I" / "a"-"i" /
1830  // "K"-"Z" / "k"-"z"
1831 
1832  eatCFWS(scursor, send, isCRLF);
1833  if (scursor == send) {
1834  return false;
1835  }
1836 
1837  if (!parseTimeOfDay(scursor, send, hour, min, sec, isCRLF)) {
1838  return false;
1839  }
1840 
1841  eatCFWS(scursor, send, isCRLF);
1842  // there might be no timezone but a year following
1843  if ((scursor == send) || isdigit(*scursor)) {
1844  timeZoneKnown = false;
1845  secsEastOfGMT = 0;
1846  return true; // allow missing timezone
1847  }
1848 
1849  timeZoneKnown = true;
1850  if (*scursor == '+' || *scursor == '-') {
1851  // remember and eat '-'/'+':
1852  const char sign = *scursor++;
1853  // numerical timezone:
1854  int maybeTimeZone;
1855  const int tzDigits = parseDigits(scursor, send, maybeTimeZone);
1856  if (tzDigits != 4) {
1857  // Allow timezones in 02:00 format
1858  if (tzDigits == 2 && scursor != send && *scursor == ':') {
1859  scursor++;
1860  int maybeTimeZone2;
1861  if (parseDigits(scursor, send, maybeTimeZone2) != 2) {
1862  return false;
1863  }
1864  maybeTimeZone = maybeTimeZone * 100 + maybeTimeZone2;
1865  } else {
1866  return false;
1867  }
1868  }
1869  secsEastOfGMT = 60 * (maybeTimeZone / 100 * 60 + maybeTimeZone % 100);
1870  if (sign == '-') {
1871  secsEastOfGMT *= -1;
1872  if (secsEastOfGMT == 0) {
1873  timeZoneKnown = false; // -0000 means indetermined tz
1874  }
1875  }
1876  } else {
1877  // maybe alphanumeric timezone:
1878  if (!parseAlphaNumericTimeZone(scursor, send, secsEastOfGMT, timeZoneKnown)) {
1879  return false;
1880  }
1881  }
1882  return true;
1883 }
1884 
1885 bool parseDateTime(const char *&scursor, const char *const send,
1886  QDateTime &result, bool isCRLF)
1887 {
1888  // Parsing date-time; strict mode:
1889  //
1890  // date-time := [ [CFWS] day-name [CFWS] "," ] ; wday
1891  // (expanded) [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date
1892  // time
1893  //
1894  // day-name := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
1895  // month-name := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" /
1896  // "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec"
1897 
1898  result = QDateTime();
1899 
1900  eatCFWS(scursor, send, isCRLF);
1901  if (scursor == send) {
1902  return false;
1903  }
1904 
1905  //
1906  // let's see if there's a day-of-week:
1907  //
1908  if (parseDayName(scursor, send)) {
1909  eatCFWS(scursor, send, isCRLF);
1910  if (scursor == send) {
1911  return false;
1912  }
1913  // day-name should be followed by ',' but we treat it as optional:
1914  if (*scursor == ',') {
1915  scursor++; // eat ','
1916  eatCFWS(scursor, send, isCRLF);
1917  }
1918  }
1919 
1920  int maybeMonth = -1;
1921  bool asctimeFormat = false;
1922 
1923  // ANSI-C asctime() format is: Wed Jun 30 21:49:08 1993
1924  if (!isdigit(*scursor) && parseMonthName(scursor, send, maybeMonth)) {
1925  asctimeFormat = true;
1926  eatCFWS(scursor, send, isCRLF);
1927  }
1928 
1929  //
1930  // 1*2DIGIT representing "day" (of month):
1931  //
1932  int maybeDay;
1933  if (!parseDigits(scursor, send, maybeDay)) {
1934  return false;
1935  }
1936 
1937  eatCFWS(scursor, send, isCRLF);
1938  if (scursor == send) {
1939  return false;
1940  }
1941 
1942  // ignore ","; bug 54098
1943  if (*scursor == ',') {
1944  scursor++;
1945  }
1946 
1947  //
1948  // month-name:
1949  //
1950  if (!asctimeFormat && !parseMonthName(scursor, send, maybeMonth)) {
1951  return false;
1952  }
1953  if (scursor == send) {
1954  return false;
1955  }
1956  assert(maybeMonth >= 0); assert(maybeMonth <= 11);
1957  ++maybeMonth; // 0-11 -> 1-12
1958 
1959  eatCFWS(scursor, send, isCRLF);
1960  if (scursor == send) {
1961  return false;
1962  }
1963 
1964  // check for "year HH:MM:SS" or only "HH:MM:SS" (or "H:MM:SS")
1965  bool timeAfterYear = true;
1966  if ((send - scursor > 3) && ((scursor[1] == ':') || (scursor[2] == ':'))) {
1967  timeAfterYear = false; // first read time, then year
1968  }
1969 
1970  //
1971  // 2*DIGIT representing "year":
1972  //
1973  int maybeYear = 0;
1974 
1975  if (timeAfterYear && !parseDigits(scursor, send, maybeYear)) {
1976  return false;
1977  }
1978 
1979  eatCFWS(scursor, send, isCRLF);
1980  if (scursor == send) {
1981  return false;
1982  }
1983 
1984  //
1985  // time
1986  //
1987  int maybeHour, maybeMinute, maybeSecond;
1988  long int secsEastOfGMT;
1989  bool timeZoneKnown = true;
1990 
1991  if (!parseTime(scursor, send,
1992  maybeHour, maybeMinute, maybeSecond,
1993  secsEastOfGMT, timeZoneKnown, isCRLF)) {
1994  return false;
1995  }
1996 
1997  // in asctime() the year follows the time
1998  if (!timeAfterYear) {
1999  eatCFWS(scursor, send, isCRLF);
2000  if (scursor == send) {
2001  return false;
2002  }
2003 
2004  if (!parseDigits(scursor, send, maybeYear)) {
2005  return false;
2006  }
2007  }
2008 
2009  // RFC 2822 4.3 processing:
2010  if (maybeYear < 50) {
2011  maybeYear += 2000;
2012  } else if (maybeYear < 1000) {
2013  maybeYear += 1900;
2014  }
2015  // else keep as is
2016  if (maybeYear < 1900) {
2017  return false; // rfc2822, 3.3
2018  }
2019 
2020  const QDate maybeDate = QDate(maybeYear, maybeMonth, maybeDay);
2021  const QTime maybeTime = QTime(maybeHour, maybeMinute, maybeSecond);
2022 
2023  if (!maybeDate.isValid() || !maybeTime.isValid()) {
2024  return false;
2025  }
2026 
2027  result = QDateTime(maybeDate, maybeTime, Qt::OffsetFromUTC, secsEastOfGMT);
2028  //result = QDateTime( maybeDateTime, QDateTime::Spec( QDateTime::OffsetFromUTC, secsEastOfGMT ) );
2029  if (!result.isValid()) {
2030  return false;
2031  }
2032  return true;
2033 }
2034 
2035 namespace {
2036 
2037 Headers::Base *extractHeader(const QByteArray &head, const int headerStart, int &endOfFieldBody)
2038 {
2039  Headers::Base *header = {};
2040 
2041  int startOfFieldBody = head.indexOf(':', headerStart);
2042  if (startOfFieldBody < 0) {
2043  return nullptr;
2044  }
2045 
2046  const char *rawType = head.constData() + headerStart;
2047  const size_t rawTypeLen = startOfFieldBody - headerStart;
2048 
2049  startOfFieldBody++; //skip the ':'
2050  if (startOfFieldBody < head.size() - 1 && head[startOfFieldBody] == ' ') { // skip the space after the ':', if there's any
2051  startOfFieldBody++;
2052  }
2053 
2054  bool folded = false;
2055  endOfFieldBody = findHeaderLineEnd(head, startOfFieldBody, &folded);
2056 
2057  // We might get an invalid mail without a field name, don't crash on that.
2058  if (rawTypeLen > 0) {
2059  header = HeaderFactory::createHeader(rawType, rawTypeLen);
2060  }
2061  if (!header) {
2062  //qCWarning(KMIME_LOG)() << "Returning Generic header of type" << rawType;
2063  header = new Headers::Generic(rawType, rawTypeLen);
2064  }
2065  if (folded) {
2066  const auto unfoldedBody = unfoldHeader(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody);
2067  header->from7BitString(unfoldedBody);
2068  } else {
2069  header->from7BitString(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody);
2070  }
2071 
2072  return header;
2073 }
2074 
2075 }
2076 
2077 Headers::Base *extractFirstHeader(QByteArray &head)
2078 {
2079  int endOfFieldBody = 0;
2080  auto header = extractHeader(head, 0, endOfFieldBody);
2081  if (header) {
2082  head.remove(0, endOfFieldBody + 1);
2083  } else {
2084  head.clear();
2085  }
2086 
2087  return header;
2088 }
2089 
2090 void extractHeaderAndBody(const QByteArray &content, QByteArray &header, QByteArray &body)
2091 {
2092  header.clear();
2093  body.clear();
2094 
2095  // empty header
2096  if (content.startsWith('\n')) {
2097  body = content.right(content.length() - 1);
2098  return;
2099  }
2100 
2101  int pos = content.indexOf("\n\n", 0);
2102  if (pos > -1) {
2103  header = content.left(++pos); //header *must* end with "\n" !!
2104  body = content.mid(pos + 1);
2105  if (body.startsWith("\n")) {
2106  body = "\n" + body;
2107  }
2108  } else {
2109  header = content;
2110  }
2111 }
2112 
2113 QVector<Headers::Base*> parseHeaders(const QByteArray &head)
2114 {
2116 
2117  int cursor = 0;
2118  while (cursor < head.size()) {
2119  const int headerStart = cursor;
2120  int endOfFieldBody;
2121  if (auto header = extractHeader(head, headerStart, endOfFieldBody)) {
2122  ret << header;
2123  cursor = endOfFieldBody + 1;
2124  } else {
2125  break;
2126  }
2127  }
2128 
2129  return ret;
2130 }
2131 
2132 } // namespace HeaderParsing
2133 
2134 } // namespace KMime
QTextCodec * codecForName(const QString &name) const
void clear()
void append(const T &value)
Represents an (email address, display name) pair according RFC 2822, section 3.4. ...
Definition: kmime_types.h:37
virtual int maxDecodedSizeFor(int insize, NewlineType newline=NewlineLF) const =0
This file is part of the API for handling MIME data and defines the DateFormatter class...
int size() const const
KCODECS_EXPORT QString decodeRFC2047String(const QString &text)
bool isValid() const const
virtual void from7BitString(const char *s, size_t len)
Parses the given string.
bool startsWith(const QByteArray &ba) const const
virtual Decoder * makeDecoder(NewlineType newline=NewlineLF) const =0
QTextStream & endl(QTextStream &stream)
int length() const const
void chop(int n)
virtual bool decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)=0
bool isNull() const const
OffsetFromUTC
void clear()
void resize(int size)
int indexOf(char ch, int from) const const
void append(const T &value)
Represents an arbitrary header, that can contain any header-field.
void setAddress(const AddrSpec &addr)
Sets the email address.
Baseclass of all header-classes.
const char * constData() const const
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
QByteArray right(int len) const const
This file is part of the API for handling MIME data and defines the various header classes: ...
bool isValid() const const
QByteArray::iterator begin()
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
QMap::iterator end()
QByteArray mid(int pos, int len) const const
QMap::iterator begin()
QString toLower() const const
static KCharsets * charsets()
bool isValid() const const
const Key key(const T &value, const Key &defaultKey) const const
const QList< QKeySequence > & end()
QByteArray left(int len) const const
static Codec * codecForName(const char *name)
QString mid(int position, int n) const const
char * data()
QString fromLatin1(const char *str, int size)
virtual const char * name() const =0
QMap::iterator insert(const Key &key, const T &value)
void setName(const QString &name)
Sets the name.
bool isEmpty() const const
int size() const const
QByteArray & remove(int pos, int len)
QString toUnicode(const QByteArray &a) const const
QByteArray::iterator end()
QTextStream & dec(QTextStream &stream)
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Wed Jun 23 2021 23:13:18 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.