KMime

kmime_header_parsing.cpp
1 /* -*- c++ -*-
2  kmime_header_parsing.cpp
3 
4  KMime, the KDE Internet mail/usenet news message library.
5  SPDX-FileCopyrightText: 2001-2002 Marc Mutz <[email protected]>
6 
7  SPDX-License-Identifier: LGPL-2.0-or-later
8 */
9 
10 #include "kmime_header_parsing.h"
11 
12 #include "kmime_headerfactory_p.h"
13 #include "kmime_headers.h"
14 #include "kmime_util.h"
15 #include "kmime_util_p.h"
16 #include "kmime_codecs.h"
17 #include "kmime_dateformatter.h"
18 #include "kmime_debug.h"
19 #include "kmime_warning.h"
20 
21 #include <KCharsets>
22 
23 #include <KCodecs>
24 
25 #include <QTextCodec>
26 #include <QMap>
27 #include <QStringList>
28 
29 #include <ctype.h> // for isdigit
30 #include <cassert>
31 
32 using namespace KMime;
33 using namespace KMime::Types;
34 
35 namespace KMime
36 {
37 
38  namespace Types
39  {
40  // Optimization to avoid allocating QStrings when the value isn't encoded
41  struct KMIME_EXPORT QStringOrQPair {
42  QStringOrQPair() : qstring(), qpair(nullptr, 0) {}
43  QString qstring;
45  };
46  } // namespace Types
47 
48 namespace HeaderParsing
49 {
50 
51 // parse the encoded-word (scursor points to after the initial '=')
52 bool parseEncodedWord(const char *&scursor, const char *const send,
53  QString &result, QByteArray &language,
54  QByteArray &usedCS, const QByteArray &defaultCS,
55  bool forceCS)
56 {
57  // make sure the caller already did a bit of the work.
58  assert(*(scursor - 1) == '=');
59 
60  //
61  // STEP 1:
62  // scan for the charset/language portion of the encoded-word
63  //
64 
65  char ch = *scursor++;
66 
67  if (ch != '?') {
68  // qCDebug(KMIME_LOG) << "first";
69  //KMIME_WARN_PREMATURE_END_OF( EncodedWord );
70  return false;
71  }
72 
73  // remember start of charset (ie. just after the initial "=?") and
74  // language (just after the first '*') fields:
75  const char *charsetStart = scursor;
76  const char *languageStart = nullptr;
77 
78  // find delimiting '?' (and the '*' separating charset and language
79  // tags, if any):
80  for (; scursor != send ; scursor++) {
81  if (*scursor == '?') {
82  break;
83  } else if (*scursor == '*' && languageStart == nullptr) {
84  languageStart = scursor + 1;
85  }
86  }
87 
88  // not found? can't be an encoded-word!
89  if (scursor == send || *scursor != '?') {
90  // qCDebug(KMIME_LOG) << "second";
91  KMIME_WARN_PREMATURE_END_OF(EncodedWord);
92  return false;
93  }
94 
95  // extract the language information, if any (if languageStart is 0,
96  // language will be null, too):
97  QByteArray maybeLanguage(languageStart, scursor - languageStart);
98  // extract charset information (keep in mind: the size given to the
99  // ctor is one off due to the \0 terminator):
100  QByteArray maybeCharset(charsetStart,
101  (languageStart ? languageStart - 1 : scursor) - charsetStart);
102 
103  //
104  // STEP 2:
105  // scan for the encoding portion of the encoded-word
106  //
107 
108  // remember start of encoding (just _after_ the second '?'):
109  scursor++;
110  const char *encodingStart = scursor;
111 
112  // find next '?' (ending the encoding tag):
113  for (; scursor != send ; scursor++) {
114  if (*scursor == '?') {
115  break;
116  }
117  }
118 
119  // not found? Can't be an encoded-word!
120  if (scursor == send || *scursor != '?') {
121  // qCDebug(KMIME_LOG) << "third";
122  KMIME_WARN_PREMATURE_END_OF(EncodedWord);
123  return false;
124  }
125 
126  // extract the encoding information:
127  QByteArray maybeEncoding(encodingStart, scursor - encodingStart);
128 
129  // qCDebug(KMIME_LOG) << "parseEncodedWord: found charset == \"" << maybeCharset
130  // << "\"; language == \"" << maybeLanguage
131  // << "\"; encoding == \"" << maybeEncoding << "\"";
132 
133  //
134  // STEP 3:
135  // scan for encoded-text portion of encoded-word
136  //
137 
138  // remember start of encoded-text (just after the third '?'):
139  scursor++;
140  const char *encodedTextStart = scursor;
141 
142  // find the '?=' sequence (ending the encoded-text):
143  for (; scursor != send ; scursor++) {
144  if (*scursor == '?') {
145  if (scursor + 1 != send) {
146  if (*(scursor + 1) != '=') { // We expect a '=' after the '?', but we got something else; ignore
147  KMIME_WARN << "Stray '?' in q-encoded word, ignoring this.";
148  continue;
149  } else { // yep, found a '?=' sequence
150  scursor += 2;
151  break;
152  }
153  } else { // The '?' is the last char, but we need a '=' after it!
154  KMIME_WARN_PREMATURE_END_OF(EncodedWord);
155  return false;
156  }
157  }
158  }
159 
160  if (*(scursor - 2) != '?' || *(scursor - 1) != '=' ||
161  scursor < encodedTextStart + 2) {
162  KMIME_WARN_PREMATURE_END_OF(EncodedWord);
163  return false;
164  }
165 
166  // set end sentinel for encoded-text:
167  const char *const encodedTextEnd = scursor - 2;
168 
169  //
170  // STEP 4:
171  // setup decoders for the transfer encoding and the charset
172  //
173 
174  // try if there's a codec for the encoding found:
175  KCodecs::Codec *codec = KCodecs::Codec::codecForName(maybeEncoding);
176  if (!codec) {
177  KMIME_WARN_UNKNOWN(Encoding, maybeEncoding);
178  return false;
179  }
180 
181  // get an instance of a corresponding decoder:
182  KCodecs::Decoder *dec = codec->makeDecoder();
183  assert(dec);
184 
185  // try if there's a (text)codec for the charset found:
186  bool matchOK = false;
187  QTextCodec *textCodec = nullptr;
188  if (forceCS || maybeCharset.isEmpty()) {
189  textCodec = KCharsets::charsets()->codecForName(QLatin1String(defaultCS), matchOK);
190  usedCS = cachedCharset(defaultCS);
191  } else {
192  textCodec = KCharsets::charsets()->codecForName(QLatin1String(maybeCharset), matchOK);
193  if (!matchOK) { //no suitable codec found => use default charset
194  textCodec = KCharsets::charsets()->codecForName(QLatin1String(defaultCS), matchOK);
195  usedCS = cachedCharset(defaultCS);
196  } else {
197  usedCS = cachedCharset(maybeCharset);
198  }
199  }
200 
201  if (!matchOK || !textCodec) {
202  KMIME_WARN_UNKNOWN(Charset, maybeCharset);
203  delete dec;
204  return false;
205  };
206 
207  // qCDebug(KMIME_LOG) << "mimeName(): \"" << textCodec->name() << "\"";
208 
209  // allocate a temporary buffer to store the 8bit text:
210  int encodedTextLength = encodedTextEnd - encodedTextStart;
211  QByteArray buffer;
212  buffer.resize(codec->maxDecodedSizeFor(encodedTextLength));
213  char *bbegin = buffer.data();
214  char *bend = bbegin + buffer.length();
215 
216  //
217  // STEP 5:
218  // do the actual decoding
219  //
220 
221  if (!dec->decode(encodedTextStart, encodedTextEnd, bbegin, bend)) {
222  KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor("
223  << encodedTextLength << ")\nresult may be truncated";
224  }
225 
226  result = textCodec->toUnicode(buffer.data(), bbegin - buffer.data());
227 
228  // qCDebug(KMIME_LOG) << "result now: \"" << result << "\"";
229  // cleanup:
230  delete dec;
231  language = maybeLanguage;
232 
233  return true;
234 }
235 
236 static inline void eatWhiteSpace(const char *&scursor, const char *const send)
237 {
238  while (scursor != send &&
239  (*scursor == ' ' || *scursor == '\n' ||
240  *scursor == '\t' || *scursor == '\r')) {
241  scursor++;
242  }
243 }
244 
245 bool parseAtom(const char*&scursor, const char *const send,
246  QByteArray &result, bool allow8Bit)
247 {
248  QPair<const char *, int> maybeResult;
249 
250  if (parseAtom(scursor, send, maybeResult, allow8Bit)) {
251  result = QByteArray(maybeResult.first, maybeResult.second);
252  return true;
253  }
254 
255  return false;
256 }
257 
258 bool parseAtom(const char*&scursor, const char *const send,
259  QPair<const char *, int> &result, bool allow8Bit)
260 {
261  bool success = false;
262  const char *start = scursor;
263 
264  while (scursor != send) {
265  signed char ch = *scursor++;
266  if (ch > 0 && isAText(ch)) {
267  // AText: OK
268  success = true;
269  } else if (allow8Bit && ch < 0) {
270  // 8bit char: not OK, but be tolerant.
271  KMIME_WARN_8BIT(ch);
272  success = true;
273  } else {
274  // CTL or special - marking the end of the atom:
275  // re-set sursor to point to the offending
276  // char and return:
277  scursor--;
278  break;
279  }
280  }
281  result.first = start;
282  result.second = scursor - start;
283  return success;
284 }
285 
286 bool parseToken(const char*&scursor, const char *const send,
287  QByteArray &result, ParseTokenFlags flags)
288 {
289  QPair<const char *, int> maybeResult;
290 
291  if (parseToken(scursor, send, maybeResult, flags)) {
292  result = QByteArray(maybeResult.first, maybeResult.second);
293  return true;
294  }
295 
296  return false;
297 }
298 
299 bool parseToken(const char*&scursor, const char *const send,
300  QPair<const char *, int> &result, ParseTokenFlags flags)
301 {
302  bool success = false;
303  const char *start = scursor;
304 
305  while (scursor != send) {
306  signed char ch = *scursor++;
307  if (ch > 0 && isTText(ch)) {
308  // TText: OK
309  success = true;
310  } else if ((flags & ParseTokenAllow8Bit) && ch < 0) {
311  // 8bit char: not OK, but be tolerant.
312  KMIME_WARN_8BIT(ch);
313  success = true;
314  } else if ((flags & ParseTokenRelaxedTText) && ch == '/') {
315  success = true;
316  } else {
317  // CTL or tspecial - marking the end of the atom:
318  // re-set sursor to point to the offending
319  // char and return:
320  scursor--;
321  break;
322  }
323  }
324  result.first = start;
325  result.second = scursor - start;
326  return success;
327 }
328 
329 #define READ_ch_OR_FAIL if ( scursor == send ) { \
330  KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \
331  return false; \
332  } else { \
333  ch = *scursor++; \
334  }
335 
336 // known issues:
337 //
338 // - doesn't handle quoted CRLF
339 
340 bool parseGenericQuotedString(const char *&scursor, const char *const send,
341  QString &result, bool isCRLF,
342  const char openChar, const char closeChar)
343 {
344  // We are in a quoted-string or domain-literal or comment and the
345  // cursor points to the first char after the openChar.
346  // We will apply unfolding and quoted-pair removal.
347  // We return when we either encounter the end or unescaped openChar
348  // or closeChar.
349  assert(*(scursor - 1) == openChar || *(scursor - 1) == closeChar);
350 
351  while (scursor != send) {
352  char ch = *scursor++;
353 
354  if (ch == closeChar || ch == openChar) {
355  // end of quoted-string or another opening char:
356  // let caller decide what to do.
357  return true;
358  }
359 
360  switch (ch) {
361  case '\\': // quoted-pair
362  // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5
363  READ_ch_OR_FAIL;
364  KMIME_WARN_IF_8BIT(ch);
365  result += QLatin1Char(ch);
366  break;
367  case '\r':
368  // ###
369  // The case of lonely '\r' is easy to solve, as they're
370  // not part of Unix Line-ending conventions.
371  // But I see a problem if we are given Unix-native
372  // line-ending-mails, where we cannot determine anymore
373  // whether a given '\n' was part of a CRLF or was occurring
374  // on it's own.
375  READ_ch_OR_FAIL;
376  if (ch != '\n') {
377  // CR on it's own...
378  KMIME_WARN_LONE(CR);
379  result += QLatin1Char('\r');
380  scursor--; // points to after the '\r' again
381  } else {
382  // CRLF encountered.
383  // lookahead: check for folding
384  READ_ch_OR_FAIL;
385  if (ch == ' ' || ch == '\t') {
386  // correct folding;
387  // position cursor behind the CRLF WSP (unfolding)
388  // and add the WSP to the result
389  result += QLatin1Char(ch);
390  } else {
391  // this is the "shouldn't happen"-case. There is a CRLF
392  // inside a quoted-string without it being part of FWS.
393  // We take it verbatim.
394  KMIME_WARN_NON_FOLDING(CRLF);
395  result += QLatin1String("\r\n");
396  // the cursor is decremented again, so's we need not
397  // duplicate the whole switch here. "ch" could've been
398  // everything (incl. openChar or closeChar).
399  scursor--;
400  }
401  }
402  break;
403  case '\n':
404  // Note: CRLF has been handled above already!
405  // ### LF needs special treatment, depending on whether isCRLF
406  // is true (we can be sure a lonely '\n' was meant this way) or
407  // false ('\n' alone could have meant LF or CRLF in the original
408  // message. This parser assumes CRLF iff the LF is followed by
409  // either WSP (folding) or NULL (premature end of quoted-string;
410  // Should be fixed, since NULL is allowed as per rfc822).
411  READ_ch_OR_FAIL;
412  if (!isCRLF && (ch == ' ' || ch == '\t')) {
413  // folding
414  // correct folding
415  result += QLatin1Char(ch);
416  } else {
417  // non-folding
418  KMIME_WARN_LONE(LF);
419  result += QLatin1Char('\n');
420  // pos is decremented, so's we need not duplicate the whole
421  // switch here. ch could've been everything (incl. <">, "\").
422  scursor--;
423  }
424  break;
425  case '=': {
426  // ### Work around broken clients that send encoded words in quoted-strings
427  // For example, older KMail versions.
428  if (scursor == send) {
429  break;
430  }
431 
432  const char *oldscursor = scursor;
433  QString tmp;
434  QByteArray lang;
435  QByteArray charset;
436  if (*scursor++ == '?') {
437  --scursor;
438  if (parseEncodedWord(scursor, send, tmp, lang, charset)) {
439  result += tmp;
440  //qDebug() << " tmp " << tmp;
441  if (scursor == send) {
442  break;
443  } else if (*scursor++ == ' ') { //Workaround Bug 362650 thunderbird add space for each new line
444  if (scursor == send) {
445  --scursor;
446  break;
447  } else if (*scursor++ == '=') {
448  if (scursor == send) {
449  --scursor;
450  --scursor;
451  break;
452  } else if (*scursor++ == '?') {
453  --scursor;
454  --scursor;
455  break;
456  }
457  } else {
458  --scursor;
459  --scursor;
460  }
461  } else {
462  --scursor;
463  }
464 
465  break;
466  } else {
467  scursor = oldscursor;
468  }
469  } else {
470  scursor = oldscursor;
471  }
472  // fall through
473  Q_FALLTHROUGH();
474  }
475  default:
476  KMIME_WARN_IF_8BIT(ch);
477  result += QLatin1Char(ch);
478  }
479  }
480 
481  return false;
482 }
483 
484 // known issues:
485 //
486 // - doesn't handle encoded-word inside comments.
487 
488 bool parseComment(const char *&scursor, const char *const send,
489  QString &result, bool isCRLF, bool reallySave)
490 {
491  int commentNestingDepth = 1;
492  const char *afterLastClosingParenPos = nullptr;
493  QString maybeCmnt;
494  const char *oldscursor = scursor;
495 
496  assert(*(scursor - 1) == '(');
497 
498  while (commentNestingDepth) {
499  QString cmntPart;
500  if (parseGenericQuotedString(scursor, send, cmntPart, isCRLF, '(', ')')) {
501  assert(*(scursor - 1) == ')' || *(scursor - 1) == '(');
502  // see the kdoc for above function for the possible conditions
503  // we have to check:
504  switch (*(scursor - 1)) {
505  case ')':
506  if (reallySave) {
507  // add the chunk that's now surely inside the comment.
508  result += maybeCmnt;
509  result += cmntPart;
510  if (commentNestingDepth > 1) {
511  // don't add the outermost ')'...
512  result += QLatin1Char(')');
513  }
514  maybeCmnt.clear();
515  }
516  afterLastClosingParenPos = scursor;
517  --commentNestingDepth;
518  break;
519  case '(':
520  if (reallySave) {
521  // don't add to "result" yet, because we might find that we
522  // are already outside the (broken) comment...
523  maybeCmnt += cmntPart;
524  maybeCmnt += QLatin1Char('(');
525  }
526  ++commentNestingDepth;
527  break;
528  default: assert(0);
529  } // switch
530  } else {
531  // !parseGenericQuotedString, ie. premature end
532  if (afterLastClosingParenPos) {
533  scursor = afterLastClosingParenPos;
534  } else {
535  scursor = oldscursor;
536  }
537  return false;
538  }
539  } // while
540 
541  return true;
542 }
543 
544 // known issues: none.
545 
546 bool parsePhrase(const char *&scursor, const char *const send,
547  QString &result, bool isCRLF)
548 {
549  enum {
550  None, Phrase, Atom, EncodedWord, QuotedString
551  } found = None;
552 
553  QString tmp;
554  QByteArray lang;
555  QByteArray charset;
556  QPair<const char *, int> tmpAtom;
557  const char *successfullyParsed = nullptr;
558  // only used by the encoded-word branch
559  const char *oldscursor;
560  // used to suppress whitespace between adjacent encoded-words
561  // (rfc2047, 6.2):
562  bool lastWasEncodedWord = false;
563 
564  while (scursor != send) {
565  char ch = *scursor++;
566  switch (ch) {
567  case '.': // broken, but allow for intorop's sake
568  if (found == None) {
569  --scursor;
570  return false;
571  } else {
572  if (scursor != send && (*scursor == ' ' || *scursor == '\t')) {
573  result += QLatin1String(". ");
574  } else {
575  result += QLatin1Char('.');
576  }
577  successfullyParsed = scursor;
578  }
579  break;
580  case '"': // quoted-string
581  tmp.clear();
582  if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) {
583  successfullyParsed = scursor;
584  assert(*(scursor - 1) == '"');
585  switch (found) {
586  case None:
587  found = QuotedString;
588  break;
589  case Phrase:
590  case Atom:
591  case EncodedWord:
592  case QuotedString:
593  found = Phrase;
594  result += QLatin1Char(' '); // rfc822, 3.4.4
595  break;
596  default:
597  assert(0);
598  }
599  lastWasEncodedWord = false;
600  result += tmp;
601  } else {
602  // premature end of quoted string.
603  // What to do? Return leading '"' as special? Return as quoted-string?
604  // We do the latter if we already found something, else signal failure.
605  if (found == None) {
606  return false;
607  } else {
608  result += QLatin1Char(' '); // rfc822, 3.4.4
609  result += tmp;
610  return true;
611  }
612  }
613  break;
614  case '(': // comment
615  // parse it, but ignore content:
616  tmp.clear();
617  if (parseComment(scursor, send, tmp, isCRLF,
618  false /*don't bother with the content*/)) {
619  successfullyParsed = scursor;
620  lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2
621  } else {
622  if (found == None) {
623  return false;
624  } else {
625  scursor = successfullyParsed;
626  return true;
627  }
628  }
629  break;
630  case '=': // encoded-word
631  tmp.clear();
632  oldscursor = scursor;
633  lang.clear();
634  charset.clear();
635  if (parseEncodedWord(scursor, send, tmp, lang, charset)) {
636  successfullyParsed = scursor;
637  switch (found) {
638  case None:
639  found = EncodedWord;
640  break;
641  case Phrase:
642  case EncodedWord:
643  case Atom:
644  case QuotedString:
645  if (!lastWasEncodedWord) {
646  result += QLatin1Char(' '); // rfc822, 3.4.4
647  }
648  found = Phrase;
649  break;
650  default: assert(0);
651  }
652  lastWasEncodedWord = true;
653  result += tmp;
654  break;
655  } else {
656  // parse as atom:
657  scursor = oldscursor;
658  }
659  Q_FALLTHROUGH();
660  // fall though...
661 
662  default: //atom
663  scursor--;
664  if (parseAtom(scursor, send, tmpAtom, true /* allow 8bit */)) {
665  successfullyParsed = scursor;
666  switch (found) {
667  case None:
668  found = Atom;
669  break;
670  case Phrase:
671  case Atom:
672  case EncodedWord:
673  case QuotedString:
674  found = Phrase;
675  result += QLatin1Char(' '); // rfc822, 3.4.4
676  break;
677  default:
678  assert(0);
679  }
680  lastWasEncodedWord = false;
681  result += QLatin1String(tmpAtom.first, tmpAtom.second);
682  } else {
683  if (found == None) {
684  return false;
685  } else {
686  scursor = successfullyParsed;
687  return true;
688  }
689  }
690  }
691  eatWhiteSpace(scursor, send);
692  }
693 
694  return found != None;
695 }
696 
697 bool parseDotAtom(const char *&scursor, const char *const send,
698  QByteArray &result, bool isCRLF)
699 {
700  eatCFWS(scursor, send, isCRLF);
701 
702  // always points to just after the last atom parsed:
703  const char *successfullyParsed;
704 
705  QByteArray tmp;
706  if (!parseAtom(scursor, send, tmp, false /* no 8bit */)) {
707  return false;
708  }
709  result += tmp;
710  successfullyParsed = scursor;
711 
712  while (scursor != send) {
713 
714  // end of header or no '.' -> return
715  if (scursor == send || *scursor != '.') {
716  return true;
717  }
718  scursor++; // eat '.'
719 
720  if (scursor == send || !isAText(*scursor)) {
721  // end of header or no AText, but this time following a '.'!:
722  // reset cursor to just after last successfully parsed char and
723  // return:
724  scursor = successfullyParsed;
725  return true;
726  }
727 
728  // try to parse the next atom:
729  QByteArray maybeAtom;
730  if (!parseAtom(scursor, send, maybeAtom, false /*no 8bit*/)) {
731  scursor = successfullyParsed;
732  return true;
733  }
734 
735  result += '.';
736  result += maybeAtom;
737  successfullyParsed = scursor;
738  }
739 
740  scursor = successfullyParsed;
741  return true;
742 }
743 
744 void eatCFWS(const char *&scursor, const char *const send, bool isCRLF)
745 {
746  QString dummy;
747 
748  while (scursor != send) {
749  const char *oldscursor = scursor;
750 
751  char ch = *scursor++;
752 
753  switch (ch) {
754  case ' ':
755  case '\t': // whitespace
756  case '\r':
757  case '\n': // folding
758  continue;
759 
760  case '(': // comment
761  if (parseComment(scursor, send, dummy, isCRLF, false /*don't save*/)) {
762  continue;
763  }
764  scursor = oldscursor;
765  return;
766 
767  default:
768  scursor = oldscursor;
769  return;
770  }
771  }
772 }
773 
774 bool parseDomain(const char *&scursor, const char *const send,
775  QString &result, bool isCRLF)
776 {
777  eatCFWS(scursor, send, isCRLF);
778  if (scursor == send) {
779  return false;
780  }
781 
782  // domain := dot-atom / domain-literal / atom *("." atom)
783  //
784  // equivalent to:
785  // domain = dot-atom / domain-literal,
786  // since parseDotAtom does allow CFWS between atoms and dots
787 
788  if (*scursor == '[') {
789  // domain-literal:
790  QString maybeDomainLiteral;
791  // eat '[':
792  scursor++;
793  while (parseGenericQuotedString(scursor, send, maybeDomainLiteral,
794  isCRLF, '[', ']')) {
795  if (scursor == send) {
796  // end of header: check for closing ']':
797  if (*(scursor - 1) == ']') {
798  // OK, last char was ']':
799  result = maybeDomainLiteral;
800  return true;
801  } else {
802  // not OK, domain-literal wasn't closed:
803  return false;
804  }
805  }
806  // we hit openChar in parseGenericQuotedString.
807  // include it in maybeDomainLiteral and keep on parsing:
808  if (*(scursor - 1) == '[') {
809  maybeDomainLiteral += QLatin1Char('[');
810  continue;
811  }
812  // OK, real end of domain-literal:
813  result = maybeDomainLiteral;
814  return true;
815  }
816  } else {
817  // dot-atom:
818  QByteArray maybeDotAtom;
819  if (parseDotAtom(scursor, send, maybeDotAtom, isCRLF)) {
820  // Domain may end with '.', if so preserve it'
821  if (scursor != send && *scursor == '.') {
822  maybeDotAtom += '.';
823  scursor++;
824  }
825  result = QString::fromLatin1(maybeDotAtom);
826  return true;
827  }
828  }
829  return false;
830 }
831 
832 bool parseObsRoute(const char *&scursor, const char *const send,
833  QStringList &result, bool isCRLF, bool save)
834 {
835  while (scursor != send) {
836  eatCFWS(scursor, send, isCRLF);
837  if (scursor == send) {
838  return false;
839  }
840 
841  // empty entry:
842  if (*scursor == ',') {
843  scursor++;
844  if (save) {
845  result.append(QString());
846  }
847  continue;
848  }
849 
850  // empty entry ending the list:
851  if (*scursor == ':') {
852  scursor++;
853  if (save) {
854  result.append(QString());
855  }
856  return true;
857  }
858 
859  // each non-empty entry must begin with '@':
860  if (*scursor != '@') {
861  return false;
862  } else {
863  scursor++;
864  }
865 
866  QString maybeDomain;
867  if (!parseDomain(scursor, send, maybeDomain, isCRLF)) {
868  return false;
869  }
870  if (save) {
871  result.append(maybeDomain);
872  }
873 
874  // eat the following (optional) comma:
875  eatCFWS(scursor, send, isCRLF);
876  if (scursor == send) {
877  return false;
878  }
879  if (*scursor == ':') {
880  scursor++;
881  return true;
882  }
883  if (*scursor == ',') {
884  scursor++;
885  }
886  }
887 
888  return false;
889 }
890 
891 bool parseAddrSpec(const char *&scursor, const char *const send,
892  AddrSpec &result, bool isCRLF)
893 {
894  //
895  // STEP 1:
896  // local-part := dot-atom / quoted-string / word *("." word)
897  //
898  // this is equivalent to:
899  // local-part := word *("." word)
900 
901  QString maybeLocalPart;
902  QString tmp;
903  QPair<const char *, int> tmpAtom;
904 
905  while (scursor != send) {
906  // first, eat any whitespace
907  eatCFWS(scursor, send, isCRLF);
908 
909  char ch = *scursor++;
910  switch (ch) {
911  case '.': // dot
912  maybeLocalPart += QLatin1Char('.');
913  break;
914 
915  case '@':
916  goto SAW_AT_SIGN;
917  break;
918 
919  case '"': // quoted-string
920  tmp.clear();
921  if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) {
922  maybeLocalPart += tmp;
923  } else {
924  return false;
925  }
926  break;
927 
928  default: // atom
929  scursor--; // re-set scursor to point to ch again
930  if (parseAtom(scursor, send, tmpAtom, false /* no 8bit */)) {
931  maybeLocalPart += QLatin1String(tmpAtom.first, tmpAtom.second);
932  } else {
933  return false; // parseAtom can only fail if the first char is non-atext.
934  }
935  break;
936  }
937  }
938 
939  return false;
940 
941  //
942  // STEP 2:
943  // domain
944  //
945 
946 SAW_AT_SIGN:
947 
948  assert(*(scursor - 1) == '@');
949 
950  QString maybeDomain;
951  if (!parseDomain(scursor, send, maybeDomain, isCRLF)) {
952  return false;
953  }
954 
955  result.localPart = maybeLocalPart;
956  result.domain = maybeDomain;
957 
958  return true;
959 }
960 
961 bool parseAngleAddr(const char *&scursor, const char *const send,
962  AddrSpec &result, bool isCRLF)
963 {
964  // first, we need an opening angle bracket:
965  eatCFWS(scursor, send, isCRLF);
966  if (scursor == send || *scursor != '<') {
967  return false;
968  }
969  scursor++; // eat '<'
970 
971  eatCFWS(scursor, send, isCRLF);
972  if (scursor == send) {
973  return false;
974  }
975 
976  if (*scursor == '@' || *scursor == ',') {
977  // obs-route: parse, but ignore:
978  KMIME_WARN << "obsolete source route found! ignoring.";
979  QStringList dummy;
980  if (!parseObsRoute(scursor, send, dummy,
981  isCRLF, false /* don't save */)) {
982  return false;
983  }
984  // angle-addr isn't complete until after the '>':
985  if (scursor == send) {
986  return false;
987  }
988  }
989 
990  // parse addr-spec:
991  AddrSpec maybeAddrSpec;
992  if (!parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) {
993  return false;
994  }
995 
996  eatCFWS(scursor, send, isCRLF);
997  if (scursor == send || *scursor != '>') {
998  return false;
999  }
1000  scursor++;
1001 
1002  result = maybeAddrSpec;
1003  return true;
1004 
1005 }
1006 
1007 static QString stripQuotes(const QString &input)
1008 {
1009  const QLatin1Char quotes('"');
1010  if (input.startsWith(quotes) && input.endsWith(quotes)) {
1011  QString stripped(input.mid(1, input.size() - 2));
1012  return stripped;
1013  } else {
1014  return input;
1015  }
1016 }
1017 
1018 bool parseMailbox(const char *&scursor, const char *const send,
1019  Mailbox &result, bool isCRLF)
1020 {
1021  eatCFWS(scursor, send, isCRLF);
1022  if (scursor == send) {
1023  return false;
1024  }
1025 
1026  AddrSpec maybeAddrSpec;
1027  QString maybeDisplayName;
1028 
1029  // first, try if it's a vanilla addr-spec:
1030  const char *oldscursor = scursor;
1031  if (parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) {
1032  result.setAddress(maybeAddrSpec);
1033  // check for the obsolete form of display-name (as comment):
1034  eatWhiteSpace(scursor, send);
1035  if (scursor != send && *scursor == '(') {
1036  scursor++;
1037  if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) {
1038  return false;
1039  }
1040  }
1041  result.setName(stripQuotes(maybeDisplayName));
1042  return true;
1043  }
1044  scursor = oldscursor;
1045 
1046  // second, see if there's a display-name:
1047  if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) {
1048  // failed: reset cursor, note absent display-name
1049  maybeDisplayName.clear();
1050  scursor = oldscursor;
1051  } else {
1052  // succeeded: eat CFWS
1053  eatCFWS(scursor, send, isCRLF);
1054  if (scursor == send) {
1055  return false;
1056  }
1057  }
1058 
1059  // third, parse the angle-addr:
1060  if (!parseAngleAddr(scursor, send, maybeAddrSpec, isCRLF)) {
1061  return false;
1062  }
1063 
1064  if (maybeDisplayName.isNull()) {
1065  // check for the obsolete form of display-name (as comment):
1066  eatWhiteSpace(scursor, send);
1067  if (scursor != send && *scursor == '(') {
1068  scursor++;
1069  if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) {
1070  return false;
1071  }
1072  }
1073  }
1074 
1075  result.setName(stripQuotes(maybeDisplayName));
1076  result.setAddress(maybeAddrSpec);
1077  return true;
1078 }
1079 
1080 bool parseGroup(const char *&scursor, const char *const send,
1081  Address &result, bool isCRLF)
1082 {
1083  // group := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS]
1084  //
1085  // equivalent to:
1086  // group := display-name ":" [ obs-mbox-list ] ";"
1087 
1088  eatCFWS(scursor, send, isCRLF);
1089  if (scursor == send) {
1090  return false;
1091  }
1092 
1093  // get display-name:
1094  QString maybeDisplayName;
1095  if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) {
1096  return false;
1097  }
1098 
1099  // get ":":
1100  eatCFWS(scursor, send, isCRLF);
1101  if (scursor == send || *scursor != ':') {
1102  return false;
1103  }
1104 
1105  // KDE5 TODO: Don't expose displayName as public, but rather add setter for it that
1106  // automatically calls removeBidiControlChars
1107  result.displayName = removeBidiControlChars(maybeDisplayName);
1108 
1109  // get obs-mbox-list (may contain empty entries):
1110  scursor++;
1111  while (scursor != send) {
1112  eatCFWS(scursor, send, isCRLF);
1113  if (scursor == send) {
1114  return false;
1115  }
1116 
1117  // empty entry:
1118  if (*scursor == ',') {
1119  scursor++;
1120  continue;
1121  }
1122 
1123  // empty entry ending the list:
1124  if (*scursor == ';') {
1125  scursor++;
1126  return true;
1127  }
1128 
1129  Mailbox maybeMailbox;
1130  if (!parseMailbox(scursor, send, maybeMailbox, isCRLF)) {
1131  return false;
1132  }
1133  result.mailboxList.append(maybeMailbox);
1134 
1135  eatCFWS(scursor, send, isCRLF);
1136  // premature end:
1137  if (scursor == send) {
1138  return false;
1139  }
1140  // regular end of the list:
1141  if (*scursor == ';') {
1142  scursor++;
1143  return true;
1144  }
1145  // eat regular list entry separator:
1146  if (*scursor == ',') {
1147  scursor++;
1148  }
1149  }
1150  return false;
1151 }
1152 
1153 bool parseAddress(const char *&scursor, const char *const send,
1154  Address &result, bool isCRLF)
1155 {
1156  // address := mailbox / group
1157 
1158  eatCFWS(scursor, send, isCRLF);
1159  if (scursor == send) {
1160  return false;
1161  }
1162 
1163  // first try if it's a single mailbox:
1164  Mailbox maybeMailbox;
1165  const char *oldscursor = scursor;
1166  if (parseMailbox(scursor, send, maybeMailbox, isCRLF)) {
1167  // yes, it is:
1168  result.displayName.clear();
1169  result.mailboxList.append(maybeMailbox);
1170  return true;
1171  }
1172  scursor = oldscursor;
1173 
1174  Address maybeAddress;
1175 
1176  // no, it's not a single mailbox. Try if it's a group:
1177  if (!parseGroup(scursor, send, maybeAddress, isCRLF)) {
1178  return false;
1179  }
1180 
1181  result = maybeAddress;
1182  return true;
1183 }
1184 
1185 bool parseAddressList(const char *&scursor, const char *const send,
1186  AddressList &result, bool isCRLF)
1187 {
1188  while (scursor != send) {
1189  eatCFWS(scursor, send, isCRLF);
1190  // end of header: this is OK.
1191  if (scursor == send) {
1192  return true;
1193  }
1194  // empty entry: ignore:
1195  if (*scursor == ',') {
1196  scursor++;
1197  continue;
1198  }
1199  // broken clients might use ';' as list delimiter, accept that as well
1200  if (*scursor == ';') {
1201  scursor++;
1202  continue;
1203  }
1204 
1205  // parse one entry
1206  Address maybeAddress;
1207  if (!parseAddress(scursor, send, maybeAddress, isCRLF)) {
1208  return false;
1209  }
1210  result.append(maybeAddress);
1211 
1212  eatCFWS(scursor, send, isCRLF);
1213  // end of header: this is OK.
1214  if (scursor == send) {
1215  return true;
1216  }
1217  // comma separating entries: eat it.
1218  if (*scursor == ',') {
1219  scursor++;
1220  }
1221  }
1222  return true;
1223 }
1224 
1225 static bool parseParameter(const char *&scursor, const char *const send,
1226  QPair<QString, QStringOrQPair> &result, bool isCRLF)
1227 {
1228  // parameter = regular-parameter / extended-parameter
1229  // regular-parameter = regular-parameter-name "=" value
1230  // extended-parameter =
1231  // value = token / quoted-string
1232  //
1233  // note that rfc2231 handling is out of the scope of this function.
1234  // Therefore we return the attribute as QByteArray and the value as
1235  // (start,length) tuple if we see that the value is encoded
1236  // (trailing asterisk), for parseParameterList to decode...
1237 
1238  eatCFWS(scursor, send, isCRLF);
1239  if (scursor == send) {
1240  return false;
1241  }
1242 
1243  //
1244  // parse the parameter name:
1245  //
1246  QByteArray tmpAttr;
1247  if (!parseToken(scursor, send, tmpAttr, ParseTokenNoFlag)) {
1248  return false;
1249  }
1250  // FIXME: we could use QMap<QByteArray, ...> in the API for parameters
1251  QString maybeAttribute = QString::fromLatin1(tmpAttr);
1252 
1253  eatCFWS(scursor, send, isCRLF);
1254  // premature end: not OK (haven't seen '=' yet).
1255  if (scursor == send || *scursor != '=') {
1256  return false;
1257  }
1258  scursor++; // eat '='
1259 
1260  eatCFWS(scursor, send, isCRLF);
1261  if (scursor == send) {
1262  // don't choke on attribute=, meaning the value was omitted:
1263  if (maybeAttribute.endsWith(QLatin1Char('*'))) {
1264  KMIME_WARN << "attribute ends with \"*\", but value is empty!"
1265  "Chopping away \"*\".";
1266  maybeAttribute.chop(1);
1267  }
1268  result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1269  return true;
1270  }
1271 
1272  const char *oldscursor = scursor;
1273 
1274  //
1275  // parse the parameter value:
1276  //
1277  QStringOrQPair maybeValue;
1278  if (*scursor == '"') {
1279  // value is a quoted-string:
1280  scursor++;
1281  if (maybeAttribute.endsWith(QLatin1Char('*'))) {
1282  // attributes ending with "*" designate extended-parameters,
1283  // which cannot have quoted-strings as values. So we remove the
1284  // trailing "*" to not confuse upper layers.
1285  KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!"
1286  "Chopping away \"*\".";
1287  maybeAttribute.chop(1);
1288  }
1289 
1290  if (!parseGenericQuotedString(scursor, send, maybeValue.qstring, isCRLF)) {
1291  scursor = oldscursor;
1292  result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1293  return false; // this case needs further processing by upper layers!!
1294  }
1295  } else {
1296  // value is a token:
1297  if (!parseToken(scursor, send, maybeValue.qpair, ParseTokenRelaxedTText)) {
1298  scursor = oldscursor;
1299  result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1300  return false; // this case needs further processing by upper layers!!
1301  }
1302  }
1303 
1304  result = qMakePair(maybeAttribute.toLower(), maybeValue);
1305  return true;
1306 }
1307 
1308 static bool parseRawParameterList(const char *&scursor, const char *const send,
1310  bool isCRLF)
1311 {
1312  // we use parseParameter() consecutively to obtain a map of raw
1313  // attributes to raw values. "Raw" here means that we don't do
1314  // rfc2231 decoding and concatenation. This is left to
1315  // parseParameterList(), which will call this function.
1316  //
1317  // The main reason for making this chunk of code a separate
1318  // (private) method is that we can deal with broken parameters
1319  // _here_ and leave the rfc2231 handling solely to
1320  // parseParameterList(), which will still be enough work.
1321  while (scursor != send) {
1322  eatCFWS(scursor, send, isCRLF);
1323  // empty entry ending the list: OK.
1324  if (scursor == send) {
1325  return true;
1326  }
1327  // empty list entry: ignore.
1328  if (*scursor == ';') {
1329  scursor++;
1330  continue;
1331  }
1332  QPair<QString, QStringOrQPair> maybeParameter;
1333  if (!parseParameter(scursor, send, maybeParameter, isCRLF)) {
1334  // we need to do a bit of work if the attribute is not
1335  // NULL. These are the cases marked with "needs further
1336  // processing" in parseParameter(). Specifically, parsing of the
1337  // token or the quoted-string, which should represent the value,
1338  // failed. We take the easy way out and simply search for the
1339  // next ';' to start parsing again. (Another option would be to
1340  // take the text between '=' and ';' as value)
1341  if (maybeParameter.first.isNull()) {
1342  return false;
1343  }
1344  while (scursor != send) {
1345  if (*scursor++ == ';') {
1346  goto IS_SEMICOLON;
1347  }
1348  }
1349  // scursor == send case: end of list.
1350  return true;
1351  IS_SEMICOLON:
1352  // *scursor == ';' case: parse next entry.
1353  continue;
1354  }
1355  // successful parsing brings us here:
1356  result.insert(maybeParameter.first, maybeParameter.second);
1357 
1358  eatCFWS(scursor, send, isCRLF);
1359  // end of header: ends list.
1360  if (scursor == send) {
1361  return true;
1362  }
1363  // regular separator: eat it.
1364  if (*scursor == ';') {
1365  scursor++;
1366  }
1367  }
1368  return true;
1369 }
1370 
1371 static void decodeRFC2231Value(KCodecs::Codec *&rfc2231Codec,
1372  QTextCodec *&textcodec,
1373  bool isContinuation, QString &value,
1374  QPair<const char *, int> &source, QByteArray &charset)
1375 {
1376  //
1377  // parse the raw value into (charset,language,text):
1378  //
1379 
1380  const char *decBegin = source.first;
1381  const char *decCursor = decBegin;
1382  const char *decEnd = decCursor + source.second;
1383 
1384  if (!isContinuation) {
1385  // find the first single quote
1386  while (decCursor != decEnd) {
1387  if (*decCursor == '\'') {
1388  break;
1389  } else {
1390  decCursor++;
1391  }
1392  }
1393 
1394  if (decCursor == decEnd) {
1395  // there wasn't a single single quote at all!
1396  // take the whole value to be in latin-1:
1397  KMIME_WARN << "No charset in extended-initial-value."
1398  "Assuming \"iso-8859-1\".";
1399  value += QString::fromLatin1(decBegin, source.second);
1400  return;
1401  }
1402 
1403  charset = QByteArray(decBegin, decCursor - decBegin);
1404 
1405  const char *oldDecCursor = ++decCursor;
1406  // find the second single quote (we ignore the language tag):
1407  while (decCursor != decEnd) {
1408  if (*decCursor == '\'') {
1409  break;
1410  } else {
1411  decCursor++;
1412  }
1413  }
1414  if (decCursor == decEnd) {
1415  KMIME_WARN << "No language in extended-initial-value."
1416  "Trying to recover.";
1417  decCursor = oldDecCursor;
1418  } else {
1419  decCursor++;
1420  }
1421 
1422  // decCursor now points to the start of the
1423  // "extended-other-values":
1424 
1425  //
1426  // get the decoders:
1427  //
1428 
1429  bool matchOK = false;
1430  textcodec = KCharsets::charsets()->codecForName(QLatin1String(charset), matchOK);
1431  if (!matchOK) {
1432  textcodec = nullptr;
1433  KMIME_WARN_UNKNOWN(Charset, charset);
1434  }
1435  }
1436 
1437  if (!rfc2231Codec) {
1438  rfc2231Codec = KCodecs::Codec::codecForName("x-kmime-rfc2231");
1439  assert(rfc2231Codec);
1440  }
1441 
1442  if (!textcodec) {
1443  value += QString::fromLatin1(decCursor, decEnd - decCursor);
1444  return;
1445  }
1446 
1447  KCodecs::Decoder *dec = rfc2231Codec->makeDecoder();
1448  assert(dec);
1449 
1450  //
1451  // do the decoding:
1452  //
1453 
1454  QByteArray buffer;
1455  buffer.resize(rfc2231Codec->maxDecodedSizeFor(decEnd - decCursor));
1456  QByteArray::Iterator bit = buffer.begin();
1457  QByteArray::ConstIterator bend = buffer.end();
1458 
1459  if (!dec->decode(decCursor, decEnd, bit, bend)) {
1460  KMIME_WARN << rfc2231Codec->name()
1461  << "codec lies about its maxDecodedSizeFor()"
1462  << Qt::endl
1463  << "result may be truncated";
1464  }
1465 
1466  value += textcodec->toUnicode(buffer.begin(), bit - buffer.begin());
1467 
1468  // qCDebug(KMIME_LOG) << "value now: \"" << value << "\"";
1469  // cleanup:
1470  delete dec;
1471 }
1472 
1473 // known issues:
1474 // - permutes rfc2231 continuations when the total number of parts
1475 // exceeds 10 (other-sections then becomes *xy, ie. two digits)
1476 
1477 bool parseParameterListWithCharset(const char *&scursor,
1478  const char *const send,
1479  QMap<QString, QString> &result,
1480  QByteArray &charset, bool isCRLF)
1481 {
1482 // parse the list into raw attribute-value pairs:
1483  QMap<QString, QStringOrQPair> rawParameterList;
1484  if (!parseRawParameterList(scursor, send, rawParameterList, isCRLF)) {
1485  return false;
1486  }
1487 
1488  if (rawParameterList.isEmpty()) {
1489  return true;
1490  }
1491 
1492  // decode rfc 2231 continuations and alternate charset encoding:
1493 
1494  // NOTE: this code assumes that what QMapIterator delivers is sorted
1495  // by the key!
1496 
1497  KCodecs::Codec *rfc2231Codec = nullptr;
1498  QTextCodec *textcodec = nullptr;
1499  QString attribute;
1500  QString value;
1501  enum Mode {
1502  NoMode = 0x0, Continued = 0x1, Encoded = 0x2
1503  };
1504 
1505  enum EncodingMode {
1506  NoEncoding,
1507  RFC2047,
1508  RFC2231
1509  };
1510 
1512  QMap<QString, QStringOrQPair>::Iterator end = rawParameterList.end();
1513 
1514  for (it = rawParameterList.begin() ; it != end ; ++it) {
1515  if (attribute.isNull() || !it.key().startsWith(attribute)) {
1516  //
1517  // new attribute:
1518  //
1519 
1520  // store the last attribute/value pair in the result map now:
1521  if (!attribute.isNull()) {
1522  result.insert(attribute, value);
1523  }
1524  // and extract the information from the new raw attribute:
1525  value.clear();
1526  attribute = it.key();
1527  int mode = NoMode;
1528  EncodingMode encodingMode = NoEncoding;
1529 
1530  // is the value rfc2331-encoded?
1531  if (attribute.endsWith(QLatin1Char('*'))) {
1532  attribute.chop(1);
1533  mode |= Encoded;
1534  encodingMode = RFC2231;
1535  }
1536  // is the value rfc2047-encoded?
1537  if (!(*it).qstring.isNull() && (*it).qstring.contains(QLatin1String("=?"))) {
1538  mode |= Encoded;
1539  encodingMode = RFC2047;
1540  }
1541  // is the value continued?
1542  if (attribute.endsWith(QLatin1String("*0"))) {
1543  attribute.chop(2);
1544  mode |= Continued;
1545  }
1546  //
1547  // decode if necessary:
1548  //
1549  if (mode & Encoded) {
1550  if (encodingMode == RFC2231) {
1551  decodeRFC2231Value(rfc2231Codec, textcodec,
1552  false, /* isn't continuation */
1553  value, (*it).qpair, charset);
1554  } else if (encodingMode == RFC2047) {
1555  value += KCodecs::decodeRFC2047String((*it).qstring.toLatin1(), &charset);
1556  }
1557  } else {
1558  // not encoded.
1559  if ((*it).qpair.first) {
1560  value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second);
1561  } else {
1562  value += (*it).qstring;
1563  }
1564  }
1565 
1566  //
1567  // shortcut-processing when the value isn't encoded:
1568  //
1569 
1570  if (!(mode & Continued)) {
1571  // save result already:
1572  result.insert(attribute, value);
1573  // force begin of a new attribute:
1574  attribute.clear();
1575  }
1576  } else { // it.key().startsWith( attribute )
1577  //
1578  // continuation
1579  //
1580 
1581  // ignore the section and trust QMap to have sorted the keys:
1582  if (it.key().endsWith(QLatin1Char('*'))) {
1583  // encoded
1584  decodeRFC2231Value(rfc2231Codec, textcodec,
1585  true, /* is continuation */
1586  value, (*it).qpair, charset);
1587  } else {
1588  // not encoded
1589  if ((*it).qpair.first) {
1590  value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second);
1591  } else {
1592  value += (*it).qstring;
1593  }
1594  }
1595  }
1596  }
1597  // write last attr/value pair:
1598  if (!attribute.isNull()) {
1599  result.insert(attribute, value);
1600  }
1601 
1602  return true;
1603 }
1604 
1605 bool parseParameterList(const char *&scursor, const char *const send,
1606  QMap<QString, QString> &result, bool isCRLF)
1607 {
1608  QByteArray charset;
1609  return parseParameterListWithCharset(scursor, send, result, charset, isCRLF);
1610 }
1611 
1612 static const char stdDayNames[][4] = {
1613  "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
1614 };
1615 static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames;
1616 
1617 static bool parseDayName(const char *&scursor, const char *const send)
1618 {
1619  // check bounds:
1620  if (send - scursor < 3) {
1621  return false;
1622  }
1623 
1624  for (int i = 0 ; i < stdDayNamesLen ; ++i) {
1625  if (qstrnicmp(scursor, stdDayNames[i], 3) == 0) {
1626  scursor += 3;
1627  // qCDebug(KMIME_LOG) << "found" << stdDayNames[i];
1628  return true;
1629  }
1630  }
1631 
1632  return false;
1633 }
1634 
1635 static const char stdMonthNames[][4] = {
1636  "Jan", "Feb", "Mar", "Apr", "May", "Jun",
1637  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
1638 };
1639 static const int stdMonthNamesLen =
1640  sizeof stdMonthNames / sizeof *stdMonthNames;
1641 
1642 static bool parseMonthName(const char *&scursor, const char *const send,
1643  int &result)
1644 {
1645  // check bounds:
1646  if (send - scursor < 3) {
1647  return false;
1648  }
1649 
1650  for (result = 0 ; result < stdMonthNamesLen ; ++result) {
1651  if (qstrnicmp(scursor, stdMonthNames[result], 3) == 0) {
1652  scursor += 3;
1653  return true;
1654  }
1655  }
1656 
1657  // not found:
1658  return false;
1659 }
1660 
1661 static const struct {
1662  const char tzName[5];
1663  long int secsEastOfGMT;
1664 } timeZones[] = {
1665  // rfc 822 timezones:
1666  { "GMT", 0 },
1667  { "UT", 0 },
1668  { "EDT", -4 * 3600 },
1669  { "EST", -5 * 3600 },
1670  { "MST", -5 * 3600 },
1671  { "CST", -6 * 3600 },
1672  { "MDT", -6 * 3600 },
1673  { "MST", -7 * 3600 },
1674  { "PDT", -7 * 3600 },
1675  { "PST", -8 * 3600 },
1676  // common, non-rfc-822 zones:
1677  { "CET", 1 * 3600 },
1678  { "MET", 1 * 3600 },
1679  { "UTC", 0 },
1680  { "CEST", 2 * 3600 },
1681  { "BST", 1 * 3600 },
1682  // rfc 822 military timezones:
1683  { "Z", 0 },
1684  { "A", -1 * 3600 },
1685  { "B", -2 * 3600 },
1686  { "C", -3 * 3600 },
1687  { "D", -4 * 3600 },
1688  { "E", -5 * 3600 },
1689  { "F", -6 * 3600 },
1690  { "G", -7 * 3600 },
1691  { "H", -8 * 3600 },
1692  { "I", -9 * 3600 },
1693  // J is not used!
1694  { "K", -10 * 3600 },
1695  { "L", -11 * 3600 },
1696  { "M", -12 * 3600 },
1697  { "N", 1 * 3600 },
1698  { "O", 2 * 3600 },
1699  { "P", 3 * 3600 },
1700  { "Q", 4 * 3600 },
1701  { "R", 5 * 3600 },
1702  { "S", 6 * 3600 },
1703  { "T", 7 * 3600 },
1704  { "U", 8 * 3600 },
1705  { "V", 9 * 3600 },
1706  { "W", 10 * 3600 },
1707  { "X", 11 * 3600 },
1708  { "Y", 12 * 3600 },
1709 };
1710 static const int timeZonesLen = sizeof timeZones / sizeof *timeZones;
1711 
1712 static bool parseAlphaNumericTimeZone(const char *&scursor,
1713  const char *const send,
1714  long int &secsEastOfGMT,
1715  bool &timeZoneKnown)
1716 {
1717  // allow the timezone to be wrapped in quotes; bug 260761
1718  if (scursor < send && *scursor == '"') {
1719  scursor++;
1720 
1721  if (scursor == send) {
1722  return false;
1723  }
1724  }
1725 
1726  QPair<const char *, int> maybeTimeZone(nullptr, 0);
1727  if (!parseToken(scursor, send, maybeTimeZone, ParseTokenNoFlag)) {
1728  return false;
1729  }
1730  for (int i = 0 ; i < timeZonesLen ; ++i) {
1731  if (qstrnicmp(timeZones[i].tzName,
1732  maybeTimeZone.first, maybeTimeZone.second) == 0) {
1733  scursor += maybeTimeZone.second;
1734  secsEastOfGMT = timeZones[i].secsEastOfGMT;
1735  timeZoneKnown = true;
1736 
1737  if (scursor < send && *scursor == '"') {
1738  scursor++;
1739  }
1740 
1741  return true;
1742  }
1743  }
1744 
1745  // don't choke just because we don't happen to know the time zone
1746  KMIME_WARN_UNKNOWN(time zone,
1747  QByteArray(maybeTimeZone.first, maybeTimeZone.second));
1748  secsEastOfGMT = 0;
1749  timeZoneKnown = false;
1750  return true;
1751 }
1752 
1753 // parse a number and return the number of digits parsed:
1754 int parseDigits(const char *&scursor, const char *const send, int &result)
1755 {
1756  result = 0;
1757  int digits = 0;
1758  for (; scursor != send && isdigit(*scursor) ; scursor++, digits++) {
1759  result *= 10;
1760  result += int(*scursor - '0');
1761  }
1762  return digits;
1763 }
1764 
1765 static bool parseTimeOfDay(const char *&scursor, const char *const send,
1766  int &hour, int &min, int &sec, bool isCRLF = false)
1767 {
1768  // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ]
1769 
1770  //
1771  // 2DIGIT representing "hour":
1772  //
1773  if (!parseDigits(scursor, send, hour)) {
1774  return false;
1775  }
1776 
1777  eatCFWS(scursor, send, isCRLF);
1778  if (scursor == send || *scursor != ':') {
1779  return false;
1780  }
1781  scursor++; // eat ':'
1782 
1783  eatCFWS(scursor, send, isCRLF);
1784  if (scursor == send) {
1785  return false;
1786  }
1787 
1788  //
1789  // 2DIGIT representing "minute":
1790  //
1791  if (!parseDigits(scursor, send, min)) {
1792  return false;
1793  }
1794 
1795  eatCFWS(scursor, send, isCRLF);
1796  if (scursor == send) {
1797  return true; // seconds are optional
1798  }
1799 
1800  //
1801  // let's see if we have a 2DIGIT representing "second":
1802  //
1803  if (*scursor == ':') {
1804  // yepp, there are seconds:
1805  scursor++; // eat ':'
1806  eatCFWS(scursor, send, isCRLF);
1807  if (scursor == send) {
1808  return false;
1809  }
1810 
1811  if (!parseDigits(scursor, send, sec)) {
1812  return false;
1813  }
1814  } else {
1815  sec = 0;
1816  }
1817 
1818  return true;
1819 }
1820 
1821 bool parseTime(const char *&scursor, const char *send,
1822  int &hour, int &min, int &sec, long int &secsEastOfGMT,
1823  bool &timeZoneKnown, bool isCRLF)
1824 {
1825  // time := time-of-day CFWS ( zone / obs-zone )
1826  //
1827  // obs-zone := "UT" / "GMT" /
1828  // "EST" / "EDT" / ; -0500 / -0400
1829  // "CST" / "CDT" / ; -0600 / -0500
1830  // "MST" / "MDT" / ; -0700 / -0600
1831  // "PST" / "PDT" / ; -0800 / -0700
1832  // "A"-"I" / "a"-"i" /
1833  // "K"-"Z" / "k"-"z"
1834 
1835  eatCFWS(scursor, send, isCRLF);
1836  if (scursor == send) {
1837  return false;
1838  }
1839 
1840  if (!parseTimeOfDay(scursor, send, hour, min, sec, isCRLF)) {
1841  return false;
1842  }
1843 
1844  eatCFWS(scursor, send, isCRLF);
1845  // there might be no timezone but a year following
1846  if ((scursor == send) || isdigit(*scursor)) {
1847  timeZoneKnown = false;
1848  secsEastOfGMT = 0;
1849  return true; // allow missing timezone
1850  }
1851 
1852  timeZoneKnown = true;
1853  if (*scursor == '+' || *scursor == '-') {
1854  // remember and eat '-'/'+':
1855  const char sign = *scursor++;
1856  // numerical timezone:
1857  int maybeTimeZone;
1858  const int tzDigits = parseDigits(scursor, send, maybeTimeZone);
1859  if (tzDigits != 4) {
1860  // Allow timezones in 02:00 format
1861  if (tzDigits == 2 && scursor != send && *scursor == ':') {
1862  scursor++;
1863  int maybeTimeZone2;
1864  if (parseDigits(scursor, send, maybeTimeZone2) != 2) {
1865  return false;
1866  }
1867  maybeTimeZone = maybeTimeZone * 100 + maybeTimeZone2;
1868  } else {
1869  return false;
1870  }
1871  }
1872  secsEastOfGMT = 60 * (maybeTimeZone / 100 * 60 + maybeTimeZone % 100);
1873  if (sign == '-') {
1874  secsEastOfGMT *= -1;
1875  if (secsEastOfGMT == 0) {
1876  timeZoneKnown = false; // -0000 means indetermined tz
1877  }
1878  }
1879  } else {
1880  // maybe alphanumeric timezone:
1881  if (!parseAlphaNumericTimeZone(scursor, send, secsEastOfGMT, timeZoneKnown)) {
1882  return false;
1883  }
1884  }
1885  return true;
1886 }
1887 
1888 bool parseDateTime(const char *&scursor, const char *const send,
1889  QDateTime &result, bool isCRLF)
1890 {
1891  // Parsing date-time; strict mode:
1892  //
1893  // date-time := [ [CFWS] day-name [CFWS] "," ] ; wday
1894  // (expanded) [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date
1895  // time
1896  //
1897  // day-name := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
1898  // month-name := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" /
1899  // "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec"
1900 
1901  result = QDateTime();
1902 
1903  eatCFWS(scursor, send, isCRLF);
1904  if (scursor == send) {
1905  return false;
1906  }
1907 
1908  //
1909  // let's see if there's a day-of-week:
1910  //
1911  if (parseDayName(scursor, send)) {
1912  eatCFWS(scursor, send, isCRLF);
1913  if (scursor == send) {
1914  return false;
1915  }
1916  // day-name should be followed by ',' but we treat it as optional:
1917  if (*scursor == ',') {
1918  scursor++; // eat ','
1919  eatCFWS(scursor, send, isCRLF);
1920  }
1921  }
1922 
1923  int maybeMonth = -1;
1924  bool asctimeFormat = false;
1925 
1926  // ANSI-C asctime() format is: Wed Jun 30 21:49:08 1993
1927  if (!isdigit(*scursor) && parseMonthName(scursor, send, maybeMonth)) {
1928  asctimeFormat = true;
1929  eatCFWS(scursor, send, isCRLF);
1930  }
1931 
1932  //
1933  // 1*2DIGIT representing "day" (of month):
1934  //
1935  int maybeDay;
1936  if (!parseDigits(scursor, send, maybeDay)) {
1937  return false;
1938  }
1939 
1940  eatCFWS(scursor, send, isCRLF);
1941  if (scursor == send) {
1942  return false;
1943  }
1944 
1945  // ignore ","; bug 54098
1946  if (*scursor == ',') {
1947  scursor++;
1948  }
1949 
1950  //
1951  // month-name:
1952  //
1953  if (!asctimeFormat && !parseMonthName(scursor, send, maybeMonth)) {
1954  return false;
1955  }
1956  if (scursor == send) {
1957  return false;
1958  }
1959  assert(maybeMonth >= 0); assert(maybeMonth <= 11);
1960  ++maybeMonth; // 0-11 -> 1-12
1961 
1962  eatCFWS(scursor, send, isCRLF);
1963  if (scursor == send) {
1964  return false;
1965  }
1966 
1967  // check for "year HH:MM:SS" or only "HH:MM:SS" (or "H:MM:SS")
1968  bool timeAfterYear = true;
1969  if ((send - scursor > 3) && ((scursor[1] == ':') || (scursor[2] == ':'))) {
1970  timeAfterYear = false; // first read time, then year
1971  }
1972 
1973  //
1974  // 2*DIGIT representing "year":
1975  //
1976  int maybeYear = 0;
1977 
1978  if (timeAfterYear && !parseDigits(scursor, send, maybeYear)) {
1979  return false;
1980  }
1981 
1982  eatCFWS(scursor, send, isCRLF);
1983  if (scursor == send) {
1984  return false;
1985  }
1986 
1987  //
1988  // time
1989  //
1990  int maybeHour;
1991  int maybeMinute;
1992  int maybeSecond;
1993  long int secsEastOfGMT;
1994  bool timeZoneKnown = true;
1995 
1996  if (!parseTime(scursor, send,
1997  maybeHour, maybeMinute, maybeSecond,
1998  secsEastOfGMT, timeZoneKnown, isCRLF)) {
1999  return false;
2000  }
2001 
2002  // in asctime() the year follows the time
2003  if (!timeAfterYear) {
2004  eatCFWS(scursor, send, isCRLF);
2005  if (scursor == send) {
2006  return false;
2007  }
2008 
2009  if (!parseDigits(scursor, send, maybeYear)) {
2010  return false;
2011  }
2012  }
2013 
2014  // RFC 2822 4.3 processing:
2015  if (maybeYear < 50) {
2016  maybeYear += 2000;
2017  } else if (maybeYear < 1000) {
2018  maybeYear += 1900;
2019  }
2020  // else keep as is
2021  if (maybeYear < 1900) {
2022  return false; // rfc2822, 3.3
2023  }
2024 
2025  const QDate maybeDate = QDate(maybeYear, maybeMonth, maybeDay);
2026  const QTime maybeTime = QTime(maybeHour, maybeMinute, maybeSecond);
2027 
2028  if (!maybeDate.isValid() || !maybeTime.isValid()) {
2029  return false;
2030  }
2031 
2032  result = QDateTime(maybeDate, maybeTime, Qt::OffsetFromUTC, secsEastOfGMT);
2033  //result = QDateTime( maybeDateTime, QDateTime::Spec( QDateTime::OffsetFromUTC, secsEastOfGMT ) );
2034  if (!result.isValid()) {
2035  return false;
2036  }
2037  return true;
2038 }
2039 
2040 namespace {
2041 
2042 Headers::Base *extractHeader(const QByteArray &head, const int headerStart, int &endOfFieldBody)
2043 {
2044  Headers::Base *header = {};
2045 
2046  int startOfFieldBody = head.indexOf(':', headerStart);
2047  if (startOfFieldBody < 0) {
2048  return nullptr;
2049  }
2050 
2051  const char *rawType = head.constData() + headerStart;
2052  const size_t rawTypeLen = startOfFieldBody - headerStart;
2053 
2054  startOfFieldBody++; //skip the ':'
2055  if (startOfFieldBody < head.size() - 1 && head[startOfFieldBody] == ' ') { // skip the space after the ':', if there's any
2056  startOfFieldBody++;
2057  }
2058 
2059  bool folded = false;
2060  endOfFieldBody = findHeaderLineEnd(head, startOfFieldBody, &folded);
2061 
2062  // We might get an invalid mail without a field name, don't crash on that.
2063  if (rawTypeLen > 0) {
2064  header = HeaderFactory::createHeader(rawType, rawTypeLen);
2065  }
2066  if (!header) {
2067  //qCWarning(KMIME_LOG)() << "Returning Generic header of type" << rawType;
2068  header = new Headers::Generic(rawType, rawTypeLen);
2069  }
2070  if (folded) {
2071  const auto unfoldedBody = unfoldHeader(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody);
2072  header->from7BitString(unfoldedBody);
2073  } else {
2074  header->from7BitString(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody);
2075  }
2076 
2077  return header;
2078 }
2079 
2080 }
2081 
2082 Headers::Base *extractFirstHeader(QByteArray &head)
2083 {
2084  int endOfFieldBody = 0;
2085  auto header = extractHeader(head, 0, endOfFieldBody);
2086  if (header) {
2087  head.remove(0, endOfFieldBody + 1);
2088  } else {
2089  head.clear();
2090  }
2091 
2092  return header;
2093 }
2094 
2095 void extractHeaderAndBody(const QByteArray &content, QByteArray &header, QByteArray &body)
2096 {
2097  header.clear();
2098  body.clear();
2099 
2100  // empty header
2101  if (content.startsWith('\n')) {
2102  body = content.right(content.length() - 1);
2103  return;
2104  }
2105 
2106  int pos = content.indexOf("\n\n", 0);
2107  if (pos > -1) {
2108  header = content.left(++pos); //header *must* end with "\n" !!
2109  body = content.mid(pos + 1);
2110  if (body.startsWith("\n")) {
2111  body = "\n" + body;
2112  }
2113  } else {
2114  header = content;
2115  }
2116 }
2117 
2118 QVector<Headers::Base*> parseHeaders(const QByteArray &head)
2119 {
2121 
2122  int cursor = 0;
2123  while (cursor < head.size()) {
2124  const int headerStart = cursor;
2125  int endOfFieldBody;
2126  if (auto header = extractHeader(head, headerStart, endOfFieldBody)) {
2127  ret << header;
2128  cursor = endOfFieldBody + 1;
2129  } else {
2130  break;
2131  }
2132  }
2133 
2134  return ret;
2135 }
2136 
2137 } // namespace HeaderParsing
2138 
2139 } // namespace KMime
QTextCodec * codecForName(const QString &name) const
void clear()
void append(const T &value)
Represents an (email address, display name) pair according RFC 2822, section 3.4. ...
Definition: kmime_types.h:37
virtual int maxDecodedSizeFor(int insize, NewlineType newline=NewlineLF) const =0
This file is part of the API for handling MIME data and defines the DateFormatter class...
int size() const const
KCODECS_EXPORT QString decodeRFC2047String(const QString &text)
bool isValid() const const
virtual void from7BitString(const char *s, size_t len)
Parses the given string.
bool startsWith(const QByteArray &ba) const const
virtual Decoder * makeDecoder(NewlineType newline=NewlineLF) const =0
QTextStream & endl(QTextStream &stream)
int length() const const
void chop(int n)
virtual bool decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)=0
bool isNull() const const
OffsetFromUTC
void clear()
void resize(int size)
int indexOf(char ch, int from) const const
void append(const T &value)
Represents an arbitrary header, that can contain any header-field.
void setAddress(const AddrSpec &addr)
Sets the email address.
Baseclass of all header-classes.
const char * constData() const const
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
QByteArray right(int len) const const
This file is part of the API for handling MIME data and defines the various header classes: ...
bool isValid() const const
QByteArray::iterator begin()
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
QMap::iterator end()
QByteArray mid(int pos, int len) const const
QMap::iterator begin()
QString toLower() const const
static KCharsets * charsets()
bool isValid() const const
const Key key(const T &value, const Key &defaultKey) const const
const QList< QKeySequence > & end()
QByteArray left(int len) const const
static Codec * codecForName(const char *name)
QString mid(int position, int n) const const
char * data()
QString fromLatin1(const char *str, int size)
virtual const char * name() const =0
QMap::iterator insert(const Key &key, const T &value)
void setName(const QString &name)
Sets the name.
bool isEmpty() const const
int size() const const
QByteArray & remove(int pos, int len)
QString toUnicode(const QByteArray &a) const const
QByteArray::iterator end()
QTextStream & dec(QTextStream &stream)
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Sun Sep 26 2021 23:14:30 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.