• Skip to content
  • Skip to link menu
KDE 4.4 API Reference
  • KDE API Reference
  • KDE Support
  • Sitemap
  • Contact Us
 

strigi/src/streams

mailinputstream.cpp

Go to the documentation of this file.
00001 /* This file is part of Strigi Desktop Search
00002  *
00003  * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
00004  *
00005  * This library is free software; you can redistribute it and/or
00006  * modify it under the terms of the GNU Library General Public
00007  * License as published by the Free Software Foundation; either
00008  * version 2 of the License, or (at your option) any later version.
00009  *
00010  * This library is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013  * Library General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU Library General Public License
00016  * along with this library; see the file COPYING.LIB.  If not, write to
00017  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00018  * Boston, MA 02110-1301, USA.
00019  */
00020 
00021 #ifdef HAVE_CONFIG_H
00022 # include "config.h"
00023 #endif
00024 
00025 
00026 #include "mailinputstream.h"
00027 #include "subinputstream.h"
00028 #include "stringterminatedsubstream.h"
00029 #include "base64inputstream.h"
00030 #include "iconv.h"
00031 #include <cstring>
00032 #include <sstream>
00033 #include <iostream>
00034 
00035 #ifdef HAVE_STRINGS_H
00036 #include <strings.h>
00037 #endif
00038 
00039 #ifdef ICONV_SECOND_ARGUMENT_IS_CONST
00040      #define ICONV_CONST const
00041 #else
00042      #define ICONV_CONST
00043 #endif
00044 
00045 using namespace std;
00046 using namespace Strigi;
00047 
00048 char
00049 decodeHex(char h) {
00050     if (h >= 'A' && h <= 'F') return 10+h-'A';
00051     if (h >= 'a' && h <= 'f') return 10+h-'a';
00052     return h - '0';
00053 }
00054 
00055 class Decoder {
00056 private:
00057     char* buffer;
00058     size_t bufferlen;
00059     map<string, iconv_t> iconvs;
00060 public:
00061     Decoder() :buffer(0), bufferlen(0) {}
00062     ~Decoder() {
00063         free(buffer);
00064         map<string, iconv_t>::const_iterator i;
00065         for (i = iconvs.begin(); i != iconvs.end(); ++i) {
00066             iconv_close(i->second);
00067         }
00068     }
00069     void decode(const string& enc, string& data);
00070 };
00071 void
00072 Decoder::decode(const string& enc, string& data) {
00073     iconv_t conv;
00074     if (iconvs.find(enc) == iconvs.end()) {
00075         conv = iconvs[enc] = iconv_open("UTF-8", enc.c_str());
00076     } else {
00077         conv = iconvs[enc];
00078     }
00079     if (conv == (iconv_t)-1) return;
00080     ICONV_CONST char* in = (char*)data.c_str();
00081     size_t ilen = data.length();
00082     size_t olen = 4*ilen;
00083     if (olen > bufferlen) {
00084         bufferlen = olen;
00085         buffer = (char*)realloc(buffer, bufferlen);
00086     }
00087     if (olen > 0) {
00088         char* out = buffer;
00089         char* mem = out;
00090         size_t r = iconv(conv, &in, &ilen, &out, &olen);
00091         if (r != (size_t)-1) {
00092             data.assign(mem, out-mem);
00093         }
00094     }
00095 }
00096 
00097 class QuotedPrintableDecoder {
00098 private:
00099     string decoded;
00100 public:
00101     string& decodeQuotedPrintable(const char* v, uint32_t len);
00102 };
00103 class HeaderDecoder : public QuotedPrintableDecoder, Decoder {
00104 private:
00105     string decoded;
00106 public:
00107     const string& decodedHeaderValue(const char* v, uint32_t len);
00108 };
00109 
00110 string&
00111 QuotedPrintableDecoder::decodeQuotedPrintable(const char* v, uint32_t len) {
00112     if (decoded.size() < len) {
00113         decoded.reserve(len);
00114     }
00115     decoded.resize(0);
00116     const char* pos = v;
00117     const char* end = v + len;
00118     char c;
00119     while (v < end) {
00120         if (*v == '=' && end - v > 2 && isxdigit(v[1]) && isxdigit(v[2])) {
00121             decoded.append(pos, v - pos);
00122             c = decodeHex(v[1])*16 + decodeHex(v[2]);
00123             decoded.append(&c, 1);
00124             pos = v = v + 3;
00125         } else if (*v == '_') {
00126             decoded.append(pos, v - pos);
00127             decoded.append(" ");
00128             pos = v = v + 1;
00129         } else {
00130             v++;
00131         }
00132     }
00133     if (pos < end) {
00134         decoded.append(pos, end-pos);
00135     }
00136     return decoded;
00137 }
00141 const string&
00142 HeaderDecoder::decodedHeaderValue(const char* v, uint32_t len) {
00143     if (decoded.size() < len) {
00144         decoded.reserve(len*2);
00145     }
00146     decoded.resize(0);
00147     const char* s = v;
00148     const char* p = v;
00149     const char* e = s + len;
00150     while (s < e) {
00151         if (e-s > 8 && *s == '=' && s[1] == '?') {
00152             // start of encoded data, find the next position of '?','?' and '?='
00153             const char *q1, *q2, *end;
00154             q1 = s+2;
00155             while (q1 < e && *q1 != '?') q1++;
00156             q2 = q1+1;
00157             while (q2 < e && *q2 != '?') q2++;
00158             end = q2+1;
00159             while (end < e && *end != '?') end++;
00160             if (e - end < 1 || end[1] != '=') {
00161                 s++;
00162                 continue;
00163             }
00164             // save the stuff from before the encoding
00165             decoded.append(p, s-p);
00166             s += 2;
00167             q1++;
00168             q2++;
00169             // find the end
00170             if (*q1 == 'b' || *q1 == 'B') {
00171                 string str(Base64InputStream::decode(q2, end-q2));
00172                 if (strncasecmp("utf-8", s, 5)) {
00173                     string encoding(s, q1-s-1);
00174                     decode(encoding, str);
00175                 }
00176                 decoded.append(str);
00177             } else if (*q1 == 'q' || *q1 =='Q') {
00178                 string& str(decodeQuotedPrintable(q2, end-q2));
00179                 if (strncasecmp("utf-8", s, 5) != 0) {
00180                     string encoding(s, q1-s-1);
00181                     decode(encoding, str);
00182                 }
00183                 decoded.append(str);
00184             } else {
00185                 s -= 1;
00186             }
00187 
00188             // continue after the quoted data
00189             s = p = end + 2;
00190         } else if (e-s > 3 && s[0] == 13 && s[1] == 10
00191                 && (s[2] == 9 || s[2] == 32)) {
00192             // skip the CRLF WSP used for folding the header lines
00193             decoded.append(p, s-p);
00194             s = p = s + 4;
00195         } else {
00196             s++;
00197         }
00198     }
00199     if (p < e) {
00200         decoded.append(p, e-p);
00201     }
00202     return decoded;
00203 }
00204 bool
00205 checkHeaderKey(const char* data, int32_t left) {
00206     if (left >= 9 && strncasecmp("Received:", data, 9) == 0) {
00207         return true;
00208     }
00209     if (left >= 5 && strncasecmp("From:", data, 5) == 0) {
00210         return true;
00211     }
00212     return false;
00213 }
00214 
00220 bool
00221 MailInputStream::checkHeader(const char* data, int32_t datasize) {
00222     // the fileheader should contain a required header and have at least 5
00223     // header lines
00224     // 'Received' or 'From' (case insensitive)
00225     int linecount = 1;
00226     bool key = true;
00227     bool slashr = false;
00228     int32_t pos = 0;
00229     bool reqheader = checkHeaderKey(data, datasize);
00230     char prevc = 0;
00231     while (pos < datasize) {
00232         unsigned char c = data[pos++];
00233         if (slashr) {
00234             slashr = false;
00235             if (c == '\n') {
00236                 if (!reqheader) {
00237                     reqheader = checkHeaderKey(data+pos, datasize-pos);
00238                 }
00239                 continue;
00240             }
00241         }
00242         if (key) {
00243             if (c == ':' || (isblank(c) && isspace(prevc))) {
00244                 // ':' signals the end of the key, a line starting with space
00245                 // is a continuation of the previous line's value
00246                 key = false;
00247             } else if ((c == '\n' || c == '\r') && reqheader && linecount >= 5
00248                     && (prevc == '\n' || prevc == '\r')) {
00249                 // if at least 5 header lines were read and an empty line is
00250                 // encountered, the mail header is valid
00251                 return true;
00252             } else if (c != '-' && c != '.' && c != '_' && !isalnum(c)
00253                 && c != '#') {
00254                 // an invalid character in the key
00255                 return false;
00256             }
00257         } else {
00258             // check that the text is 7-bit
00259             if (c == '\n' || c == '\r') {
00260                 // a new line starts, so a new key
00261                 key = true;
00262                 linecount++;
00263                 // enable reading of \r\n line endings
00264                 if (c == '\r') {
00265                     slashr = true;
00266                 } else if (!reqheader) {
00267                     reqheader = checkHeaderKey(data+pos, datasize-pos);
00268                 }
00269             }
00270         }
00271         prevc = c;
00272     }
00273     return reqheader && linecount >= 5;
00274 }
00275 class MailInputStream::Private {
00276 public:
00277     MailInputStream* const m;
00278     int64_t nextLineStartPosition;
00279     // variables that record the current read state
00280     int32_t entrynumber;
00281     int maxlinesize;
00282     const char* linestart;
00283     const char* lineend;
00284 
00285     StringTerminatedSubStream* substream;
00286     std::string m_contenttransferencoding;
00287     std::string m_contentdisposition;
00288 
00289     std::stack<std::string> boundary;
00290 
00291     HeaderDecoder decoder;
00292 
00293     void readHeaderLine();
00294     void readHeader();
00295     void scanBody();
00296     void handleHeaderLine();
00297     bool handleBodyLine();
00298     bool lineIsEndOfBlock();
00299     bool checkHeaderLine() const;
00300     void clearHeaders();
00301     void ensureFileName();
00302     std::string value(const char* n, const std::string& headerline) const;
00303 
00304     Private(MailInputStream* mail);
00305     ~Private();
00306 };
00307 MailInputStream::Private::Private(MailInputStream* mail) :m(mail) {
00308     substream = 0;
00309     entrynumber = 0;
00310     nextLineStartPosition = 0;
00311 }
00312 MailInputStream::Private::~Private() {
00313     if (substream && substream != m->m_entrystream) {
00314         delete substream;
00315     }
00316 }
00317 MailInputStream::MailInputStream(InputStream* input)
00318         : SubStreamProvider(input), p(new Private(this)) {
00319     // parse the header and store the imporant header fields
00320     p->readHeader();
00321     if (m_status != Ok) {
00322         fprintf(stderr, "no valid header\n");
00323         return;
00324     }
00325 }
00326 MailInputStream::~MailInputStream() {
00327     delete p;
00328 }
00337 void
00338 MailInputStream::Private::readHeaderLine() {
00339     // state: 0 -> ok, 1 -> '\r', 2 -> '\n', 3 -> '\r\n'
00340     char state = 0;
00341     int32_t nread;
00342     int32_t linepos = 0;
00343     bool completeLine = false;
00344     char c = 0;
00345 
00346     m->m_input->reset(nextLineStartPosition);
00347     do {
00348         nread = m->m_input->read(linestart, linepos+1, maxlinesize);
00349         if (nread < linepos+1) {
00350             completeLine = true;
00351             lineend = linestart + nread;
00352             m->m_status = Eof;
00353             return;
00354         }
00355         m->m_input->reset(nextLineStartPosition);
00356         if (m->m_input->status() == Error) {
00357             m->m_status = Error;
00358             m->m_error = m->m_input->error();
00359             return;
00360         } else if (linepos >= maxlinesize) {
00361             // error line is too long
00362             m->m_status = Error;
00363             ostringstream out;
00364             out << "mail header line is too long: more than " << linepos
00365                 << " bytes.";
00366             m->m_error = out.str();
00367             return;
00368         } else {
00369             while (linepos < nread) {
00370                 c = linestart[linepos];
00371                 if (state == 0) {
00372                     if (c == '\r') {
00373                         state = 1;
00374                     } else if (c == '\n') {
00375                         state = 2;
00376                     }
00377                 } else if (state == 1) { // '\r'
00378                     if (c == '\n') {
00379                         state = 3;
00380                     } else if (c == '\r' || !isspace(c)) { // end
00381                         completeLine = true;
00382                         lineend = linestart + linepos - 1;
00383                         break;
00384                     } else {
00385                         state = 0;
00386                     }
00387                 } else if (state == 2) { // '\n'
00388                     if (c == '\n' || !isspace(c)) { // end
00389                         completeLine = true;
00390                         lineend = linestart + linepos - 1;
00391                         break;
00392                     } else {
00393                         state = 0;
00394                     }
00395                 } else { // state == 3   '\r\n'
00396                     if (c == '\r' || linepos == 2 || !isspace(c)) { // end
00397                         completeLine = true;
00398                         lineend = linestart + linepos - 2;
00399                         break;
00400                     } else {
00401                         state = 0;
00402                     }
00403                 }
00404                 linepos++;
00405             }
00406         }
00407     } while (!completeLine);
00408     nextLineStartPosition += linepos;
00409 }
00410 string
00411 MailInputStream::Private::value(const char* n, const string& headerline) const {
00412     size_t nl = strlen(n);
00413     string value;
00414     // get the value
00415     const char* hl = headerline.c_str();
00416     const char* v = strcasestr(hl, n);
00417     if (v == 0) {
00418         // so far we just scan for a value attribute
00419         return value;
00420     }
00421     v += nl;
00422     v += strspn(v, "= \n\r");
00423     const char* vend = strchr(v, ';');
00424     if (vend == 0) {
00425         vend = hl + headerline.length();
00426     }
00427     if (*v == '"' && vend-v > 2) {
00428         value.assign(v+1, vend-v-2);
00429     } else {
00430         value.assign(v, vend-v);
00431     }
00432     return value;
00433 }
00434 void
00435 MailInputStream::Private::readHeader() {
00436     maxlinesize = 1024*1024;
00437 
00438     readHeaderLine();
00439     while (m->m_status == Ok && linestart != lineend) {
00440         handleHeaderLine();
00441         readHeaderLine();
00442     }
00443 }
00448 void
00449 MailInputStream::Private::scanBody() {
00450     while (m->m_status == Ok) {
00451         readHeaderLine();
00452         int32_t len = lineend - linestart;
00453         if (len > 2 && strncmp("--", linestart, 2) == 0) {
00454             int32_t blen = boundary.top().length();
00455             if (len == blen + 4 && strncmp(linestart + 2 + blen, "--", 2) == 0
00456                     && strncmp(linestart + 2, boundary.top().c_str(), blen)
00457                         == 0) {
00458                 // check if this is the end of a multipart
00459                 boundary.pop();
00460                 if (boundary.size() == 0) {
00461                     m->m_status = Eof;
00462                 }
00463             } else if (len == blen + 2
00464                     && strncmp(linestart + 2, boundary.top().c_str(), blen)
00465                         == 0) {
00466                 if (handleBodyLine()) {
00467                     break;
00468                 }
00469             }
00470         }
00471     }
00472 }
00473 void
00474 MailInputStream::Private::handleHeaderLine() {
00475     static const char* subject = "Subject:";
00476     static const char* contenttype = "Content-Type:";
00477     static const char* to = "To:";
00478     static const char* from = "From:";
00479     static const char* cc = "Cc:";
00480     static const char* bcc = "Bcc:";
00481     static const char* messageid = "Message-ID:";
00482     static const char* inreplyto = "In-Reply-To:";
00483     static const char* references = "References:";
00484     static const char* contenttransferencoding = "Content-Transfer-Encoding:";
00485     static const char* contentdisposition = "Content-Disposition:";
00486     int32_t len = lineend - linestart;
00487     if (len < 2) return;
00488     if (len < 8) {
00489         return;
00490     } else if (strncasecmp(linestart, subject, 8) == 0) {
00491         int32_t offset = 8;
00492         while (offset < len && isspace(linestart[offset])) offset++;
00493         m->m_subject = decoder.decodedHeaderValue(linestart+offset, len-offset);
00494     } else if (strncasecmp(linestart, to, 3) == 0) {
00495         int32_t offset = 3;
00496         // FIXME: should split for ','
00497         while (offset < len && isspace(linestart[offset])) offset++;
00498         m->m_to = decoder.decodedHeaderValue(linestart+offset, len-offset);
00499     } else if (strncasecmp(linestart, from, 5) == 0) {
00500         int32_t offset = 5;
00501         while (offset < len && isspace(linestart[offset])) offset++;
00502         m->m_from = decoder.decodedHeaderValue(linestart+offset, len-offset);
00503     } else if (strncasecmp(linestart, cc, 3) == 0) {
00504         int32_t offset = 3;
00505         while (offset < len && isspace(linestart[offset])) offset++;
00506         m->m_cc = decoder.decodedHeaderValue(linestart+offset, len-offset);
00507     } else if (strncasecmp(linestart, bcc, 4) == 0) {
00508         int32_t offset = 4;
00509         while (offset < len && isspace(linestart[offset])) offset++;
00510         m->m_bcc = decoder.decodedHeaderValue(linestart+offset, len-offset);
00511     } else if (strncasecmp(linestart, messageid, 11) == 0) {
00512         int32_t offset = 11;
00513         while (offset < len && isspace(linestart[offset])) offset++;
00514         m->m_messageid = decoder.decodedHeaderValue(linestart+offset, len-offset);
00515     } else if (strncasecmp(linestart, inreplyto, 12) == 0) {
00516         int32_t offset = 12;
00517         while (offset < len && isspace(linestart[offset])) offset++;
00518         m->m_inreplyto = decoder.decodedHeaderValue(linestart+offset, len-offset);
00519     } else if (strncasecmp(linestart, references, 11) == 0) {
00520         int32_t offset = 11;
00521         while (offset < len && isspace(linestart[offset])) offset++;
00522         m->m_references = decoder.decodedHeaderValue(linestart+offset, len-offset);
00523     } else if (strncasecmp(linestart, contenttype, 13) == 0) {
00524         int32_t offset = 13;
00525         while (offset < len && isspace(linestart[offset])) offset++;
00526         m->m_contenttype = std::string(linestart+offset, len-offset);
00527         // get the boundary
00528         string b = value("boundary", m->m_contenttype);
00529         if (b.size()) {
00530             boundary.push(b);
00531         }
00532     } else if (strncasecmp(linestart, contenttransferencoding, 26) == 0) {
00533         m_contenttransferencoding = std::string(linestart, len);
00534     } else if (strncasecmp(linestart, contentdisposition, 20) == 0) {
00535         m_contentdisposition = std::string(linestart, len);
00536     }
00537 }
00538 bool
00539 MailInputStream::Private::checkHeaderLine() const {
00540     assert(lineend - linestart >= 0);
00541     bool validheader = linestart < lineend;
00542     if (validheader) {
00543         const char* colpos = linestart;
00544         while (*colpos != ':' && ++colpos != lineend) {}
00545         validheader = colpos != lineend || isblank(*linestart);
00546     }
00547     return validheader;
00548 }
00552 bool
00553 MailInputStream::Private::handleBodyLine() {
00554     clearHeaders();
00555 
00556     // start of new block
00557     // read part header
00558     bool validheader;
00559     size_t n = boundary.size();
00560     do {
00561         readHeaderLine();
00562         validheader = m->m_status == Ok && checkHeaderLine();
00563         if (validheader) {
00564             handleHeaderLine();
00565         }
00566     } while (m->m_status == Ok && validheader);
00567     if (boundary.size() > n) {
00568         return false;
00569     }
00570     readHeaderLine();
00571     if (m->m_status != Ok) {
00572         return false;
00573     }
00574 
00575     // get the filename
00576     m->m_entryinfo.filename = value("filename", m_contentdisposition);
00577     if (m->m_entryinfo.filename.length() == 0) {
00578         m->m_entryinfo.filename = value("name", m->m_contenttype);
00579     }
00580 
00581     // create a stream that's limited to the content
00582     substream = new StringTerminatedSubStream(m->m_input, "--"+boundary.top());
00583     // set a reasonable buffer size
00584     if (strcasestr(m_contenttransferencoding.c_str(), "base64")) {
00585         m->m_entrystream = new Base64InputStream(substream);
00586     } else {
00587         m->m_entrystream = substream;
00588     }
00589     return true;
00590 }
00596 void
00597 MailInputStream::Private::ensureFileName() {
00598     entrynumber++;
00599     if (m->m_entryinfo.filename.length() == 0) {
00600         ostringstream o;
00601         o << entrynumber;
00602         m->m_entryinfo.filename = o.str();
00603     }
00604     m->m_entryinfo.type = EntryInfo::File;
00605 }
00606 InputStream*
00607 MailInputStream::nextEntry() {
00608     if (m_status != Ok) return 0;
00609     // if the mail does not consist of multiple parts, we give a pointer to
00610     // the input stream
00611     if (p->boundary.size() == 0) {
00612         // signal eof because we only return eof once
00613         m_status = Eof;
00614         m_entrystream = new SubInputStream(m_input);
00615         m_entryinfo.filename = "body";
00616         return m_entrystream;
00617     }
00618     // read anything that's left over in the previous stream
00619     if (p->substream) {
00620         const char* dummy;
00621         while (p->substream->status() == Ok) {
00622             p->substream->read(dummy, 1, 0);
00623         }
00624         if (p->substream->status() == Error) {
00625             m_status = Error;
00626         } else {
00627             p->nextLineStartPosition = p->substream->offset()
00628                 + p->substream->size();
00629         }
00630         if (p->substream && p->substream != m_entrystream) {
00631             delete p->substream;
00632         }
00633         p->substream = 0;
00634         delete m_entrystream;
00635         m_entrystream = 0;
00636 
00637         if (m_status != Ok) {
00638             return 0;
00639         }
00640     }
00641     p->scanBody();
00642 
00643     if (m_entrystream == 0) {
00644         m_status = Eof;
00645     }
00646     p->ensureFileName();
00647     return m_entrystream;
00648 }
00649 void
00650 MailInputStream::Private::clearHeaders() {
00651     m->m_contenttype.resize(0);
00652     m_contenttransferencoding.resize(0);
00653     m_contentdisposition.resize(0);
00654 }

strigi/src/streams

Skip menu "strigi/src/streams"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members

KDE Support

Skip menu "KDE Support"
  • akonadi
  • Decibel
  • grantlee
  • kdewin
  • phonon
  •     Backend
  • polkit-qt
  • qca
  • qimageblitz
  • soprano
  • strigi
  •     searchclient
  •     streamanalyzer
  •     streams
Generated for KDE Support by doxygen 1.5.9-20090814
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal