kioslaves

rfcdecoder.cc

00001 /**********************************************************************
00002  *
00003  *   rfcdecoder.cc  - handler for various rfc/mime encodings
00004  *   Copyright (C) 2000 s.carstens@gmx.de
00005  *
00006  *   This program is free software; you can redistribute it and/or modify
00007  *   it under the terms of the GNU General Public License as published by
00008  *   the Free Software Foundation; either version 2 of the License, or
00009  *   (at your option) any later version.
00010  *
00011  *   This program is distributed in the hope that it will be useful,
00012  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  *   GNU General Public License for more details.
00015  *
00016  *   You should have received a copy of the GNU General Public License
00017  *   along with this program; if not, write to the Free Software
00018  *   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
00019  *
00020  *   Send comments and bug fixes to s.carstens@gmx.de
00021  *
00022  *********************************************************************/
00023 #include "rfcdecoder.h"
00024 
00025 #include <ctype.h>
00026 #include <sys/types.h>
00027 
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 
00031 #include <qtextcodec.h>
00032 #include <qbuffer.h>
00033 #include <qregexp.h>
00034 #include <kmdcodec.h>
00035 
00036 // This part taken from rfc 2192 IMAP URL Scheme. C. Newman. September 1997.
00037 // adapted to QT-Toolkit by Sven Carstens <s.carstens@gmx.de> 2000
00038 
00039 static unsigned char base64chars[] =
00040   "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
00041 #define UNDEFINED 64
00042 #define MAXLINE  76
00043 
00044 /* UTF16 definitions */
00045 #define UTF16MASK       0x03FFUL
00046 #define UTF16SHIFT      10
00047 #define UTF16BASE       0x10000UL
00048 #define UTF16HIGHSTART  0xD800UL
00049 #define UTF16HIGHEND    0xDBFFUL
00050 #define UTF16LOSTART    0xDC00UL
00051 #define UTF16LOEND      0xDFFFUL
00052 
00053 /* Convert an IMAP mailbox to a Unicode path
00054  */
00055 QString rfcDecoder::fromIMAP (const QString & inSrc)
00056 {
00057   unsigned char c, i, bitcount;
00058   unsigned long ucs4, utf16, bitbuf;
00059   unsigned char base64[256], utf8[6];
00060   unsigned long srcPtr = 0;
00061   QCString dst;
00062   QCString src = inSrc.ascii ();
00063   uint srcLen = inSrc.length();
00064 
00065   /* initialize modified base64 decoding table */
00066   memset (base64, UNDEFINED, sizeof (base64));
00067   for (i = 0; i < sizeof (base64chars); ++i)
00068   {
00069     base64[(int)base64chars[i]] = i;
00070   }
00071 
00072   /* loop until end of string */
00073   while (srcPtr < srcLen)
00074   {
00075     c = src[srcPtr++];
00076     /* deal with literal characters and &- */
00077     if (c != '&' || src[srcPtr] == '-')
00078     {
00079       /* encode literally */
00080       dst += c;
00081       /* skip over the '-' if this is an &- sequence */
00082       if (c == '&')
00083         srcPtr++;
00084     }
00085     else
00086     {
00087       /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
00088       bitbuf = 0;
00089       bitcount = 0;
00090       ucs4 = 0;
00091       while ((c = base64[(unsigned char) src[srcPtr]]) != UNDEFINED)
00092       {
00093         ++srcPtr;
00094         bitbuf = (bitbuf << 6) | c;
00095         bitcount += 6;
00096         /* enough bits for a UTF-16 character? */
00097         if (bitcount >= 16)
00098         {
00099           bitcount -= 16;
00100           utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff;
00101           /* convert UTF16 to UCS4 */
00102           if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND)
00103           {
00104             ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT;
00105             continue;
00106           }
00107           else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND)
00108           {
00109             ucs4 += utf16 - UTF16LOSTART + UTF16BASE;
00110           }
00111           else
00112           {
00113             ucs4 = utf16;
00114           }
00115           /* convert UTF-16 range of UCS4 to UTF-8 */
00116           if (ucs4 <= 0x7fUL)
00117           {
00118             utf8[0] = ucs4;
00119             i = 1;
00120           }
00121           else if (ucs4 <= 0x7ffUL)
00122           {
00123             utf8[0] = 0xc0 | (ucs4 >> 6);
00124             utf8[1] = 0x80 | (ucs4 & 0x3f);
00125             i = 2;
00126           }
00127           else if (ucs4 <= 0xffffUL)
00128           {
00129             utf8[0] = 0xe0 | (ucs4 >> 12);
00130             utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
00131             utf8[2] = 0x80 | (ucs4 & 0x3f);
00132             i = 3;
00133           }
00134           else
00135           {
00136             utf8[0] = 0xf0 | (ucs4 >> 18);
00137             utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
00138             utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
00139             utf8[3] = 0x80 | (ucs4 & 0x3f);
00140             i = 4;
00141           }
00142           /* copy it */
00143           for (c = 0; c < i; ++c)
00144           {
00145             dst += utf8[c];
00146           }
00147         }
00148       }
00149       /* skip over trailing '-' in modified UTF-7 encoding */
00150       if (src[srcPtr] == '-')
00151         ++srcPtr;
00152     }
00153   }
00154   return QString::fromUtf8 (dst.data ());
00155 }
00156 
00157 /* replace " with \" and \ with \\ " and \ characters */
00158 QString rfcDecoder::quoteIMAP(const QString &src)
00159 {
00160   uint len = src.length();
00161   QString result;
00162   result.reserve(2 * len);
00163   for (unsigned int i = 0; i < len; i++)
00164   {
00165     if (src[i] == '"' || src[i] == '\\')
00166       result += '\\';
00167     result += src[i];
00168   }
00169   //result.squeeze(); - unnecessary and slow
00170   return result;
00171 }
00172 
00173 /* Convert Unicode path to modified UTF-7 IMAP mailbox
00174  */
00175 QString rfcDecoder::toIMAP (const QString & inSrc)
00176 {
00177   unsigned int utf8pos, utf8total, c, utf7mode, bitstogo, utf16flag;
00178   unsigned long ucs4, bitbuf;
00179   QCString src = inSrc.utf8 ();
00180   QString dst;
00181 
00182   ulong srcPtr = 0;
00183   utf7mode = 0;
00184   utf8total = 0;
00185   bitstogo = 0;
00186   utf8pos = 0;
00187   bitbuf = 0;
00188   ucs4 = 0;
00189   while (srcPtr < src.length ())
00190   {
00191     c = (unsigned char) src[srcPtr++];
00192     /* normal character? */
00193     if (c >= ' ' && c <= '~')
00194     {
00195       /* switch out of UTF-7 mode */
00196       if (utf7mode)
00197       {
00198         if (bitstogo)
00199         {
00200           dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
00201           bitstogo = 0;
00202         }
00203         dst += '-';
00204         utf7mode = 0;
00205       }
00206       dst += c;
00207       /* encode '&' as '&-' */
00208       if (c == '&')
00209       {
00210         dst += '-';
00211       }
00212       continue;
00213     }
00214     /* switch to UTF-7 mode */
00215     if (!utf7mode)
00216     {
00217       dst += '&';
00218       utf7mode = 1;
00219     }
00220     /* Encode US-ASCII characters as themselves */
00221     if (c < 0x80)
00222     {
00223       ucs4 = c;
00224       utf8total = 1;
00225     }
00226     else if (utf8total)
00227     {
00228       /* save UTF8 bits into UCS4 */
00229       ucs4 = (ucs4 << 6) | (c & 0x3FUL);
00230       if (++utf8pos < utf8total)
00231       {
00232         continue;
00233       }
00234     }
00235     else
00236     {
00237       utf8pos = 1;
00238       if (c < 0xE0)
00239       {
00240         utf8total = 2;
00241         ucs4 = c & 0x1F;
00242       }
00243       else if (c < 0xF0)
00244       {
00245         utf8total = 3;
00246         ucs4 = c & 0x0F;
00247       }
00248       else
00249       {
00250         /* NOTE: can't convert UTF8 sequences longer than 4 */
00251         utf8total = 4;
00252         ucs4 = c & 0x03;
00253       }
00254       continue;
00255     }
00256     /* loop to split ucs4 into two utf16 chars if necessary */
00257     utf8total = 0;
00258     do
00259     {
00260       if (ucs4 >= UTF16BASE)
00261       {
00262         ucs4 -= UTF16BASE;
00263         bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT) + UTF16HIGHSTART);
00264         ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
00265         utf16flag = 1;
00266       }
00267       else
00268       {
00269         bitbuf = (bitbuf << 16) | ucs4;
00270         utf16flag = 0;
00271       }
00272       bitstogo += 16;
00273       /* spew out base64 */
00274       while (bitstogo >= 6)
00275       {
00276         bitstogo -= 6;
00277         dst += base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf) & 0x3F];
00278       }
00279     }
00280     while (utf16flag);
00281   }
00282   /* if in UTF-7 mode, finish in ASCII */
00283   if (utf7mode)
00284   {
00285     if (bitstogo)
00286     {
00287       dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
00288     }
00289     dst += '-';
00290   }
00291   return quoteIMAP(dst);
00292 }
00293 
00294 //-----------------------------------------------------------------------------
00295 QString rfcDecoder::decodeQuoting(const QString &aStr)
00296 {
00297   QString result;
00298   unsigned int strLength(aStr.length());
00299   for (unsigned int i = 0; i < strLength ; i++)
00300   {
00301     if (aStr[i] == "\\") i++;
00302     result += aStr[i];
00303   }
00304   return result;
00305 }
00306 
00307 //-----------------------------------------------------------------------------
00308 QTextCodec *
00309 rfcDecoder::codecForName (const QString & _str)
00310 {
00311   if (_str.isEmpty ())
00312     return NULL;
00313   return QTextCodec::codecForName (_str.lower ().
00314                                    replace ("windows", "cp").latin1 ());
00315 }
00316 
00317 //-----------------------------------------------------------------------------
00318 const QString
00319 rfcDecoder::decodeRFC2047String (const QString & _str)
00320 {
00321   QString throw_away;
00322 
00323   return decodeRFC2047String (_str, throw_away);
00324 }
00325 
00326 //-----------------------------------------------------------------------------
00327 const QString
00328 rfcDecoder::decodeRFC2047String (const QString & _str, QString & charset)
00329 {
00330   QString throw_away;
00331 
00332   return decodeRFC2047String (_str, charset, throw_away);
00333 }
00334 
00335 //-----------------------------------------------------------------------------
00336 const QString
00337 rfcDecoder::decodeRFC2047String (const QString & _str, QString & charset,
00338                                  QString & language)
00339 {
00340   //do we have a rfc string
00341   if (_str.find("=?") < 0)
00342     return _str;
00343 
00344   QCString aStr = _str.ascii ();  // QString.length() means Unicode chars
00345   QCString result;
00346   char *pos, *beg, *end, *mid = NULL;
00347   QCString str;
00348   char encoding = 0, ch;
00349   bool valid;
00350   const int maxLen = 200;
00351   int i;
00352 
00353 //  result.truncate(aStr.length());
00354   for (pos = aStr.data (); *pos; pos++)
00355   {
00356     if (pos[0] != '=' || pos[1] != '?')
00357     {
00358       result += *pos;
00359       continue;
00360     }
00361     beg = pos + 2;
00362     end = beg;
00363     valid = TRUE;
00364     // parse charset name
00365     for (i = 2, pos += 2;
00366          i < maxLen && (*pos != '?' && (ispunct (*pos) || isalnum (*pos)));
00367          i++)
00368       pos++;
00369     if (*pos != '?' || i < 4 || i >= maxLen)
00370       valid = FALSE;
00371     else
00372     {
00373       charset = QCString (beg, i - 1);  // -2 + 1 for the zero
00374       int pt = charset.findRev('*');
00375       if (pt != -1)
00376       {
00377         // save language for later usage
00378         language = charset.right (charset.length () - pt - 1);
00379 
00380         // tie off language as defined in rfc2047
00381         charset.truncate(pt);
00382       }
00383       // get encoding and check delimiting question marks
00384       encoding = toupper (pos[1]);
00385       if (pos[2] != '?'
00386           || (encoding != 'Q' && encoding != 'B' && encoding != 'q'
00387               && encoding != 'b'))
00388         valid = FALSE;
00389       pos += 3;
00390       i += 3;
00391 //    kdDebug(7116) << "rfcDecoder::decodeRFC2047String - charset " << charset << " - language " << language << " - '" << pos << "'" << endl;
00392     }
00393     if (valid)
00394     {
00395       mid = pos;
00396       // search for end of encoded part
00397       while (i < maxLen && *pos && !(*pos == '?' && *(pos + 1) == '='))
00398       {
00399         i++;
00400         pos++;
00401       }
00402       end = pos + 2;            //end now points to the first char after the encoded string
00403       if (i >= maxLen || !*pos)
00404         valid = FALSE;
00405     }
00406     if (valid)
00407     {
00408       ch = *pos;
00409       *pos = '\0';
00410       str = QCString (mid).left ((int) (mid - pos - 1));
00411       if (encoding == 'Q')
00412       {
00413         // decode quoted printable text
00414         for (i = str.length () - 1; i >= 0; i--)
00415           if (str[i] == '_')
00416             str[i] = ' ';
00417 //    kdDebug(7116) << "rfcDecoder::decodeRFC2047String - before QP '" << str << "'" << endl;
00418 
00419         str = KCodecs::quotedPrintableDecode(str);
00420 //    kdDebug(7116) << "rfcDecoder::decodeRFC2047String - after QP '" << str << "'" << endl;
00421       }
00422       else
00423       {
00424         // decode base64 text
00425         str = KCodecs::base64Decode(str);
00426       }
00427       *pos = ch;
00428       int len = str.length();
00429       for (i = 0; i < len; i++)
00430         result += (char) (QChar) str[i];
00431 
00432       pos = end - 1;
00433     }
00434     else
00435     {
00436 //    kdDebug(7116) << "rfcDecoder::decodeRFC2047String - invalid" << endl;
00437       //result += "=?";
00438       //pos = beg -1; // because pos gets increased shortly afterwards
00439       pos = beg - 2;
00440       result += *pos++;
00441       result += *pos;
00442     }
00443   }
00444   if (!charset.isEmpty ())
00445   {
00446     QTextCodec *aCodec = codecForName (charset.ascii ());
00447     if (aCodec)
00448     {
00449 //    kdDebug(7116) << "Codec is " << aCodec->name() << endl;
00450       return aCodec->toUnicode (result);
00451     }
00452   }
00453   return result;
00454 }
00455 
00456 
00457 //-----------------------------------------------------------------------------
00458 const char especials[17] = "()<>@,;:\"/[]?.= ";
00459 
00460 const QString
00461 rfcDecoder::encodeRFC2047String (const QString & _str)
00462 {
00463   if (_str.isEmpty ())
00464     return _str;
00465   const signed char *latin = reinterpret_cast<const signed char *>(_str.latin1()), *l, *start, *stop;
00466   char hexcode;
00467   int numQuotes, i;
00468   int rptr = 0;
00469   // My stats show this number results in 12 resize() out of 73,000
00470   int resultLen = 3 * _str.length() / 2;
00471   QCString result(resultLen);
00472   
00473   while (*latin)
00474   {
00475     l = latin;
00476     start = latin;
00477     while (*l)
00478     {
00479       if (*l == 32)
00480         start = l + 1;
00481       if (*l < 0)
00482         break;
00483       l++;
00484     }
00485     if (*l)
00486     {
00487       numQuotes = 1;
00488       while (*l)
00489       {
00490         /* The encoded word must be limited to 75 character */
00491         for (i = 0; i < 16; i++)
00492           if (*l == especials[i])
00493             numQuotes++;
00494         if (*l < 0)
00495           numQuotes++;
00496         /* Stop after 58 = 75 - 17 characters or at "<user@host..." */
00497         if (l - start + 2 * numQuotes >= 58 || *l == 60)
00498           break;
00499         l++;
00500       }
00501       if (*l)
00502       {
00503         stop = l - 1;
00504         while (stop >= start && *stop != 32)
00505           stop--;
00506         if (stop <= start)
00507           stop = l;
00508       }
00509       else
00510         stop = l;
00511       if (resultLen - rptr - 1 <= start -  latin + 1 + 16 /* =?iso-88... */) {
00512         resultLen += (start - latin + 1) * 2 + 20; // more space
00513     result.resize(resultLen);
00514       }
00515       while (latin < start)
00516       {
00517         result[rptr++] = *latin;
00518         latin++;
00519       }
00520       strcpy(&result[rptr], "=?iso-8859-1?q?"); rptr += 15;
00521       if (resultLen - rptr - 1 <= 3*(stop - latin + 1)) {
00522         resultLen += (stop - latin + 1) * 4 + 20; // more space
00523     result.resize(resultLen);
00524       }
00525       while (latin < stop) // can add up to 3 chars/iteration
00526       {
00527         numQuotes = 0;
00528         for (i = 0; i < 16; i++)
00529           if (*latin == especials[i])
00530             numQuotes = 1;
00531         if (*latin < 0)
00532           numQuotes = 1;
00533         if (numQuotes)
00534         {
00535           result[rptr++] = '=';
00536           hexcode = ((*latin & 0xF0) >> 4) + 48;
00537           if (hexcode >= 58)
00538             hexcode += 7;
00539           result[rptr++] = hexcode;
00540           hexcode = (*latin & 0x0F) + 48;
00541           if (hexcode >= 58)
00542             hexcode += 7;
00543           result[rptr++] = hexcode;
00544         }
00545         else
00546         {
00547           result[rptr++] = *latin;
00548         }
00549         latin++;
00550       }
00551       result[rptr++] = '?';
00552       result[rptr++] = '=';
00553     }
00554     else
00555     {
00556       while (*latin)
00557       {
00558         if (rptr == resultLen - 1) {
00559           resultLen += 30;
00560           result.resize(resultLen);
00561         }
00562         result[rptr++] = *latin;
00563         latin++;
00564       }
00565     }
00566   }
00567   result[rptr] = 0;
00568   //free (latinStart);
00569   return result;
00570 }
00571 
00572 
00573 //-----------------------------------------------------------------------------
00574 const QString
00575 rfcDecoder::encodeRFC2231String (const QString & _str)
00576 {
00577   if (_str.isEmpty ())
00578     return _str;
00579   signed char *latin = (signed char *) calloc (1, _str.length () + 1);
00580   char *latin_us = (char *) latin;
00581   strcpy (latin_us, _str.latin1 ());
00582   signed char *l = latin;
00583   char hexcode;
00584   int i;
00585   bool quote;
00586   while (*l)
00587   {
00588     if (*l < 0)
00589       break;
00590     l++;
00591   }
00592   if (!*l) {
00593     free(latin);
00594     return _str.ascii ();
00595   }
00596   QCString result;
00597   l = latin;
00598   while (*l)
00599   {
00600     quote = *l < 0;
00601     for (i = 0; i < 16; i++)
00602       if (*l == especials[i])
00603         quote = true;
00604     if (quote)
00605     {
00606       result += "%";
00607       hexcode = ((*l & 0xF0) >> 4) + 48;
00608       if (hexcode >= 58)
00609         hexcode += 7;
00610       result += hexcode;
00611       hexcode = (*l & 0x0F) + 48;
00612       if (hexcode >= 58)
00613         hexcode += 7;
00614       result += hexcode;
00615     }
00616     else
00617     {
00618       result += *l;
00619     }
00620     l++;
00621   }
00622   free (latin);
00623   return result;
00624 }
00625 
00626 
00627 //-----------------------------------------------------------------------------
00628 const QString
00629 rfcDecoder::decodeRFC2231String (const QString & _str)
00630 {
00631   int p = _str.find ('\'');
00632 
00633   //see if it is an rfc string
00634   if (p < 0)
00635     return _str;
00636 
00637   int l = _str.findRev ('\'');
00638 
00639   //second is language
00640   if (p >= l)
00641     return _str;
00642 
00643   //first is charset or empty
00644   QString charset = _str.left (p);
00645   QString st = _str.mid (l + 1);
00646   QString language = _str.mid (p + 1, l - p - 1);
00647 
00648   //kdDebug(7116) << "Charset: " << charset << " Language: " << language << endl;
00649 
00650   char ch, ch2;
00651   p = 0;
00652   while (p < (int) st.length ())
00653   {
00654     if (st.at (p) == 37)
00655     {
00656       ch = st.at (p + 1).latin1 () - 48;
00657       if (ch > 16)
00658         ch -= 7;
00659       ch2 = st.at (p + 2).latin1 () - 48;
00660       if (ch2 > 16)
00661         ch2 -= 7;
00662       st.at (p) = ch * 16 + ch2;
00663       st.remove (p + 1, 2);
00664     }
00665     p++;
00666   }
00667   return st;
00668 }
kioslaves

rfcdecoder.cc

kioslaves

API Reference