parsinghelpers.cpp Source File

KIOSlave

 /* This file is part of the KDE libraries
     Copyright (C) 2008 Andreas Hartmetz <ahartmetz@gmail.com>
     Copyright (C) 2010,2011 Rolf Eike Beer <kde@opensource.sf-tec.de>
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Library General Public
     License as published by the Free Software Foundation; either
     version 2 of the License, or (at your option) any later version.
 
     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Library General Public License for more details.
 
     You should have received a copy of the GNU Library General Public License
     along with this library; see the file COPYING.LIB.  If not, write to
     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     Boston, MA 02110-1301, USA.
 */
 
 #include <ctype.h>
 
 #include <QDir>
 #include <QMap>
 #include <QTextCodec>
 #include <QUrl>
 
 #include <kcodecs.h>
 #include <kdebug.h>
 
 // Advance *pos beyond spaces / tabs
 static void skipSpace(const char input[], int *pos, int end)
 {
     int idx = *pos;
     while (idx < end && (input[idx] == ' ' || input[idx] == '\t')) {
         idx++;
     }
     *pos = idx;
     return;
 }
 
 // Advance *pos to start of next line while being forgiving about line endings.
 // Return false if the end of the header has been reached, true otherwise.
 static bool nextLine(const char input[], int *pos, int end)
 {
     int idx = *pos;
     while (idx < end && input[idx] != '\r' && input[idx] != '\n') {
         idx++;
     }
     int rCount = 0;
     int nCount = 0;
     while (idx < end && qMax(rCount, nCount) < 2 && (input[idx] == '\r' || input[idx] == '\n')) {
         input[idx] == '\r' ? rCount++ : nCount++;
         idx++;
     }
     if (idx < end && qMax(rCount, nCount) == 2 && qMin(rCount, nCount) == 1) {
         // if just one of the others is missing eat it too.
         // this ensures that conforming headers using the proper
         // \r\n sequence (and also \n\r) will be parsed correctly.
         if ((rCount == 1 && input[idx] == '\r') || (nCount == 1 && input[idx] == '\n')) {
             idx++;
         }
     }
 
     *pos = idx;
     return idx < end && rCount < 2 && nCount < 2;
 }
 
 // QByteArray::fromPercentEncoding() does not notify us about encoding errors so we need
 // to check here if this is valid at all.
 static bool isValidPercentEncoding(const QByteArray &data)
 {
     int i = 0;
     const int last = data.length() - 1;
     const char *d = data.constData();
 
     while ( (i = data.indexOf('%', i)) != -1) {
         if ( i >= last - 2 )
             return false;
         if ( ! isxdigit(d[i + 1]) )
             return false;
         if ( ! isxdigit(d[i + 2]) )
             return false;
         i++;
     }
 
     return true;
 }
 
 QByteArray TokenIterator::next()
 {
     QPair<int, int> token = m_tokens[m_currentToken++];
     //fromRawData brings some speed advantage but also the requirement to keep the text buffer
     //around. this together with implicit sharing (you don't know where copies end up)
     //is dangerous!
     //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first);
     return QByteArray(&m_buffer[token.first], token.second - token.first);
 }
 
 QByteArray TokenIterator::current() const
 {
     QPair<int, int> token = m_tokens[m_currentToken - 1];
     //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first);
     return QByteArray(&m_buffer[token.first], token.second - token.first);
 }
 
 QList<QByteArray> TokenIterator::all() const
 {
     QList<QByteArray> ret;
     for (int i = 0; i < m_tokens.count(); i++) {
         QPair<int, int> token = m_tokens[i];
         ret.append(QByteArray(&m_buffer[token.first], token.second - token.first));
     }
     return ret;
 }
 
 
 HeaderTokenizer::HeaderTokenizer(char *buffer)
     : m_buffer(buffer)
 {
     // add information about available headers and whether they have one or multiple,
     // comma-separated values.
 
     //The following response header fields are from RFC 2616 unless otherwise specified.
     //Hint: search the web for e.g. 'http "accept-ranges header"' to find information about
     //a header field.
     static const HeaderFieldTemplate headerFieldTemplates[] = {
         {"accept-ranges", false},
         {"age", false},
         {"cache-control", true},
         {"connection", true},
         {"content-disposition", false}, //is multi-valued in a way, but with ";" separator!
         {"content-encoding", true},
         {"content-language", true},
         {"content-length", false},
         {"content-location", false},
         {"content-md5", false},
         {"content-type", false},
         {"date", false},
         {"dav", true}, //RFC 2518
         {"etag", false},
         {"expires", false},
         {"keep-alive", true}, //RFC 2068
         {"last-modified", false},
         {"link", false}, //RFC 2068, multi-valued with ";" separator
         {"location", false},
         {"p3p", true}, // http://www.w3.org/TR/P3P/
         {"pragma", true},
         {"proxy-authenticate", false}, //complicated multi-valuedness: quoted commas don't separate
                                        //multiple values. we handle this at a higher level.
         {"proxy-connection", true}, //inofficial but well-known; to avoid misunderstandings
                                     //when using "connection" when talking to a proxy.
         {"refresh", false}, //not sure, only found some mailing list posts mentioning it
         {"set-cookie", false}, //RFC 2109; the multi-valuedness seems to be usually achieved
                                //by sending several instances of this field as opposed to
                                //usually comma-separated lists with maybe multiple instances.
         {"transfer-encoding", true},
         {"upgrade", true},
         {"warning", true},
         {"www-authenticate", false} //see proxy-authenticate
     };
 
     for (uint i = 0; i < sizeof(headerFieldTemplates) / sizeof(HeaderFieldTemplate); i++) {
         const HeaderFieldTemplate &ft = headerFieldTemplates[i];
         insert(QByteArray(ft.name), HeaderField(ft.isMultiValued));
     }
 }
 
 int HeaderTokenizer::tokenize(int begin, int end)
 {
     char *buf = m_buffer;  //keep line length in check :/
     int idx = begin;
     int startIdx = begin; //multi-purpose start of current token
     bool multiValuedEndedWithComma = false; //did the last multi-valued line end with a comma?
     QByteArray headerKey;
     do {
 
         if (buf[idx] == ' ' || buf [idx] == '\t') {
             // line continuation; preserve startIdx except (see below)
             if (headerKey.isEmpty()) {
                 continue;
             }
             // turn CR/LF into spaces for later parsing convenience
             int backIdx = idx - 1;
             while (backIdx >= begin && (buf[backIdx] == '\r' || buf[backIdx] == '\n')) {
                 buf[backIdx--] = ' ';
             }
 
             // multiple values, comma-separated: add new value or continue previous?
             if (operator[](headerKey).isMultiValued) {
                 if (multiValuedEndedWithComma) {
                     // start new value; this is almost like no line continuation
                     skipSpace(buf, &idx, end);
                     startIdx = idx;
                 } else {
                     // continue previous value; this is tricky. unit tests to the rescue!
                     if (operator[](headerKey).beginEnd.last().first == startIdx) {
                         // remove entry, it will be re-added because already idx != startIdx
                         operator[](headerKey).beginEnd.removeLast();
                     } else {
                         // no comma, no entry: the prev line was whitespace only - start new value
                         skipSpace(buf, &idx, end);
                         startIdx = idx;
                     }
                 }
             }
 
         } else {
             // new field
             startIdx = idx;
             // also make sure that there is at least one char after the colon
             while (idx < (end - 1) && buf[idx] != ':' && buf[idx] != '\r' && buf[idx] != '\n') {
                 buf[idx] = tolower(buf[idx]);
                 idx++;
             }
             if (buf[idx] != ':') {
                 //malformed line: no colon
                 headerKey.clear();
                 continue;
             }
             headerKey = QByteArray(&buf[startIdx], idx - startIdx);
             if (!contains(headerKey)) {
                 //we don't recognize this header line
                 headerKey.clear();
                 continue;
             }
             // skip colon & leading whitespace
             idx++;
             skipSpace(buf, &idx, end);
             startIdx = idx;
         }
 
         // we have the name/key of the field, now parse the value
         if (!operator[](headerKey).isMultiValued) {
 
             // scan to end of line
             while (idx < end && buf[idx] != '\r' && buf[idx] != '\n') {
                 idx++;
             }
             if (!operator[](headerKey).beginEnd.isEmpty()) {
                 // there already is an entry; are we just in a line continuation?
                 if (operator[](headerKey).beginEnd.last().first == startIdx) {
                     // line continuation: delete previous entry and later insert a new, longer one.
                     operator[](headerKey).beginEnd.removeLast();
                 }
             }
             operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx));
 
         } else {
 
             // comma-separated list
             while (true) {
                 //skip one value
                 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n' && buf[idx] != ',') {
                     idx++;
                 }
                 if (idx != startIdx) {
                     operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx));
                 }
                 multiValuedEndedWithComma = buf[idx] == ',';
                 //skip comma(s) and leading whitespace, if any respectively
                 while (idx < end && buf[idx] == ',') {
                     idx++;
                 }
                 skipSpace(buf, &idx, end);
                 //next value or end-of-line / end of header?
                 if (buf[idx] >= end || buf[idx] == '\r' || buf[idx] == '\n') {
                     break;
                 }
                 //next value
                 startIdx = idx;
             }
         }
     } while (nextLine(buf, &idx, end));
     return idx;
 }
 
 
 TokenIterator HeaderTokenizer::iterator(const char *key) const
 {
     QByteArray keyBa = QByteArray::fromRawData(key, strlen(key));
     if (contains(keyBa)) {
         return TokenIterator(value(keyBa).beginEnd, m_buffer);
     } else {
         return TokenIterator(m_nullTokens, m_buffer);
     }
 }
 
 static void skipLWS(const QString &str, int &pos)
 {
     while (pos < str.length() && (str[pos] == QLatin1Char(' ') || str[pos] == QLatin1Char('\t'))) {
         ++pos;
     }
 }
 
 // keep the common ending, this allows the compiler to join them
 static const char typeSpecials[] =  "{}*'%()<>@,;:\\\"/[]?=";
 static const char attrSpecials[] =     "'%()<>@,;:\\\"/[]?=";
 static const char valueSpecials[] =      "()<>@,;:\\\"/[]?=";
 
 static bool specialChar(const QChar &ch, const char *specials)
 {
     // WORKAROUND: According to RFC 2616, any character other than ascii
     // characters should NOT be allowed in unquoted content-disposition file
     // names. However, since none of the major browsers follow this rule, we do
     // the same thing here and allow all printable unicode characters. See
     // https://bugs.kde.org/show_bug.cgi?id=261223 for the detials.
     if (!ch.isPrint()) {
         return true;
     }
 
     for (int i = qstrlen(specials) - 1; i >= 0; i--) {
         if (ch == QLatin1Char(specials[i])) {
             return true;
         }
     }
 
     return false;
 }
 
 static QString extractUntil(const QString &str, QChar term, int &pos, const char *specials)
 {
     QString out;
     skipLWS(str, pos);
     bool valid = true;
 
     while (pos < str.length() && (str[pos] != term)) {
         out += str[pos];
         valid = (valid && !specialChar(str[pos], specials));
         ++pos;
     }
 
     if (pos < str.length()) { // Stopped due to finding term
         ++pos;
     }
 
     if (!valid) {
         return QString();
     }
 
     // Remove trailing linear whitespace...
     while (out.endsWith(QLatin1Char(' ')) || out.endsWith(QLatin1Char('\t'))) {
         out.chop(1);
     }
 
     if (out.contains(QLatin1Char(' '))) {
         out.clear();
     }
 
     return out;
 }
 
 // As above, but also handles quotes..
 // pos is set to -1 on parse error
 static QString extractMaybeQuotedUntil(const QString &str, int &pos)
 {
     const QChar term = QLatin1Char(';');
 
     skipLWS(str, pos);
 
     // Are we quoted?
     if (pos < str.length() && str[pos] == QLatin1Char('"')) {
         QString out;
 
         // Skip the quote...
         ++pos;
 
         // when quoted we also need an end-quote
         bool endquote = false;
 
         // Parse until trailing quote...
         while (pos < str.length()) {
             if (str[pos] == QLatin1Char('\\') && pos + 1 < str.length()) {
                 // quoted-pair = "\" CHAR
                 out += str[pos + 1];
                 pos += 2; // Skip both...
             } else if (str[pos] == QLatin1Char('"')) {
                 ++pos;
                 endquote = true;
                 break;
             } else if (!str[pos].isPrint()) { // Don't allow CTL's RFC 2616 sec 2.2
                 break;
             } else {
                 out += str[pos];
                 ++pos;
             }
         }
 
         if (!endquote) {
             pos = -1;
             return QString();
         }
 
         // Skip until term..
         while (pos < str.length() && (str[pos] != term)) {
             if ((str[pos] != QLatin1Char(' ')) && (str[pos] != QLatin1Char('\t'))) {
                 pos = -1;
                 return QString();
             }
             ++pos;
         }
 
         if (pos < str.length()) {  // Stopped due to finding term
             ++pos;
         }
 
         return out;
     } else {
         return extractUntil(str, term, pos, valueSpecials);
     }
 }
 
 static QMap<QString, QString> contentDispositionParserInternal(const QString &disposition)
 {
     kDebug(7113) << "disposition: " << disposition;
     int pos = 0;
     const QString strDisposition = extractUntil(disposition, QLatin1Char(';'), pos, typeSpecials).toLower();
 
     QMap<QString, QString> parameters;
     QMap<QString, QString> contparams;   // all parameters that contain continuations
     QMap<QString, QString> encparams;    // all parameters that have character encoding
 
     // the type is invalid, the complete header is junk
     if (strDisposition.isEmpty()) {
         return parameters;
     }
 
     parameters.insert(QLatin1String("type"), strDisposition);
 
     while (pos < disposition.length()) {
         QString key = extractUntil(disposition, QLatin1Char('='), pos, attrSpecials).toLower();
 
         if (key.isEmpty()) {
             // parse error in this key: do not parse more, but add up
             // everything we already got
             kDebug(7113) << "parse error in key, abort parsing";
             break;
         }
 
         QString val;
         if (key.endsWith(QLatin1Char('*'))) {
             val = extractUntil(disposition, QLatin1Char(';'), pos, valueSpecials);
         } else {
             val = extractMaybeQuotedUntil(disposition, pos);
         }
 
         if (val.isEmpty()) {
             if (pos == -1) {
                 kDebug(7113) << "parse error in value, abort parsing";
                 break;
             }
             continue;
         }
 
         const int spos = key.indexOf(QLatin1Char('*'));
         if (spos == key.length() - 1) {
             key.chop(1);
             encparams.insert(key, val);
         } else if (spos >= 0) {
             contparams.insert(key, val);
         } else if (parameters.contains(key)) {
             kDebug(7113) << "duplicate key" << key << "found, ignoring everything more";
             parameters.remove(key);
             return parameters;
         } else {
             parameters.insert(key, val);
         }
     }
 
     QMap<QString, QString>::iterator i = contparams.begin();
     while (i != contparams.end()) {
         QString key = i.key();
         int spos = key.indexOf(QLatin1Char('*'));
         bool hasencoding = false;
 
         if (key.at(spos + 1) != QLatin1Char('0')) {
             ++i;
             continue;
         }
 
         // no leading zeros allowed, so delete the junk
         int klen = key.length();
         if (klen > spos + 2) {
             // nothing but continuations and encodings may insert * into parameter name
             if ((klen > spos + 3) || ((klen == spos + 3) && (key.at(spos + 2) != QLatin1Char('*')))) {
                 kDebug(7113) << "removing invalid key " << key << "with val" << i.value() << key.at(spos + 2);
                 i = contparams.erase(i);
                 continue;
             }
             hasencoding = true;
         }
 
         int seqnum = 1;
         QMap<QString, QString>::iterator partsi;
         // we do not need to care about encoding specifications: only the first
         // part is allowed to have one
         QString val = i.value();
 
         key.chop(hasencoding ? 2 : 1);
 
         while ((partsi = contparams.find(key + QString::number(seqnum))) != contparams.end()) {
             val += partsi.value();
             contparams.erase(partsi);
         }
 
         i = contparams.erase(i);
 
         key.chop(1);
         if (hasencoding) {
             encparams.insert(key, val);
         } else {
             if (parameters.contains(key)) {
                 kDebug(7113) << "duplicate key" << key << "found, ignoring everything more";
                 parameters.remove(key);
                 return parameters;
             }
 
             parameters.insert(key, val);
         }
     }
 
     for (QMap<QString, QString>::iterator i = encparams.begin(); i != encparams.end(); ++i) {
         QString val = i.value();
 
         // RfC 2231 encoded character set in filename
         int spos = val.indexOf(QLatin1Char('\''));
         if (spos == -1) {
             continue;
         }
         int npos = val.indexOf(QLatin1Char('\''), spos + 1);
         if (npos == -1) {
             continue;
         }
 
         const QString charset = val.left(spos);
         const QString lang = val.mid(spos + 1, npos - spos - 1);
         const QByteArray encodedVal = val.mid(npos + 1).toLatin1();
 
         if ( ! isValidPercentEncoding(encodedVal) )
             continue;
 
         const QByteArray rawval = QByteArray::fromPercentEncoding(encodedVal);
 
         if (charset.isEmpty() || (charset == QLatin1String("us-ascii"))) {
             bool valid = true;
             for (int j = rawval.length() - 1; (j >= 0) && valid; j--) {
                 valid = (rawval.at(j) >= 32);
             }
 
             if (!valid)
                 continue;
             val = QString::fromLatin1(rawval.constData());
         } else {
             QTextCodec *codec = QTextCodec::codecForName(charset.toLatin1());
             if (!codec)
                 continue;
             val = codec->toUnicode(rawval);
         }
 
         parameters.insert(i.key(), val);
     }
 
     return parameters;
 }
 
 static QMap<QString, QString> contentDispositionParser(const QString &disposition)
 {
     QMap<QString, QString> parameters = contentDispositionParserInternal(disposition);
 
     const QLatin1String fn("filename");
     if (parameters.contains(fn)) {
         // Content-Disposition is not allowed to dictate directory
         // path, thus we extract the filename only.
         const QString val = QDir::toNativeSeparators(parameters[fn]);
         int slpos = val.lastIndexOf(QDir::separator());
 
         if (slpos > -1) {
             parameters.insert(fn, val.mid(slpos + 1));
         }
     }
 
     return parameters;
 }
KDE's Doxygen guidelines are available online.
KIOSlave

KIOSlave

kdelibs API Reference

Search