regexp.cpp

00001 // -*- c-basic-offset: 2 -*-
00002 /*
00003  *  This file is part of the KDE libraries
00004  *  Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
00005  *  Copyright (C) 2003,2004 Apple Computer, Inc.
00006  *  Copyright (C) 2006      Maksim Orlovich (maksim@kde.org)
00007  *
00008  *  This library is free software; you can redistribute it and/or
00009  *  modify it under the terms of the GNU Lesser General Public
00010  *  License as published by the Free Software Foundation; either
00011  *  version 2 of the License, or (at your option) any later version.
00012  *
00013  *  This library is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  *  Lesser General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU Lesser General Public
00019  *  License along with this library; if not, write to the Free Software
00020  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00021  *
00022  */
00023 
00024 #include "regexp.h"
00025 
00026 #include "lexer.h"
00027 #include <assert.h>
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 
00032 using namespace KJS;
00033 
00034 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
00035 
00036 RegExp::RegExp(const UString &p, int f)
00037   : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
00038 {
00039   // Determine whether libpcre has unicode support if need be..
00040   if (utf8Support == Unknown) {
00041     int supported;
00042     pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
00043     utf8Support = supported ? Supported : Unsupported;
00044   }
00045 
00046   nrSubPatterns = 0; // determined in match() with POSIX regex.
00047 
00048   // JS regexps can contain Unicode escape sequences (\uxxxx) which
00049   // are rather uncommon elsewhere. As our regexp libs don't understand
00050   // them we do the unescaping ourselves internally.
00051   // Also make sure to expand out any nulls as pcre_compile 
00052   // expects null termination..
00053   UString intern;
00054   const char* const nil = "\\x00";
00055   if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
00056     bool escape = false;
00057     for (int i = 0; i < p.size(); ++i) {
00058       UChar c = p[i];
00059       if (escape) {
00060         escape = false;
00061         // we only care about \uxxxx
00062         if (c == 'u' && i + 4 < p.size()) {
00063           int c0 = p[i+1].unicode();
00064           int c1 = p[i+2].unicode();
00065           int c2 = p[i+3].unicode();
00066           int c3 = p[i+4].unicode();
00067           if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) &&
00068               Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) {
00069             c = Lexer::convertUnicode(c0, c1, c2, c3);
00070             if (c.unicode() == 0) {
00071                 // Make sure to encode 0, to avoid terminating the string
00072                 intern += UString(nil);
00073             } else {
00074                 intern += UString(&c, 1);
00075             }
00076             i += 4;
00077             continue;
00078           }
00079         }
00080         intern += UString('\\');
00081         intern += UString(&c, 1);
00082       } else {
00083         if (c == '\\')
00084           escape = true;
00085         else if (c == '\0')
00086           intern += UString(nil);
00087         else
00088           intern += UString(&c, 1);
00089       }
00090     }
00091   } else {
00092     intern = p;
00093   }
00094 
00095 #ifdef HAVE_PCREPOSIX
00096   int pcreflags = 0;
00097   const char *perrormsg;
00098   int errorOffset;
00099 
00100   if (flgs & IgnoreCase)
00101     pcreflags |= PCRE_CASELESS;
00102 
00103   if (flgs & Multiline)
00104     pcreflags |= PCRE_MULTILINE;
00105 
00106   if (utf8Support == Supported)
00107     pcreflags |= PCRE_UTF8;
00108 
00109   // Fill our buffer with an encoded version, whether utf-8, or, 
00110   // if PCRE is incapable, truncated.
00111   prepareMatch(intern);
00112 
00113   pcregex = pcre_compile(buffer, pcreflags,
00114              &perrormsg, &errorOffset, NULL);
00115   doneMatch(); // Cleanup buffers
00116   if (!pcregex) {
00117 #ifndef NDEBUG
00118     fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
00119 #endif
00120     valid = false;
00121     return;
00122   }
00123 
00124 #ifdef PCRE_INFO_CAPTURECOUNT
00125   // Get number of subpatterns that will be returned
00126   int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00127   if (rc != 0)
00128 #endif
00129     nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
00130 
00131 #else /* HAVE_PCREPOSIX */
00132 
00133   int regflags = 0;
00134 #ifdef REG_EXTENDED
00135   regflags |= REG_EXTENDED;
00136 #endif
00137 #ifdef REG_ICASE
00138   if ( f & IgnoreCase )
00139     regflags |= REG_ICASE;
00140 #endif
00141 
00142   //NOTE: Multiline is not feasible with POSIX regex.
00143   //if ( f & Multiline )
00144   //    ;
00145   // Note: the Global flag is already handled by RegExpProtoFunc::execute
00146 
00147   int errorCode = regcomp(&preg, intern.ascii(), regflags);
00148   if (errorCode != 0) {
00149 #ifndef NDEBUG
00150     char errorMessage[80];
00151     regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
00152     fprintf(stderr, "KJS: regcomp failed with '%s'", errorMessage);
00153 #endif
00154     valid = false;
00155   }
00156 #endif
00157 }
00158 
00159 RegExp::~RegExp()
00160 {
00161   doneMatch(); // Be 100% sure buffers are freed
00162 #ifdef HAVE_PCREPOSIX
00163   if (pcregex)
00164     pcre_free(pcregex);
00165 #else
00166   /* TODO: is this really okay after an error ? */
00167   regfree(&preg);
00168 #endif
00169 }
00170 
00171 void RegExp::prepareUtf8(const UString& s)
00172 {
00173   // Allocate a buffer big enough to hold all the characters plus \0
00174   const int length = s.size();
00175   buffer = new char[length * 3 + 1];
00176 
00177   // Also create buffer for positions. We need one extra character in there,
00178   // even past the \0 since the non-empty handling may jump one past the end
00179   originalPos = new int[length * 3 + 2];
00180 
00181   // Convert to runs of 8-bit characters, and generate indeces
00182   // Note that we do NOT combine surrogate pairs here, as 
00183   // regexps operate on them as separate characters
00184   char *p      = buffer;
00185   int  *posOut = originalPos;
00186   const UChar *d = s.data();
00187   for (int i = 0; i != length; ++i) {
00188     unsigned short c = d[i].unicode();
00189 
00190     int sequenceLen;
00191     if (c < 0x80) {
00192       *p++ = (char)c;
00193       sequenceLen = 1;
00194     } else if (c < 0x800) {
00195       *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
00196       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
00197       sequenceLen = 2;
00198     } else {
00199       *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
00200       *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
00201       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
00202       sequenceLen = 3;
00203     }
00204 
00205     while (sequenceLen > 0) {
00206       *posOut = i;
00207       ++posOut;
00208       --sequenceLen;
00209     }
00210   }
00211 
00212   bufferSize = p - buffer;
00213 
00214   *p++ = '\0';
00215 
00216   // Record positions for \0, and the fictional character after that.
00217   *posOut     = length;
00218   *(posOut+1) = length+1;
00219 }
00220 
00221 void RegExp::prepareASCII (const UString& s)
00222 {
00223   originalPos = 0;
00224 
00225   // Best-effort attempt to get something done
00226   // when we don't have utf 8 available -- use 
00227   // truncated version, and pray for the best 
00228   CString truncated = s.cstring();
00229   buffer = new char[truncated.size() + 1];
00230   memcpy(buffer, truncated.c_str(), truncated.size());
00231   buffer[truncated.size()] = '\0'; // For _compile use
00232   bufferSize = truncated.size();
00233 }
00234 
00235 void RegExp::prepareMatch(const UString &s)
00236 {
00237   delete[] originalPos; // Just to be sure..
00238   delete[] buffer;
00239   if (utf8Support == Supported)
00240     prepareUtf8(s);
00241   else
00242     prepareASCII(s);
00243 
00244 #ifndef NDEBUG
00245   originalS = s;
00246 #endif
00247 }
00248 
00249 void RegExp::doneMatch() 
00250 {
00251   delete[] originalPos; originalPos = 0;
00252   delete[] buffer;      buffer      = 0;
00253 }
00254 
00255 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
00256 {
00257 #ifndef NDEBUG
00258   assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
00259 #endif
00260   assert(valid);
00261 
00262   if (i < 0)
00263     i = 0;
00264   if (ovector)
00265     *ovector = 0L;
00266   int dummyPos;
00267   if (!pos)
00268     pos = &dummyPos;
00269   *pos = -1;
00270   if (i > s.size() || s.isNull())
00271     return UString::null;
00272 
00273 #ifdef HAVE_PCREPOSIX
00274   int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
00275   if (ovector) *ovector = new int[ovecsize];
00276   if (!pcregex)
00277     return UString::null;
00278 
00279   int startPos;
00280   int nextPos;
00281 
00282   if (utf8Support == Supported) {
00283     startPos = i;
00284     while (originalPos[startPos] < i)
00285       ++startPos;
00286 
00287     nextPos = startPos;
00288     while (originalPos[nextPos] < (i + 1))
00289       ++nextPos;
00290   } else {
00291     startPos = i;
00292     nextPos  = i + 1;
00293   }
00294 
00295   if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
00296                 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0, // see man pcretest
00297                 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00298   {
00299     // Failed to match.
00300     if ((flgs & Global) && m_notEmpty && ovector)
00301     {
00302       // We set m_notEmpty ourselves, to look for a non-empty match
00303       // (see man pcretest or pcretest.c for details).
00304       // So we don't stop here, we want to try again at i+1.
00305 #ifdef KJS_VERBOSE
00306       fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
00307 #endif
00308       m_notEmpty = 0;
00309       if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, 0,
00310                     ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00311         return UString::null;
00312     }
00313     else // done
00314       return UString::null;
00315   }
00316 
00317   // Got a match, proceed with it.
00318   // But fix up the ovector if need be..
00319   if (ovector && originalPos) {
00320     for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) {
00321       if ((*ovector)[c] != -1)
00322         (*ovector)[c] = originalPos[(*ovector)[c]];
00323     }
00324   }
00325 
00326   if (!ovector)
00327     return UString::null; // don't rely on the return value if you pass ovector==0
00328 #else
00329   const uint maxMatch = 10;
00330   regmatch_t rmatch[maxMatch];
00331 
00332   char *str = strdup(s.ascii()); // TODO: why ???
00333   if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00334     free(str);
00335     return UString::null;
00336   }
00337   free(str);
00338 
00339   if (!ovector) {
00340     *pos = rmatch[0].rm_so + i;
00341     return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00342   }
00343 
00344   // map rmatch array to ovector used in PCRE case
00345   nrSubPatterns = 0;
00346   for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
00347     nrSubPatterns++;
00348     // if the nonEmpty flag is set, return a failed match if any of the
00349     // subMatches happens to be an empty string.
00350     if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo) 
00351       return UString::null;
00352   }
00353   // Allow an ovector slot to return the (failed) match result.
00354   if (nrSubPatterns == 0) nrSubPatterns = 1;
00355   
00356   int ovecsize = (nrSubPatterns)*3; // see above
00357   *ovector = new int[ovecsize];
00358   for (uint j = 0; j < nrSubPatterns; j++) {
00359       (*ovector)[2*j] = rmatch[j].rm_so + i;
00360       (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00361   }
00362 #endif
00363 
00364   *pos = (*ovector)[0];
00365   if ( *pos == (*ovector)[1] && (flgs & Global) )
00366   {
00367     // empty match, next try will be with m_notEmpty=true
00368     m_notEmpty=true;
00369   }
00370   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00371 }
00372 
00373 #if 0 // unused
00374 bool RegExp::test(const UString &s, int)
00375 {
00376 #ifdef HAVE_PCREPOSIX
00377   int ovector[300];
00378   CString buffer(s.cstring());
00379 
00380   if (s.isNull() ||
00381       pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
00382         0, ovector, 300) == PCRE_ERROR_NOMATCH)
00383     return false;
00384   else
00385     return true;
00386 
00387 #else
00388 
00389   char *str = strdup(s.ascii());
00390   int r = regexec(&preg, str, 0, 0, 0);
00391   free(str);
00392 
00393   return r == 0;
00394 #endif
00395 }
00396 #endif
KDE Home | KDE Accessibility Home | Description of Access Keys