00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "regexp.h"
00025
00026 #include "lexer.h"
00027 #include <assert.h>
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031
00032 using namespace KJS;
00033
00034 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
00035
00036 RegExp::RegExp(const UString &p, int f)
00037 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
00038 {
00039
00040 if (utf8Support == Unknown) {
00041 int supported;
00042 pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
00043 utf8Support = supported ? Supported : Unsupported;
00044 }
00045
00046 nrSubPatterns = 0;
00047
00048
00049
00050
00051
00052
00053 UString intern;
00054 const char* const nil = "\\x00";
00055 if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
00056 bool escape = false;
00057 for (int i = 0; i < p.size(); ++i) {
00058 UChar c = p[i];
00059 if (escape) {
00060 escape = false;
00061
00062 if (c == 'u' && i + 4 < p.size()) {
00063 int c0 = p[i+1].unicode();
00064 int c1 = p[i+2].unicode();
00065 int c2 = p[i+3].unicode();
00066 int c3 = p[i+4].unicode();
00067 if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) &&
00068 Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) {
00069 c = Lexer::convertUnicode(c0, c1, c2, c3);
00070 if (c.unicode() == 0) {
00071
00072 intern += UString(nil);
00073 } else {
00074 intern += UString(&c, 1);
00075 }
00076 i += 4;
00077 continue;
00078 }
00079 }
00080 intern += UString('\\');
00081 intern += UString(&c, 1);
00082 } else {
00083 if (c == '\\')
00084 escape = true;
00085 else if (c == '\0')
00086 intern += UString(nil);
00087 else
00088 intern += UString(&c, 1);
00089 }
00090 }
00091 } else {
00092 intern = p;
00093 }
00094
00095 #ifdef HAVE_PCREPOSIX
00096 int pcreflags = 0;
00097 const char *perrormsg;
00098 int errorOffset;
00099
00100 if (flgs & IgnoreCase)
00101 pcreflags |= PCRE_CASELESS;
00102
00103 if (flgs & Multiline)
00104 pcreflags |= PCRE_MULTILINE;
00105
00106 if (utf8Support == Supported)
00107 pcreflags |= PCRE_UTF8;
00108
00109
00110
00111 prepareMatch(intern);
00112
00113 pcregex = pcre_compile(buffer, pcreflags,
00114 &perrormsg, &errorOffset, NULL);
00115 doneMatch();
00116 if (!pcregex) {
00117 #ifndef NDEBUG
00118 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
00119 #endif
00120 valid = false;
00121 return;
00122 }
00123
00124 #ifdef PCRE_INFO_CAPTURECOUNT
00125
00126 int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00127 if (rc != 0)
00128 #endif
00129 nrSubPatterns = 0;
00130
00131 #else
00132
00133 int regflags = 0;
00134 #ifdef REG_EXTENDED
00135 regflags |= REG_EXTENDED;
00136 #endif
00137 #ifdef REG_ICASE
00138 if ( f & IgnoreCase )
00139 regflags |= REG_ICASE;
00140 #endif
00141
00142
00143
00144
00145
00146
00147 int errorCode = regcomp(&preg, intern.ascii(), regflags);
00148 if (errorCode != 0) {
00149 #ifndef NDEBUG
00150 char errorMessage[80];
00151 regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
00152 fprintf(stderr, "KJS: regcomp failed with '%s'", errorMessage);
00153 #endif
00154 valid = false;
00155 }
00156 #endif
00157 }
00158
00159 RegExp::~RegExp()
00160 {
00161 doneMatch();
00162 #ifdef HAVE_PCREPOSIX
00163 if (pcregex)
00164 pcre_free(pcregex);
00165 #else
00166
00167 regfree(&preg);
00168 #endif
00169 }
00170
00171 void RegExp::prepareUtf8(const UString& s)
00172 {
00173
00174 const int length = s.size();
00175 buffer = new char[length * 3 + 1];
00176
00177
00178
00179 originalPos = new int[length * 3 + 2];
00180
00181
00182
00183
00184 char *p = buffer;
00185 int *posOut = originalPos;
00186 const UChar *d = s.data();
00187 for (int i = 0; i != length; ++i) {
00188 unsigned short c = d[i].unicode();
00189
00190 int sequenceLen;
00191 if (c < 0x80) {
00192 *p++ = (char)c;
00193 sequenceLen = 1;
00194 } else if (c < 0x800) {
00195 *p++ = (char)((c >> 6) | 0xC0);
00196 *p++ = (char)((c | 0x80) & 0xBF);
00197 sequenceLen = 2;
00198 } else {
00199 *p++ = (char)((c >> 12) | 0xE0);
00200 *p++ = (char)(((c >> 6) | 0x80) & 0xBF);
00201 *p++ = (char)((c | 0x80) & 0xBF);
00202 sequenceLen = 3;
00203 }
00204
00205 while (sequenceLen > 0) {
00206 *posOut = i;
00207 ++posOut;
00208 --sequenceLen;
00209 }
00210 }
00211
00212 bufferSize = p - buffer;
00213
00214 *p++ = '\0';
00215
00216
00217 *posOut = length;
00218 *(posOut+1) = length+1;
00219 }
00220
00221 void RegExp::prepareASCII (const UString& s)
00222 {
00223 originalPos = 0;
00224
00225
00226
00227
00228 CString truncated = s.cstring();
00229 buffer = new char[truncated.size() + 1];
00230 memcpy(buffer, truncated.c_str(), truncated.size());
00231 buffer[truncated.size()] = '\0';
00232 bufferSize = truncated.size();
00233 }
00234
00235 void RegExp::prepareMatch(const UString &s)
00236 {
00237 delete[] originalPos;
00238 delete[] buffer;
00239 if (utf8Support == Supported)
00240 prepareUtf8(s);
00241 else
00242 prepareASCII(s);
00243
00244 #ifndef NDEBUG
00245 originalS = s;
00246 #endif
00247 }
00248
00249 void RegExp::doneMatch()
00250 {
00251 delete[] originalPos; originalPos = 0;
00252 delete[] buffer; buffer = 0;
00253 }
00254
00255 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
00256 {
00257 #ifndef NDEBUG
00258 assert(s.data() == originalS.data());
00259 #endif
00260 assert(valid);
00261
00262 if (i < 0)
00263 i = 0;
00264 if (ovector)
00265 *ovector = 0L;
00266 int dummyPos;
00267 if (!pos)
00268 pos = &dummyPos;
00269 *pos = -1;
00270 if (i > s.size() || s.isNull())
00271 return UString::null;
00272
00273 #ifdef HAVE_PCREPOSIX
00274 int ovecsize = (nrSubPatterns+1)*3;
00275 if (ovector) *ovector = new int[ovecsize];
00276 if (!pcregex)
00277 return UString::null;
00278
00279 int startPos;
00280 int nextPos;
00281
00282 if (utf8Support == Supported) {
00283 startPos = i;
00284 while (originalPos[startPos] < i)
00285 ++startPos;
00286
00287 nextPos = startPos;
00288 while (originalPos[nextPos] < (i + 1))
00289 ++nextPos;
00290 } else {
00291 startPos = i;
00292 nextPos = i + 1;
00293 }
00294
00295 if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
00296 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0,
00297 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00298 {
00299
00300 if ((flgs & Global) && m_notEmpty && ovector)
00301 {
00302
00303
00304
00305 #ifdef KJS_VERBOSE
00306 fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
00307 #endif
00308 m_notEmpty = 0;
00309 if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, 0,
00310 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00311 return UString::null;
00312 }
00313 else
00314 return UString::null;
00315 }
00316
00317
00318
00319 if (ovector && originalPos) {
00320 for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) {
00321 if ((*ovector)[c] != -1)
00322 (*ovector)[c] = originalPos[(*ovector)[c]];
00323 }
00324 }
00325
00326 if (!ovector)
00327 return UString::null;
00328 #else
00329 const uint maxMatch = 10;
00330 regmatch_t rmatch[maxMatch];
00331
00332 char *str = strdup(s.ascii());
00333 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00334 free(str);
00335 return UString::null;
00336 }
00337 free(str);
00338
00339 if (!ovector) {
00340 *pos = rmatch[0].rm_so + i;
00341 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00342 }
00343
00344
00345 nrSubPatterns = 0;
00346 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
00347 nrSubPatterns++;
00348
00349
00350 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
00351 return UString::null;
00352 }
00353
00354 if (nrSubPatterns == 0) nrSubPatterns = 1;
00355
00356 int ovecsize = (nrSubPatterns)*3;
00357 *ovector = new int[ovecsize];
00358 for (uint j = 0; j < nrSubPatterns; j++) {
00359 (*ovector)[2*j] = rmatch[j].rm_so + i;
00360 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00361 }
00362 #endif
00363
00364 *pos = (*ovector)[0];
00365 if ( *pos == (*ovector)[1] && (flgs & Global) )
00366 {
00367
00368 m_notEmpty=true;
00369 }
00370 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00371 }
00372
00373 #if 0 // unused
00374 bool RegExp::test(const UString &s, int)
00375 {
00376 #ifdef HAVE_PCREPOSIX
00377 int ovector[300];
00378 CString buffer(s.cstring());
00379
00380 if (s.isNull() ||
00381 pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
00382 0, ovector, 300) == PCRE_ERROR_NOMATCH)
00383 return false;
00384 else
00385 return true;
00386
00387 #else
00388
00389 char *str = strdup(s.ascii());
00390 int r = regexec(&preg, str, 0, 0, 0);
00391 free(str);
00392
00393 return r == 0;
00394 #endif
00395 }
00396 #endif