Changeset 4482 in webkit for trunk/JavaScriptCore/kjs/regexp.cpp


Ignore:
Timestamp:
Jun 4, 2003, 5:11:22 PM (22 years ago)
Author:
darin
Message:

Reviewed by Dave.

  • fixed 3224031 -- can't search at rakuten.co.jp b/c of extra characters inserted by regexp replace (8-bit char)

Use PCRE UTF-8 regular expressions instead of just chopping off high bytes.

  • kjs/regexp.h: Redo field names, remove some unused stuff.
  • kjs/regexp.cpp: (convertToUTF8): Added. (compareStringOffsets): Added. (createSortedOffsetsArray): Added. (convertCharacterOffsetsToUTF8ByteOffsets): Added. (convertUTF8ByteOffsetsToCharacterOffsets): Added. (RegExp::RegExp): Set the PCRE_UTF8 flag, and convert the UString to UTF-8 instead of using ascii() on it. (RegExp::~RegExp): Remove unneeded if statement (pcre_free is 0-tolerant as free is). (RegExp::match): Convert the UString to UTF-8 and convert the character offsets to and from UTF-8 byte offsets. Also do fixes for the "no offset vector" case so we get the correct position and matched string.
  • JavaScriptCore.pbproj/project.pbxproj: Add a PCRE header that was missing before.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/kjs/regexp.cpp

    r4206 r4482  
    2626#include <string.h>
    2727
    28 using namespace KJS;
    29 
    30 RegExp::RegExp(const UString &p, int f)
    31   : pattern(p), flgs(f)
    32 {
     28using KJS::CString;
     29using KJS::RegExp;
     30using KJS::UString;
     31
    3332#ifdef HAVE_PCREPOSIX
    34   int pcreflags = 0;
    35   const char *perrormsg;
     33
     34static CString convertToUTF8(const UString &s)
     35{
     36    // Allocate a buffer big enough to hold all the characters.
     37    const int length = s.size();
     38    const unsigned bufferSize = length * 3 + 1;
     39    char fixedSizeBuffer[1024];
     40    char *buffer;
     41    if (bufferSize > sizeof(fixedSizeBuffer)) {
     42        buffer = new char [bufferSize];
     43    } else {
     44        buffer = fixedSizeBuffer;
     45    }
     46
     47    // Convert to runs of 8-bit characters.
     48    char *p = buffer;
     49    for (int i = 0; i != length; ++i) {
     50        unsigned short c = s[i].unicode();
     51        if (c < 0x80) {
     52            *p++ = (char)c;
     53        } else if (c < 0x800) {
     54            *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
     55            *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
     56        } else {
     57            *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
     58            *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
     59            *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
     60        }
     61    }
     62    *p = 0;
     63
     64    // Return the result as a C string.
     65    CString result(buffer);
     66    if (buffer != fixedSizeBuffer) {
     67        delete [] buffer;
     68    }
     69    return result;
     70}
     71
     72struct StringOffset {
     73    int offset;
     74    int locationInOffsetsArray;
     75};
     76
     77static int compareStringOffsets(const void *a, const void *b)
     78{
     79    const StringOffset *oa = static_cast<const StringOffset *>(a);
     80    const StringOffset *ob = static_cast<const StringOffset *>(b);
     81   
     82    if (oa->offset < ob->offset) {
     83        return -1;
     84    }
     85    if (oa->offset > ob->offset) {
     86        return +1;
     87    }
     88    return 0;
     89}
     90
     91const int sortedOffsetsFixedBufferSize = 128;
     92
     93static StringOffset *createSortedOffsetsArray(const int offsets[], int numOffsets,
     94    StringOffset sortedOffsetsFixedBuffer[sortedOffsetsFixedBufferSize])
     95{
     96    // Allocate the sorted offsets.
     97    StringOffset *sortedOffsets;
     98    if (numOffsets <= sortedOffsetsFixedBufferSize) {
     99        sortedOffsets = sortedOffsetsFixedBuffer;
     100    } else {
     101        sortedOffsets = new StringOffset [numOffsets];
     102    }
     103
     104    // Copy offsets.
     105    for (int i = 0; i != numOffsets; ++i) {
     106        sortedOffsets[i].offset = offsets[i];
     107        sortedOffsets[i].locationInOffsetsArray = i;
     108    }
     109
     110    // Sort them.
     111    qsort(sortedOffsets, numOffsets, sizeof(StringOffset), compareStringOffsets);
     112
     113    return sortedOffsets;
     114}
     115
     116static void convertCharacterOffsetsToUTF8ByteOffsets(const char *s, int *offsets, int numOffsets)
     117{
     118    // Allocate buffer.
     119    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
     120    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
     121
     122    // Walk through sorted offsets and string, adjusting all the offests.
     123    // Offsets that are off the ends of the string map to the edges of the string.
     124    int characterOffset = 0;
     125    const char *p = s;
     126    for (int oi = 0; oi != numOffsets; ++oi) {
     127        const int nextOffset = sortedOffsets[oi].offset;
     128        while (*p && characterOffset < nextOffset) {
     129            // Skip to the next character.
     130            ++characterOffset;
     131            do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
     132        }
     133        offsets[sortedOffsets[oi].locationInOffsetsArray] = p - s;
     134    }
     135
     136    // Free buffer.
     137    if (sortedOffsets != fixedBuffer) {
     138        delete [] sortedOffsets;
     139    }
     140}
     141
     142static void convertUTF8ByteOffsetsToCharacterOffsets(const char *s, int *offsets, int numOffsets)
     143{
     144    // Allocate buffer.
     145    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
     146    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
     147
     148    // Walk through sorted offsets and string, adjusting all the offests.
     149    // Offsets that are off the end of the string map to the edges of the string.
     150    int characterOffset = 0;
     151    const char *p = s;
     152    for (int oi = 0; oi != numOffsets; ++oi) {
     153        const int nextOffset = sortedOffsets[oi].offset;
     154        while (*p && (p - s) < nextOffset) {
     155            // Skip to the next character.
     156            ++characterOffset;
     157            do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
     158        }
     159        offsets[sortedOffsets[oi].locationInOffsetsArray] = characterOffset;
     160    }
     161
     162    // Free buffer.
     163    if (sortedOffsets != fixedBuffer) {
     164        delete [] sortedOffsets;
     165    }
     166}
     167
     168#endif // HAVE_PCREPOSIX
     169
     170RegExp::RegExp(const UString &p, int flags)
     171  : _flags(flags), _numSubPatterns(0)
     172{
     173#ifdef HAVE_PCREPOSIX
     174
     175  int options = PCRE_UTF8;
     176  // Note: the Global flag is already handled by RegExpProtoFunc::execute.
     177  if (flags & IgnoreCase)
     178    options |= PCRE_CASELESS;
     179  if (flags & Multiline)
     180    options |= PCRE_MULTILINE;
     181
     182  const char *errorMessage;
    36183  int errorOffset;
    37 
    38   if (flgs & IgnoreCase)
    39     pcreflags |= PCRE_CASELESS;
    40 
    41   if (flgs & Multiline)
    42     pcreflags |= PCRE_MULTILINE;
    43 
    44   pcregex = pcre_compile(p.ascii(), pcreflags,
    45                          &perrormsg, &errorOffset, NULL);
     184  _regex = pcre_compile(convertToUTF8(p).c_str(), options, &errorMessage, &errorOffset, NULL);
     185  if (!_regex) {
    46186#ifndef NDEBUG
    47   if (!pcregex)
    48     fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
    49 #endif
     187    fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
     188#endif
     189    return;
     190  }
    50191
    51192#ifdef PCRE_INFO_CAPTURECOUNT
    52   // Get number of subpatterns that will be returned
    53   int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
    54   if (rc != 0)
    55 #endif
    56     nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
     193  // Get number of subpatterns that will be returned.
     194  pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
     195#endif
    57196
    58197#else /* HAVE_PCREPOSIX */
    59198
    60   nrSubPatterns = 0; // determined in match() with POSIX regex.
    61199  int regflags = 0;
    62200#ifdef REG_EXTENDED
     
    73211  // Note: the Global flag is already handled by RegExpProtoFunc::execute
    74212
    75   regcomp(&preg, p.ascii(), regflags);
     213  regcomp(&_regex, p.ascii(), regflags);
    76214  /* TODO check for errors */
    77 #endif
    78 
     215
     216#endif
    79217}
    80218
     
    82220{
    83221#ifdef HAVE_PCREPOSIX
    84   if (pcregex)
    85     pcre_free(pcregex);
     222  pcre_free(_regex);
    86223#else
    87224  /* TODO: is this really okay after an error ? */
    88   regfree(&preg);
     225  regfree(&_regex);
    89226#endif
    90227}
     
    94231  if (i < 0)
    95232    i = 0;
    96   if (ovector)
    97     *ovector = 0L;
    98233  int dummyPos;
    99234  if (!pos)
    100235    pos = &dummyPos;
    101236  *pos = -1;
     237  if (ovector)
     238    *ovector = 0;
     239
    102240  if (i > s.size() || s.isNull())
    103241    return UString::null();
    104242
    105243#ifdef HAVE_PCREPOSIX
    106   CString buffer(s.cstring());
    107   int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
    108   if (ovector) *ovector = new int[ovecsize];
    109 
    110   if (!pcregex || pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), i,
    111                   0, ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
     244
     245  if (!_regex)
    112246    return UString::null();
    113247
    114   if (!ovector)
    115     return UString::null(); // don't rely on the return value if you pass ovector==0
     248  // Set up the offset vector for the result.
     249  // First 2/3 used for result, the last third used by PCRE.
     250  int *offsetVector;
     251  int offsetVectorSize;
     252  int fixedSizeOffsetVector[3];
     253  if (!ovector) {
     254    offsetVectorSize = 3;
     255    offsetVector = fixedSizeOffsetVector;
     256  } else {
     257    offsetVectorSize = (_numSubPatterns + 1) * 3;
     258    offsetVector = new int [offsetVectorSize];
     259  }
     260
     261  const CString buffer(convertToUTF8(s));
     262  convertCharacterOffsetsToUTF8ByteOffsets(buffer.c_str(), &i, 1);
     263  const int numMatches = pcre_exec(_regex, NULL, buffer.c_str(), buffer.size(), i, 0, offsetVector, offsetVectorSize);
     264
     265  if (numMatches < 0) {
     266#ifndef NDEBUG
     267    if (numMatches != PCRE_ERROR_NOMATCH)
     268      fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
     269#endif
     270    if (offsetVector != fixedSizeOffsetVector)
     271      delete [] offsetVector;
     272    return UString::null();
     273  }
     274
     275  convertUTF8ByteOffsetsToCharacterOffsets(buffer.c_str(), offsetVector, (numMatches == 0 ? 1 : numMatches) * 2);
     276
     277  *pos = offsetVector[0];
     278  if (ovector)
     279    *ovector = offsetVector;
     280  return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
     281
    116282#else
     283
    117284  const uint maxMatch = 10;
    118285  regmatch_t rmatch[maxMatch];
    119286
    120287  char *str = strdup(s.ascii()); // TODO: why ???
    121   if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
     288  if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
    122289    free(str);
    123290    return UString::null();
     
    131298
    132299  // map rmatch array to ovector used in PCRE case
    133   nrSubPatterns = 0;
     300  _numSubPatterns = 0;
    134301  for(uint j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
    135       nrSubPatterns++;
    136   int ovecsize = (nrSubPatterns+1)*3; // see above
     302      _numSubPatterns++;
     303  int ovecsize = (_numSubPatterns+1)*3; // see above
    137304  *ovector = new int[ovecsize];
    138   for (uint j = 0; j < nrSubPatterns + 1; j++) {
     305  for (uint j = 0; j < _numSubPatterns + 1; j++) {
    139306    if (j>maxMatch)
    140307      break;
     
    142309    (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
    143310  }
    144 #endif
    145311
    146312  *pos = (*ovector)[0];
    147313  return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
    148 }
    149 
    150 #if 0 // unused
    151 bool RegExp::test(const UString &s, int)
    152 {
    153 #ifdef HAVE_PCREPOSIX
    154   int ovector[300];
    155   CString buffer(s.cstring());
    156 
    157   if (s.isNull() ||
    158       pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
    159                 0, ovector, 300) == PCRE_ERROR_NOMATCH)
    160     return false;
    161   else
    162     return true;
    163 
    164 #else
    165 
    166   char *str = strdup(s.ascii());
    167   int r = regexec(&preg, str, 0, 0, 0);
    168   free(str);
    169 
    170   return r == 0;
    171 #endif
    172 }
    173 #endif
     314
     315#endif
     316}
Note: See TracChangeset for help on using the changeset viewer.