Context Navigation

← Previous Change
Next Change →

regexp.cpp

Timestamp:

Jun 4, 2003, 5:11:22 PM (22 years ago)

Author:

darin

Message:

Reviewed by Dave.

fixed 3224031 -- can't search at rakuten.co.jp b/c of extra characters inserted by regexp replace (8-bit char)

Use PCRE UTF-8 regular expressions instead of just chopping off high bytes.

kjs/regexp.h: Redo field names, remove some unused stuff.
kjs/regexp.cpp: (convertToUTF8): Added. (compareStringOffsets): Added. (createSortedOffsetsArray): Added. (convertCharacterOffsetsToUTF8ByteOffsets): Added. (convertUTF8ByteOffsetsToCharacterOffsets): Added. (RegExp::RegExp): Set the PCRE_UTF8 flag, and convert the UString to UTF-8 instead of using ascii() on it. (RegExp::~RegExp): Remove unneeded if statement (pcre_free is 0-tolerant as free is). (RegExp::match): Convert the UString to UTF-8 and convert the character offsets to and from UTF-8 byte offsets. Also do fixes for the "no offset vector" case so we get the correct position and matched string.

JavaScriptCore.pbproj/project.pbxproj: Add a PCRE header that was missing before.

File:

: 1 edited

trunk/JavaScriptCore/kjs/regexp.cpp (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/JavaScriptCore/kjs/regexp.cpp

-              r4206
+              r4482
 #include <string.h>
+using namespace KJS;
+RegExp::RegExp(const UString &p, int f)
+  : pattern(p), flgs(f)
+{
+using KJS::CString;
+using KJS::RegExp;
+using KJS::UString;
 #ifdef HAVE_PCREPOSIX
+  int pcreflags = 0;
+  const char *perrormsg;
+static CString convertToUTF8(const UString &s)
+{
+    // Allocate a buffer big enough to hold all the characters.
+    const int length = s.size();
+    const unsigned bufferSize = length * 3 + 1;
+    char fixedSizeBuffer[1024];
+    char *buffer;
+    if (bufferSize > sizeof(fixedSizeBuffer)) {
+        buffer = new char [bufferSize];
+    } else {
+        buffer = fixedSizeBuffer;
+    }
+    // Convert to runs of 8-bit characters.
+    char *p = buffer;
+    for (int i = 0; i != length; ++i) {
+        unsigned short c = s[i].unicode();
+        if (c < 0x80) {
+            *p++ = (char)c;
+        } else if (c < 0x800) {
+            *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
+            *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+        } else {
+            *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
+            *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+            *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+        }
+    }
+    *p = 0;
+    // Return the result as a C string.
+    CString result(buffer);
+    if (buffer != fixedSizeBuffer) {
+        delete [] buffer;
+    }
+    return result;
+}
+struct StringOffset {
+    int offset;
+    int locationInOffsetsArray;
+};
+static int compareStringOffsets(const void *a, const void *b)
+{
+    const StringOffset *oa = static_cast<const StringOffset *>(a);
+    const StringOffset *ob = static_cast<const StringOffset *>(b);
+    if (oa->offset < ob->offset) {
+        return -1;
+    }
+    if (oa->offset > ob->offset) {
+        return +1;
+    }
+    return 0;
+}
+const int sortedOffsetsFixedBufferSize = 128;
+static StringOffset *createSortedOffsetsArray(const int offsets[], int numOffsets,
+    StringOffset sortedOffsetsFixedBuffer[sortedOffsetsFixedBufferSize])
+{
+    // Allocate the sorted offsets.
+    StringOffset *sortedOffsets;
+    if (numOffsets <= sortedOffsetsFixedBufferSize) {
+        sortedOffsets = sortedOffsetsFixedBuffer;
+    } else {
+        sortedOffsets = new StringOffset [numOffsets];
+    }
+    // Copy offsets.
+    for (int i = 0; i != numOffsets; ++i) {
+        sortedOffsets[i].offset = offsets[i];
+        sortedOffsets[i].locationInOffsetsArray = i;
+    }
+    // Sort them.
+    qsort(sortedOffsets, numOffsets, sizeof(StringOffset), compareStringOffsets);
+    return sortedOffsets;
+}
+static void convertCharacterOffsetsToUTF8ByteOffsets(const char *s, int *offsets, int numOffsets)
+{
+    // Allocate buffer.
+    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
+    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
+    // Walk through sorted offsets and string, adjusting all the offests.
+    // Offsets that are off the ends of the string map to the edges of the string.
+    int characterOffset = 0;
+    const char *p = s;
+    for (int oi = 0; oi != numOffsets; ++oi) {
+        const int nextOffset = sortedOffsets[oi].offset;
+        while (*p && characterOffset < nextOffset) {
+            // Skip to the next character.
+            ++characterOffset;
+            do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
+        }
+        offsets[sortedOffsets[oi].locationInOffsetsArray] = p - s;
+    }
+    // Free buffer.
+    if (sortedOffsets != fixedBuffer) {
+        delete [] sortedOffsets;
+    }
+}
+static void convertUTF8ByteOffsetsToCharacterOffsets(const char *s, int *offsets, int numOffsets)
+{
+    // Allocate buffer.
+    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
+    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
+    // Walk through sorted offsets and string, adjusting all the offests.
+    // Offsets that are off the end of the string map to the edges of the string.
+    int characterOffset = 0;
+    const char *p = s;
+    for (int oi = 0; oi != numOffsets; ++oi) {
+        const int nextOffset = sortedOffsets[oi].offset;
+        while (*p && (p - s) < nextOffset) {
+            // Skip to the next character.
+            ++characterOffset;
+            do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
+        }
+        offsets[sortedOffsets[oi].locationInOffsetsArray] = characterOffset;
+    }
+    // Free buffer.
+    if (sortedOffsets != fixedBuffer) {
+        delete [] sortedOffsets;
+    }
+}
+#endif // HAVE_PCREPOSIX
+RegExp::RegExp(const UString &p, int flags)
+  : _flags(flags), _numSubPatterns(0)
+{
+#ifdef HAVE_PCREPOSIX
+  int options = PCRE_UTF8;
+  // Note: the Global flag is already handled by RegExpProtoFunc::execute.
+  if (flags & IgnoreCase)
+    options |= PCRE_CASELESS;
+  if (flags & Multiline)
+    options |= PCRE_MULTILINE;
+  const char *errorMessage;
   int errorOffset;
+  if (flgs & IgnoreCase)
+    pcreflags |= PCRE_CASELESS;
+  if (flgs & Multiline)
+    pcreflags |= PCRE_MULTILINE;
+  pcregex = pcre_compile(p.ascii(), pcreflags,
+                         &perrormsg, &errorOffset, NULL);
+  _regex = pcre_compile(convertToUTF8(p).c_str(), options, &errorMessage, &errorOffset, NULL);
+  if (!_regex) {
 #ifndef NDEBUG
+  if (!pcregex)
+    fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
+#endif
+    fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
+#endif
+    return;
+  }
 #ifdef PCRE_INFO_CAPTURECOUNT
+  // Get number of subpatterns that will be returned
+  int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
+  if (rc != 0)
+#endif
+    nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
+  // Get number of subpatterns that will be returned.
+  pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
+#endif
 #else /* HAVE_PCREPOSIX */
-  nrSubPatterns = 0; // determined in match() with POSIX regex.
   int regflags = 0;
 #ifdef REG_EXTENDED
 …
   // Note: the Global flag is already handled by RegExpProtoFunc::execute
   regcomp(&preg, p.ascii(), regflags);
+  regcomp(&_regex, p.ascii(), regflags);
   /* TODO check for errors */
+#endif
+#endif
+}
 …
+{
 #ifdef HAVE_PCREPOSIX
+  if (pcregex)
+    pcre_free(pcregex);
+  pcre_free(_regex);
 #else
   /* TODO: is this really okay after an error ? */
   regfree(&preg);
+  regfree(&_regex);
 #endif
+}
 …
   if (i < 0)
     i = 0;
-  if (ovector)
-    *ovector = 0L;
   int dummyPos;
   if (!pos)
     pos = &dummyPos;
   *pos = -1;
+  if (ovector)
+    *ovector = 0;
   if (i > s.size() || s.isNull())
     return UString::null();
 #ifdef HAVE_PCREPOSIX
+  CString buffer(s.cstring());
+  int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
+  if (ovector) *ovector = new int[ovecsize];
+  if (!pcregex || pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), i,
+, ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
+  if (!_regex)
     return UString::null();
+  if (!ovector)
+    return UString::null(); // don't rely on the return value if you pass ovector==0
+  // Set up the offset vector for the result.
+  // First 2/3 used for result, the last third used by PCRE.
+  int *offsetVector;
+  int offsetVectorSize;
+  int fixedSizeOffsetVector[3];
+  if (!ovector) {
+    offsetVectorSize = 3;
+    offsetVector = fixedSizeOffsetVector;
+  } else {
+    offsetVectorSize = (_numSubPatterns + 1) * 3;
+    offsetVector = new int [offsetVectorSize];
+  }
+  const CString buffer(convertToUTF8(s));
+  convertCharacterOffsetsToUTF8ByteOffsets(buffer.c_str(), &i, 1);
+  const int numMatches = pcre_exec(_regex, NULL, buffer.c_str(), buffer.size(), i, 0, offsetVector, offsetVectorSize);
+  if (numMatches < 0) {
+#ifndef NDEBUG
+    if (numMatches != PCRE_ERROR_NOMATCH)
+      fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
+#endif
+    if (offsetVector != fixedSizeOffsetVector)
+      delete [] offsetVector;
+    return UString::null();
+  }
+  convertUTF8ByteOffsetsToCharacterOffsets(buffer.c_str(), offsetVector, (numMatches == 0 ? 1 : numMatches) * 2);
+  *pos = offsetVector[0];
+  if (ovector)
+    *ovector = offsetVector;
+  return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
 #else
   const uint maxMatch = 10;
   regmatch_t rmatch[maxMatch];
   char *str = strdup(s.ascii()); // TODO: why ???
   if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
+  if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
     free(str);
     return UString::null();
 …
   // map rmatch array to ovector used in PCRE case
   nrSubPatterns = 0;
+  _numSubPatterns = 0;
   for(uint j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
       nrSubPatterns++;
   int ovecsize = (nrSubPatterns+1)*3; // see above
+      _numSubPatterns++;
+  int ovecsize = (_numSubPatterns+1)*3; // see above
   *ovector = new int[ovecsize];
   for (uint j = 0; j < nrSubPatterns + 1; j++) {
+  for (uint j = 0; j < _numSubPatterns + 1; j++) {
     if (j>maxMatch)
       break;
 …
     (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
+  }
-#endif
   *pos = (*ovector)[0];
   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
+}
+#if 0 // unused
+bool RegExp::test(const UString &s, int)
+{
+#ifdef HAVE_PCREPOSIX
+  int ovector[300];
+  CString buffer(s.cstring());
+  if (s.isNull() ||
+      pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
+, ovector, 300) == PCRE_ERROR_NOMATCH)
+    return false;
+  else
+    return true;
+#else
+  char *str = strdup(s.ascii());
+  int r = regexec(&preg, str, 0, 0, 0);
+  free(str);
+  return r == 0;
+#endif
+}
+#endif
+#endif
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 4482 in webkit for trunk/JavaScriptCore/kjs/regexp.cpp

Legend:

trunk/JavaScriptCore/kjs/regexp.cpp

Download in other formats: