Context Navigation

← Previous Change
Next Change →

regexp.cpp

Timestamp:

Nov 20, 2006, 12:24:22 PM (19 years ago)

Author:

Message:

2006-11-20 W. Andy Carrel <[email protected]>

Reviewed by Maciej.

http://bugs.webkit.org/show_bug.cgi?id=11501
REGRESSION: \u no longer escapes metacharacters in RegExps
http://bugs.webkit.org/show_bug.cgi?id=11502
Serializing RegExps doesn't preserve Unicode escapes

JavaScriptCore:

kjs/lexer.cpp: (Lexer::Lexer): (Lexer::setCode): (Lexer::shift): (Lexer::scanRegExp): Push \u parsing back down into the RegExp object rather than in the parser. This backs out r17354 in favor of a new fix that better matches the behavior of other browsers.

kjs/lexer.h:
kjs/regexp.cpp: (KJS::RegExp::RegExp): (KJS::sanitizePattern): (KJS::isHexDigit): (KJS::convertHex): (KJS::convertUnicode):
kjs/regexp.h: Translate \u escaped unicode characters for the benefit of pcre.

kjs/ustring.cpp: (KJS::UString::append): Fix failure to increment length on the first UChar appended to a UString that was copy-on-write.

tests/mozilla/ecma_2/RegExp/properties-001.js: Adjust tests back to the uniform standards.

LayoutTests:

fast/js/kde/RegExp-expected.txt:
fast/js/regexp-unicode-handling-expected.txt: Adjust these test results to passing as a result of other included changes in this revision.

File:

: 1 edited

trunk/JavaScriptCore/kjs/regexp.cpp (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/JavaScriptCore/kjs/regexp.cpp

-              r13203
+              r17862
   const char *errorMessage;
   int errorOffset;
+  UString nullTerminated(p);
+  char null(0);
+  nullTerminated.append(null);
+  _regex = pcre_compile(reinterpret_cast<const uint16_t *>(nullTerminated.data()), options, &errorMessage, &errorOffset, NULL);
+  if (!_regex)
+    return;
+  UString pattern(p);
+  pattern.append('\0');
+  _regex = pcre_compile(reinterpret_cast<const uint16_t*>(pattern.data()),
+                        options, &errorMessage, &errorOffset, NULL);
+  if (!_regex) {
+    // Try again, this time handle any \u we might find.
+    UString uPattern = sanitizePattern(pattern);
+    _regex = pcre_compile(reinterpret_cast<const uint16_t*>(uPattern.data()),
+                          options, &errorMessage, &errorOffset, NULL);
+    if (!_regex)
+      return;
+  }
 #ifdef PCRE_INFO_CAPTURECOUNT
 …
+}
+UString RegExp::sanitizePattern(const UString& p)
+{
+  UString newPattern;
+  int startPos = 0;
+  int pos = p.find("\\u", 0) + 2; // Skip the \u
+  while (pos != 1) { // p.find failing is -1 + 2 = 1
+    if (pos + 3 < p.size()) {
+      if (isHexDigit(p[pos]) && isHexDigit(p[pos + 1]) &&
+          isHexDigit(p[pos + 2]) && isHexDigit(p[pos + 3])) {
+        newPattern.append(p.substr(startPos, pos - startPos - 2));
+        UChar escapedUnicode(convertUnicode(p[pos], p[pos + 1],
+                                            p[pos + 2], p[pos + 3]));
+        // \u encoded characters should be treated as if they were escaped,
+        // so add an escape for certain characters that need it.
+        switch (escapedUnicode.unicode()) {
+          case '|':
+          case '+':
+          case '*':
+          case '(':
+          case ')':
+          case '[':
+          case ']':
+          case '{':
+          case '}':
+          case '?':
+          case '\\':
+            newPattern.append('\\');
+        }
+        newPattern.append(escapedUnicode);
+        startPos = pos + 4;
+      }
+    }
+    pos = p.find("\\u", pos) + 2;
+  }
+  newPattern.append(p.substr(startPos, p.size() - startPos));
+  return newPattern;
+}
+bool RegExp::isHexDigit(UChar uc)
+{
+  int c = uc.unicode();
+  return (c >= '0' && c <= '9' ||
+          c >= 'a' && c <= 'f' ||
+          c >= 'A' && c <= 'F');
+}
+unsigned char RegExp::convertHex(int c)
+{
+  if (c >= '0' && c <= '9')
+    return static_cast<unsigned char>(c - '0');
+  if (c >= 'a' && c <= 'f')
+    return static_cast<unsigned char>(c - 'a' + 10);
+  return static_cast<unsigned char>(c - 'A' + 10);
+}
+unsigned char RegExp::convertHex(int c1, int c2)
+{
+  return ((convertHex(c1) << 4) + convertHex(c2));
+}
+UChar RegExp::convertUnicode(UChar uc1, UChar uc2, UChar uc3, UChar uc4)
+{
+  int c1 = uc1.unicode();
+  int c2 = uc2.unicode();
+  int c3 = uc3.unicode();
+  int c4 = uc4.unicode();
+  return UChar((convertHex(c1) << 4) + convertHex(c2),
+               (convertHex(c3) << 4) + convertHex(c4));
+}
 } // namespace KJS

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 17862 in webkit for trunk/JavaScriptCore/kjs/regexp.cpp

Legend:

trunk/JavaScriptCore/kjs/regexp.cpp

Download in other formats: