Changeset 17862 in webkit for trunk/JavaScriptCore/kjs/regexp.cpp


Ignore:
Timestamp:
Nov 20, 2006, 12:24:22 PM (19 years ago)
Author:
ap
Message:

2006-11-20 W. Andy Carrel <[email protected]>

Reviewed by Maciej.

http://bugs.webkit.org/show_bug.cgi?id=11501
REGRESSION: \u no longer escapes metacharacters in RegExps
http://bugs.webkit.org/show_bug.cgi?id=11502
Serializing RegExps doesn't preserve Unicode escapes

JavaScriptCore:

  • kjs/lexer.cpp: (Lexer::Lexer): (Lexer::setCode): (Lexer::shift): (Lexer::scanRegExp): Push \u parsing back down into the RegExp object rather than in the parser. This backs out r17354 in favor of a new fix that better matches the behavior of other browsers.
  • kjs/lexer.h:
  • kjs/regexp.cpp: (KJS::RegExp::RegExp): (KJS::sanitizePattern): (KJS::isHexDigit): (KJS::convertHex): (KJS::convertUnicode):
  • kjs/regexp.h: Translate \u escaped unicode characters for the benefit of pcre.
  • kjs/ustring.cpp: (KJS::UString::append): Fix failure to increment length on the first UChar appended to a UString that was copy-on-write.
  • tests/mozilla/ecma_2/RegExp/properties-001.js: Adjust tests back to the uniform standards.

LayoutTests:

  • fast/js/kde/RegExp-expected.txt:
  • fast/js/regexp-unicode-handling-expected.txt: Adjust these test results to passing as a result of other included changes in this revision.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/kjs/regexp.cpp

    r13203 r17862  
    4545  const char *errorMessage;
    4646  int errorOffset;
    47   UString nullTerminated(p);
    48   char null(0);
    49   nullTerminated.append(null);
    50   _regex = pcre_compile(reinterpret_cast<const uint16_t *>(nullTerminated.data()), options, &errorMessage, &errorOffset, NULL);
    51   if (!_regex)
    52     return;
     47 
     48  UString pattern(p);
     49 
     50  pattern.append('\0');
     51  _regex = pcre_compile(reinterpret_cast<const uint16_t*>(pattern.data()),
     52                        options, &errorMessage, &errorOffset, NULL);
     53  if (!_regex) {
     54    // Try again, this time handle any \u we might find.
     55    UString uPattern = sanitizePattern(pattern);
     56    _regex = pcre_compile(reinterpret_cast<const uint16_t*>(uPattern.data()),
     57                          options, &errorMessage, &errorOffset, NULL);
     58    if (!_regex)
     59      return;
     60  }
    5361
    5462#ifdef PCRE_INFO_CAPTURECOUNT
     
    174182}
    175183
     184UString RegExp::sanitizePattern(const UString& p)
     185{
     186  UString newPattern;
     187 
     188  int startPos = 0;
     189  int pos = p.find("\\u", 0) + 2; // Skip the \u
     190 
     191  while (pos != 1) { // p.find failing is -1 + 2 = 1
     192    if (pos + 3 < p.size()) {
     193      if (isHexDigit(p[pos]) && isHexDigit(p[pos + 1]) &&
     194          isHexDigit(p[pos + 2]) && isHexDigit(p[pos + 3])) {
     195        newPattern.append(p.substr(startPos, pos - startPos - 2));
     196        UChar escapedUnicode(convertUnicode(p[pos], p[pos + 1],
     197                                            p[pos + 2], p[pos + 3]));
     198        // \u encoded characters should be treated as if they were escaped,
     199        // so add an escape for certain characters that need it.
     200        switch (escapedUnicode.unicode()) {
     201          case '|':
     202          case '+':
     203          case '*':
     204          case '(':
     205          case ')':
     206          case '[':
     207          case ']':
     208          case '{':
     209          case '}':
     210          case '?':
     211          case '\\':
     212            newPattern.append('\\');
     213        }
     214        newPattern.append(escapedUnicode);
     215
     216        startPos = pos + 4;
     217      }
     218    }
     219    pos = p.find("\\u", pos) + 2;
     220  }
     221  newPattern.append(p.substr(startPos, p.size() - startPos));
     222
     223  return newPattern;
     224}
     225
     226bool RegExp::isHexDigit(UChar uc)
     227{
     228  int c = uc.unicode();
     229  return (c >= '0' && c <= '9' ||
     230          c >= 'a' && c <= 'f' ||
     231          c >= 'A' && c <= 'F');
     232}
     233
     234unsigned char RegExp::convertHex(int c)
     235{
     236  if (c >= '0' && c <= '9')
     237    return static_cast<unsigned char>(c - '0');
     238  if (c >= 'a' && c <= 'f')
     239    return static_cast<unsigned char>(c - 'a' + 10);
     240  return static_cast<unsigned char>(c - 'A' + 10);
     241}
     242
     243unsigned char RegExp::convertHex(int c1, int c2)
     244{
     245  return ((convertHex(c1) << 4) + convertHex(c2));
     246}
     247
     248UChar RegExp::convertUnicode(UChar uc1, UChar uc2, UChar uc3, UChar uc4)
     249{
     250  int c1 = uc1.unicode();
     251  int c2 = uc2.unicode();
     252  int c3 = uc3.unicode();
     253  int c4 = uc4.unicode();
     254  return UChar((convertHex(c1) << 4) + convertHex(c2),
     255               (convertHex(c3) << 4) + convertHex(c4));
     256}
     257
    176258} // namespace KJS
Note: See TracChangeset for help on using the changeset viewer.