Ignore:
Timestamp:
Jan 2, 2008, 10:39:50 PM (17 years ago)
Author:
Darin Adler
Message:

JavaScriptCore:

Reviewed by Geoff.

Test: fast/regex/early-acid3-86.html

The problem was with the cutoff point between backreferences and octal
escape sequences. We need to determine the cutoff point by counting the
total number of capturing brackets, which requires an extra pass through
the expression when compiling it.

  • pcre/pcre_compile.cpp: (CompileData::CompileData): Added numCapturingBrackets. Removed some unused fields. (compileBranch): Use numCapturingBrackets when calling checkEscape. (calculateCompiledPatternLength): Use numCapturingBrackets when calling checkEscape, and also store the bracket count at the end of the compile. (jsRegExpCompile): Call calculateCompiledPatternLength twice -- once to count the number of brackets and then a second time to calculate the length.

LayoutTests:

Reviewed by Geoff.

  • fast/regex/early-acid3-86-expected.txt: Added.
  • fast/regex/early-acid3-86.html: Added.
  • fast/regex/resources/early-acid3-86.js: Added.
  • fast/regex/test1-expected.txt: Updated for a few cases where we now fail. But these "failures" represent us replacing PCRE behavior with semantics that are correct for JavaScript regular expressions.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/pcre/pcre_compile.cpp

    r28796 r29110  
    136136struct CompileData {
    137137    CompileData() {
    138         start_code = 0;
    139         start_pattern = 0;
    140138        top_backref = 0;
    141139        backrefMap = 0;
    142140        req_varyopt = 0;
    143141        needOuterBracket = false;
     142        numCapturingBrackets = 0;
    144143    }
    145     const unsigned char* start_code;   /* The start of the compiled code */
    146     const UChar* start_pattern; /* The start of the pattern */
    147144    int top_backref;            /* Maximum back reference */
    148145    unsigned backrefMap;       /* Bitmap of low back refs */
    149146    int req_varyopt;            /* "After variable item" flag for reqbyte */
    150147    bool needOuterBracket;
     148    int numCapturingBrackets;
    151149};
    152150
     
    730728                   
    731729                    if (c == '\\') {
    732                         c = checkEscape(&ptr, patternEnd, errorcodeptr, *brackets, true);
     730                        c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, true);
    733731                        if (c < 0) {
    734732                            class_charcount += 2;     /* Greater than 1 is what matters */
     
    797795                        if (d == '\\') {
    798796                            const UChar* oldptr = ptr;
    799                             d = checkEscape(&ptr, patternEnd, errorcodeptr, *brackets, true);
     797                            d = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, true);
    800798                           
    801799                            /* \X is literal X; any other special means the '-' was literal */
     
    15651563            case '\\':
    15661564                tempptr = ptr;
    1567                 c = checkEscape(&ptr, patternEnd, errorcodeptr, *brackets, false);
     1565                c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, false);
    15681566               
    15691567                /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
     
    19881986}
    19891987
    1990 static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
     1988static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
    19911989    CompileData& cd, ErrorCode& errorcode)
    19921990{
    19931991    /* Make a pass over the pattern to compute the
    19941992     amount of store required to hold the compiled code. This does not have to be
    1995      perfect as long as errors are overestimates. At the same time we can detect any
    1996      flag settings right at the start, and extract them. Make an attempt to correct
    1997      for any counted white space if an "extended" flag setting appears late in the
    1998      pattern. We can't be so clever for #-comments. */
     1993     perfect as long as errors are overestimates. */
    19991994   
    20001995    int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
     
    20182013
    20192014            case '\\':
    2020                 c = checkEscape(&ptr, patternEnd, &errorcode, bracount, false);
     2015                c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
    20212016                if (errorcode != 0)
    20222017                    return -1;
     
    21512146                   
    21522147                    if (*ptr == '\\') {
    2153                         c = checkEscape(&ptr, patternEnd, &errorcode, bracount, true);
     2148                        c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
    21542149                        if (errorcode != 0)
    21552150                            return -1;
     
    21862181                            if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
    21872182                                ptr++;
    2188                                 d = checkEscape(&ptr, patternEnd, &errorcode, bracount, true);
     2183                                d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
    21892184                                if (errorcode != 0)
    21902185                                    return -1;
     
    24692464   
    24702465    length += 2 + LINK_SIZE;    /* For final KET and END */
     2466
     2467    cd.numCapturingBrackets = bracount;
    24712468    return length;
    24722469}
     
    25132510   
    25142511    ErrorCode errorcode = ERR0;
    2515     int length = calculateCompiledPatternLengthAndFlags(pattern, patternLength, ignoreCase, cd, errorcode);
     2512    /* Call this once just to count the brackets. */
     2513    calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
     2514    /* Call it again to compute the length. */
     2515    int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
    25162516    if (errorcode)
    25172517        return returnError(errorcode, errorptr);
     
    25322532   
    25332533    const unsigned char* codeStart = (const unsigned char*)(re + 1);
    2534     cd.start_code = codeStart;
    2535     cd.start_pattern = (const UChar*)pattern;
    25362534   
    25372535    /* Set up a starting, non-extracting bracket, then compile the expression. On
Note: See TracChangeset for help on using the changeset viewer.