Ignore:
Timestamp:
Nov 29, 2010, 10:52:16 AM (15 years ago)
Author:
[email protected]
Message:

Bug 48100 - YARR allows what seems like a bogus character-class range

Reviewed by Sam Weinig.

JavaScriptCore:

Per ECMA-262 character classes containing character ranges containing
character classes are invalid, eg:

/[\d-x]/
/[x-\d]/
/[\d-\d]/

These should throw a syntax error.

  • yarr/RegexParser.h:

LayoutTests:

Add/update layout test results.

  • fast/js/regexp-overflow-expected.txt:
  • fast/js/regexp-ranges-and-escaped-hyphens-expected.txt:
  • fast/js/script-tests/regexp-overflow.js:
  • fast/js/script-tests/regexp-ranges-and-escaped-hyphens.js:
  • fast/regex/invalid-range-in-class-expected.txt: Added.
  • fast/regex/invalid-range-in-class.html: Added.
  • fast/regex/script-tests/invalid-range-in-class.js: Added.
  • fast/regex/test1-expected.txt:
  • fast/regex/test4-expected.txt:
  • fast/regex/testinput4:
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/yarr/RegexParser.h

    r72489 r72813  
    5757        ParenthesesTypeInvalid,
    5858        CharacterClassUnmatched,
     59        CharacterClassInvalidRange,
    5960        CharacterClassOutOfOrder,
    6061        EscapeUnterminated,
     
    7677            : m_delegate(delegate)
    7778            , m_err(err)
    78             , m_state(empty)
     79            , m_state(Empty)
    7980        {
    8081        }
     
    9192
    9293        /*
    93          * atomPatternCharacterUnescaped():
     94         * atomPatternCharacter():
    9495         *
    95          * This method is called directly from parseCharacterClass(), to report a new
    96          * pattern character token.  This method differs from atomPatternCharacter(),
    97          * which will be called from parseEscape(), since a hypen provided via this
    98          * method may be indicating a character range, but a hyphen parsed by
    99          * parseEscape() cannot be interpreted as doing so.
     96         * This method is called either from parseCharacterClass() (for an unescaped
     97         * character in a character class), or from parseEscape(). In the former case
     98         * the value true will be passed for the argument 'hyphenIsRange', and in this
     99         * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
     100         * is different to /[a\-z]/).
    100101         */
    101         void atomPatternCharacterUnescaped(UChar ch)
     102        void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
    102103        {
    103104            switch (m_state) {
    104             case empty:
     105            case AfterCharacterClass:
     106                // Following a builtin character class we need look out for a hyphen.
     107                // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
     108                // If we see a hyphen following a charater class then unlike usual
     109                // we'll report it to the delegate immediately, and put ourself into
     110                // a poisoned state. Any following calls to add another character or
     111                // character class will result in an error. (A hypen following a
     112                // character-class is itself valid, but only  at the end of a regex).
     113                if (hyphenIsRange && ch == '-') {
     114                    m_delegate.atomCharacterClassAtom('-');
     115                    m_state = AfterCharacterClassHyphen;
     116                    return;
     117                }
     118                // Otherwise just fall through - cached character so treat this as Empty.
     119
     120            case Empty:
    105121                m_character = ch;
    106                 m_state = cachedCharacter;
    107                 break;
    108 
    109             case cachedCharacter:
    110                 if (ch == '-')
    111                     m_state = cachedCharacterHyphen;
     122                m_state = CachedCharacter;
     123                return;
     124
     125            case CachedCharacter:
     126                if (hyphenIsRange && ch == '-')
     127                    m_state = CachedCharacterHyphen;
    112128                else {
    113129                    m_delegate.atomCharacterClassAtom(m_character);
    114130                    m_character = ch;
    115131                }
    116                 break;
    117 
    118             case cachedCharacterHyphen:
    119                 if (ch >= m_character)
    120                     m_delegate.atomCharacterClassRange(m_character, ch);
    121                 else
     132                return;
     133
     134            case CachedCharacterHyphen:
     135                if (ch < m_character) {
    122136                    m_err = CharacterClassOutOfOrder;
    123                 m_state = empty;
    124             }
    125         }
    126 
    127         /*
    128          * atomPatternCharacter():
    129          *
    130          * Adds a pattern character, called by parseEscape(), as such will not
    131          * interpret a hyphen as indicating a character range.
    132          */
    133         void atomPatternCharacter(UChar ch)
    134         {
    135             // Flush if a character is already pending to prevent the
    136             // hyphen from begin interpreted as indicating a range.
    137             if((ch == '-') && (m_state == cachedCharacter))
    138                 flush();
    139 
    140             atomPatternCharacterUnescaped(ch);
     137                    return;
     138                }
     139                m_delegate.atomCharacterClassRange(m_character, ch);
     140                m_state = Empty;
     141                return;
     142
     143            case AfterCharacterClassHyphen:
     144                // Error! We have something like /[\d-x]/.
     145                m_err = CharacterClassInvalidRange;
     146                return;
     147            }
    141148        }
    142149
     
    148155        void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
    149156        {
    150             flush();
    151             m_delegate.atomCharacterClassBuiltIn(classID, invert);
     157            switch (m_state) {
     158            case CachedCharacter:
     159                // Flush the currently cached character, then fall through.
     160                m_delegate.atomCharacterClassAtom(m_character);
     161
     162            case Empty:
     163            case AfterCharacterClass:
     164                m_state = AfterCharacterClass;
     165                m_delegate.atomCharacterClassBuiltIn(classID, invert);
     166                return;
     167
     168            case CachedCharacterHyphen:
     169            case AfterCharacterClassHyphen:
     170                // Error! If we hit either of these cases, we have an
     171                // invalid range that looks something like /[x-\d]/
     172                // or /[\d-\d]/.
     173                m_err = CharacterClassInvalidRange;
     174                return;
     175            }
    152176        }
    153177
     
    159183        void end()
    160184        {
    161             flush();
     185            if (m_state == CachedCharacter)
     186                m_delegate.atomCharacterClassAtom(m_character);
     187            else if (m_state == CachedCharacterHyphen) {
     188                m_delegate.atomCharacterClassAtom(m_character);
     189                m_delegate.atomCharacterClassAtom('-');
     190            }
    162191            m_delegate.atomCharacterClassEnd();
    163192        }
     
    169198
    170199    private:
    171         void flush()
    172         {
    173             if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
    174                 m_delegate.atomCharacterClassAtom(m_character);
    175             if (m_state == cachedCharacterHyphen)
    176                 m_delegate.atomCharacterClassAtom('-');
    177             m_state = empty;
    178         }
    179    
    180200        Delegate& m_delegate;
    181201        ErrorCode& m_err;
    182202        enum CharacterClassConstructionState {
    183             empty,
    184             cachedCharacter,
    185             cachedCharacterHyphen,
     203            Empty,
     204            CachedCharacter,
     205            CachedCharacterHyphen,
     206            AfterCharacterClass,
     207            AfterCharacterClassHyphen,
    186208        } m_state;
    187209        UChar m_character;
     
    429451
    430452            default:
    431                 characterClassConstructor.atomPatternCharacterUnescaped(consume());
     453                characterClassConstructor.atomPatternCharacter(consume(), true);
    432454            }
    433455
     
    658680            "unrecognized character after (?",
    659681            "missing terminating ] for character class",
     682            "invalid range in character class",
    660683            "range out of order in character class",
    661684            "\\ at end of pattern"
Note: See TracChangeset for help on using the changeset viewer.