Changeset 36337 in webkit for trunk/JavaScriptCore/wrec/WREC.cpp


Ignore:
Timestamp:
Sep 11, 2008, 2:13:01 PM (17 years ago)
Author:
[email protected]
Message:

2008-09-11 Cameron Zwarich <[email protected]>

Reviewed by Maciej Stachowiak.

Bug 20788: Split CharacterClassConstructor into its own file
<https://bugs.webkit.org/show_bug.cgi?id=20788>

Split CharacterClassConstructor into its own file and clean up some
style issues.

  • JavaScriptCore.vcproj/JavaScriptCore/JavaScriptCore.vcproj:
  • JavaScriptCore.xcodeproj/project.pbxproj:
  • wrec/CharacterClassConstructor.cpp: Added. (JSC::): (JSC::getCharacterClassNewline): (JSC::getCharacterClassDigits): (JSC::getCharacterClassSpaces): (JSC::getCharacterClassWordchar): (JSC::getCharacterClassNondigits): (JSC::getCharacterClassNonspaces): (JSC::getCharacterClassNonwordchar): (JSC::CharacterClassConstructor::addSorted): (JSC::CharacterClassConstructor::addSortedRange): (JSC::CharacterClassConstructor::put): (JSC::CharacterClassConstructor::flush): (JSC::CharacterClassConstructor::append):
  • wrec/CharacterClassConstructor.h: Added. (JSC::CharacterClassConstructor::CharacterClassConstructor): (JSC::CharacterClassConstructor::isUpsideDown): (JSC::CharacterClassConstructor::charClass):
  • wrec/WREC.cpp: (JSC::WRECParser::parseCharacterClass):
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/wrec/WREC.cpp

    r36327 r36337  
    2929#if ENABLE(WREC)
    3030
     31#include "CharacterClassConstructor.h"
    3132#include "ExecState.h"
    3233#include "Machine.h"
     
    3637
    3738namespace JSC {
    38 
    39 // ==== CharacterClass ====
    40 
    41 struct CharacterClassRange {
    42     UChar begin;
    43     UChar end;
    44 };
    45 
    46 struct CharacterClass {
    47     const UChar* matches;
    48     unsigned numMatches;
    49 
    50     const CharacterClassRange* ranges;
    51     unsigned numRanges;
    52 
    53     const UChar* matchesUnicode;
    54     unsigned numMatchesUnicode;
    55 
    56     const CharacterClassRange* rangesUnicode;
    57     unsigned numRangesUnicode;
    58 };
    59 
    60 static const UChar asciiNewlines[2] = { '\n', '\r' };
    61 static const UChar unicodeNewlines[2] = { 0x2028, 0x2029 };
    62 static CharacterClass& getCharacterClassNewline() {
    63     static CharacterClass charClass = {
    64         asciiNewlines, 2,
    65         0, 0,
    66         unicodeNewlines, 2,
    67         0, 0,
    68     };
    69    
    70     return charClass;
    71 }
    72 
    73 static const CharacterClassRange asciiDigitsRange[1] = { { '0', '9' } };
    74 static CharacterClass& getCharacterClassDigits() {
    75     static CharacterClass charClass = {
    76         0, 0,
    77         asciiDigitsRange, 1,
    78         0, 0,
    79         0, 0,
    80     };
    81    
    82     return charClass;
    83 }
    84 
    85 static const UChar asciiSpaces[1] = { ' ' };
    86 static const CharacterClassRange asciiSpacesRange[1] = { { '\t', '\r' } };
    87 static const UChar unicodeSpaces[8] = { 0x00a0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000 };
    88 static const CharacterClassRange unicodeSpacesRange[1] = { { 0x2000, 0x200a } };
    89 static CharacterClass& getCharacterClassSpaces() {
    90     static CharacterClass charClass = {
    91         asciiSpaces, 1,
    92         asciiSpacesRange, 1,
    93         unicodeSpaces, 8,
    94         unicodeSpacesRange, 1,
    95     };
    96    
    97     return charClass;
    98 }
    99 
    100 static const UChar asciiWordchar[1] = { '_' };
    101 static const CharacterClassRange asciiWordcharRange[3] = { { '0', '9' }, { 'A', 'Z' }, { 'a', 'z' } };
    102 static CharacterClass& getCharacterClassWordchar() {
    103     static CharacterClass charClass = {
    104         asciiWordchar, 1,
    105         asciiWordcharRange, 3,
    106         0, 0,
    107         0, 0,
    108     };
    109    
    110     return charClass;
    111 }
    112 
    113 static const CharacterClassRange asciiNondigitsRange[2] = { { 0, '0' - 1 }, { '9' + 1, 0x7f } };
    114 static const CharacterClassRange unicodeNondigitsRange[1] = { { 0x0080, 0xffff } };
    115 static CharacterClass& getCharacterClassNondigits() {
    116     static CharacterClass charClass = {
    117         0, 0,
    118         asciiNondigitsRange, 2,
    119         0, 0,
    120         unicodeNondigitsRange, 1,
    121     };
    122    
    123     return charClass;
    124 }
    125 
    126 static const CharacterClassRange asciiNonspacesRange[3] = { { 0, '\t' - 1 }, { '\r' + 1, ' ' - 1 }, { ' ' + 1, 0x7f } };
    127 static const CharacterClassRange unicodeNonspacesRange[9] = {
    128     { 0x0080, 0x009f },
    129     { 0x00a1, 0x167f },
    130     { 0x1681, 0x180d },
    131     { 0x180f, 0x1fff },
    132     { 0x200b, 0x2027 },
    133     { 0x202a, 0x202e },
    134     { 0x2030, 0x205e },
    135     { 0x2060, 0x2fff },
    136     { 0x3001, 0xffff }
    137 };
    138 static CharacterClass& getCharacterClassNonspaces() {
    139     static CharacterClass charClass = {
    140         0, 0,
    141         asciiNonspacesRange, 3,
    142         0, 0,
    143         unicodeNonspacesRange, 9,
    144     };
    145    
    146     return charClass;
    147 }
    148 
    149 static const UChar asciiNonwordchar[1] = { '`' };
    150 static const CharacterClassRange asciiNonwordcharRange[4] = { { 0, '0' - 1 }, { '9' + 1, 'A' - 1 }, { 'Z' + 1, '_' - 1 }, { 'z' + 1, 0x7f } };
    151 static const CharacterClassRange unicodeNonwordcharRange[1] = { { 0x0080, 0xffff } };
    152 static CharacterClass& getCharacterClassNonwordchar() {
    153     static CharacterClass charClass = {
    154         asciiNonwordchar, 1,
    155         asciiNonwordcharRange, 4,
    156         0, 0,
    157         unicodeNonwordcharRange, 1,
    158     };
    159    
    160     return charClass;
    161 }
    162 
    163 struct CharacterClassConstructor {
    164     Vector<UChar> m_matches;
    165     Vector<CharacterClassRange> m_ranges;
    166     Vector<UChar> m_matchesUnicode;
    167     Vector<CharacterClassRange> m_rangesUnicode;
    168 
    169     int m_ch_buffer;
    170     bool m_pending_dash;
    171     bool m_ignoreCase;
    172     bool m_upsideDown;
    173    
    174     CharacterClassConstructor(bool ignoreCase)
    175         : m_ch_buffer(-1)
    176         , m_pending_dash(false)
    177         , m_ignoreCase(ignoreCase)
    178         , m_upsideDown(false)
    179     {
    180     }
    181    
    182     void flush();
    183     void put(UChar ch);
    184     void append(CharacterClass& other);
    185 private:
    186     void addSorted(Vector<UChar>& matches, UChar ch);
    187     void addSortedRange(Vector<CharacterClassRange>& ranges, UChar lo, UChar hi);
    188 };
    189 
    190 void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)
    191 {
    192     unsigned pos = 0;
    193     unsigned range = matches.size();
    194 
    195     // binary chop, find position to insert char.
    196     while (range) {
    197         unsigned index = range >> 1;
    198 
    199         int val = matches[pos+index] - ch;
    200         if (!val)
    201             return;
    202         else if (val > 0)
    203             range = index;
    204         else {
    205             pos += (index+1);
    206             range -= (index+1);
    207         }
    208     }
    209    
    210     if (pos == matches.size())
    211         matches.append(ch);
    212     else
    213         matches.insert(pos, ch);
    214 }
    215 
    216 void CharacterClassConstructor::addSortedRange(Vector<CharacterClassRange>& ranges, UChar lo, UChar hi)
    217 {
    218     unsigned end = ranges.size();
    219    
    220     // Simple linear scan - I doubt there are that many ranges anyway...
    221     // feel free to fix this with something faster (eg binary chop).
    222     for (unsigned i = 0; i < end; ++i) {
    223         // does the new range fall before the current position in the array
    224         if (hi < ranges[i].begin) {
    225             // optional optimization: concatenate appending ranges? - may not be worthwhile.
    226             if (hi == (ranges[i].begin - 1)) {
    227                 ranges[i].begin = lo;
    228                 return;
    229             }
    230             CharacterClassRange r = {lo, hi};
    231             ranges.insert(i, r);
    232             return;
    233         }
    234         // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
    235         // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
    236         // end of the last range they concatenate, which is just as good.
    237         if (lo <= (ranges[i].end + 1)) {
    238             // found an intersect! we'll replace this entry in the array.
    239             ranges[i].begin = std::min(ranges[i].begin, lo);
    240             ranges[i].end = std::max(ranges[i].end, hi);
    241 
    242             // now check if the new range can subsume any subsequent ranges.
    243             unsigned next = i+1;
    244             // each iteration of the loop we will either remove something from the list, or break the loop.
    245             while (next < ranges.size()) {
    246                 if (ranges[next].begin <= (ranges[i].end + 1)) {
    247                     // the next entry now overlaps / concatenates this one.
    248                     ranges[i].end = std::max(ranges[i].end, ranges[next].end);
    249                     ranges.remove(next);
    250                 } else
    251                     break;
    252             }
    253            
    254             return;
    255         }
    256     }
    257 
    258     // Range comes after all existing ranges.
    259     CharacterClassRange r = {lo, hi};
    260     ranges.append(r);
    261 }
    262 
    263 void CharacterClassConstructor::put(UChar ch)
    264 {
    265     if (m_ch_buffer != -1) {
    266         if (m_pending_dash) {
    267             UChar lo = m_ch_buffer;
    268             UChar hi = ch;
    269             m_ch_buffer = -1;
    270             m_pending_dash = false;
    271            
    272             if (lo > hi)
    273                 m_upsideDown = true;
    274            
    275             if (lo <= 0x7f) {
    276                 char asciiLo = lo;
    277                 char asciiHi = std::min(hi, (UChar)0x7f);
    278                 addSortedRange(m_ranges, lo, asciiHi);
    279                
    280                 if (m_ignoreCase) {
    281                     if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
    282                         addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
    283                     if ((asciiLo <= 'z') && (asciiHi >= 'a'))
    284                         addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
    285                 }
    286             }
    287             if (hi >= 0x80) {
    288                 UChar unicodeCurr = std::max(lo, (UChar)0x80);
    289                 addSortedRange(m_rangesUnicode, unicodeCurr, hi);
    290                
    291                 if (m_ignoreCase) {
    292                     // we're going to scan along, updating the start of the range
    293                     while (unicodeCurr <= hi) {
    294                         // Spin forwards over any characters that don't have two cases.
    295                         for (; kjs_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {
    296                             // if this was the last character in the range, we're done.
    297                             if (unicodeCurr == hi)
    298                                 return;
    299                         }
    300                         // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
    301                         UChar rangeStart = unicodeCurr;
    302                         UChar otherCurr = kjs_pcre_ucp_othercase(unicodeCurr);
    303                        
    304                         // If unicodeCurr is not yet hi, check the next char in the range.  If it also has another case,
    305                         // and if it's other case value is one greater then the othercase value for the current last
    306                         // character included in the range, we can include next into the range.
    307                         while ((unicodeCurr < hi) && (kjs_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {
    308                             // increment unicodeCurr; it points to the end of the range.
    309                             // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
    310                             ++unicodeCurr;
    311                             ++otherCurr;
    312                         }
    313                        
    314                         // otherChar is the last in the range of other case chars, calculate offset to get back to the start.
    315                         addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);
    316                        
    317                         // unicodeCurr has been added, move on to the next char.
    318                         ++unicodeCurr;
    319                     }
    320                 }
    321             }
    322         } else if (ch == '-') {
    323             m_pending_dash = true;
    324         } else {
    325             flush();
    326             m_ch_buffer = ch;
    327         }
    328     } else
    329         m_ch_buffer = ch;
    330 }
    331 
    332 // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
    333 // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
    334 // array (either ascii or unicode).
    335 // If the pattern is case insensitive we add entries for both cases.
    336 void CharacterClassConstructor::flush()
    337 {
    338     if (m_ch_buffer != -1) {
    339         if (m_ch_buffer <= 0x7f) {
    340             if (m_ignoreCase && isASCIILower(m_ch_buffer))
    341                 addSorted(m_matches, toASCIIUpper(m_ch_buffer));
    342             addSorted(m_matches, m_ch_buffer);
    343             if (m_ignoreCase && isASCIIUpper(m_ch_buffer))
    344                 addSorted(m_matches, toASCIILower(m_ch_buffer));
    345         } else {
    346             addSorted(m_matchesUnicode, m_ch_buffer);
    347             if (m_ignoreCase) {
    348                 int other = kjs_pcre_ucp_othercase(m_ch_buffer);
    349                 if (other != -1)
    350                     addSorted(m_matchesUnicode, other);
    351             }
    352         }
    353         m_ch_buffer = -1;
    354     }
    355    
    356     if (m_pending_dash) {
    357         addSorted(m_matches, '-');
    358     }
    359 }
    360 
    361 void CharacterClassConstructor::append(CharacterClass& other)
    362 {
    363     // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
    364     // Need to check the spec, really, but think this matches PCRE behaviour.
    365     flush();
    366    
    367     if (other.numMatches) {
    368         for (size_t i = 0; i < other.numMatches; ++i)
    369             addSorted(m_matches, other.matches[i]);
    370     }
    371     if (other.numRanges) {
    372         for (size_t i = 0; i < other.numRanges; ++i)
    373             addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);
    374     }
    375     if (other.numMatchesUnicode) {
    376         for (size_t i = 0; i < other.numMatchesUnicode; ++i)
    377             addSorted(m_matchesUnicode, other.matchesUnicode[i]);
    378     }
    379     if (other.numRangesUnicode) {
    380         for (size_t i = 0; i < other.numRangesUnicode; ++i)
    381             addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);
    382     }
    383 }
    38439
    38540class GenerateAtomFunctor {
     
    14381093
    14391094    // lazily catch reversed ranges ([z-a])in character classes
    1440     if (charClassConstructor.m_upsideDown) {
     1095    if (charClassConstructor.isUpsideDown()) {
    14411096        m_err = Error_malformedCharacterClass;
    14421097        return false;
     
    14441099
    14451100    charClassConstructor.flush();
    1446 
    1447     CharacterClass charClass = {
    1448         charClassConstructor.m_matches.begin(), charClassConstructor.m_matches.size(),
    1449         charClassConstructor.m_ranges.begin(), charClassConstructor.m_ranges.size(),
    1450         charClassConstructor.m_matchesUnicode.begin(), charClassConstructor.m_matchesUnicode.size(),
    1451         charClassConstructor.m_rangesUnicode.begin(), charClassConstructor.m_rangesUnicode.size(),
    1452     };
     1101    CharacterClass charClass = charClassConstructor.charClass();
    14531102    return parseCharacterClassQuantifier(failures, charClass, invert);
    14541103}
Note: See TracChangeset for help on using the changeset viewer.