Changeset 36337 in webkit for trunk/JavaScriptCore/wrec/WREC.cpp
- Timestamp:
- Sep 11, 2008, 2:13:01 PM (17 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/wrec/WREC.cpp
r36327 r36337 29 29 #if ENABLE(WREC) 30 30 31 #include "CharacterClassConstructor.h" 31 32 #include "ExecState.h" 32 33 #include "Machine.h" … … 36 37 37 38 namespace JSC { 38 39 // ==== CharacterClass ====40 41 struct CharacterClassRange {42 UChar begin;43 UChar end;44 };45 46 struct CharacterClass {47 const UChar* matches;48 unsigned numMatches;49 50 const CharacterClassRange* ranges;51 unsigned numRanges;52 53 const UChar* matchesUnicode;54 unsigned numMatchesUnicode;55 56 const CharacterClassRange* rangesUnicode;57 unsigned numRangesUnicode;58 };59 60 static const UChar asciiNewlines[2] = { '\n', '\r' };61 static const UChar unicodeNewlines[2] = { 0x2028, 0x2029 };62 static CharacterClass& getCharacterClassNewline() {63 static CharacterClass charClass = {64 asciiNewlines, 2,65 0, 0,66 unicodeNewlines, 2,67 0, 0,68 };69 70 return charClass;71 }72 73 static const CharacterClassRange asciiDigitsRange[1] = { { '0', '9' } };74 static CharacterClass& getCharacterClassDigits() {75 static CharacterClass charClass = {76 0, 0,77 asciiDigitsRange, 1,78 0, 0,79 0, 0,80 };81 82 return charClass;83 }84 85 static const UChar asciiSpaces[1] = { ' ' };86 static const CharacterClassRange asciiSpacesRange[1] = { { '\t', '\r' } };87 static const UChar unicodeSpaces[8] = { 0x00a0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000 };88 static const CharacterClassRange unicodeSpacesRange[1] = { { 0x2000, 0x200a } };89 static CharacterClass& getCharacterClassSpaces() {90 static CharacterClass charClass = {91 asciiSpaces, 1,92 asciiSpacesRange, 1,93 unicodeSpaces, 8,94 unicodeSpacesRange, 1,95 };96 97 return charClass;98 }99 100 static const UChar asciiWordchar[1] = { '_' };101 static const CharacterClassRange asciiWordcharRange[3] = { { '0', '9' }, { 'A', 'Z' }, { 'a', 'z' } };102 static CharacterClass& getCharacterClassWordchar() {103 static CharacterClass charClass = {104 asciiWordchar, 1,105 asciiWordcharRange, 3,106 0, 0,107 0, 0,108 };109 110 return charClass;111 }112 113 static const CharacterClassRange asciiNondigitsRange[2] = { { 0, '0' - 1 }, { '9' + 1, 0x7f } };114 static const CharacterClassRange unicodeNondigitsRange[1] = { { 0x0080, 0xffff } };115 static CharacterClass& getCharacterClassNondigits() {116 static CharacterClass charClass = {117 0, 0,118 asciiNondigitsRange, 2,119 0, 0,120 unicodeNondigitsRange, 1,121 };122 123 return charClass;124 }125 126 static const CharacterClassRange asciiNonspacesRange[3] = { { 0, '\t' - 1 }, { '\r' + 1, ' ' - 1 }, { ' ' + 1, 0x7f } };127 static const CharacterClassRange unicodeNonspacesRange[9] = {128 { 0x0080, 0x009f },129 { 0x00a1, 0x167f },130 { 0x1681, 0x180d },131 { 0x180f, 0x1fff },132 { 0x200b, 0x2027 },133 { 0x202a, 0x202e },134 { 0x2030, 0x205e },135 { 0x2060, 0x2fff },136 { 0x3001, 0xffff }137 };138 static CharacterClass& getCharacterClassNonspaces() {139 static CharacterClass charClass = {140 0, 0,141 asciiNonspacesRange, 3,142 0, 0,143 unicodeNonspacesRange, 9,144 };145 146 return charClass;147 }148 149 static const UChar asciiNonwordchar[1] = { '`' };150 static const CharacterClassRange asciiNonwordcharRange[4] = { { 0, '0' - 1 }, { '9' + 1, 'A' - 1 }, { 'Z' + 1, '_' - 1 }, { 'z' + 1, 0x7f } };151 static const CharacterClassRange unicodeNonwordcharRange[1] = { { 0x0080, 0xffff } };152 static CharacterClass& getCharacterClassNonwordchar() {153 static CharacterClass charClass = {154 asciiNonwordchar, 1,155 asciiNonwordcharRange, 4,156 0, 0,157 unicodeNonwordcharRange, 1,158 };159 160 return charClass;161 }162 163 struct CharacterClassConstructor {164 Vector<UChar> m_matches;165 Vector<CharacterClassRange> m_ranges;166 Vector<UChar> m_matchesUnicode;167 Vector<CharacterClassRange> m_rangesUnicode;168 169 int m_ch_buffer;170 bool m_pending_dash;171 bool m_ignoreCase;172 bool m_upsideDown;173 174 CharacterClassConstructor(bool ignoreCase)175 : m_ch_buffer(-1)176 , m_pending_dash(false)177 , m_ignoreCase(ignoreCase)178 , m_upsideDown(false)179 {180 }181 182 void flush();183 void put(UChar ch);184 void append(CharacterClass& other);185 private:186 void addSorted(Vector<UChar>& matches, UChar ch);187 void addSortedRange(Vector<CharacterClassRange>& ranges, UChar lo, UChar hi);188 };189 190 void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)191 {192 unsigned pos = 0;193 unsigned range = matches.size();194 195 // binary chop, find position to insert char.196 while (range) {197 unsigned index = range >> 1;198 199 int val = matches[pos+index] - ch;200 if (!val)201 return;202 else if (val > 0)203 range = index;204 else {205 pos += (index+1);206 range -= (index+1);207 }208 }209 210 if (pos == matches.size())211 matches.append(ch);212 else213 matches.insert(pos, ch);214 }215 216 void CharacterClassConstructor::addSortedRange(Vector<CharacterClassRange>& ranges, UChar lo, UChar hi)217 {218 unsigned end = ranges.size();219 220 // Simple linear scan - I doubt there are that many ranges anyway...221 // feel free to fix this with something faster (eg binary chop).222 for (unsigned i = 0; i < end; ++i) {223 // does the new range fall before the current position in the array224 if (hi < ranges[i].begin) {225 // optional optimization: concatenate appending ranges? - may not be worthwhile.226 if (hi == (ranges[i].begin - 1)) {227 ranges[i].begin = lo;228 return;229 }230 CharacterClassRange r = {lo, hi};231 ranges.insert(i, r);232 return;233 }234 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining235 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the236 // end of the last range they concatenate, which is just as good.237 if (lo <= (ranges[i].end + 1)) {238 // found an intersect! we'll replace this entry in the array.239 ranges[i].begin = std::min(ranges[i].begin, lo);240 ranges[i].end = std::max(ranges[i].end, hi);241 242 // now check if the new range can subsume any subsequent ranges.243 unsigned next = i+1;244 // each iteration of the loop we will either remove something from the list, or break the loop.245 while (next < ranges.size()) {246 if (ranges[next].begin <= (ranges[i].end + 1)) {247 // the next entry now overlaps / concatenates this one.248 ranges[i].end = std::max(ranges[i].end, ranges[next].end);249 ranges.remove(next);250 } else251 break;252 }253 254 return;255 }256 }257 258 // Range comes after all existing ranges.259 CharacterClassRange r = {lo, hi};260 ranges.append(r);261 }262 263 void CharacterClassConstructor::put(UChar ch)264 {265 if (m_ch_buffer != -1) {266 if (m_pending_dash) {267 UChar lo = m_ch_buffer;268 UChar hi = ch;269 m_ch_buffer = -1;270 m_pending_dash = false;271 272 if (lo > hi)273 m_upsideDown = true;274 275 if (lo <= 0x7f) {276 char asciiLo = lo;277 char asciiHi = std::min(hi, (UChar)0x7f);278 addSortedRange(m_ranges, lo, asciiHi);279 280 if (m_ignoreCase) {281 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))282 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));283 if ((asciiLo <= 'z') && (asciiHi >= 'a'))284 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));285 }286 }287 if (hi >= 0x80) {288 UChar unicodeCurr = std::max(lo, (UChar)0x80);289 addSortedRange(m_rangesUnicode, unicodeCurr, hi);290 291 if (m_ignoreCase) {292 // we're going to scan along, updating the start of the range293 while (unicodeCurr <= hi) {294 // Spin forwards over any characters that don't have two cases.295 for (; kjs_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {296 // if this was the last character in the range, we're done.297 if (unicodeCurr == hi)298 return;299 }300 // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.301 UChar rangeStart = unicodeCurr;302 UChar otherCurr = kjs_pcre_ucp_othercase(unicodeCurr);303 304 // If unicodeCurr is not yet hi, check the next char in the range. If it also has another case,305 // and if it's other case value is one greater then the othercase value for the current last306 // character included in the range, we can include next into the range.307 while ((unicodeCurr < hi) && (kjs_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {308 // increment unicodeCurr; it points to the end of the range.309 // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.310 ++unicodeCurr;311 ++otherCurr;312 }313 314 // otherChar is the last in the range of other case chars, calculate offset to get back to the start.315 addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);316 317 // unicodeCurr has been added, move on to the next char.318 ++unicodeCurr;319 }320 }321 }322 } else if (ch == '-') {323 m_pending_dash = true;324 } else {325 flush();326 m_ch_buffer = ch;327 }328 } else329 m_ch_buffer = ch;330 }331 332 // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.333 // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate334 // array (either ascii or unicode).335 // If the pattern is case insensitive we add entries for both cases.336 void CharacterClassConstructor::flush()337 {338 if (m_ch_buffer != -1) {339 if (m_ch_buffer <= 0x7f) {340 if (m_ignoreCase && isASCIILower(m_ch_buffer))341 addSorted(m_matches, toASCIIUpper(m_ch_buffer));342 addSorted(m_matches, m_ch_buffer);343 if (m_ignoreCase && isASCIIUpper(m_ch_buffer))344 addSorted(m_matches, toASCIILower(m_ch_buffer));345 } else {346 addSorted(m_matchesUnicode, m_ch_buffer);347 if (m_ignoreCase) {348 int other = kjs_pcre_ucp_othercase(m_ch_buffer);349 if (other != -1)350 addSorted(m_matchesUnicode, other);351 }352 }353 m_ch_buffer = -1;354 }355 356 if (m_pending_dash) {357 addSorted(m_matches, '-');358 }359 }360 361 void CharacterClassConstructor::append(CharacterClass& other)362 {363 // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).364 // Need to check the spec, really, but think this matches PCRE behaviour.365 flush();366 367 if (other.numMatches) {368 for (size_t i = 0; i < other.numMatches; ++i)369 addSorted(m_matches, other.matches[i]);370 }371 if (other.numRanges) {372 for (size_t i = 0; i < other.numRanges; ++i)373 addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);374 }375 if (other.numMatchesUnicode) {376 for (size_t i = 0; i < other.numMatchesUnicode; ++i)377 addSorted(m_matchesUnicode, other.matchesUnicode[i]);378 }379 if (other.numRangesUnicode) {380 for (size_t i = 0; i < other.numRangesUnicode; ++i)381 addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);382 }383 }384 39 385 40 class GenerateAtomFunctor { … … 1438 1093 1439 1094 // lazily catch reversed ranges ([z-a])in character classes 1440 if (charClassConstructor. m_upsideDown) {1095 if (charClassConstructor.isUpsideDown()) { 1441 1096 m_err = Error_malformedCharacterClass; 1442 1097 return false; … … 1444 1099 1445 1100 charClassConstructor.flush(); 1446 1447 CharacterClass charClass = { 1448 charClassConstructor.m_matches.begin(), charClassConstructor.m_matches.size(), 1449 charClassConstructor.m_ranges.begin(), charClassConstructor.m_ranges.size(), 1450 charClassConstructor.m_matchesUnicode.begin(), charClassConstructor.m_matchesUnicode.size(), 1451 charClassConstructor.m_rangesUnicode.begin(), charClassConstructor.m_rangesUnicode.size(), 1452 }; 1101 CharacterClass charClass = charClassConstructor.charClass(); 1453 1102 return parseCharacterClassQuantifier(failures, charClass, invert); 1454 1103 }
Note:
See TracChangeset
for help on using the changeset viewer.