Changeset 112143 in webkit for trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp
- Timestamp:
- Mar 26, 2012, 1:13:39 PM (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp
r106748 r112143 29 29 30 30 #include "Yarr.h" 31 #include "YarrCanonicalizeUCS2.h" 31 32 #include "YarrParser.h" 32 33 #include <wtf/Vector.h> … … 67 68 void putChar(UChar ch) 68 69 { 70 // Handle ascii cases. 69 71 if (ch <= 0x7f) { 70 72 if (m_isCaseInsensitive && isASCIIAlpha(ch)) { … … 73 75 } else 74 76 addSorted(m_matches, ch); 77 return; 78 } 79 80 // Simple case, not a case-insensitive match. 81 if (!m_isCaseInsensitive) { 82 addSorted(m_matchesUnicode, ch); 83 return; 84 } 85 86 // Add multiple matches, if necessary. 87 UCS2CanonicalizationRange* info = rangeInfoFor(ch); 88 if (info->type == CanonicalizeUnique) 89 addSorted(m_matchesUnicode, ch); 90 else 91 putUnicodeIgnoreCase(ch, info); 92 } 93 94 void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info) 95 { 96 ASSERT(m_isCaseInsensitive); 97 ASSERT(ch > 0x7f); 98 ASSERT(ch >= info->begin && ch <= info->end); 99 ASSERT(info->type != CanonicalizeUnique); 100 if (info->type == CanonicalizeSet) { 101 for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) 102 addSorted(m_matchesUnicode, ch); 75 103 } else { 76 UChar upper, lower; 77 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { 78 addSorted(m_matchesUnicode, upper); 79 addSorted(m_matchesUnicode, lower); 80 } else 81 addSorted(m_matchesUnicode, ch); 82 } 83 } 84 85 // returns true if this character has another case, and 'ch' is the upper case form. 86 static inline bool isUnicodeUpper(UChar ch) 87 { 88 return ch != Unicode::toLower(ch); 89 } 90 91 // returns true if this character has another case, and 'ch' is the lower case form. 92 static inline bool isUnicodeLower(UChar ch) 93 { 94 return ch != Unicode::toUpper(ch); 104 addSorted(m_matchesUnicode, ch); 105 addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); 106 } 95 107 } 96 108 … … 109 121 } 110 122 } 111 if (hi >= 0x80) { 112 uint32_t unicodeCurr = std::max(lo, (UChar)0x80); 113 addSortedRange(m_rangesUnicode, unicodeCurr, hi); 114 115 if (m_isCaseInsensitive) { 116 while (unicodeCurr <= hi) { 117 // If the upper bound of the range (hi) is 0xffff, the increments to 118 // unicodeCurr in this loop may take it to 0x10000. This is fine 119 // (if so we won't re-enter the loop, since the loop condition above 120 // will definitely fail) - but this does mean we cannot use a UChar 121 // to represent unicodeCurr, we must use a 32-bit value instead. 122 ASSERT(unicodeCurr <= 0xffff); 123 124 if (isUnicodeUpper(unicodeCurr)) { 125 UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); 126 UChar lowerCaseRangeEnd = lowerCaseRangeBegin; 127 while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) 128 lowerCaseRangeEnd++; 129 addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); 130 } else if (isUnicodeLower(unicodeCurr)) { 131 UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); 132 UChar upperCaseRangeEnd = upperCaseRangeBegin; 133 while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) 134 upperCaseRangeEnd++; 135 addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); 136 } else 137 ++unicodeCurr; 138 } 139 } 140 } 123 if (hi <= 0x7f) 124 return; 125 126 lo = std::max(lo, (UChar)0x80); 127 addSortedRange(m_rangesUnicode, lo, hi); 128 129 if (!m_isCaseInsensitive) 130 return; 131 132 UCS2CanonicalizationRange* info = rangeInfoFor(lo); 133 while (true) { 134 // Handle the range [lo .. end] 135 UChar end = std::min(info->end, hi); 136 137 switch (info->type) { 138 case CanonicalizeUnique: 139 // Nothing to do - no canonical equivalents. 140 break; 141 case CanonicalizeSet: { 142 UChar ch; 143 for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) 144 addSorted(m_matchesUnicode, ch); 145 break; 146 } 147 case CanonicalizeRangeLo: 148 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); 149 break; 150 case CanonicalizeRangeHi: 151 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); 152 break; 153 case CanonicalizeAlternatingAligned: 154 // Use addSortedRange since there is likely an abutting range to combine with. 155 if (lo & 1) 156 addSortedRange(m_rangesUnicode, lo - 1, lo - 1); 157 if (!(end & 1)) 158 addSortedRange(m_rangesUnicode, end + 1, end + 1); 159 break; 160 case CanonicalizeAlternatingUnaligned: 161 // Use addSortedRange since there is likely an abutting range to combine with. 162 if (!(lo & 1)) 163 addSortedRange(m_rangesUnicode, lo - 1, lo - 1); 164 if (end & 1) 165 addSortedRange(m_rangesUnicode, end + 1, end + 1); 166 break; 167 } 168 169 if (hi == end) 170 return; 171 172 ++info; 173 lo = info->begin; 174 }; 175 141 176 } 142 177 … … 281 316 // We handle case-insensitive checking of unicode characters which do have both 282 317 // cases by handling them as if they were defined using a CharacterClass. 283 if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { 284 atomCharacterClassBegin(); 285 atomCharacterClassAtom(ch); 286 atomCharacterClassEnd(); 287 } else 318 if (!m_pattern.m_ignoreCase || isASCII(ch)) { 288 319 m_alternative->m_terms.append(PatternTerm(ch)); 320 return; 321 } 322 323 UCS2CanonicalizationRange* info = rangeInfoFor(ch); 324 if (info->type == CanonicalizeUnique) { 325 m_alternative->m_terms.append(PatternTerm(ch)); 326 return; 327 } 328 329 m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); 330 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); 331 m_pattern.m_userCharacterClasses.append(newCharacterClass); 332 m_alternative->m_terms.append(PatternTerm(newCharacterClass, false)); 289 333 } 290 334
Note:
See TracChangeset
for help on using the changeset viewer.