Changeset 72180 in webkit for trunk/JavaScriptCore/yarr/RegexCompiler.cpp
- Timestamp:
- Nov 17, 2010, 1:42:41 AM (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/yarr/RegexCompiler.cpp
r67869 r72180 1 1 /* 2 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged 3 4 * 4 5 * Redistribution and use in source and binary forms, with or without … … 236 237 }; 237 238 239 struct BeginCharHelper { 240 BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false) 241 : m_beginChars(beginChars) 242 , m_isCaseInsensitive(isCaseInsensitive) 243 {} 244 245 void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount) 246 { 247 if (quantityType == QuantifierFixedCount && quantityCount > 1) { 248 // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/ 249 beginChar.value |= beginChar.value << 16; 250 beginChar.mask |= beginChar.mask << 16; 251 addCharacter(beginChar); 252 } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size()) 253 // In case of characters with fixed quantifier we should check the next character as well. 254 linkHotTerms(beginChar, hotTerms); 255 else 256 // In case of greedy matching the next character checking is unnecessary therefore we just store 257 // the first character. 258 addCharacter(beginChar); 259 } 260 261 // Merge two following BeginChars in the vector to reduce the number of character checks. 262 void merge(unsigned size) 263 { 264 for (unsigned i = 0; i < size; i++) { 265 BeginChar* curr = &m_beginChars->at(i); 266 BeginChar* next = &m_beginChars->at(i + 1); 267 268 // If the current and the next size of value is different we should skip the merge process 269 // because the 16bit and 32bit values are unmergable. 270 if (curr->value <= 0xFFFF && next->value > 0xFFFF) 271 continue; 272 273 unsigned diff = curr->value ^ next->value; 274 275 curr->mask |= diff; 276 curr->value |= curr->mask; 277 278 m_beginChars->remove(i + 1); 279 size--; 280 } 281 } 282 283 private: 284 void addCharacter(BeginChar beginChar) 285 { 286 unsigned pos = 0; 287 unsigned range = m_beginChars->size(); 288 289 // binary chop, find position to insert char. 290 while (range) { 291 unsigned index = range >> 1; 292 293 int val = m_beginChars->at(pos+index).value - beginChar.value; 294 if (!val) 295 return; 296 if (val < 0) 297 range = index; 298 else { 299 pos += (index+1); 300 range -= (index+1); 301 } 302 } 303 304 if (pos == m_beginChars->size()) 305 m_beginChars->append(beginChar); 306 else 307 m_beginChars->insert(pos, beginChar); 308 } 309 310 // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object. 311 void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms) 312 { 313 for (unsigned i = 0; i < hotTerms->size(); i++) { 314 PatternTerm hotTerm = hotTerms->at(i).term; 315 ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter); 316 317 UChar characterNext = hotTerm.patternCharacter; 318 319 // Append a character to an existing BeginChar object. 320 if (characterNext <= 0x7f) { 321 unsigned mask = 0; 322 323 if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) { 324 mask = 32; 325 characterNext = toASCIILower(characterNext); 326 } 327 328 addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16))); 329 } else { 330 UChar upper, lower; 331 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) { 332 addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask)); 333 addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask)); 334 } else 335 addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask)); 336 } 337 } 338 } 339 340 Vector<BeginChar>* m_beginChars; 341 bool m_isCaseInsensitive; 342 }; 343 238 344 class RegexPatternConstructor { 239 345 public: … … 241 347 : m_pattern(pattern) 242 348 , m_characterClassConstructor(pattern.m_ignoreCase) 349 , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase) 243 350 , m_invertParentheticalAssertion(false) 244 351 { … … 650 757 } 651 758 } 652 653 759 760 bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth) 761 { 762 if (term.quantityType == QuantifierFixedCount) { 763 beginTerms->append(TermChain(term)); 764 if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1) 765 setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1); 766 } else if (termIndex != numTerms - 1) { 767 beginTerms->append(TermChain(term)); 768 return true; 769 } 770 771 return false; 772 } 773 774 // This function collects the terms which are potentially matching the first number of depth characters in the result. 775 // If this function returns false then it found at least one term which makes the beginning character 776 // look-up optimization inefficient. 777 bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth) 778 { 779 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 780 PatternAlternative* alternative = disjunction->m_alternatives[alt]; 781 782 if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth)) 783 return false; 784 } 785 786 return true; 787 } 788 789 bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth) 790 { 791 bool checkNext = true; 792 unsigned numTerms = alternative->m_terms.size(); 793 794 while (checkNext && termIndex < numTerms) { 795 PatternTerm term = alternative->m_terms[termIndex]; 796 checkNext = false; 797 798 switch (term.type) { 799 case PatternTerm::TypeAssertionBOL: 800 case PatternTerm::TypeAssertionEOL: 801 case PatternTerm::TypeAssertionWordBoundary: 802 return false; 803 804 case PatternTerm::TypeBackReference: 805 case PatternTerm::TypeForwardReference: 806 return false; 807 808 case PatternTerm::TypePatternCharacter: 809 if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) { 810 termIndex++; 811 checkNext = true; 812 } 813 break; 814 815 case PatternTerm::TypeCharacterClass: 816 return false; 817 818 case PatternTerm::TypeParentheticalAssertion: 819 if (term.invertOrCapture) 820 return false; 821 822 case PatternTerm::TypeParenthesesSubpattern: 823 if (term.quantityType != QuantifierFixedCount) { 824 if (termIndex == numTerms - 1) 825 break; 826 827 termIndex++; 828 checkNext = true; 829 830 } 831 832 if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth)) 833 return false; 834 835 break; 836 } 837 } 838 839 return true; 840 } 841 842 void setupBeginChars() 843 { 844 Vector<TermChain> beginTerms; 845 bool containsFixedCharacter = false; 846 847 if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1) 848 && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) { 849 unsigned size = beginTerms.size(); 850 851 // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization. 852 if (!size) 853 return; 854 855 m_pattern.m_containsBeginChars = true; 856 857 for (unsigned i = 0; i < size; i++) { 858 PatternTerm term = beginTerms[i].term; 859 860 // We have just collected PatternCharacter terms, other terms are not allowed. 861 ASSERT(term.type == PatternTerm::TypePatternCharacter); 862 863 if (term.quantityType == QuantifierFixedCount) 864 containsFixedCharacter = true; 865 866 UChar character = term.patternCharacter; 867 unsigned mask = 0; 868 869 if (character <= 0x7f) { 870 if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) { 871 mask = 32; 872 character = toASCIILower(character); 873 } 874 875 m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); 876 } else { 877 UChar upper, lower; 878 if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) { 879 m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); 880 m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); 881 } else 882 m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); 883 } 884 } 885 886 // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient. 887 if (!containsFixedCharacter) { 888 m_pattern.m_containsBeginChars = false; 889 return; 890 } 891 892 size = m_pattern.m_beginChars.size(); 893 894 if (size > 2) 895 m_beginCharHelper.merge(size - 1); 896 else if (size <= 1) 897 m_pattern.m_containsBeginChars = false; 898 } 899 } 900 654 901 private: 655 902 RegexPattern& m_pattern; 656 903 PatternAlternative* m_alternative; 657 904 CharacterClassConstructor m_characterClassConstructor; 905 BeginCharHelper m_beginCharHelper; 658 906 bool m_invertCharacterClass; 659 907 bool m_invertParentheticalAssertion; … … 688 936 689 937 constructor.setupOffsets(); 938 constructor.setupBeginChars(); 690 939 691 940 return 0;
Note:
See TracChangeset
for help on using the changeset viewer.