Ignore:
Timestamp:
May 23, 2011, 5:09:19 PM (14 years ago)
Author:
[email protected]
Message:

https://bugs.webkit.org/show_bug.cgi?id=61306

Reviewed by Geoff Garen.

The begin characters optimization currently has issues (#61129),
and does not appear to still be a performance win. The prudent
next step seems to be to disable while we ascertain whether this
is still a useful performance optimization.

  • yarr/YarrInterpreter.cpp:

(JSC::Yarr::Interpreter::matchDisjunction):
(JSC::Yarr::Interpreter::interpret):

  • yarr/YarrInterpreter.h:

(JSC::Yarr::BytecodePattern::BytecodePattern):

  • yarr/YarrPattern.cpp:

(JSC::Yarr::YarrPatternConstructor::YarrPatternConstructor):
(JSC::Yarr::YarrPattern::compile):
(JSC::Yarr::YarrPattern::YarrPattern):

  • yarr/YarrPattern.h:

(JSC::Yarr::YarrPattern::reset):

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp

    r86547 r87109  
    235235};
    236236
    237 struct BeginCharHelper {
    238     BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
    239         : m_beginChars(beginChars)
    240         , m_isCaseInsensitive(isCaseInsensitive)
    241     {}
    242 
    243     void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
    244     {
    245         if (quantityType == QuantifierFixedCount && quantityCount > 1) {
    246             // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
    247             beginChar.value |= beginChar.value << 16;
    248             beginChar.mask |= beginChar.mask << 16;
    249             addCharacter(beginChar);
    250         } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
    251             // In case of characters with fixed quantifier we should check the next character as well.
    252             linkHotTerms(beginChar, hotTerms);
    253         else
    254             // In case of greedy matching the next character checking is unnecessary therefore we just store
    255             // the first character.
    256             addCharacter(beginChar);
    257     }
    258 
    259     // Merge two following BeginChars in the vector to reduce the number of character checks.
    260     void merge(unsigned size)
    261     {
    262         for (unsigned i = 0; i < size; i++) {
    263             BeginChar* curr = &m_beginChars->at(i);
    264             BeginChar* next = &m_beginChars->at(i + 1);
    265 
    266             // If the current and the next size of value is different we should skip the merge process
    267             // because the 16bit and 32bit values are unmergable.
    268             if (curr->value <= 0xFFFF && next->value > 0xFFFF)
    269                 continue;
    270 
    271             unsigned diff = curr->value ^ next->value;
    272 
    273             curr->mask |= diff;
    274             curr->value |= curr->mask;
    275 
    276             m_beginChars->remove(i + 1);
    277             size--;
    278         }
    279     }
    280 
    281 private:
    282     void addCharacter(BeginChar beginChar)
    283     {
    284         unsigned pos = 0;
    285         unsigned range = m_beginChars->size();
    286 
    287         // binary chop, find position to insert char.
    288         while (range) {
    289             unsigned index = range >> 1;
    290 
    291             int val = m_beginChars->at(pos+index).value - beginChar.value;
    292             if (!val)
    293                 return;
    294             if (val < 0)
    295                 range = index;
    296             else {
    297                 pos += (index+1);
    298                 range -= (index+1);
    299             }
    300         }
    301 
    302         if (pos == m_beginChars->size())
    303             m_beginChars->append(beginChar);
    304         else
    305             m_beginChars->insert(pos, beginChar);
    306     }
    307 
    308     // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
    309     void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
    310     {
    311         for (unsigned i = 0; i < hotTerms->size(); i++) {
    312             PatternTerm hotTerm = hotTerms->at(i).term;
    313             ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
    314 
    315             UChar characterNext = hotTerm.patternCharacter;
    316 
    317             // Append a character to an existing BeginChar object.
    318             if (characterNext <= 0x7f) {
    319                 unsigned mask = 0;
    320 
    321                 if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
    322                     mask = 32;
    323                     characterNext = toASCIILower(characterNext);
    324                 }
    325 
    326                 addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
    327             } else {
    328                 UChar upper, lower;
    329                 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
    330                     addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
    331                     addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
    332                 } else
    333                     addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
    334             }
    335         }
    336     }
    337 
    338     Vector<BeginChar>* m_beginChars;
    339     bool m_isCaseInsensitive;
    340 };
    341 
    342237class YarrPatternConstructor {
    343238public:
     
    345240        : m_pattern(pattern)
    346241        , m_characterClassConstructor(pattern.m_ignoreCase)
    347         , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
    348242        , m_invertParentheticalAssertion(false)
    349243    {
     
    782676    }
    783677
    784     // This function collects the terms which are potentially matching the first number of depth characters in the result.
    785     // If this function returns false then it found at least one term which makes the beginning character
    786     // look-up optimization inefficient.
    787     bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
    788     {
    789         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
    790             PatternAlternative* alternative = disjunction->m_alternatives[alt];
    791 
    792             if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
    793                 return false;
    794         }
    795 
    796         return true;
    797     }
    798 
    799     bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
    800     {
    801         bool checkNext = true;
    802         unsigned numTerms = alternative->m_terms.size();
    803 
    804         while (checkNext && termIndex < numTerms) {
    805             PatternTerm term = alternative->m_terms[termIndex];
    806             checkNext = false;
    807 
    808             switch (term.type) {
    809             case PatternTerm::TypeAssertionBOL:
    810             case PatternTerm::TypeAssertionEOL:
    811             case PatternTerm::TypeAssertionWordBoundary:
    812                 return false;
    813 
    814             case PatternTerm::TypeBackReference:
    815             case PatternTerm::TypeForwardReference:
    816                 return false;
    817 
    818             case PatternTerm::TypePatternCharacter:
    819                 if (termIndex != numTerms - 1) {
    820                     beginTerms->append(TermChain(term));
    821                     termIndex++;
    822                     checkNext = true;
    823                 } else if (term.quantityType == QuantifierFixedCount) {
    824                     beginTerms->append(TermChain(term));
    825                     if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
    826                         if (!setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1))
    827                             return false;
    828                 }
    829 
    830                 break;
    831 
    832             case PatternTerm::TypeCharacterClass:
    833                 return false;
    834 
    835             case PatternTerm::TypeParentheticalAssertion:
    836                 if (term.invert())
    837                     return false;
    838 
    839             case PatternTerm::TypeParenthesesSubpattern:
    840                 if (term.quantityType != QuantifierFixedCount) {
    841                     if (termIndex == numTerms - 1)
    842                         break;
    843 
    844                     termIndex++;
    845                     checkNext = true;
    846                 }
    847 
    848                 if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
    849                     return false;
    850 
    851                 break;
    852             }
    853         }
    854 
    855         return true;
    856     }
    857 
    858     void setupBeginChars()
    859     {
    860         Vector<TermChain> beginTerms;
    861         bool containsFixedCharacter = false;
    862 
    863         if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
    864                 && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
    865             unsigned size = beginTerms.size();
    866 
    867             // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
    868             if (!size)
    869                 return;
    870 
    871             m_pattern.m_containsBeginChars = true;
    872 
    873             for (unsigned i = 0; i < size; i++) {
    874                 PatternTerm term = beginTerms[i].term;
    875 
    876                 // We have just collected PatternCharacter terms, other terms are not allowed.
    877                 ASSERT(term.type == PatternTerm::TypePatternCharacter);
    878 
    879                 if (term.quantityType == QuantifierFixedCount)
    880                     containsFixedCharacter = true;
    881 
    882                 UChar character = term.patternCharacter;
    883                 unsigned mask = 0;
    884 
    885                 if (character <= 0x7f) {
    886                     if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
    887                         mask = 32;
    888                         character = toASCIILower(character);
    889                     }
    890 
    891                     m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
    892                 } else {
    893                     UChar upper, lower;
    894                     if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
    895                         m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
    896                         m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
    897                     } else
    898                         m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
    899                 }
    900             }
    901 
    902             // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
    903             if (!containsFixedCharacter) {
    904                 m_pattern.m_containsBeginChars = false;
    905                 return;
    906             }
    907 
    908             size = m_pattern.m_beginChars.size();
    909 
    910             if (size > 2)
    911                 m_beginCharHelper.merge(size - 1);
    912             else if (size <= 1)
    913                 m_pattern.m_containsBeginChars = false;
    914         }
    915     }
    916 
    917678private:
    918679    YarrPattern& m_pattern;
    919680    PatternAlternative* m_alternative;
    920681    CharacterClassConstructor m_characterClassConstructor;
    921     BeginCharHelper m_beginCharHelper;
    922682    bool m_invertCharacterClass;
    923683    bool m_invertParentheticalAssertion;
     
    952712       
    953713    constructor.setupOffsets();
    954     constructor.setupBeginChars();
    955714
    956715    return 0;
     
    961720    , m_multiline(multiline)
    962721    , m_containsBackreferences(false)
    963     , m_containsBeginChars(false)
    964722    , m_containsBOL(false)
    965723    , m_numSubpatterns(0)
Note: See TracChangeset for help on using the changeset viewer.