Ignore:
Timestamp:
Nov 17, 2010, 1:42:41 AM (15 years ago)
Author:
[email protected]
Message:

2010-11-17 Peter Varga <[email protected]>

Reviewed by Gavin Barraclough.

Collect the beginning characters in a RegExp pattern for look-up
optimization
https://bugs.webkit.org/show_bug.cgi?id=45748

Extend the YARR's parser with an algorithm which collects the potential
beginning characters from a RegExp pattern for later look-up optimization.

  • yarr/RegexCompiler.cpp: (JSC::Yarr::BeginCharHelper::BeginCharHelper): (JSC::Yarr::BeginCharHelper::addBeginChar): (JSC::Yarr::BeginCharHelper::merge): (JSC::Yarr::BeginCharHelper::addCharacter): (JSC::Yarr::BeginCharHelper::linkHotTerms): (JSC::Yarr::RegexPatternConstructor::RegexPatternConstructor): (JSC::Yarr::RegexPatternConstructor::addBeginTerm): (JSC::Yarr::RegexPatternConstructor::setupDisjunctionBeginTerms): (JSC::Yarr::RegexPatternConstructor::setupAlternativeBeginTerms): (JSC::Yarr::RegexPatternConstructor::setupBeginChars): (JSC::Yarr::compileRegex):
  • yarr/RegexPattern.h: (JSC::Yarr::TermChain::TermChain): (JSC::Yarr::BeginChar::BeginChar): (JSC::Yarr::RegexPattern::RegexPattern): (JSC::Yarr::RegexPattern::reset):
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/yarr/RegexCompiler.cpp

    r67869 r72180  
    11/*
    22 * Copyright (C) 2009 Apple Inc. All rights reserved.
     3 * Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged
    34 *
    45 * Redistribution and use in source and binary forms, with or without
     
    236237};
    237238
     239struct BeginCharHelper {
     240    BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
     241        : m_beginChars(beginChars)
     242        , m_isCaseInsensitive(isCaseInsensitive)
     243    {}
     244
     245    void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
     246    {
     247        if (quantityType == QuantifierFixedCount && quantityCount > 1) {
     248            // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
     249            beginChar.value |= beginChar.value << 16;
     250            beginChar.mask |= beginChar.mask << 16;
     251            addCharacter(beginChar);
     252        } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
     253            // In case of characters with fixed quantifier we should check the next character as well.
     254            linkHotTerms(beginChar, hotTerms);
     255        else
     256            // In case of greedy matching the next character checking is unnecessary therefore we just store
     257            // the first character.
     258            addCharacter(beginChar);
     259    }
     260
     261    // Merge two following BeginChars in the vector to reduce the number of character checks.
     262    void merge(unsigned size)
     263    {
     264        for (unsigned i = 0; i < size; i++) {
     265            BeginChar* curr = &m_beginChars->at(i);
     266            BeginChar* next = &m_beginChars->at(i + 1);
     267
     268            // If the current and the next size of value is different we should skip the merge process
     269            // because the 16bit and 32bit values are unmergable.
     270            if (curr->value <= 0xFFFF && next->value > 0xFFFF)
     271                continue;
     272
     273            unsigned diff = curr->value ^ next->value;
     274
     275            curr->mask |= diff;
     276            curr->value |= curr->mask;
     277
     278            m_beginChars->remove(i + 1);
     279            size--;
     280        }
     281    }
     282
     283private:
     284    void addCharacter(BeginChar beginChar)
     285    {
     286        unsigned pos = 0;
     287        unsigned range = m_beginChars->size();
     288
     289        // binary chop, find position to insert char.
     290        while (range) {
     291            unsigned index = range >> 1;
     292
     293            int val = m_beginChars->at(pos+index).value - beginChar.value;
     294            if (!val)
     295                return;
     296            if (val < 0)
     297                range = index;
     298            else {
     299                pos += (index+1);
     300                range -= (index+1);
     301            }
     302        }
     303
     304        if (pos == m_beginChars->size())
     305            m_beginChars->append(beginChar);
     306        else
     307            m_beginChars->insert(pos, beginChar);
     308    }
     309
     310    // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
     311    void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
     312    {
     313        for (unsigned i = 0; i < hotTerms->size(); i++) {
     314            PatternTerm hotTerm = hotTerms->at(i).term;
     315            ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
     316
     317            UChar characterNext = hotTerm.patternCharacter;
     318
     319            // Append a character to an existing BeginChar object.
     320            if (characterNext <= 0x7f) {
     321                unsigned mask = 0;
     322
     323                if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
     324                    mask = 32;
     325                    characterNext = toASCIILower(characterNext);
     326                }
     327
     328                addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
     329            } else {
     330                UChar upper, lower;
     331                if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
     332                    addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
     333                    addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
     334                } else
     335                    addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
     336            }
     337        }
     338    }
     339
     340    Vector<BeginChar>* m_beginChars;
     341    bool m_isCaseInsensitive;
     342};
     343
    238344class RegexPatternConstructor {
    239345public:
     
    241347        : m_pattern(pattern)
    242348        , m_characterClassConstructor(pattern.m_ignoreCase)
     349        , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
    243350        , m_invertParentheticalAssertion(false)
    244351    {
     
    650757        }
    651758    }
    652    
    653    
     759
     760    bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth)
     761    {
     762        if (term.quantityType == QuantifierFixedCount) {
     763            beginTerms->append(TermChain(term));
     764            if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
     765                setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1);
     766        } else if (termIndex != numTerms - 1) {
     767            beginTerms->append(TermChain(term));
     768            return true;
     769        }
     770
     771        return false;
     772    }
     773
     774    // This function collects the terms which are potentially matching the first number of depth characters in the result.
     775    // If this function returns false then it found at least one term which makes the beginning character
     776    // look-up optimization inefficient.
     777    bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
     778    {
     779        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
     780            PatternAlternative* alternative = disjunction->m_alternatives[alt];
     781
     782            if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
     783                return false;
     784        }
     785
     786        return true;
     787    }
     788
     789    bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
     790    {
     791        bool checkNext = true;
     792        unsigned numTerms = alternative->m_terms.size();
     793
     794        while (checkNext && termIndex < numTerms) {
     795            PatternTerm term = alternative->m_terms[termIndex];
     796            checkNext = false;
     797
     798            switch (term.type) {
     799            case PatternTerm::TypeAssertionBOL:
     800            case PatternTerm::TypeAssertionEOL:
     801            case PatternTerm::TypeAssertionWordBoundary:
     802                return false;
     803
     804            case PatternTerm::TypeBackReference:
     805            case PatternTerm::TypeForwardReference:
     806                return false;
     807
     808            case PatternTerm::TypePatternCharacter:
     809                if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) {
     810                    termIndex++;
     811                    checkNext = true;
     812                }
     813                break;
     814
     815            case PatternTerm::TypeCharacterClass:
     816                return false;
     817
     818            case PatternTerm::TypeParentheticalAssertion:
     819                if (term.invertOrCapture)
     820                    return false;
     821
     822            case PatternTerm::TypeParenthesesSubpattern:
     823                if (term.quantityType != QuantifierFixedCount) {
     824                    if (termIndex == numTerms - 1)
     825                        break;
     826
     827                    termIndex++;
     828                    checkNext = true;
     829
     830                }
     831
     832                if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
     833                    return false;
     834
     835                break;
     836            }
     837        }
     838
     839        return true;
     840    }
     841
     842    void setupBeginChars()
     843    {
     844        Vector<TermChain> beginTerms;
     845        bool containsFixedCharacter = false;
     846
     847        if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
     848                && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
     849            unsigned size = beginTerms.size();
     850
     851            // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
     852            if (!size)
     853                return;
     854
     855            m_pattern.m_containsBeginChars = true;
     856
     857            for (unsigned i = 0; i < size; i++) {
     858                PatternTerm term = beginTerms[i].term;
     859
     860                // We have just collected PatternCharacter terms, other terms are not allowed.
     861                ASSERT(term.type == PatternTerm::TypePatternCharacter);
     862
     863                if (term.quantityType == QuantifierFixedCount)
     864                    containsFixedCharacter = true;
     865
     866                UChar character = term.patternCharacter;
     867                unsigned mask = 0;
     868
     869                if (character <= 0x7f) {
     870                    if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
     871                        mask = 32;
     872                        character = toASCIILower(character);
     873                    }
     874
     875                    m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
     876                } else {
     877                    UChar upper, lower;
     878                    if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
     879                        m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
     880                        m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
     881                    } else
     882                        m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
     883                }
     884            }
     885
     886            // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
     887            if (!containsFixedCharacter) {
     888                m_pattern.m_containsBeginChars = false;
     889                return;
     890            }
     891
     892            size = m_pattern.m_beginChars.size();
     893
     894            if (size > 2)
     895                m_beginCharHelper.merge(size - 1);
     896            else if (size <= 1)
     897                m_pattern.m_containsBeginChars = false;
     898        }
     899    }
     900
    654901private:
    655902    RegexPattern& m_pattern;
    656903    PatternAlternative* m_alternative;
    657904    CharacterClassConstructor m_characterClassConstructor;
     905    BeginCharHelper m_beginCharHelper;
    658906    bool m_invertCharacterClass;
    659907    bool m_invertParentheticalAssertion;
     
    688936       
    689937    constructor.setupOffsets();
     938    constructor.setupBeginChars();
    690939
    691940    return 0;
Note: See TracChangeset for help on using the changeset viewer.