Ignore:
Timestamp:
Dec 8, 2017, 12:32:42 PM (7 years ago)
Author:
[email protected]
Message:

YARR: JIT RegExps with greedy parenthesized sub patterns
https://bugs.webkit.org/show_bug.cgi?id=180538

Reviewed by JF Bastien.

This patch adds JIT support for regular expressions containing greedy counted
parenthesis. An example expression that couldn't be JIT'ed before is /q(a|b)*q/.

Just like in the interpreter, expressions with nested parenthetical subpatterns
require saving the results of previous matches of the parentheses contents along
with any associated state. This saved state is needed in the case that we need
to backtrack. This state is called ParenContext within the code space allocated
for this ParenContext is managed using a simple block allocator within the JIT'ed
code. The raw space managed by this allocator is passed into the JIT'ed function.

Since this fixed sized space may be exceeded, this patch adds a fallback mechanism.
If the JIT'ed code exhausts all its ParenContext space, it returns a new error
JSRegExpJITCodeFailure. The caller will then bytecompile and interpret the
expression.

Due to increased register usage by the parenthesis handling code, the use of
registers by the JIT engine was restructured, with registers used for Unicode
pattern matching replaced with constants.

Reworked some of the context structures that are used across the interpreter
and JIT implementations to make them a little more uniform and to handle the
needs of JIT'ing the new parentheses forms.

To help with development and debugging of this code, compiled patterns dumping
code was enhanced. Also added the ability to also dump interpreter ByteCodes.

  • runtime/RegExp.cpp:

(JSC::byteCodeCompilePattern):
(JSC::RegExp::byteCodeCompileIfNecessary):
(JSC::RegExp::compile):
(JSC::RegExp::compileMatchOnly):

  • runtime/RegExp.h:
  • runtime/RegExpInlines.h:

(JSC::RegExp::matchInline):

  • testRegExp.cpp:

(parseRegExpLine):
(runFromFiles):

  • yarr/Yarr.h:
  • yarr/YarrInterpreter.cpp:

(JSC::Yarr::ByteCompiler::compile):
(JSC::Yarr::ByteCompiler::dumpDisjunction):

  • yarr/YarrJIT.cpp:

(JSC::Yarr::YarrGenerator::ParenContextSizes::ParenContextSizes):
(JSC::Yarr::YarrGenerator::ParenContextSizes::numSubpatterns):
(JSC::Yarr::YarrGenerator::ParenContextSizes::frameSlots):
(JSC::Yarr::YarrGenerator::ParenContext::sizeFor):
(JSC::Yarr::YarrGenerator::ParenContext::nextOffset):
(JSC::Yarr::YarrGenerator::ParenContext::beginOffset):
(JSC::Yarr::YarrGenerator::ParenContext::matchAmountOffset):
(JSC::Yarr::YarrGenerator::ParenContext::subpatternOffset):
(JSC::Yarr::YarrGenerator::ParenContext::savedFrameOffset):
(JSC::Yarr::YarrGenerator::initParenContextFreeList):
(JSC::Yarr::YarrGenerator::allocatePatternContext):
(JSC::Yarr::YarrGenerator::freePatternContext):
(JSC::Yarr::YarrGenerator::savePatternContext):
(JSC::Yarr::YarrGenerator::restorePatternContext):
(JSC::Yarr::YarrGenerator::tryReadUnicodeCharImpl):
(JSC::Yarr::YarrGenerator::storeToFrame):
(JSC::Yarr::YarrGenerator::generateJITFailReturn):
(JSC::Yarr::YarrGenerator::clearMatches):
(JSC::Yarr::YarrGenerator::generate):
(JSC::Yarr::YarrGenerator::backtrack):
(JSC::Yarr::YarrGenerator::opCompileParenthesesSubpattern):
(JSC::Yarr::YarrGenerator::generateEnter):
(JSC::Yarr::YarrGenerator::generateReturn):
(JSC::Yarr::YarrGenerator::YarrGenerator):
(JSC::Yarr::YarrGenerator::compile):

  • yarr/YarrJIT.h:

(JSC::Yarr::YarrCodeBlock::execute):

  • yarr/YarrPattern.cpp:

(JSC::Yarr::indentForNestingLevel):
(JSC::Yarr::dumpUChar32):
(JSC::Yarr::dumpCharacterClass):
(JSC::Yarr::PatternTerm::dump):
(JSC::Yarr::YarrPattern::dumpPattern):

  • yarr/YarrPattern.h:

(JSC::Yarr::PatternTerm::containsAnyCaptures):
(JSC::Yarr::BackTrackInfoParenthesesOnce::returnAddressIndex):
(JSC::Yarr::BackTrackInfoParentheses::beginIndex):
(JSC::Yarr::BackTrackInfoParentheses::returnAddressIndex):
(JSC::Yarr::BackTrackInfoParentheses::matchAmountIndex):
(JSC::Yarr::BackTrackInfoParentheses::patternContextHeadIndex):
(JSC::Yarr::BackTrackInfoAlternative::offsetIndex): Deleted.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/JavaScriptCore/yarr/YarrInterpreter.cpp

    r225683 r225695  
    2828#include "YarrInterpreter.h"
    2929
     30#include "Options.h"
    3031#include "SuperSampler.h"
    3132#include "Yarr.h"
     
    16701671        regexEnd();
    16711672
     1673#ifndef NDEBUG
     1674        if (Options::dumpCompiledRegExpPatterns())
     1675            dumpDisjunction(m_bodyDisjunction.get());
     1676#endif
     1677
    16721678        return std::make_unique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock);
    16731679    }
     
    18291835        return beginTerm;
    18301836    }
    1831 
    1832 #ifndef NDEBUG
    1833     void dumpDisjunction(ByteDisjunction* disjunction)
    1834     {
    1835         dataLogF("ByteDisjunction(%p):\n\t", disjunction);
    1836         for (unsigned i = 0; i < disjunction->terms.size(); ++i)
    1837             dataLogF("{ %d } ", disjunction->terms[i].type);
    1838         dataLogF("\n");
    1839     }
    1840 #endif
    18411837
    18421838    void closeAlternative(int beginTerm)
     
    21122108        }
    21132109    }
     2110#ifndef NDEBUG
     2111    void dumpDisjunction(ByteDisjunction* disjunction, unsigned nesting = 0)
     2112    {
     2113        PrintStream& out = WTF::dataFile();
     2114
     2115        unsigned termIndexNest = 0;
     2116
     2117        if (!nesting) {
     2118            out.printf("ByteDisjunction(%p):\n", disjunction);
     2119            nesting = 1;
     2120        } else {
     2121            termIndexNest = nesting - 1;
     2122            nesting = 2;
     2123        }
     2124
     2125        auto outputTermIndexAndNest = [&](size_t index, unsigned termNesting) {
     2126            for (unsigned nestingDepth = 0; nestingDepth < termIndexNest; nestingDepth++)
     2127                out.print("  ");
     2128            out.printf("%4lu", index);
     2129            for (unsigned nestingDepth = 0; nestingDepth < termNesting; nestingDepth++)
     2130                out.print("  ");
     2131        };
     2132
     2133        auto dumpQuantity = [&](ByteTerm& term) {
     2134            if (term.atom.quantityType == QuantifierFixedCount && term.atom.quantityMinCount == 1 && term.atom.quantityMaxCount == 1)
     2135                return;
     2136
     2137            out.print(" {", term.atom.quantityMinCount);
     2138            if (term.atom.quantityMinCount != term.atom.quantityMaxCount) {
     2139                if (term.atom.quantityMaxCount == UINT_MAX)
     2140                    out.print(",inf");
     2141                else
     2142                    out.print(",", term.atom.quantityMaxCount);
     2143            }
     2144            out.print("}");
     2145            if (term.atom.quantityType == QuantifierGreedy)
     2146                out.print(" greedy");
     2147            else if (term.atom.quantityType == QuantifierNonGreedy)
     2148                out.print(" non-greedy");
     2149        };
     2150
     2151        auto dumpCaptured = [&](ByteTerm& term) {
     2152            if (term.capture())
     2153                out.print(" captured (#", term.atom.subpatternId, ")");
     2154        };
     2155
     2156        auto dumpInverted = [&](ByteTerm& term) {
     2157            if (term.invert())
     2158                out.print(" inverted");
     2159        };
     2160
     2161        auto dumpInputPosition = [&](ByteTerm& term) {
     2162            out.printf(" inputPosition %u", term.inputPosition);
     2163        };
     2164
     2165        auto dumpCharacter = [&](ByteTerm& term) {
     2166            out.print(" ");
     2167            dumpUChar32(out, term.atom.patternCharacter);
     2168        };
     2169
     2170        auto dumpCharClass = [&](ByteTerm& term) {
     2171            out.print(" ");
     2172            dumpCharacterClass(out, &m_pattern, term.atom.characterClass);
     2173        };
     2174
     2175        for (size_t idx = 0; idx < disjunction->terms.size(); ++idx) {
     2176            ByteTerm term = disjunction->terms[idx];
     2177
     2178            bool outputNewline = true;
     2179
     2180            switch (term.type) {
     2181            case ByteTerm::TypeBodyAlternativeBegin:
     2182                outputTermIndexAndNest(idx, nesting++);
     2183                out.print("BodyAlternativeBegin");
     2184                if (term.alternative.onceThrough)
     2185                    out.print(" onceThrough");
     2186                break;
     2187            case ByteTerm::TypeBodyAlternativeDisjunction:
     2188                outputTermIndexAndNest(idx, nesting - 1);
     2189                out.print("BodyAlternativeDisjunction");
     2190                break;
     2191            case ByteTerm::TypeBodyAlternativeEnd:
     2192                outputTermIndexAndNest(idx, --nesting);
     2193                out.print("BodyAlternativeEnd");
     2194                break;
     2195            case ByteTerm::TypeAlternativeBegin:
     2196                outputTermIndexAndNest(idx, nesting++);
     2197                out.print("AlternativeBegin");
     2198                break;
     2199            case ByteTerm::TypeAlternativeDisjunction:
     2200                outputTermIndexAndNest(idx, nesting - 1);
     2201                out.print("AlternativeDisjunction");
     2202                break;
     2203            case ByteTerm::TypeAlternativeEnd:
     2204                outputTermIndexAndNest(idx, --nesting);
     2205                out.print("AlternativeEnd");
     2206                break;
     2207            case ByteTerm::TypeSubpatternBegin:
     2208                outputTermIndexAndNest(idx, nesting++);
     2209                out.print("SubpatternBegin");
     2210                break;
     2211            case ByteTerm::TypeSubpatternEnd:
     2212                outputTermIndexAndNest(idx, --nesting);
     2213                out.print("SubpatternEnd");
     2214                break;
     2215            case ByteTerm::TypeAssertionBOL:
     2216                outputTermIndexAndNest(idx, nesting);
     2217                out.print("AssertionBOL");
     2218                break;
     2219            case ByteTerm::TypeAssertionEOL:
     2220                outputTermIndexAndNest(idx, nesting);
     2221                out.print("AssertionEOL");
     2222                break;
     2223            case ByteTerm::TypeAssertionWordBoundary:
     2224                outputTermIndexAndNest(idx, nesting);
     2225                out.print("AssertionWordBoundary");
     2226                break;
     2227            case ByteTerm::TypePatternCharacterOnce:
     2228                outputTermIndexAndNest(idx, nesting);
     2229                out.print("PatternCharacterOnce");
     2230                dumpInverted(term);
     2231                dumpInputPosition(term);
     2232                dumpCharacter(term);
     2233                dumpQuantity(term);
     2234                break;
     2235            case ByteTerm::TypePatternCharacterFixed:
     2236                outputTermIndexAndNest(idx, nesting);
     2237                out.print("PatternCharacterFixed");
     2238                dumpInverted(term);
     2239                dumpInputPosition(term);
     2240                dumpCharacter(term);
     2241                out.print(" {", term.atom.quantityMinCount, "}");
     2242                break;
     2243            case ByteTerm::TypePatternCharacterGreedy:
     2244                outputTermIndexAndNest(idx, nesting);
     2245                out.print("PatternCharacterGreedy");
     2246                dumpInverted(term);
     2247                dumpInputPosition(term);
     2248                dumpCharacter(term);
     2249                dumpQuantity(term);
     2250                break;
     2251            case ByteTerm::TypePatternCharacterNonGreedy:
     2252                outputTermIndexAndNest(idx, nesting);
     2253                out.print("PatternCharacterNonGreedy");
     2254                dumpInverted(term);
     2255                dumpInputPosition(term);
     2256                dumpCharacter(term);
     2257                dumpQuantity(term);
     2258                break;
     2259            case ByteTerm::TypePatternCasedCharacterOnce:
     2260                outputTermIndexAndNest(idx, nesting);
     2261                out.print("PatternCasedCharacterOnce");
     2262                break;
     2263            case ByteTerm::TypePatternCasedCharacterFixed:
     2264                outputTermIndexAndNest(idx, nesting);
     2265                out.print("PatternCasedCharacterFixed");
     2266                break;
     2267            case ByteTerm::TypePatternCasedCharacterGreedy:
     2268                outputTermIndexAndNest(idx, nesting);
     2269                out.print("PatternCasedCharacterGreedy");
     2270                break;
     2271            case ByteTerm::TypePatternCasedCharacterNonGreedy:
     2272                outputTermIndexAndNest(idx, nesting);
     2273                out.print("PatternCasedCharacterNonGreedy");
     2274                break;
     2275            case ByteTerm::TypeCharacterClass:
     2276                outputTermIndexAndNest(idx, nesting);
     2277                out.print("CharacterClass");
     2278                dumpInverted(term);
     2279                dumpInputPosition(term);
     2280                dumpCharClass(term);
     2281                dumpQuantity(term);
     2282                break;
     2283            case ByteTerm::TypeBackReference:
     2284                outputTermIndexAndNest(idx, nesting);
     2285                out.print("BackReference #", term.atom.subpatternId);
     2286                dumpQuantity(term);
     2287                break;
     2288            case ByteTerm::TypeParenthesesSubpattern:
     2289                outputTermIndexAndNest(idx, nesting);
     2290                out.print("ParenthesesSubpattern");
     2291                dumpCaptured(term);
     2292                dumpInverted(term);
     2293                dumpInputPosition(term);
     2294                dumpQuantity(term);
     2295                out.print("\n");
     2296                outputNewline = false;
     2297                dumpDisjunction(term.atom.parenthesesDisjunction, nesting);
     2298                break;
     2299            case ByteTerm::TypeParenthesesSubpatternOnceBegin:
     2300                outputTermIndexAndNest(idx, nesting++);
     2301                out.print("ParenthesesSubpatternOnceBegin");
     2302                dumpCaptured(term);
     2303                dumpInverted(term);
     2304                dumpInputPosition(term);
     2305                break;
     2306            case ByteTerm::TypeParenthesesSubpatternOnceEnd:
     2307                outputTermIndexAndNest(idx, --nesting);
     2308                out.print("ParenthesesSubpatternOnceEnd");
     2309                break;
     2310            case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
     2311                outputTermIndexAndNest(idx, nesting++);
     2312                out.print("ParenthesesSubpatternTerminalBegin");
     2313                dumpInverted(term);
     2314                dumpInputPosition(term);
     2315                break;
     2316            case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
     2317                outputTermIndexAndNest(idx, --nesting);
     2318                out.print("ParenthesesSubpatternTerminalEnd");
     2319                break;
     2320            case ByteTerm::TypeParentheticalAssertionBegin:
     2321                outputTermIndexAndNest(idx, nesting++);
     2322                out.print("ParentheticalAssertionBegin");
     2323                dumpInverted(term);
     2324                dumpInputPosition(term);
     2325                break;
     2326            case ByteTerm::TypeParentheticalAssertionEnd:
     2327                outputTermIndexAndNest(idx, --nesting);
     2328                out.print("ParentheticalAssertionEnd");
     2329                break;
     2330            case ByteTerm::TypeCheckInput:
     2331                outputTermIndexAndNest(idx, nesting);
     2332                out.print("CheckInput ", term.checkInputCount);
     2333                break;
     2334            case ByteTerm::TypeUncheckInput:
     2335                outputTermIndexAndNest(idx, nesting);
     2336                out.print("UncheckInput ", term.checkInputCount);
     2337                break;
     2338            case ByteTerm::TypeDotStarEnclosure:
     2339                outputTermIndexAndNest(idx, nesting);
     2340                out.print("DotStarEnclosure");
     2341                break;
     2342            }
     2343            if (outputNewline)
     2344                out.print("\n");
     2345        }
     2346    }
     2347#endif
    21142348
    21152349private:
     
    21532387COMPILE_ASSERT(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
    21542388COMPILE_ASSERT(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
    2155 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
     2389COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) <= (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
    21562390
    21572391
Note: See TracChangeset for help on using the changeset viewer.