Ignore:
Timestamp:
Dec 8, 2017, 12:32:42 PM (7 years ago)
Author:
[email protected]
Message:

YARR: JIT RegExps with greedy parenthesized sub patterns
https://bugs.webkit.org/show_bug.cgi?id=180538

Reviewed by JF Bastien.

This patch adds JIT support for regular expressions containing greedy counted
parenthesis. An example expression that couldn't be JIT'ed before is /q(a|b)*q/.

Just like in the interpreter, expressions with nested parenthetical subpatterns
require saving the results of previous matches of the parentheses contents along
with any associated state. This saved state is needed in the case that we need
to backtrack. This state is called ParenContext within the code space allocated
for this ParenContext is managed using a simple block allocator within the JIT'ed
code. The raw space managed by this allocator is passed into the JIT'ed function.

Since this fixed sized space may be exceeded, this patch adds a fallback mechanism.
If the JIT'ed code exhausts all its ParenContext space, it returns a new error
JSRegExpJITCodeFailure. The caller will then bytecompile and interpret the
expression.

Due to increased register usage by the parenthesis handling code, the use of
registers by the JIT engine was restructured, with registers used for Unicode
pattern matching replaced with constants.

Reworked some of the context structures that are used across the interpreter
and JIT implementations to make them a little more uniform and to handle the
needs of JIT'ing the new parentheses forms.

To help with development and debugging of this code, compiled patterns dumping
code was enhanced. Also added the ability to also dump interpreter ByteCodes.

  • runtime/RegExp.cpp:

(JSC::byteCodeCompilePattern):
(JSC::RegExp::byteCodeCompileIfNecessary):
(JSC::RegExp::compile):
(JSC::RegExp::compileMatchOnly):

  • runtime/RegExp.h:
  • runtime/RegExpInlines.h:

(JSC::RegExp::matchInline):

  • testRegExp.cpp:

(parseRegExpLine):
(runFromFiles):

  • yarr/Yarr.h:
  • yarr/YarrInterpreter.cpp:

(JSC::Yarr::ByteCompiler::compile):
(JSC::Yarr::ByteCompiler::dumpDisjunction):

  • yarr/YarrJIT.cpp:

(JSC::Yarr::YarrGenerator::ParenContextSizes::ParenContextSizes):
(JSC::Yarr::YarrGenerator::ParenContextSizes::numSubpatterns):
(JSC::Yarr::YarrGenerator::ParenContextSizes::frameSlots):
(JSC::Yarr::YarrGenerator::ParenContext::sizeFor):
(JSC::Yarr::YarrGenerator::ParenContext::nextOffset):
(JSC::Yarr::YarrGenerator::ParenContext::beginOffset):
(JSC::Yarr::YarrGenerator::ParenContext::matchAmountOffset):
(JSC::Yarr::YarrGenerator::ParenContext::subpatternOffset):
(JSC::Yarr::YarrGenerator::ParenContext::savedFrameOffset):
(JSC::Yarr::YarrGenerator::initParenContextFreeList):
(JSC::Yarr::YarrGenerator::allocatePatternContext):
(JSC::Yarr::YarrGenerator::freePatternContext):
(JSC::Yarr::YarrGenerator::savePatternContext):
(JSC::Yarr::YarrGenerator::restorePatternContext):
(JSC::Yarr::YarrGenerator::tryReadUnicodeCharImpl):
(JSC::Yarr::YarrGenerator::storeToFrame):
(JSC::Yarr::YarrGenerator::generateJITFailReturn):
(JSC::Yarr::YarrGenerator::clearMatches):
(JSC::Yarr::YarrGenerator::generate):
(JSC::Yarr::YarrGenerator::backtrack):
(JSC::Yarr::YarrGenerator::opCompileParenthesesSubpattern):
(JSC::Yarr::YarrGenerator::generateEnter):
(JSC::Yarr::YarrGenerator::generateReturn):
(JSC::Yarr::YarrGenerator::YarrGenerator):
(JSC::Yarr::YarrGenerator::compile):

  • yarr/YarrJIT.h:

(JSC::Yarr::YarrCodeBlock::execute):

  • yarr/YarrPattern.cpp:

(JSC::Yarr::indentForNestingLevel):
(JSC::Yarr::dumpUChar32):
(JSC::Yarr::dumpCharacterClass):
(JSC::Yarr::PatternTerm::dump):
(JSC::Yarr::YarrPattern::dumpPattern):

  • yarr/YarrPattern.h:

(JSC::Yarr::PatternTerm::containsAnyCaptures):
(JSC::Yarr::BackTrackInfoParenthesesOnce::returnAddressIndex):
(JSC::Yarr::BackTrackInfoParentheses::beginIndex):
(JSC::Yarr::BackTrackInfoParentheses::returnAddressIndex):
(JSC::Yarr::BackTrackInfoParentheses::matchAmountIndex):
(JSC::Yarr::BackTrackInfoParentheses::patternContextHeadIndex):
(JSC::Yarr::BackTrackInfoAlternative::offsetIndex): Deleted.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp

    r225683 r225695  
    829829                term.frameLocation = currentCallFrameSize;
    830830                if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
    831                     if (term.quantityType != QuantifierFixedCount)
    832                         currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
     831                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
    833832                    error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
    834833                    if (error)
     
    846845                } else {
    847846                    term.inputPosition = currentInputPosition.unsafeGet();
    848                     unsigned ignoredCallFrameSize;
    849                     error = setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet(), ignoredCallFrameSize);
     847                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
     848                    error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
    850849                    if (error)
    851850                        return error;
    852                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
    853851                }
    854852                // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
     
    11861184}
    11871185
    1188 static void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
     1186void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
    11891187{
    11901188    out.print("    ");
     
    11931191}
    11941192
    1195 static void dumpUChar32(PrintStream& out, UChar32 c)
     1193void dumpUChar32(PrintStream& out, UChar32 c)
    11961194{
    11971195    if (c >= ' '&& c <= 0xff)
     
    11991197    else
    12001198        out.printf("0x%04x", c);
     1199}
     1200
     1201void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
     1202{
     1203    if (characterClass == pattern->anyCharacterClass())
     1204        out.print("<any character>");
     1205    else if (characterClass == pattern->newlineCharacterClass())
     1206        out.print("<newline>");
     1207    else if (characterClass == pattern->digitsCharacterClass())
     1208        out.print("<digits>");
     1209    else if (characterClass == pattern->spacesCharacterClass())
     1210        out.print("<whitespace>");
     1211    else if (characterClass == pattern->wordcharCharacterClass())
     1212        out.print("<word>");
     1213    else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
     1214        out.print("<unicode ignore case>");
     1215    else if (characterClass == pattern->nondigitsCharacterClass())
     1216        out.print("<non-digits>");
     1217    else if (characterClass == pattern->nonspacesCharacterClass())
     1218        out.print("<non-whitespace>");
     1219    else if (characterClass == pattern->nonwordcharCharacterClass())
     1220        out.print("<non-word>");
     1221    else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
     1222        out.print("<unicode non-ignore case>");
     1223    else {
     1224        bool needMatchesRangesSeperator = false;
     1225
     1226        auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
     1227            size_t matchesSize = matches.size();
     1228            if (matchesSize) {
     1229                if (needMatchesRangesSeperator)
     1230                    out.print(",");
     1231                needMatchesRangesSeperator = true;
     1232
     1233                out.print(prefix, ":(");
     1234                for (size_t i = 0; i < matchesSize; ++i) {
     1235                    if (i)
     1236                        out.print(",");
     1237                    dumpUChar32(out, matches[i]);
     1238                }
     1239                out.print(")");
     1240            }
     1241        };
     1242
     1243        auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
     1244            size_t rangeSize = ranges.size();
     1245            if (rangeSize) {
     1246                if (needMatchesRangesSeperator)
     1247                    out.print(",");
     1248                needMatchesRangesSeperator = true;
     1249
     1250                out.print(prefix, " ranges:(");
     1251                for (size_t i = 0; i < rangeSize; ++i) {
     1252                    if (i)
     1253                        out.print(",");
     1254                    CharacterRange range = ranges[i];
     1255                    out.print("(");
     1256                    dumpUChar32(out, range.begin);
     1257                    out.print("..");
     1258                    dumpUChar32(out, range.end);
     1259                    out.print(")");
     1260                }
     1261                out.print(")");
     1262            }
     1263        };
     1264
     1265        out.print("[");
     1266        dumpMatches("ASCII", characterClass->m_matches);
     1267        dumpRanges("ASCII", characterClass->m_ranges);
     1268        dumpMatches("Unicode", characterClass->m_matchesUnicode);
     1269        dumpRanges("Unicode", characterClass->m_rangesUnicode);
     1270        out.print("]");
     1271    }
    12011272}
    12021273
     
    12401311    indentForNestingLevel(out, nestingDepth);
    12411312
    1242     if (invert() && (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion))
    1243         out.print("not ");
     1313    if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
     1314        if (invert())
     1315            out.print("not ");
     1316    }
    12441317
    12451318    switch (type) {
     
    12551328    case TypePatternCharacter:
    12561329        out.printf("character ");
     1330        out.printf("inputPosition %u ", inputPosition);
    12571331        if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
    12581332            dumpUChar32(out, toASCIIUpper(patternCharacter));
     
    13761450            out.print(",terminal");
    13771451
    1378         if (quantityMaxCount != 1 || parentheses.isCopy || quantityType != QuantifierFixedCount)
    1379             out.println(",frame location ", frameLocation);
    1380         else
    1381             out.println();
     1452        out.println(",frame location ", frameLocation);
    13821453
    13831454        if (parentheses.disjunction->m_alternatives.size() > 1) {
    13841455            indentForNestingLevel(out, nestingDepth + 1);
    13851456            unsigned alternativeFrameLocation = frameLocation;
    1386             if (quantityType != QuantifierFixedCount)
     1457            if (quantityMaxCount == 1 && !parentheses.isCopy)
    13871458                alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
     1459            else if (parentheses.isTerminal)
     1460                alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
     1461            else
     1462                alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
    13881463            out.println("alternative list,frame location ", alternativeFrameLocation);
    13891464        }
     
    14621537    }
    14631538    out.print(":\n");
     1539    if (m_body->m_callFrameSize)
     1540        out.print("    callframe size: ", m_body->m_callFrameSize, "\n");
    14641541    m_body->dump(out, this);
    14651542}
Note: See TracChangeset for help on using the changeset viewer.