Ignore:
Timestamp:
Nov 4, 2007, 1:28:22 AM (18 years ago)
Author:
Darin Adler
Message:

Reviewed by Maciej.

SunSpider says it's 2.6% faster overall, 32.5% in the regular expression tests.

  • pcre/pcre_internal.h: Added OP_ASCII_CHAR and OP_ASCII_LETTER_NC.
  • pcre/pcre_compile.c: (find_fixedlength): Added cases for OP_ASCII_CHAR and OP_ASCII_LETTER_NC. Also added OP_NOT since there was no reason it should not be in here. (could_be_empty_branch): Ditto. (compile_branch): Streamlined all the single-character cases; there was a bit of duplicate code. Added cases for OP_ASCII_CHAR and OP_ASCII_LETTER_NC as needed. But in particular, compile to those opcodes when the single character match is ASCII. (find_firstassertedchar): Added cases for OP_ASCII_CHAR and OP_ASCII_LETTER_NC.
  • pcre/pcre_exec.c: (match): Removed the "min", "minimize", and "op" fields from the matchframe, after I discovered that none of them needed to be saved and restored across recursive match calls. Also eliminated the ignored result field from the matchframe, since I discovered that rrc ("recursive result code") was already the exact same thing. Moved the handling of opcodes higher than OP_BRA into the default statement of the switch instead of doing them before the switch. This removes a branch from each iteration of the opcode interpreter, just as removal of "op" removed at least one store from each iteration. Last, but not least, add the OP_ASCII_CHAR and OP_ASCII_LETTER_NC functions. Neither can ever match a surrogate pair and the letter case can be handled efficiently.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/pcre/pcre_internal.h

    r27420 r27422  
    366366
    367367enum {
    368   OP_END,            /* 0 End of pattern */
     368  OP_END,                   /* End of pattern */
    369369
    370370  /* Values corresponding to backslashed metacharacters */
    371371
    372   OP_NOT_WORD_BOUNDARY,  /*  3 \B */
    373   OP_WORD_BOUNDARY,      /*  4 \b */
    374   OP_NOT_DIGIT,          /*  5 \D */
    375   OP_DIGIT,              /*  6 \d */
    376   OP_NOT_WHITESPACE,     /*  7 \S */
    377   OP_WHITESPACE,         /*  8 \s */
    378   OP_NOT_WORDCHAR,       /*  9 \W */
    379   OP_WORDCHAR,           /* 10 \w */
    380 
    381   OP_ANY,            /* 11 Match any character */
    382 
    383   OP_CIRC,           /* 19 Start of line - varies with multiline switch */
    384   OP_DOLL,           /* 20 End of line - varies with multiline switch */
    385   OP_CHAR,           /* 21 Match one character, casefully */
    386   OP_CHARNC,         /* 22 Match one character, caselessly */
    387   OP_NOT,            /* 23 Match anything but the following char */
    388 
    389   OP_STAR,           /* 24 The maximizing and minimizing versions of */
    390   OP_MINSTAR,        /* 25 all these opcodes must come in pairs, with */
    391   OP_PLUS,           /* 26 the minimizing one second. */
    392   OP_MINPLUS,        /* 27 This first set applies to single characters */
    393   OP_QUERY,          /* 28 */
    394   OP_MINQUERY,       /* 29 */
    395   OP_UPTO,           /* 30 From 0 to n matches */
    396   OP_MINUPTO,        /* 31 */
    397   OP_EXACT,          /* 32 Exactly n matches */
    398 
    399   OP_NOTSTAR,        /* 33 The maximizing and minimizing versions of */
    400   OP_NOTMINSTAR,     /* 34 all these opcodes must come in pairs, with */
    401   OP_NOTPLUS,        /* 35 the minimizing one second. */
    402   OP_NOTMINPLUS,     /* 36 This set applies to "not" single characters */
    403   OP_NOTQUERY,       /* 37 */
    404   OP_NOTMINQUERY,    /* 38 */
    405   OP_NOTUPTO,        /* 39 From 0 to n matches */
    406   OP_NOTMINUPTO,     /* 40 */
    407   OP_NOTEXACT,       /* 41 Exactly n matches */
    408 
    409   OP_TYPESTAR,       /* 42 The maximizing and minimizing versions of */
    410   OP_TYPEMINSTAR,    /* 43 all these opcodes must come in pairs, with */
    411   OP_TYPEPLUS,       /* 44 the minimizing one second. These codes must */
    412   OP_TYPEMINPLUS,    /* 45 be in exactly the same order as those above. */
    413   OP_TYPEQUERY,      /* 46 This set applies to character types such as \d */
    414   OP_TYPEMINQUERY,   /* 47 */
    415   OP_TYPEUPTO,       /* 48 From 0 to n matches */
    416   OP_TYPEMINUPTO,    /* 49 */
    417   OP_TYPEEXACT,      /* 50 Exactly n matches */
    418 
    419   OP_CRSTAR,         /* 51 The maximizing and minimizing versions of */
    420   OP_CRMINSTAR,      /* 52 all these opcodes must come in pairs, with */
    421   OP_CRPLUS,         /* 53 the minimizing one second. These codes must */
    422   OP_CRMINPLUS,      /* 54 be in exactly the same order as those above. */
    423   OP_CRQUERY,        /* 55 These are for character classes and back refs */
    424   OP_CRMINQUERY,     /* 56 */
    425   OP_CRRANGE,        /* 57 These are different to the three sets above. */
    426   OP_CRMINRANGE,     /* 58 */
    427 
    428   OP_CLASS,          /* 59 Match a character class, chars < 256 only */
    429   OP_NCLASS,         /* 60 Same, but the bitmap was created from a negative
    430                            class - the difference is relevant only when a UTF-8
    431                            character > 255 is encountered. */
    432 
    433   OP_XCLASS,         /* 61 Extended class for handling UTF-8 chars within the
    434                            class. This does both positive and negative. */
    435 
    436   OP_REF,            /* 62 Match a back reference */
    437 
    438   OP_ALT,            /* 65 Start of alternation */
    439   OP_KET,            /* 66 End of group that doesn't have an unbounded repeat */
    440   OP_KETRMAX,        /* 67 These two must remain together and in this */
    441   OP_KETRMIN,        /* 68 order. They are for groups the repeat for ever. */
     372  OP_NOT_WORD_BOUNDARY,     /* \B */
     373  OP_WORD_BOUNDARY,         /* \b */
     374  OP_NOT_DIGIT,             /* \D */
     375  OP_DIGIT,                 /* \d */
     376  OP_NOT_WHITESPACE,        /* \S */
     377  OP_WHITESPACE,            /* \s */
     378  OP_NOT_WORDCHAR,          /* \W */
     379  OP_WORDCHAR,              /* \w */
     380
     381  OP_ANY,                   /* . -- Match any character */
     382
     383  OP_CIRC,                  /* ^ */
     384  OP_DOLL,                  /* $ */
     385  OP_CHAR,                  /* Match one character, casefully */
     386  OP_CHARNC,                /* Match one character, caselessly */
     387  OP_ASCII_CHAR,            /* Match one ASCII (0-127) character. */
     388  OP_ASCII_LETTER_NC,       /* Match one ASCII letter, caselessly. */
     389  OP_NOT,                   /* Match anything but the following char */
     390
     391  OP_STAR,                  /* The maximizing and minimizing versions of */
     392  OP_MINSTAR,               /* all these opcodes must come in pairs, with */
     393  OP_PLUS,                  /* the minimizing one second. */
     394  OP_MINPLUS,               /* This first set applies to single characters */
     395  OP_QUERY,
     396  OP_MINQUERY,
     397  OP_UPTO,                  /* From 0 to n matches */
     398  OP_MINUPTO,
     399  OP_EXACT,                 /* Exactly n matches */
     400
     401  OP_NOTSTAR,               /* This set applies to "not" single characters */
     402  OP_NOTMINSTAR,
     403  OP_NOTPLUS,
     404  OP_NOTMINPLUS,
     405  OP_NOTQUERY,
     406  OP_NOTMINQUERY,
     407  OP_NOTUPTO,
     408  OP_NOTMINUPTO,
     409  OP_NOTEXACT,
     410
     411  OP_TYPESTAR,              /* This set applies to character types such as \d */
     412  OP_TYPEMINSTAR,
     413  OP_TYPEPLUS,
     414  OP_TYPEMINPLUS,
     415  OP_TYPEQUERY,
     416  OP_TYPEMINQUERY,
     417  OP_TYPEUPTO,
     418  OP_TYPEMINUPTO,
     419  OP_TYPEEXACT,
     420
     421  OP_CRSTAR,                /* These are for character classes and back refs */
     422  OP_CRMINSTAR,
     423  OP_CRPLUS,
     424  OP_CRMINPLUS,
     425  OP_CRQUERY,
     426  OP_CRMINQUERY,
     427  OP_CRRANGE,               /* These are different to the three sets above. */
     428  OP_CRMINRANGE,
     429
     430  OP_CLASS,                 /* Match a character class, chars < 256 only */
     431  OP_NCLASS,                /* Same, but the bitmap was created from a negative
     432                               class - the difference is relevant when a UTF-8
     433                               character > 255 is encountered. */
     434
     435  OP_XCLASS,                /* Extended class for handling UTF-8 chars within the
     436                               class. This does both positive and negative. */
     437
     438  OP_REF,                   /* Match a back reference */
     439
     440  OP_ALT,                   /* Start of alternation */
     441  OP_KET,                   /* End of group that doesn't have an unbounded repeat */
     442  OP_KETRMAX,               /* These two must remain together and in this */
     443  OP_KETRMIN,               /* order. They are for groups the repeat for ever. */
    442444
    443445  /* The assertions must come before ONCE and COND */
    444446
    445   OP_ASSERT,         /* 69 Positive lookahead */
    446   OP_ASSERT_NOT,     /* 70 Negative lookahead */
     447  OP_ASSERT,                /* Positive lookahead */
     448  OP_ASSERT_NOT,            /* Negative lookahead */
    447449
    448450  /* ONCE and COND must come after the assertions, with ONCE first, as there's
    449451  a test for >= ONCE for a subpattern that isn't an assertion. */
    450452
    451   OP_ONCE,           /* 74 Once matched, don't back up into the subpattern */
    452 
    453   OP_BRAZERO,        /* 77 These two must remain together and in this */
    454   OP_BRAMINZERO,     /* 78 order. */
    455 
    456   OP_BRANUMBER,      /* 79 Used for extracting brackets whose number is greater
    457                            than can fit into an opcode. */
    458 
    459   OP_BRA             /* 80 This and greater values are used for brackets that
    460                            extract substrings up to EXTRACT_BASIC_MAX. After
    461                            that, use is made of OP_BRANUMBER. */
     453  OP_ONCE,                  /* Once matched, don't back up into the subpattern */
     454
     455  OP_BRAZERO,               /* These two must remain together and in this */
     456  OP_BRAMINZERO,            /* order. */
     457
     458  OP_BRANUMBER,             /* Used for extracting brackets whose number is greater
     459                               than can fit into an opcode. */
     460
     461  OP_BRA                    /* This and greater values are used for brackets that
     462                               extract substrings up to EXTRACT_BASIC_MAX. After
     463                               that, use is made of OP_BRANUMBER. */
    462464};
    463465
     
    489491  1,                             /* Any                                    */ \
    490492  1, 1,                          /* ^, $                                   */ \
    491   2,                             /* Char  - the minimum length             */ \
    492   2,                             /* Charnc  - the minimum length           */ \
     493  2, 2,                          /* Char, Charnc - minimum lengths         */ \
     494  2, 2,                          /* ASCII char or non-cased                */ \
    493495  2,                             /* not                                    */ \
    494496  /* Positive single-char repeats                            ** These are  */ \
Note: See TracChangeset for help on using the changeset viewer.