Changeset 27686 in webkit for trunk/JavaScriptCore/pcre/pcre_compile.cpp
- Timestamp:
- Nov 11, 2007, 10:56:13 AM (18 years ago)
- File:
-
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/pcre/pcre_compile.cpp
r27681 r27686 1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 1 /* This is JavaScriptCore's variant of the PCRE library. While this library 2 started out as a copy of PCRE, many of the features of PCRE have been 3 removed. This library now supports only the regular expression features 4 required by the JavaScript language specification, and has only the functions 5 needed by JavaScriptCore and the rest of WebKit. 6 7 Originally written by Philip Hazel 9 8 Copyright (c) 1997-2006 University of Cambridge 10 Copyright (c) 2004, 2005 Apple Computer, Inc.9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved. 11 10 12 11 ----------------------------------------------------------------------------- … … 39 38 */ 40 39 41 42 /* This module contains the external function pcre_compile(), along with 40 /* This module contains the external function jsRegExpExecute(), along with 43 41 supporting internal functions that are not used by other modules. */ 44 42 45 46 43 #include "pcre_internal.h" 47 44 48 49 /* WARNING: These macros evaluate their parameters more than once. */ 45 #include <wtf/ASCIICType.h> 46 #include <wtf/FastMalloc.h> 47 48 using namespace WTF; 49 50 /* WARNING: This macro evaluates its parameters more than once. */ 50 51 #define DIGITAB(x) ((x) < 128 ? digitab[(x)] : 0) 51 52 53 /* When DEBUG is defined, we need the pcre_printint() function, which is also54 used by pcretest. DEBUG is not defined when building a production library. */55 56 #ifdef DEBUG57 #include "pcre_printint.src"58 #endif59 60 61 52 62 53 /************************************************* … … 77 68 is invalid. */ 78 69 79 static const short intescapes[] = {70 static const short escapes[] = { 80 71 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ 81 72 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ … … 90 81 }; 91 82 83 /* Error code numbers. They are given names so that they can more easily be 84 tracked. */ 85 86 typedef enum { 87 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, 88 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17 89 } ErrorCode; 90 91 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that 92 the definition is next to the definition of the opcodes in pcre_internal.h. */ 93 94 static const uschar OP_lengths[] = { OP_LENGTHS }; 92 95 93 96 /* The texts of compile-time error messages. These are "char *" because they 94 97 are passed to the outside world. */ 95 98 96 static const char * const error_texts[] = { 97 "no error", 98 "\\ at end of pattern", 99 "\\c at end of pattern", 100 "unrecognized character follows \\", 101 "numbers out of order in {} quantifier", 102 /* 5 */ 103 "number too big in {} quantifier", 104 "missing terminating ] for character class", 105 "invalid escape sequence in character class", 106 "range out of order in character class", 107 "nothing to repeat", 108 /* 10 */ 109 "operand of unlimited repeat could match the empty string", 110 "internal error: unexpected repeat", 111 "unrecognized character after (?", 112 "POSIX named classes are supported only within a class", 113 "missing )", 114 /* 15 */ 115 "reference to non-existent subpattern", 116 "erroffset passed as NULL", 117 "unknown option bit(s) set", 118 "missing ) after comment", 119 "parentheses nested too deeply", 120 /* 20 */ 121 "regular expression too large", 122 "failed to get memory", 123 "unmatched parentheses", 124 "internal error: code overflow", 125 "unrecognized character after (?<", 126 /* 25 */ 127 "lookbehind assertion is not fixed length", 128 "malformed number after (?(", 129 "conditional group contains more than two branches", 130 "assertion expected after (?(", 131 "(?R or (?digits must be followed by )", 132 /* 30 */ 133 "unknown POSIX class name", 134 "POSIX collating elements are not supported", 135 "this version of PCRE is not compiled with PCRE_UTF8 support", 136 "spare error", 137 "character value in \\x{...} sequence is too large", 138 /* 35 */ 139 "invalid condition (?(0)", 140 "\\C not allowed in lookbehind assertion", 141 "PCRE does not support \\L, \\l, \\N, \\U, or \\u", 142 "number after (?C is > 255", 143 "closing ) for (?C expected", 144 /* 40 */ 145 "recursive call could loop indefinitely", 146 "unrecognized character after (?P", 147 "syntax error after (?P", 148 "two named groups have the same name", 149 "invalid UTF-16 string", 150 /* 45 */ 151 "support for \\P, \\p, and \\X has not been compiled", 152 "malformed \\P or \\p sequence", 153 "unknown property name after \\P or \\p" 154 }; 155 156 157 /* Table to identify digits and hex digits. This is used when compiling 99 static const char* error_text(ErrorCode code) 100 { 101 static const char error_texts[] = 102 /* 1 */ 103 "\\ at end of pattern\0" 104 "\\c at end of pattern\0" 105 "character value in \\x{...} sequence is too large\0" 106 "numbers out of order in {} quantifier\0" 107 /* 5 */ 108 "number too big in {} quantifier\0" 109 "missing terminating ] for character class\0" 110 "internal error: code overflow\0" 111 "range out of order in character class\0" 112 "nothing to repeat\0" 113 /* 10 */ 114 "unmatched parentheses\0" 115 "internal error: unexpected repeat\0" 116 "unrecognized character after (?\0" 117 "failed to get memory\0" 118 "missing )\0" 119 /* 15 */ 120 "reference to non-existent subpattern\0" 121 "regular expression too large\0" 122 "parentheses nested too deeply" 123 ; 124 125 int i = code; 126 const char* text = error_texts; 127 while (i > 1) 128 i -= !*text++; 129 return text; 130 } 131 132 /* Table to hex digits. This is used when compiling 158 133 patterns. Note that the tables in chartables are dependent on the locale, and 159 134 may mark arbitrary characters as digits - but the PCRE compiling code expects … … 164 139 efficiently. 165 140 166 For convenience, we use the same bit definitions as in chartables: 167 168 0x04 decimal digit 141 For convenience, we use the same bit definition as in chartables: 142 169 143 0x08 hexadecimal digit 170 144 171 Then we can use ctype_ digit and ctype_xdigit in the code. */145 Then we can use ctype_xdigit in the code. */ 172 146 173 147 static const unsigned char digitab[] = … … 206 180 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ 207 181 208 209 182 /* Definition to allow mutual recursion */ 210 183 211 184 static BOOL 212 compile_regex(int, int *, uschar **, const pcre_uchar **, const pcre_uchar *, int*, int,185 compile_regex(int, int *, uschar **, const pcre_uchar **, const pcre_uchar *, ErrorCode*, int, 213 186 int *, int *, compile_data *); 214 215 216 187 217 188 /************************************************* … … 238 209 239 210 static int 240 check_escape(const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, int *errorcodeptr, int bracount,211 check_escape(const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, ErrorCode* errorcodeptr, int bracount, 241 212 BOOL isclass) 242 213 { … … 253 224 c = *ptr; 254 225 255 if (0) { } /* Matches with else below; to make merging easier. */256 257 226 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in 258 227 a table. A non-zero result is something that can be returned immediately. 259 228 Otherwise further processing may be required. */ 260 229 261 elseif (c < '0' || c > 'z') {} /* Not alphameric */230 if (c < '0' || c > 'z') {} /* Not alphameric */ 262 231 else if ((i = escapes[c - '0']) != 0) c = i; 263 232 … … 291 260 oldptr = ptr; 292 261 c -= '0'; 293 while (ptr + 1 < patternEnd && (DIGITAB(ptr[1]) & ctype_digit) != 0)262 while (ptr + 1 < patternEnd && isASCIIDigit(ptr[1])) 294 263 c = c * 10 + *(++ptr) - '0'; 295 264 if (c < 10 || c <= bracount) … … 345 314 if (pt < patternEnd && *pt == '}') 346 315 { 347 if (c < 0 || count > 8) *errorcodeptr = ERR3 4;348 else if (c >= 0xD800 && c <= 0xDFFF) *errorcodeptr = ERR3 4; // half of surrogate pair349 else if (c >= 0xFDD0 && c <= 0xFDEF) *errorcodeptr = ERR3 4; // ?350 else if (c == 0xFFFE) *errorcodeptr = ERR3 4; // not a character351 else if (c == 0xFFFF) *errorcodeptr = ERR3 4; // not a character352 else if (c > 0x10FFFF) *errorcodeptr = ERR3 4; // out of Unicode character range316 if (c < 0 || count > 8) *errorcodeptr = ERR3; 317 else if (c >= 0xD800 && c <= 0xDFFF) *errorcodeptr = ERR3; // half of surrogate pair 318 else if (c >= 0xFDD0 && c <= 0xFDEF) *errorcodeptr = ERR3; // ? 319 else if (c == 0xFFFE) *errorcodeptr = ERR3; // not a character 320 else if (c == 0xFFFF) *errorcodeptr = ERR3; // not a character 321 else if (c > 0x10FFFF) *errorcodeptr = ERR3; // out of Unicode character range 353 322 ptr = pt; 354 323 break; … … 441 410 is_counted_repeat(const pcre_uchar *p, const pcre_uchar *patternEnd) 442 411 { 443 if (p >= patternEnd || (DIGITAB(*p) & ctype_digit) == 0)412 if (p >= patternEnd || !isASCIIDigit(*p)) 444 413 return FALSE; 445 414 p++; 446 while (p < patternEnd && (DIGITAB(*p) & ctype_digit) != 0)415 while (p < patternEnd && isASCIIDigit(*p)) 447 416 p++; 448 417 if (p < patternEnd && *p == '}') … … 454 423 return TRUE; 455 424 456 if (p >= patternEnd || (DIGITAB(*p) & ctype_digit) == 0)425 if (p >= patternEnd || !isASCIIDigit(*p)) 457 426 return FALSE; 458 427 p++; 459 while (p < patternEnd && (DIGITAB(*p) & ctype_digit) != 0)428 while (p < patternEnd && isASCIIDigit(*p)) 460 429 p++; 461 430 … … 485 454 486 455 static const pcre_uchar * 487 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)456 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, ErrorCode* errorcodeptr) 488 457 { 489 458 int min = 0; … … 493 462 an integer overflow. */ 494 463 495 while ( (DIGITAB(*p) & ctype_digit) != 0) min = min * 10 + *p++ - '0';464 while (isASCIIDigit(*p)) min = min * 10 + *p++ - '0'; 496 465 if (min < 0 || min > 65535) 497 466 { … … 508 477 { 509 478 max = 0; 510 while ((DIGITAB(*p) & ctype_digit) != 0) max = max * 10 + *p++ - '0';479 while (isASCIIDigit(*p)) max = max * 10 + *p++ - '0'; 511 480 if (max < 0 || max > 65535) 512 481 { … … 559 528 if (!skipassert) return code; 560 529 do code += GET(code, 1); while (*code == OP_ALT); 561 code += _pcre_OP_lengths[*code];530 code += OP_lengths[*code]; 562 531 break; 563 532 … … 568 537 569 538 case OP_BRANUMBER: 570 code += _pcre_OP_lengths[*code];539 code += OP_lengths[*code]; 571 540 break; 572 541 … … 655 624 case OP_NOT_WORD_BOUNDARY: 656 625 case OP_WORD_BOUNDARY: 657 cc += _pcre_OP_lengths[*cc];626 cc += OP_lengths[*cc]; 658 627 break; 659 628 … … 766 735 for (code = first_significant_code(code + 1 + LINK_SIZE, TRUE); 767 736 code < endcode; 768 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))737 code = first_significant_code(code + OP_lengths[c], TRUE)) 769 738 { 770 739 const uschar *ccode; … … 956 925 957 926 Arguments: 958 options ptr pointer tothe option bits927 options the option bits 959 928 brackets points to number of extracting brackets used 960 929 codeptr points to the pointer to the current code point … … 970 939 971 940 static BOOL 972 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,973 const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, int *errorcodeptr, int *firstbyteptr,941 compile_branch(int options, int *brackets, uschar **codeptr, 942 const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, ErrorCode* errorcodeptr, int *firstbyteptr, 974 943 int *reqbyteptr, compile_data *cd) 975 944 { … … 980 949 int zeroreqbyte, zerofirstbyte; 981 950 int req_caseopt, reqvary, tempreqvary; 982 int options = *optionsptr;983 951 int after_manual_callout = 0; 984 952 register int c; … … 1025 993 int class_charcount; 1026 994 int class_lastchar; 1027 int newoptions;1028 995 int skipbytes; 1029 996 int subreqbyte; … … 1955 1922 1956 1923 case '(': 1957 newoptions = options;1958 1924 skipbytes = 0; 1959 1925 … … 2012 1978 2013 1979 if (!compile_regex( 2014 newoptions, /* The complete new option state */1980 options, 2015 1981 brackets, /* Extracting bracket count */ 2016 1982 &tempcode, /* Where to put code (updated) */ … … 2254 2220 static BOOL 2255 2221 compile_regex(int options, int *brackets, uschar **codeptr, 2256 const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, int *errorcodeptr, int skipbytes,2222 const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, ErrorCode* errorcodeptr, int skipbytes, 2257 2223 int *firstbyteptr, int *reqbyteptr, compile_data *cd) 2258 2224 { … … 2277 2243 /* Now compile the branch */ 2278 2244 2279 if (!compile_branch( &options, brackets, &code, &ptr, patternEnd, errorcodeptr,2245 if (!compile_branch(options, brackets, &code, &ptr, patternEnd, errorcodeptr, 2280 2246 &branchfirstbyte, &branchreqbyte, cd)) 2281 2247 { … … 2415 2381 2416 2382 static BOOL 2417 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,2383 is_anchored(register const uschar *code, int options, unsigned int bracket_map, 2418 2384 unsigned int backref_map) 2419 2385 { … … 2443 2409 /* Check for explicit anchoring */ 2444 2410 2445 else if ((( *options & PCRE_MULTILINE) != 0 || op != OP_CIRC))2411 else if (((options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) 2446 2412 return FALSE; 2447 2413 code += GET(code, 1); … … 2541 2507 2542 2508 static int 2543 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)2509 find_firstassertedchar(const uschar *code, int options, BOOL inassert) 2544 2510 { 2545 2511 register int c = -1; … … 2578 2544 { 2579 2545 c = scode[1]; 2580 if (( *options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;2546 if ((options & PCRE_CASELESS) != 0) c |= REQ_CASELESS; 2581 2547 } 2582 2548 else if (c != scode[1]) return -1; … … 2615 2581 2616 2582 pcre * 2617 jsRegExpCompile(const pcre_char* pattern, int patternLength, int options, unsigned* numSubpatterns, const char** errorptr) 2583 jsRegExpCompile(const pcre_char* pattern, int patternLength, 2584 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline, 2585 unsigned* numSubpatterns, const char** errorptr) 2618 2586 { 2619 2587 real_pcre *re; … … 2627 2595 int max_name_size = 0; 2628 2596 int lastitemlength = 0; 2629 int errorcode =0;2597 ErrorCode errorcode = ERR0; 2630 2598 BOOL class_utf8; 2631 2599 BOOL capturing; … … 2909 2877 } 2910 2878 2911 if ((d > 255 || ( (options & PCRE_CASELESS) != 0&& d > 127)))2879 if ((d > 255 || (ignoreCase && d > 127))) 2912 2880 { 2913 2881 uschar buffer[6]; … … 2924 2892 another byte in the UTF-8 representation. */ 2925 2893 2926 if ( (options & PCRE_CASELESS) != 0)2894 if (ignoreCase) 2927 2895 { 2928 2896 int occ, ocd; … … 2965 2933 else 2966 2934 { 2967 if ((c > 255 || ( (options & PCRE_CASELESS) != 0&& c > 127)))2935 if ((c > 255 || (ignoreCase && c > 127))) 2968 2936 { 2969 2937 uschar buffer[6]; … … 2974 2942 length += LINK_SIZE + 2; 2975 2943 } 2976 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) * 2977 (1 + _pcre_ord2utf8(c, buffer)); 2944 length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer)); 2978 2945 } 2979 2946 } … … 3071 3038 if (brastackptr >= sizeof(brastack)/sizeof(int)) 3072 3039 { 3073 errorcode = ERR1 9;3040 errorcode = ERR17; 3074 3041 goto PCRE_ERROR_RETURN; 3075 3042 } … … 3181 3148 if (length > MAX_PATTERN_SIZE) 3182 3149 { 3183 errorcode = ERR 20;3184 goto PCRE_E ARLY_ERROR_RETURN;3150 errorcode = ERR16; 3151 goto PCRE_ERROR_RETURN; 3185 3152 } 3186 3153 3187 /* Compute the size of data block needed and get it, either from malloc or 3188 externally provided function. */ 3154 /* Compute the size of data block needed and get it. */ 3189 3155 3190 3156 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); 3191 re = (real_pcre *)(pcre_malloc)(size);3157 re = reinterpret_cast<real_pcre*>(new char[size]); 3192 3158 3193 3159 if (re == NULL) 3194 3160 { 3195 errorcode = ERR 21;3196 goto PCRE_E ARLY_ERROR_RETURN;3161 errorcode = ERR13; 3162 goto PCRE_ERROR_RETURN; 3197 3163 } 3198 3164 … … 3203 3169 3204 3170 re->size = (pcre_uint32)size; 3205 re->options = options;3171 re->options = (ignoreCase ? PCRE_CASELESS : 0) | (multiline ? PCRE_MULTILINE : 0); 3206 3172 3207 3173 /* The starting points of the name/number translation table and of the code are … … 3221 3187 *code = OP_BRA; 3222 3188 bracount = 0; 3223 (void)compile_regex( options, &bracount, &code, &ptr,3189 (void)compile_regex(re->options, &bracount, &code, &ptr, 3224 3190 patternEnd, 3225 3191 &errorcode, 0, &firstbyte, &reqbyte, &compile_block); … … 3229 3195 /* If not reached end of pattern on success, there's an excess bracket. */ 3230 3196 3231 if (errorcode == 0 && ptr < patternEnd) errorcode = ERR 22;3197 if (errorcode == 0 && ptr < patternEnd) errorcode = ERR10; 3232 3198 3233 3199 /* Fill in the terminating state and check for disastrous overflow, but … … 3237 3203 3238 3204 #ifndef DEBUG 3239 if (code - codestart > length) errorcode = ERR 23;3205 if (code - codestart > length) errorcode = ERR7; 3240 3206 #endif 3241 3207 … … 3247 3213 /* Failed to compile, or error while post-processing */ 3248 3214 3249 if (errorcode != 0)3215 if (errorcode != ERR0) 3250 3216 { 3251 (pcre_free)(re);3217 delete [] reinterpret_cast<char*>(re); 3252 3218 PCRE_ERROR_RETURN: 3253 PCRE_EARLY_ERROR_RETURN: 3254 *errorptr = error_texts[errorcode]; 3219 *errorptr = error_text(errorcode); 3255 3220 return NULL; 3256 3221 } … … 3267 3232 3268 3233 { 3269 int temp_options = options; 3270 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map)) 3234 if (is_anchored(codestart, re->options, 0, compile_block.backref_map)) 3271 3235 re->options |= PCRE_ANCHORED; 3272 3236 else 3273 3237 { 3274 3238 if (firstbyte < 0) 3275 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);3239 firstbyte = find_firstassertedchar(codestart, re->options, FALSE); 3276 3240 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ 3277 3241 { … … 3347 3311 { 3348 3312 (pcre_free)(re); 3349 *errorptr = error_text s[ERR23];3313 *errorptr = error_text(ERR7); 3350 3314 return NULL; 3351 3315 } 3316 3352 3317 #endif 3353 3318 … … 3359 3324 void jsRegExpFree(JSRegExp* re) 3360 3325 { 3361 pcre_free(re);3326 delete [] reinterpret_cast<char*>(re); 3362 3327 } 3363 3364 /* End of pcre_compile.c */
Note:
See TracChangeset
for help on using the changeset viewer.