Changeset 28793 in webkit for trunk/JavaScriptCore/pcre/pcre_compile.cpp
- Timestamp:
- Dec 16, 2007, 8:19:25 PM (17 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/pcre/pcre_compile.cpp
r28785 r28793 51 51 using namespace WTF; 52 52 53 /* Negative values for the firstchar and reqchar variables */ 54 55 #define REQ_UNSET (-2) 56 #define REQ_NONE (-1) 57 53 58 /************************************************* 54 59 * Code parameters and static tables * … … 89 94 }; 90 95 91 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that92 the definition is next to the definition of the opcodes in pcre_internal.h. */93 94 static const uschar OP_lengths[] = { OP_LENGTHS };95 96 96 /* The texts of compile-time error messages. These are "char *" because they 97 97 are passed to the outside world. */ 98 98 99 static const char* error _text(ErrorCode code)99 static const char* errorText(ErrorCode code) 100 100 { 101 static const char error _texts[] =101 static const char errorTexts[] = 102 102 /* 1 */ 103 103 "\\ at end of pattern\0" … … 124 124 125 125 int i = code; 126 const char* text = error _texts;126 const char* text = errorTexts; 127 127 while (i > 1) 128 128 i -= !*text++; … … 142 142 needOuterBracket = false; 143 143 } 144 const u schar* start_code; /* The start of the compiled code */144 const unsigned char* start_code; /* The start of the compiled code */ 145 145 const UChar* start_pattern; /* The start of the pattern */ 146 146 int top_backref; /* Maximum back reference */ … … 152 152 /* Definitions to allow mutual recursion */ 153 153 154 static bool compileBracket(int, int*, u schar**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);155 static bool bracketIsAnchored(const u schar* code);156 static bool bracketNeedsLineStart(const u schar* code, unsigned captureMap, unsigned backrefMap);157 static int bracketFindFirstAssertedCharacter(const u schar* code, bool inassert);154 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&); 155 static bool bracketIsAnchored(const unsigned char* code); 156 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap); 157 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert); 158 158 159 159 /************************************************* … … 179 179 */ 180 180 181 static int check _escape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass)181 static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass) 182 182 { 183 183 const UChar* ptr = *ptrptr + 1; … … 209 209 } else { 210 210 switch (c) { 211 case '1': 212 case '2': 213 case '3': 214 case '4': 215 case '5': 216 case '6': 217 case '7': 218 case '8': 219 case '9': 220 /* Escape sequences starting with a non-zero digit are backreferences, 221 unless there are insufficient brackets, in which case they are octal 222 escape sequences. Those sequences end on the first non-octal character 223 or when we overflow 0-255, whichever comes first. */ 224 225 if (!isclass) { 226 const UChar* oldptr = ptr; 227 c -= '0'; 228 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount) 229 c = c * 10 + *(++ptr) - '0'; 230 if (c <= bracount) { 231 c = -(ESC_REF + c); 211 case '1': 212 case '2': 213 case '3': 214 case '4': 215 case '5': 216 case '6': 217 case '7': 218 case '8': 219 case '9': 220 /* Escape sequences starting with a non-zero digit are backreferences, 221 unless there are insufficient brackets, in which case they are octal 222 escape sequences. Those sequences end on the first non-octal character 223 or when we overflow 0-255, whichever comes first. */ 224 225 if (!isclass) { 226 const UChar* oldptr = ptr; 227 c -= '0'; 228 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount) 229 c = c * 10 + *(++ptr) - '0'; 230 if (c <= bracount) { 231 c = -(ESC_REF + c); 232 break; 233 } 234 ptr = oldptr; /* Put the pointer back and fall through */ 235 } 236 237 /* Handle an octal number following \. If the first digit is 8 or 9, 238 this is not octal. */ 239 240 if ((c = *ptr) >= '8') 232 241 break; 233 } 234 ptr = oldptr; /* Put the pointer back and fall through */ 235 } 236 237 /* Handle an octal number following \. If the first digit is 8 or 9, 238 this is not octal. */ 239 240 if ((c = *ptr) >= '8') 241 break; 242 242 243 243 /* \0 always starts an octal number, but we may drop through to here with a 244 244 larger first octal digit. */ 245 246 case '0': { 247 c -= '0'; 248 int i; 249 for (i = 1; i <= 2; ++i) { 250 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7') 251 break; 252 int cc = c * 8 + ptr[i] - '0'; 253 if (cc > 255) 254 break; 255 c = cc; 245 246 case '0': { 247 c -= '0'; 248 int i; 249 for (i = 1; i <= 2; ++i) { 250 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7') 251 break; 252 int cc = c * 8 + ptr[i] - '0'; 253 if (cc > 255) 254 break; 255 c = cc; 256 } 257 ptr += i - 1; 258 break; 256 259 } 257 ptr += i - 1; 258 break; 259 } 260 case 'x': { 261 c = 0; 262 int i; 263 for (i = 1; i <= 2; ++i) { 264 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) { 265 c = 'x'; 266 i = 1; 267 break; 268 } 269 int cc = ptr[i]; 270 if (cc >= 'a') 271 cc -= 32; /* Convert to upper case */ 272 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10)); 260 261 case 'x': { 262 c = 0; 263 int i; 264 for (i = 1; i <= 2; ++i) { 265 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) { 266 c = 'x'; 267 i = 1; 268 break; 269 } 270 int cc = ptr[i]; 271 if (cc >= 'a') 272 cc -= 32; /* Convert to upper case */ 273 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10)); 274 } 275 ptr += i - 1; 276 break; 273 277 } 274 ptr += i - 1; 275 break; 276 } 277 case 'u': { 278 c = 0; 279 int i; 280 for (i = 1; i <= 4; ++i) { 281 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) { 282 c = 'u'; 283 i = 1; 284 break; 285 } 286 int cc = ptr[i]; 287 if (cc >= 'a') 288 cc -= 32; /* Convert to upper case */ 289 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10)); 278 279 case 'u': { 280 c = 0; 281 int i; 282 for (i = 1; i <= 4; ++i) { 283 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) { 284 c = 'u'; 285 i = 1; 286 break; 287 } 288 int cc = ptr[i]; 289 if (cc >= 'a') 290 cc -= 32; /* Convert to upper case */ 291 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10)); 292 } 293 ptr += i - 1; 294 break; 290 295 } 291 ptr += i - 1; 292 break; 293 294 /* Other special escapes not starting with a digit are straightforward */ 295 } 296 case 'c': 297 if (++ptr == patternEnd) { 298 *errorcodeptr = ERR2; 299 return 0; 296 297 case 'c': 298 if (++ptr == patternEnd) { 299 *errorcodeptr = ERR2; 300 return 0; 301 } 302 c = *ptr; 303 304 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding 305 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */ 306 c = toASCIIUpper(c) ^ 0x40; 307 break; 300 308 } 301 c = *ptr;302 303 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding304 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */305 306 if (c >= 'a' && c <= 'z')307 c -= 32;308 c ^= 0x40;309 break;310 }311 309 } 312 310 … … 314 312 return c; 315 313 } 316 317 318 314 319 315 /************************************************* … … 332 328 */ 333 329 334 static bool is _counted_repeat(const UChar* p, const UChar* patternEnd)330 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd) 335 331 { 336 332 if (p >= patternEnd || !isASCIIDigit(*p)) … … 356 352 } 357 353 358 359 354 /************************************************* 360 355 * Read repeat counts * … … 362 357 363 358 /* Read an item of the form {n,m} and return the values. This is called only 364 after is _counted_repeat() has confirmed that a repeat-count quantifier exists,359 after isCountedRepeat() has confirmed that a repeat-count quantifier exists, 365 360 so the syntax is guaranteed to be correct, but we need to check the values. 366 361 … … 376 371 */ 377 372 378 static const UChar* read _repeat_counts(const UChar* p, int* minp, int* maxp, ErrorCode* errorcodeptr)373 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorcodeptr) 379 374 { 380 375 int min = 0; … … 420 415 } 421 416 422 423 417 /************************************************* 424 418 * Find first significant op code * … … 427 421 /* This is called by several functions that scan a compiled expression looking 428 422 for a fixed first character, or an anchoring op code etc. It skips over things 429 that do not influence this. For some calls, a change of option is important. 430 For some calls, it makes sense to skip negative forward and all backward 431 assertions, and also the \b assertion; for others it does not. 423 that do not influence this. 432 424 433 425 Arguments: 434 426 code pointer to the start of the group 435 skipassert true if certain assertions are to be skipped436 437 427 Returns: pointer to the first significant opcode 438 428 */ 439 429 440 static const u schar* firstSignificantOpCode(const uschar* code)430 static const unsigned char* firstSignificantOpcode(const unsigned char* code) 441 431 { 442 432 while (*code == OP_BRANUMBER) 443 code += OP_lengths[*code];433 code += 3; 444 434 return code; 445 435 } 446 436 447 static const u schar* firstSignificantOpCodeSkippingAssertions(const uschar* code)437 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code) 448 438 { 449 439 while (true) { 450 440 switch (*code) { 451 case OP_ASSERT_NOT:452 do {453 code += getOpcodeValueAtOffset(code, 1);454 } while (*code == OP_ALT);455 c ode += OP_lengths[*code];456 break;457 case OP_WORD_BOUNDARY:458 case OP_NOT_WORD_BOUNDARY:459 case OP_BRANUMBER:460 code += OP_lengths[*code];461 break;462 default:463 return code;441 case OP_ASSERT_NOT: 442 advanceToEndOfBracket(code); 443 code += 1 + LINK_SIZE; 444 break; 445 case OP_WORD_BOUNDARY: 446 case OP_NOT_WORD_BOUNDARY: 447 ++code; 448 break; 449 case OP_BRANUMBER: 450 code += 3; 451 break; 452 default: 453 return code; 464 454 } 465 455 } 466 ASSERT_NOT_REACHED();467 456 } 468 469 470 /*************************************************471 * Find the fixed length of a pattern *472 *************************************************/473 474 /* Scan a pattern and compute the fixed length of subject that will match it,475 if the length is fixed. This is needed for dealing with backward assertions.476 In UTF8 mode, the result is in characters rather than bytes.477 478 Arguments:479 code points to the start of the pattern (the bracket)480 options the compiling options481 482 Returns: the fixed length, or -1 if there is no fixed length,483 or -2 if \C was encountered484 */485 486 static int find_fixedlength(uschar* code, int options)487 {488 int length = -1;489 490 int branchlength = 0;491 uschar* cc = code + 1 + LINK_SIZE;492 493 /* Scan along the opcodes for this branch. If we get to the end of the494 branch, check the length against that of the other branches. */495 496 while (true) {497 int d;498 int op = *cc;499 if (op >= OP_BRA)500 op = OP_BRA;501 502 switch (op) {503 case OP_BRA:504 case OP_ONCE:505 d = find_fixedlength(cc, options);506 if (d < 0)507 return d;508 branchlength += d;509 do {510 cc += getOpcodeValueAtOffset(cc, 1);511 } while (*cc == OP_ALT);512 cc += 1 + LINK_SIZE;513 break;514 515 /* Reached end of a branch; if it's a ket it is the end of a nested516 call. If it's ALT it is an alternation in a nested call. If it is517 END it's the end of the outer call. All can be handled by the same code. */518 519 case OP_ALT:520 case OP_KET:521 case OP_KETRMAX:522 case OP_KETRMIN:523 case OP_END:524 if (length < 0)525 length = branchlength;526 else if (length != branchlength)527 return -1;528 if (*cc != OP_ALT)529 return length;530 cc += 1 + LINK_SIZE;531 branchlength = 0;532 break;533 534 /* Skip over assertive subpatterns */535 536 case OP_ASSERT:537 case OP_ASSERT_NOT:538 do {539 cc += getOpcodeValueAtOffset(cc, 1);540 } while (*cc == OP_ALT);541 /* Fall through */542 543 /* Skip over things that don't match chars */544 545 case OP_BRANUMBER:546 case OP_CIRC:547 case OP_DOLL:548 case OP_NOT_WORD_BOUNDARY:549 case OP_WORD_BOUNDARY:550 cc += OP_lengths[*cc];551 break;552 553 /* Handle literal characters */554 555 case OP_CHAR:556 case OP_CHAR_IGNORING_CASE:557 case OP_NOT:558 branchlength++;559 cc += 2;560 while ((*cc & 0xc0) == 0x80)561 cc++;562 break;563 564 case OP_ASCII_CHAR:565 case OP_ASCII_LETTER_IGNORING_CASE:566 branchlength++;567 cc += 2;568 break;569 570 /* Handle exact repetitions. The count is already in characters, but we571 need to skip over a multibyte character in UTF8 mode. */572 573 case OP_EXACT:574 branchlength += get2ByteOpcodeValueAtOffset(cc,1);575 cc += 4;576 while((*cc & 0x80) == 0x80)577 cc++;578 break;579 580 case OP_TYPEEXACT:581 branchlength += get2ByteOpcodeValueAtOffset(cc,1);582 cc += 4;583 break;584 585 /* Handle single-char matchers */586 587 case OP_NOT_DIGIT:588 case OP_DIGIT:589 case OP_NOT_WHITESPACE:590 case OP_WHITESPACE:591 case OP_NOT_WORDCHAR:592 case OP_WORDCHAR:593 case OP_NOT_NEWLINE:594 branchlength++;595 cc++;596 break;597 598 /* Check a class for variable quantification */599 600 case OP_XCLASS:601 cc += getOpcodeValueAtOffset(cc, 1) - 33;602 /* Fall through */603 604 case OP_CLASS:605 case OP_NCLASS:606 cc += 33;607 608 switch (*cc) {609 case OP_CRSTAR:610 case OP_CRMINSTAR:611 case OP_CRQUERY:612 case OP_CRMINQUERY:613 return -1;614 615 case OP_CRRANGE:616 case OP_CRMINRANGE:617 if (get2ByteOpcodeValueAtOffset(cc, 1) != get2ByteOpcodeValueAtOffset(cc, 3))618 return -1;619 branchlength += get2ByteOpcodeValueAtOffset(cc, 1);620 cc += 5;621 break;622 623 default:624 branchlength++;625 }626 break;627 628 /* Anything else is variable length */629 630 default:631 return -1;632 }633 }634 ASSERT_NOT_REACHED();635 }636 637 638 /*************************************************639 * Complete a callout item *640 *************************************************/641 642 /* A callout item contains the length of the next item in the pattern, which643 we can't fill in till after we have reached the relevant point. This is used644 for both automatic and manual callouts.645 646 Arguments:647 previous_callout points to previous callout item648 ptr current pattern pointer649 cd pointers to tables etc650 */651 652 static void complete_callout(uschar* previous_callout, const UChar* ptr, const CompileData& cd)653 {654 int length = ptr - cd.start_pattern - getOpcodeValueAtOffset(previous_callout, 2);655 putOpcodeValueAtOffset(previous_callout, 2 + LINK_SIZE, length);656 }657 658 659 457 660 458 /************************************************* … … 676 474 */ 677 475 678 static bool get _othercase_range(int* cptr, int d, int* ocptr, int* odptr)476 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr) 679 477 { 680 478 int c, othercase = 0; 681 479 682 480 for (c = *cptr; c <= d; c++) { 683 if ((othercase = _pcre_ucp_othercase(c)) >= 0)481 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) 684 482 break; 685 483 } … … 692 490 693 491 for (++c; c <= d; c++) { 694 if ( _pcre_ucp_othercase(c) != next)492 if (kjs_pcre_ucp_othercase(c) != next) 695 493 break; 696 494 next++; … … 717 515 */ 718 516 719 // FIXME: This should be removed as soon as all UTF8 uses are removed from PCRE 720 int _pcre_ord2utf8(int cvalue, uschar *buffer) 517 static int encodeUTF8(int cvalue, unsigned char *buffer) 721 518 { 722 519 int i; 723 for (i = 0; i < _pcre_utf8_table1_size; i++)724 if (cvalue <= _pcre_utf8_table1[i])520 for (i = 0; i < kjs_pcre_utf8_table1_size; i++) 521 if (cvalue <= kjs_pcre_utf8_table1[i]) 725 522 break; 726 523 buffer += i; … … 729 526 cvalue >>= 6; 730 527 } 731 *buffer = _pcre_utf8_table2[i] | cvalue;528 *buffer = kjs_pcre_utf8_table2[i] | cvalue; 732 529 return i + 1; 733 530 } … … 759 556 760 557 static bool 761 compileBranch(int options, int* brackets, u schar** codeptr,558 compileBranch(int options, int* brackets, unsigned char** codeptr, 762 559 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int *firstbyteptr, 763 560 int* reqbyteptr, CompileData& cd) … … 767 564 int bravalue = 0; 768 565 int reqvary, tempreqvary; 769 int after_manual_callout = 0;770 566 int c; 771 u schar* code = *codeptr;772 u schar* tempcode;567 unsigned char* code = *codeptr; 568 unsigned char* tempcode; 773 569 bool groupsetfirstbyte = false; 774 570 const UChar* ptr = *ptrptr; 775 571 const UChar* tempptr; 776 uschar* previous = NULL; 777 uschar* previous_callout = NULL; 778 uschar classbits[32]; 572 unsigned char* previous = NULL; 573 unsigned char classbits[32]; 779 574 780 575 bool class_utf8; 781 u schar* class_utf8data;782 u schar utf8_char[6];576 unsigned char* class_utf8data; 577 unsigned char utf8_char[6]; 783 578 784 579 /* Initialize no first byte, no required byte. REQ_UNSET means "no char … … 815 610 int subfirstbyte; 816 611 int mclength; 817 u schar mcbuffer[8];612 unsigned char mcbuffer[8]; 818 613 819 614 /* Next byte in the pattern */ … … 824 619 a quantifier. */ 825 620 826 bool is_quantifier = c == '*' || c == '+' || c == '?' || (c == '{' && is_counted_repeat(ptr + 1, patternEnd)); 827 828 if (!is_quantifier && previous_callout && after_manual_callout-- <= 0) { 829 complete_callout(previous_callout, ptr, cd); 830 previous_callout = NULL; 831 } 621 bool is_quantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd)); 832 622 833 623 switch (c) { … … 922 712 bit map. */ 923 713 924 memset(classbits, 0, 32 * sizeof(u schar));714 memset(classbits, 0, 32 * sizeof(unsigned char)); 925 715 926 716 /* Process characters until ] is reached. The first pass … … 939 729 940 730 if (c == '\\') { 941 c = check _escape(&ptr, patternEnd, errorcodeptr, *brackets, true);731 c = checkEscape(&ptr, patternEnd, errorcodeptr, *brackets, true); 942 732 if (c < 0) { 943 733 class_charcount += 2; /* Greater than 1 is what matters */ … … 1006 796 if (d == '\\') { 1007 797 const UChar* oldptr = ptr; 1008 d = check _escape(&ptr, patternEnd, errorcodeptr, *brackets, true);798 d = checkEscape(&ptr, patternEnd, errorcodeptr, *brackets, true); 1009 799 1010 800 /* \X is literal X; any other special means the '-' was literal */ … … 1037 827 int cc = c; 1038 828 int origd = d; 1039 while (get _othercase_range(&cc, origd, &occ, &ocd)) {829 while (getOthercaseRange(&cc, origd, &occ, &ocd)) { 1040 830 if (occ >= c && ocd <= d) 1041 831 continue; /* Skip embedded ranges */ … … 1056 846 else { 1057 847 *class_utf8data++ = XCL_RANGE; 1058 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);848 class_utf8data += encodeUTF8(occ, class_utf8data); 1059 849 } 1060 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);850 class_utf8data += encodeUTF8(ocd, class_utf8data); 1061 851 } 1062 852 } … … 1066 856 1067 857 *class_utf8data++ = XCL_RANGE; 1068 class_utf8data += _pcre_ord2utf8(c, class_utf8data);1069 class_utf8data += _pcre_ord2utf8(d, class_utf8data);858 class_utf8data += encodeUTF8(c, class_utf8data); 859 class_utf8data += encodeUTF8(d, class_utf8data); 1070 860 1071 861 /* With UCP support, we are done. Without UCP support, there is no … … 1104 894 class_utf8 = true; 1105 895 *class_utf8data++ = XCL_SINGLE; 1106 class_utf8data += _pcre_ord2utf8(c, class_utf8data);896 class_utf8data += encodeUTF8(c, class_utf8data); 1107 897 1108 898 if (options & IgnoreCaseOption) { 1109 899 int othercase; 1110 if ((othercase = _pcre_ucp_othercase(c)) >= 0) {900 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) { 1111 901 *class_utf8data++ = XCL_SINGLE; 1112 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);902 class_utf8data += encodeUTF8(othercase, class_utf8data); 1113 903 } 1114 904 } … … 1198 988 /* Now fill in the complete length of the item */ 1199 989 1200 put OpcodeValueAtOffset(previous,1, code - previous);990 putLinkValue(previous + 1, code - previous); 1201 991 break; /* End of class handling */ 1202 992 } … … 1223 1013 if (!is_quantifier) 1224 1014 goto NORMAL_CHAR; 1225 ptr = read _repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);1015 ptr = readRepeatCounts(ptr + 1, &repeat_min, &repeat_max, errorcodeptr); 1226 1016 if (*errorcodeptr) 1227 1017 goto FAILED; … … 1261 1051 /* Save start of previous item, in case we have to move it up to make space 1262 1052 for an inserted OP_ONCE for the additional '+' extension. */ 1053 /* FIXME: Probably don't need this because we don't use OP_ONCE. */ 1263 1054 1264 1055 tempcode = previous; … … 1289 1080 1290 1081 if (code[-1] & 0x80) { 1291 u schar *lastchar = code - 1;1082 unsigned char *lastchar = code - 1; 1292 1083 while((*lastchar & 0xc0) == 0x80) 1293 1084 lastchar--; … … 1335 1126 int prop_value = -1; 1336 1127 1337 u schar* oldcode = code;1128 unsigned char* oldcode = code; 1338 1129 code = previous; /* Usually overwrite previous item */ 1339 1130 … … 1358 1149 else { 1359 1150 *code++ = OP_UPTO + repeat_type; 1360 put2Byte OpcodeValueAtOffsetAndAdvance(code, 0, repeat_max);1151 put2ByteValueAndAdvance(code, repeat_max); 1361 1152 } 1362 1153 } … … 1375 1166 goto END_REPEAT; 1376 1167 *code++ = OP_UPTO + repeat_type; 1377 put2Byte OpcodeValueAtOffsetAndAdvance(code, 0, repeat_max - 1);1168 put2ByteValueAndAdvance(code, repeat_max - 1); 1378 1169 } 1379 1170 } … … 1384 1175 else { 1385 1176 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ 1386 put2Byte OpcodeValueAtOffsetAndAdvance(code, 0, repeat_min);1177 put2ByteValueAndAdvance(code, repeat_min); 1387 1178 1388 1179 /* If the maximum is unlimited, insert an OP_STAR. Before doing so, … … 1421 1212 repeat_max -= repeat_min; 1422 1213 *code++ = OP_UPTO + repeat_type; 1423 put2Byte OpcodeValueAtOffsetAndAdvance(code, 0, repeat_max);1214 put2ByteValueAndAdvance(code, repeat_max); 1424 1215 } 1425 1216 } … … 1463 1254 else { 1464 1255 *code++ = OP_CRRANGE + repeat_type; 1465 put2Byte OpcodeValueAtOffsetAndAdvance(code, 0, repeat_min);1256 put2ByteValueAndAdvance(code, repeat_min); 1466 1257 if (repeat_max == -1) 1467 1258 repeat_max = 0; /* 2-byte encoding for max */ 1468 put2Byte OpcodeValueAtOffsetAndAdvance(code, 0, repeat_max);1259 put2ByteValueAndAdvance(code, repeat_max); 1469 1260 } 1470 1261 } … … 1473 1264 cases. */ 1474 1265 1475 else if (*previous >= OP_BRA || *previous == OP_ONCE) {1266 else if (*previous >= OP_BRA) { 1476 1267 int ketoffset = 0; 1477 1268 int len = code - previous; 1478 u schar* bralink = NULL;1269 unsigned char* bralink = NULL; 1479 1270 1480 1271 /* If the maximum repeat count is unlimited, find the end of the bracket … … 1485 1276 1486 1277 if (repeat_max == -1) { 1487 uschar* ket = previous; 1488 do { 1489 ket += getOpcodeValueAtOffset(ket, 1); 1490 } while (*ket != OP_KET); 1278 const unsigned char* ket = previous; 1279 advanceToEndOfBracket(ket); 1491 1280 ketoffset = code - ket; 1492 1281 } … … 1540 1329 int offset = (!bralink) ? 0 : previous - bralink; 1541 1330 bralink = previous; 1542 put OpcodeValueAtOffsetAndAdvance(previous, 0, offset);1331 putLinkValueAllowZeroAndAdvance(previous, offset); 1543 1332 } 1544 1333 … … 1581 1370 int offset = (!bralink) ? 0 : code - bralink; 1582 1371 bralink = code; 1583 put OpcodeValueAtOffsetAndAdvance(code, 0, offset);1372 putLinkValueAllowZeroAndAdvance(code, offset); 1584 1373 } 1585 1374 … … 1593 1382 while (bralink) { 1594 1383 int offset = code - bralink + 1; 1595 u schar* bra = code - offset;1596 int oldlinkoffset = get OpcodeValueAtOffset(bra,1);1597 bralink = oldlinkoffset ? bralink - oldlinkoffset : 0;1384 unsigned char* bra = code - offset; 1385 int oldlinkoffset = getLinkValueAllowZero(bra + 1); 1386 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset; 1598 1387 *code++ = OP_KET; 1599 put OpcodeValueAtOffsetAndAdvance(code, 0, offset);1600 put OpcodeValueAtOffset(bra,1, offset);1388 putLinkValueAndAdvance(code, offset); 1389 putLinkValue(bra + 1, offset); 1601 1390 } 1602 1391 } … … 1639 1428 if (*(++ptr) == '?') { 1640 1429 switch (*(++ptr)) { 1641 case ':': /* Non-extracting bracket */1642 bravalue = OP_BRA;1643 ptr++;1644 break;1645 1646 case '=': /* Positive lookahead */1647 bravalue = OP_ASSERT;1648 ptr++;1649 break;1650 1651 case '!': /* Negative lookahead */1652 bravalue = OP_ASSERT_NOT;1653 ptr++;1654 break;1655 1430 case ':': /* Non-extracting bracket */ 1431 bravalue = OP_BRA; 1432 ptr++; 1433 break; 1434 1435 case '=': /* Positive lookahead */ 1436 bravalue = OP_ASSERT; 1437 ptr++; 1438 break; 1439 1440 case '!': /* Negative lookahead */ 1441 bravalue = OP_ASSERT_NOT; 1442 ptr++; 1443 break; 1444 1656 1445 /* Character after (? not specially recognized */ 1657 1658 default: /* Option setting */1659 *errorcodeptr = ERR12;1660 goto FAILED;1661 }1446 1447 default: 1448 *errorcodeptr = ERR12; 1449 goto FAILED; 1450 } 1662 1451 } 1663 1452 … … 1670 1459 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1; 1671 1460 code[1 + LINK_SIZE] = OP_BRANUMBER; 1672 put2Byte OpcodeValueAtOffset(code, 2+LINK_SIZE, *brackets);1461 put2ByteValue(code + 2 + LINK_SIZE, *brackets); 1673 1462 skipbytes = 3; 1674 1463 } … … 1682 1471 new setting for the ims options if they have changed. */ 1683 1472 1684 previous = (bravalue >= OP_ ONCE) ? code : 0;1473 previous = (bravalue >= OP_BRAZERO) ? code : 0; 1685 1474 *code = bravalue; 1686 1475 tempcode = code; … … 1715 1504 groupsetfirstbyte = false; 1716 1505 1717 if (bravalue >= OP_BRA || bravalue == OP_ONCE) {1506 if (bravalue >= OP_BRA) { 1718 1507 /* If we have not yet set a firstbyte in this branch, take it from the 1719 1508 subpattern, remembering that it was set here so that a repeat of more … … 1775 1564 case '\\': 1776 1565 tempptr = ptr; 1777 c = check _escape(&ptr, patternEnd, errorcodeptr, *brackets, false);1566 c = checkEscape(&ptr, patternEnd, errorcodeptr, *brackets, false); 1778 1567 1779 1568 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values … … 1802 1591 previous = code; 1803 1592 *code++ = OP_REF; 1804 put2Byte OpcodeValueAtOffsetAndAdvance(code, 0, number);1593 put2ByteValueAndAdvance(code, number); 1805 1594 } 1806 1595 … … 1838 1627 } 1839 1628 } else { 1840 mclength = _pcre_ord2utf8(c, mcbuffer);1629 mclength = encodeUTF8(c, mcbuffer); 1841 1630 1842 1631 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR; … … 1888 1677 return false; 1889 1678 } 1890 1891 1892 1893 1679 1894 1680 /************************************************* … … 1919 1705 1920 1706 static bool 1921 compileBracket(int options, int* brackets, u schar** codeptr,1707 compileBracket(int options, int* brackets, unsigned char** codeptr, 1922 1708 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int skipbytes, 1923 1709 int* firstbyteptr, int* reqbyteptr, CompileData& cd) 1924 1710 { 1925 1711 const UChar* ptr = *ptrptr; 1926 u schar* code = *codeptr;1927 u schar* last_branch = code;1928 u schar* start_bracket = code;1712 unsigned char* code = *codeptr; 1713 unsigned char* last_branch = code; 1714 unsigned char* start_bracket = code; 1929 1715 int firstbyte = REQ_UNSET; 1930 1716 int reqbyte = REQ_UNSET; … … 1932 1718 /* Offset is set zero to mark that this bracket is still open */ 1933 1719 1934 put OpcodeValueAtOffset(code,1, 0);1720 putLinkValueAllowZero(code + 1, 0); 1935 1721 code += 1 + LINK_SIZE + skipbytes; 1936 1722 … … 1998 1784 int length = code - last_branch; 1999 1785 do { 2000 int prev_length = get OpcodeValueAtOffset(last_branch,1);2001 put OpcodeValueAtOffset(last_branch,1, length);1786 int prev_length = getLinkValueAllowZero(last_branch + 1); 1787 putLinkValue(last_branch + 1, length); 2002 1788 length = prev_length; 2003 1789 last_branch -= length; … … 2007 1793 2008 1794 *code = OP_KET; 2009 put OpcodeValueAtOffset(code,1, code - start_bracket);1795 putLinkValue(code + 1, code - start_bracket); 2010 1796 code += 1 + LINK_SIZE; 2011 1797 … … 2025 1811 2026 1812 *code = OP_ALT; 2027 put OpcodeValueAtOffset(code,1, code - last_branch);1813 putLinkValue(code + 1, code - last_branch); 2028 1814 last_branch = code; 2029 1815 code += 1 + LINK_SIZE; … … 2032 1818 ASSERT_NOT_REACHED(); 2033 1819 } 2034 2035 1820 2036 1821 /************************************************* … … 2051 1836 */ 2052 1837 2053 static bool branchIsAnchored(const u schar* code)1838 static bool branchIsAnchored(const unsigned char* code) 2054 1839 { 2055 const u schar* scode = firstSignificantOpCode(code);1840 const unsigned char* scode = firstSignificantOpcode(code); 2056 1841 int op = *scode; 2057 1842 2058 1843 /* Brackets */ 2059 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)1844 if (op >= OP_BRA || op == OP_ASSERT) 2060 1845 return bracketIsAnchored(scode); 2061 1846 … … 2064 1849 } 2065 1850 2066 static bool bracketIsAnchored(const u schar* code)1851 static bool bracketIsAnchored(const unsigned char* code) 2067 1852 { 2068 1853 do { 2069 1854 if (!branchIsAnchored(code + 1 + LINK_SIZE)) 2070 1855 return false; 2071 code += get OpcodeValueAtOffset(code,1);1856 code += getLinkValue(code + 1); 2072 1857 } while (*code == OP_ALT); /* Loop for each alternative */ 2073 1858 return true; … … 2096 1881 */ 2097 1882 2098 static bool branchNeedsLineStart(const u schar* code, unsigned captureMap, unsigned backrefMap)1883 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap) 2099 1884 { 2100 const u schar* scode = firstSignificantOpCode(code);1885 const unsigned char* scode = firstSignificantOpcode(code); 2101 1886 int op = *scode; 2102 1887 … … 2105 1890 int captureNum = op - OP_BRA; 2106 1891 if (captureNum > EXTRACT_BASIC_MAX) 2107 captureNum = get2Byte OpcodeValueAtOffset(scode,2 + LINK_SIZE);1892 captureNum = get2ByteValue(scode + 2 + LINK_SIZE); 2108 1893 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1; 2109 1894 return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap); … … 2111 1896 2112 1897 /* Other brackets */ 2113 if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE)1898 if (op == OP_BRA || op == OP_ASSERT) 2114 1899 return bracketNeedsLineStart(scode, captureMap, backrefMap); 2115 1900 … … 2124 1909 } 2125 1910 2126 static bool bracketNeedsLineStart(const u schar* code, unsigned captureMap, unsigned backrefMap)1911 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap) 2127 1912 { 2128 1913 do { 2129 1914 if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap)) 2130 1915 return false; 2131 code += get OpcodeValueAtOffset(code,1);1916 code += getLinkValue(code + 1); 2132 1917 } while (*code == OP_ALT); /* Loop for each alternative */ 2133 1918 return true; … … 2154 1939 */ 2155 1940 2156 static int branchFindFirstAssertedCharacter(const u schar* code, bool inassert)1941 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert) 2157 1942 { 2158 const u schar* scode = firstSignificantOpCodeSkippingAssertions(code);1943 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code); 2159 1944 int op = *scode; 2160 1945 … … 2168 1953 case OP_BRA: 2169 1954 case OP_ASSERT: 2170 case OP_ONCE:2171 1955 return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT); 2172 1956 … … 2187 1971 } 2188 1972 2189 static int bracketFindFirstAssertedCharacter(const u schar* code, bool inassert)1973 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert) 2190 1974 { 2191 1975 int c = -1; … … 2198 1982 else if (c != d) 2199 1983 return -1; 2200 code += get OpcodeValueAtOffset(code,1);1984 code += getLinkValue(code + 1); 2201 1985 } while (*code == OP_ALT); 2202 1986 return c; … … 2218 2002 unsigned brastackptr = 0; 2219 2003 int brastack[BRASTACK_SIZE]; 2220 u schar bralenstack[BRASTACK_SIZE];2004 unsigned char bralenstack[BRASTACK_SIZE]; 2221 2005 int bracount = 0; 2222 2006 … … 2233 2017 2234 2018 case '\\': 2235 c = check _escape(&ptr, patternEnd, &errorcode, bracount, false);2019 c = checkEscape(&ptr, patternEnd, &errorcode, bracount, false); 2236 2020 if (errorcode != 0) 2237 2021 return -1; … … 2244 2028 if (c > 127) { 2245 2029 int i; 2246 for (i = 0; i < _pcre_utf8_table1_size; i++)2247 if (c <= _pcre_utf8_table1[i]) break;2030 for (i = 0; i < kjs_pcre_utf8_table1_size; i++) 2031 if (c <= kjs_pcre_utf8_table1[i]) break; 2248 2032 length += i; 2249 2033 lastitemlength += i; … … 2267 2051 cd.top_backref = refnum; 2268 2052 length += 2; /* For single back reference */ 2269 if (safelyCheckNextChar(ptr, patternEnd, '{') && is _counted_repeat(ptr + 2, patternEnd)) {2270 ptr = read _repeat_counts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);2053 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) { 2054 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode); 2271 2055 if (errorcode) 2272 2056 return -1; … … 2299 2083 2300 2084 case '{': 2301 if (!is _counted_repeat(ptr+1, patternEnd))2085 if (!isCountedRepeat(ptr + 1, patternEnd)) 2302 2086 goto NORMAL_CHAR; 2303 ptr = read _repeat_counts(ptr+1, &minRepeats, &maxRepeats, &errorcode);2087 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode); 2304 2088 if (errorcode != 0) 2305 2089 return -1; … … 2366 2150 2367 2151 if (*ptr == '\\') { 2368 c = check _escape(&ptr, patternEnd, &errorcode, bracount, true);2152 c = checkEscape(&ptr, patternEnd, &errorcode, bracount, true); 2369 2153 if (errorcode != 0) 2370 2154 return -1; … … 2401 2185 if (safelyCheckNextChar(ptr, patternEnd, '\\')) { 2402 2186 ptr++; 2403 d = check _escape(&ptr, patternEnd, &errorcode, bracount, true);2187 d = checkEscape(&ptr, patternEnd, &errorcode, bracount, true); 2404 2188 if (errorcode != 0) 2405 2189 return -1; … … 2422 2206 2423 2207 if ((d > 255 || (ignoreCase && d > 127))) { 2424 u schar buffer[6];2208 unsigned char buffer[6]; 2425 2209 if (!class_utf8) /* Allow for XCLASS overhead */ 2426 2210 { … … 2439 2223 int cc = c; 2440 2224 int origd = d; 2441 while (get _othercase_range(&cc, origd, &occ, &ocd)) {2225 while (getOthercaseRange(&cc, origd, &occ, &ocd)) { 2442 2226 if (occ >= c && ocd <= d) 2443 2227 continue; /* Skip embedded */ … … 2456 2240 /* An extra item is needed */ 2457 2241 2458 length += 1 + _pcre_ord2utf8(occ, buffer) +2459 ((occ == ocd) ? 0 : _pcre_ord2utf8(ocd, buffer));2242 length += 1 + encodeUTF8(occ, buffer) + 2243 ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer)); 2460 2244 } 2461 2245 } … … 2463 2247 /* The length of the (possibly extended) range */ 2464 2248 2465 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);2249 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer); 2466 2250 } 2467 2251 … … 2475 2259 else { 2476 2260 if ((c > 255 || (ignoreCase && c > 127))) { 2477 u schar buffer[6];2261 unsigned char buffer[6]; 2478 2262 class_optcount = 10; /* Ensure > 1 */ 2479 2263 if (!class_utf8) /* Allow for XCLASS overhead */ … … 2482 2266 length += LINK_SIZE + 2; 2483 2267 } 2484 length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer));2268 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer)); 2485 2269 } 2486 2270 } … … 2508 2292 we also need extra for wrapping the whole thing in a sub-pattern. */ 2509 2293 2510 if (safelyCheckNextChar(ptr, patternEnd, '{') && is _counted_repeat(ptr+2, patternEnd)) {2511 ptr = read _repeat_counts(ptr+2, &minRepeats, &maxRepeats, &errorcode);2294 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) { 2295 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode); 2512 2296 if (errorcode != 0) 2513 2297 return -1; … … 2538 2322 if (safelyCheckNextChar(ptr, patternEnd, '?')) { 2539 2323 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) { 2540 2541 2542 2543 2324 /* Non-referencing groups and lookaheads just move the pointer on, and 2325 then behave like a non-special bracket, except that they don't increment 2326 the count of extracting brackets. Ditto for the "once only" bracket, 2327 which is in Perl from version 5.005. */ 2544 2328 2545 2329 case ':': … … 2549 2333 break; 2550 2334 2551 2552 2553 2554 2335 /* Else loop checking valid options until ) is met. Anything else is an 2336 error. If we are without any brackets, i.e. at top level, the settings 2337 act as if specified in the options, so massage the options immediately. 2338 This is for backward compatibility with Perl 5.004. */ 2555 2339 2556 2340 default: … … 2605 2389 duplength = 0; 2606 2390 2607 /* Leave ptr at the final char; for read _repeat_counts this happens2391 /* Leave ptr at the final char; for readRepeatCounts this happens 2608 2392 automatically; for the others we need an increment. */ 2609 2393 2610 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && is _counted_repeat(ptr+2, patternEnd)) {2611 ptr = read _repeat_counts(ptr+2, &minRepeats, &maxRepeats, &errorcode);2394 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) { 2395 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode); 2612 2396 if (errorcode) 2613 2397 return -1; … … 2672 2456 if (c > 127) { 2673 2457 int i; 2674 for (i = 0; i < _pcre_utf8_table1_size; i++)2675 if (c <= _pcre_utf8_table1[i])2458 for (i = 0; i < kjs_pcre_utf8_table1_size; i++) 2459 if (c <= kjs_pcre_utf8_table1[i]) 2676 2460 break; 2677 2461 length += i; … … 2709 2493 */ 2710 2494 2711 static JSRegExp* returnError(ErrorCode errorcode, const char** errorptr)2495 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorptr) 2712 2496 { 2713 *errorptr = error _text(errorcode);2497 *errorptr = errorText(errorcode); 2714 2498 return 0; 2715 2499 } … … 2746 2530 passed around in the compile data block. */ 2747 2531 2748 const u schar* codeStart = (const uschar*)(re + 1);2532 const unsigned char* codeStart = (const unsigned char*)(re + 1); 2749 2533 cd.start_code = codeStart; 2750 2534 cd.start_pattern = (const UChar*)pattern; … … 2756 2540 const UChar* ptr = (const UChar*)pattern; 2757 2541 const UChar* patternEnd = pattern + patternLength; 2758 u schar* code = (uschar*)codeStart;2542 unsigned char* code = (unsigned char*)codeStart; 2759 2543 int firstbyte, reqbyte; 2760 2544 int bracketCount = 0;
Note:
See TracChangeset
for help on using the changeset viewer.