Changeset 27828 in webkit for trunk/JavaScriptCore/pcre/pcre_exec.cpp
- Timestamp:
- Nov 15, 2007, 4:14:33 PM (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/pcre/pcre_exec.cpp
r27805 r27828 333 333 */ 334 334 335 static int match(USPTR eptr, const uschar *ecode, int offset_top, match_data *md)335 static int match(USPTR eptr, const uschar* ecode, int offset_top, match_data* md) 336 336 { 337 register int is_match = false;338 register int i;339 register int c;340 341 unsigned rdepth = 0;342 343 BOOL cur_is_word;344 BOOL prev_is_word;345 BOOL is_group_start = true;346 int min;347 BOOL minimize = false; /* Initialization not really needed, but some compilers think so. */348 349 /* The value 16 here is large enough that most regular expressions don't require350 any calls to pcre_stack_malloc, yet the amount of stack used for the array is351 modest enough that we don't run out of stack. */352 matchframe stackframes[16];353 matchframe *stackframesend = stackframes + sizeof(stackframes) / sizeof(stackframes[0]);354 355 matchframe *frame = stackframes;356 matchframe *newframe;357 358 /* The opcode jump table. */337 register int is_match = false; 338 register int i; 339 register int c; 340 341 unsigned rdepth = 0; 342 343 BOOL cur_is_word; 344 BOOL prev_is_word; 345 BOOL is_group_start = true; 346 int min; 347 BOOL minimize = false; /* Initialization not really needed, but some compilers think so. */ 348 349 /* The value 16 here is large enough that most regular expressions don't require 350 any calls to pcre_stack_malloc, yet the amount of stack used for the array is 351 modest enough that we don't run out of stack. */ 352 matchframe stackframes[16]; 353 matchframe *stackframesend = stackframes + sizeof(stackframes) / sizeof(stackframes[0]); 354 355 matchframe *frame = stackframes; 356 matchframe *newframe; 357 358 /* The opcode jump table. */ 359 359 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 360 360 #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode, 361 static void* opcode_jump_table[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) };361 static void* opcode_jump_table[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) }; 362 362 #undef EMIT_JUMP_TABLE_ENTRY 363 363 #endif 364 365 /* One-time setup of the opcode jump table. */364 365 /* One-time setup of the opcode jump table. */ 366 366 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 367 i = 255;368 while (!opcode_jump_table[i])369 opcode_jump_table[i--] = &&CAPTURING_BRACKET;367 i = 255; 368 while (!opcode_jump_table[i]) 369 opcode_jump_table[i--] = &&CAPTURING_BRACKET; 370 370 #endif 371 371 372 372 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION 373 frame->where = &&RETURN;373 frame->where = &&RETURN; 374 374 #else 375 frame->where = 0;375 frame->where = 0; 376 376 #endif 377 378 frame->eptr = eptr;379 frame->ecode = ecode;380 frame->offset_top = offset_top;381 frame->eptrb = NULL;382 383 /* This is where control jumps back to to effect "recursion" */384 377 378 frame->eptr = eptr; 379 frame->ecode = ecode; 380 frame->offset_top = offset_top; 381 frame->eptrb = NULL; 382 383 /* This is where control jumps back to to effect "recursion" */ 384 385 385 RECURSE: 386 387 /* OK, now we can get on with the real code of the function. Recursive calls 388 are specified by the macro RMATCH and RRETURN is used to return. When 389 NO_RECURSE is *not* defined, these just turn into a recursive call to match() 390 and a "return", respectively (possibly with some debugging if DEBUG is 391 defined). However, RMATCH isn't like a function call because it's quite a 392 complicated macro. It has to be used in one particular way. This shouldn't, 393 however, impact performance when true recursion is being used. */ 394 395 /* First check that we haven't called match() too many times, or that we 396 haven't exceeded the recursive call limit. */ 397 398 if (md->match_call_count++ >= MATCH_LIMIT) RRETURN_ERROR(JSRegExpErrorMatchLimit); 399 if (rdepth >= MATCH_LIMIT_RECURSION) RRETURN_ERROR(JSRegExpErrorRecursionLimit); 400 401 /* At the start of a bracketed group, add the current subject pointer to the 402 stack of such pointers, to be re-instated at the end of the group when we hit 403 the closing ket. When match() is called in other circumstances, we don't add to 404 this stack. */ 405 406 if (is_group_start) 407 { 408 frame->newptrb.epb_prev = frame->eptrb; 409 frame->newptrb.epb_saved_eptr = frame->eptr; 410 frame->eptrb = &frame->newptrb; 411 } 412 413 /* Now start processing the operations. */ 414 386 387 /* OK, now we can get on with the real code of the function. Recursive calls 388 are specified by the macro RMATCH and RRETURN is used to return. When 389 NO_RECURSE is *not* defined, these just turn into a recursive call to match() 390 and a "return", respectively (possibly with some debugging if DEBUG is 391 defined). However, RMATCH isn't like a function call because it's quite a 392 complicated macro. It has to be used in one particular way. This shouldn't, 393 however, impact performance when true recursion is being used. */ 394 395 /* First check that we haven't called match() too many times, or that we 396 haven't exceeded the recursive call limit. */ 397 398 if (md->match_call_count++ >= MATCH_LIMIT) 399 RRETURN_ERROR(JSRegExpErrorMatchLimit); 400 if (rdepth >= MATCH_LIMIT_RECURSION) 401 RRETURN_ERROR(JSRegExpErrorRecursionLimit); 402 403 /* At the start of a bracketed group, add the current subject pointer to the 404 stack of such pointers, to be re-instated at the end of the group when we hit 405 the closing ket. When match() is called in other circumstances, we don't add to 406 this stack. */ 407 408 if (is_group_start) { 409 frame->newptrb.epb_prev = frame->eptrb; 410 frame->newptrb.epb_saved_eptr = frame->eptr; 411 frame->eptrb = &frame->newptrb; 412 } 413 414 /* Now start processing the operations. */ 415 415 416 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 416 for (;;) 417 #endif 418 { 419 420 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 421 #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode 422 #define NEXT_OPCODE goto *opcode_jump_table[*frame->ecode] 423 #else 424 #define BEGIN_OPCODE(opcode) case OP_##opcode 425 #define NEXT_OPCODE continue 426 #endif 427 428 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 429 NEXT_OPCODE; 430 #else 431 switch (*frame->ecode) 417 while (true) 432 418 #endif 433 419 { 434 /* Non-capturing bracket: optimized */ 435 436 BEGIN_OPCODE(BRA): 437 NON_CAPTURING_BRACKET: 438 DPRINTF(("start bracket 0\n")); 439 do 440 { 441 RMATCH(2, frame->ecode + 1 + LINK_SIZE, frame->eptrb, match_isgroup); 442 if (is_match) RRETURN; 443 frame->ecode += GET(frame->ecode, 1); 444 } 445 while (*frame->ecode == OP_ALT); 446 DPRINTF(("bracket 0 failed\n")); 447 RRETURN; 448 449 /* Skip over large extraction number data if encountered. */ 450 451 BEGIN_OPCODE(BRANUMBER): 452 frame->ecode += 3; 453 NEXT_OPCODE; 454 455 /* End of the pattern. */ 456 457 BEGIN_OPCODE(END): 458 md->end_match_ptr = frame->eptr; /* Record where we ended */ 459 md->end_offset_top = frame->offset_top; /* and how many extracts were taken */ 460 is_match = true; 461 RRETURN; 462 463 /* Assertion brackets. Check the alternative branches in turn - the 464 matching won't pass the KET for an assertion. If any one branch matches, 465 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 466 start of each branch to move the current point backwards, so the code at 467 this level is identical to the lookahead case. */ 468 469 BEGIN_OPCODE(ASSERT): 470 do 471 { 472 RMATCH(6, frame->ecode + 1 + LINK_SIZE, NULL, match_isgroup); 473 if (is_match) break; 474 frame->ecode += GET(frame->ecode, 1); 475 } 476 while (*frame->ecode == OP_ALT); 477 if (*frame->ecode == OP_KET) RRETURN_NO_MATCH; 478 479 /* Continue from after the assertion, updating the offsets high water 480 mark, since extracts may have been taken during the assertion. */ 481 482 do frame->ecode += GET(frame->ecode,1); while (*frame->ecode == OP_ALT); 483 frame->ecode += 1 + LINK_SIZE; 484 frame->offset_top = md->end_offset_top; 485 NEXT_OPCODE; 486 487 /* Negative assertion: all branches must fail to match */ 488 489 BEGIN_OPCODE(ASSERT_NOT): 490 do 491 { 492 RMATCH(7, frame->ecode + 1 + LINK_SIZE, NULL, match_isgroup); 493 if (is_match) RRETURN_NO_MATCH; 494 frame->ecode += GET(frame->ecode,1); 495 } 496 while (*frame->ecode == OP_ALT); 497 498 frame->ecode += 1 + LINK_SIZE; 499 NEXT_OPCODE; 500 501 /* "Once" brackets are like assertion brackets except that after a match, 502 the point in the subject string is not moved back. Thus there can never be 503 a move back into the brackets. Friedl calls these "atomic" subpatterns. 504 Check the alternative branches in turn - the matching won't pass the KET 505 for this kind of subpattern. If any one branch matches, we carry on as at 506 the end of a normal bracket, leaving the subject pointer. */ 507 508 BEGIN_OPCODE(ONCE): 509 frame->prev = frame->ecode; 510 frame->saved_eptr = frame->eptr; 511 512 do 420 421 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 422 #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode 423 #define NEXT_OPCODE goto *opcode_jump_table[*frame->ecode] 424 #else 425 #define BEGIN_OPCODE(opcode) case OP_##opcode 426 #define NEXT_OPCODE continue 427 #endif 428 429 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 430 NEXT_OPCODE; 431 #else 432 switch (*frame->ecode) 433 #endif 513 434 { 514 RMATCH(9, frame->ecode + 1 + LINK_SIZE, frame->eptrb, match_isgroup); 515 if (is_match) break; 516 frame->ecode += GET(frame->ecode,1); 435 /* Non-capturing bracket: optimized */ 436 437 BEGIN_OPCODE(BRA): 438 NON_CAPTURING_BRACKET: 439 DPRINTF(("start bracket 0\n")); 440 do { 441 RMATCH(2, frame->ecode + 1 + LINK_SIZE, frame->eptrb, match_isgroup); 442 if (is_match) 443 RRETURN; 444 frame->ecode += GET(frame->ecode, 1); 445 } while (*frame->ecode == OP_ALT); 446 DPRINTF(("bracket 0 failed\n")); 447 RRETURN; 448 449 /* Skip over large extraction number data if encountered. */ 450 451 BEGIN_OPCODE(BRANUMBER): 452 frame->ecode += 3; 453 NEXT_OPCODE; 454 455 /* End of the pattern. */ 456 457 BEGIN_OPCODE(END): 458 md->end_match_ptr = frame->eptr; /* Record where we ended */ 459 md->end_offset_top = frame->offset_top; /* and how many extracts were taken */ 460 is_match = true; 461 RRETURN; 462 463 /* Assertion brackets. Check the alternative branches in turn - the 464 matching won't pass the KET for an assertion. If any one branch matches, 465 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 466 start of each branch to move the current point backwards, so the code at 467 this level is identical to the lookahead case. */ 468 469 BEGIN_OPCODE(ASSERT): 470 do { 471 RMATCH(6, frame->ecode + 1 + LINK_SIZE, NULL, match_isgroup); 472 if (is_match) break; 473 frame->ecode += GET(frame->ecode, 1); 474 } while (*frame->ecode == OP_ALT); 475 if (*frame->ecode == OP_KET) 476 RRETURN_NO_MATCH; 477 478 /* Continue from after the assertion, updating the offsets high water 479 mark, since extracts may have been taken during the assertion. */ 480 481 do frame->ecode += GET(frame->ecode,1); while (*frame->ecode == OP_ALT); 482 frame->ecode += 1 + LINK_SIZE; 483 frame->offset_top = md->end_offset_top; 484 NEXT_OPCODE; 485 486 /* Negative assertion: all branches must fail to match */ 487 488 BEGIN_OPCODE(ASSERT_NOT): 489 do { 490 RMATCH(7, frame->ecode + 1 + LINK_SIZE, NULL, match_isgroup); 491 if (is_match) 492 RRETURN_NO_MATCH; 493 frame->ecode += GET(frame->ecode,1); 494 } while (*frame->ecode == OP_ALT); 495 496 frame->ecode += 1 + LINK_SIZE; 497 NEXT_OPCODE; 498 499 /* "Once" brackets are like assertion brackets except that after a match, 500 the point in the subject string is not moved back. Thus there can never be 501 a move back into the brackets. Friedl calls these "atomic" subpatterns. 502 Check the alternative branches in turn - the matching won't pass the KET 503 for this kind of subpattern. If any one branch matches, we carry on as at 504 the end of a normal bracket, leaving the subject pointer. */ 505 506 BEGIN_OPCODE(ONCE): 507 frame->prev = frame->ecode; 508 frame->saved_eptr = frame->eptr; 509 510 do { 511 RMATCH(9, frame->ecode + 1 + LINK_SIZE, frame->eptrb, match_isgroup); 512 if (is_match) 513 break; 514 frame->ecode += GET(frame->ecode,1); 515 } while (*frame->ecode == OP_ALT); 516 517 /* If hit the end of the group (which could be repeated), fail */ 518 519 if (*frame->ecode != OP_ONCE && *frame->ecode != OP_ALT) 520 RRETURN; 521 522 /* Continue as from after the assertion, updating the offsets high water 523 mark, since extracts may have been taken. */ 524 525 do frame->ecode += GET(frame->ecode,1); while (*frame->ecode == OP_ALT); 526 527 frame->offset_top = md->end_offset_top; 528 frame->eptr = md->end_match_ptr; 529 530 /* For a non-repeating ket, just continue at this level. This also 531 happens for a repeating ket if no characters were matched in the group. 532 This is the forcible breaking of infinite loops as implemented in Perl 533 5.005. If there is an options reset, it will get obeyed in the normal 534 course of events. */ 535 536 if (*frame->ecode == OP_KET || frame->eptr == frame->saved_eptr) { 537 frame->ecode += 1+LINK_SIZE; 538 NEXT_OPCODE; 539 } 540 541 /* The repeating kets try the rest of the pattern or restart from the 542 preceding bracket, in the appropriate order. We need to reset any options 543 that changed within the bracket before re-running it, so check the next 544 opcode. */ 545 546 if (*frame->ecode == OP_KETRMIN) { 547 RMATCH(10, frame->ecode + 1 + LINK_SIZE, frame->eptrb, 0); 548 if (is_match) 549 RRETURN; 550 RMATCH(11, frame->prev, frame->eptrb, match_isgroup); 551 if (is_match) 552 RRETURN; 553 } else { /* OP_KETRMAX */ 554 RMATCH(12, frame->prev, frame->eptrb, match_isgroup); 555 if (is_match) 556 RRETURN; 557 RMATCH(13, frame->ecode + 1+LINK_SIZE, frame->eptrb, 0); 558 if (is_match) 559 RRETURN; 560 } 561 RRETURN; 562 563 /* An alternation is the end of a branch; scan along to find the end of the 564 bracketed group and go to there. */ 565 566 BEGIN_OPCODE(ALT): 567 do frame->ecode += GET(frame->ecode,1); while (*frame->ecode == OP_ALT); 568 NEXT_OPCODE; 569 570 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating 571 that it may occur zero times. It may repeat infinitely, or not at all - 572 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper 573 repeat limits are compiled as a number of copies, with the optional ones 574 preceded by BRAZERO or BRAMINZERO. */ 575 576 BEGIN_OPCODE(BRAZERO): 577 { 578 frame->next = frame->ecode+1; 579 RMATCH(14, frame->next, frame->eptrb, match_isgroup); 580 if (is_match) 581 RRETURN; 582 do frame->next += GET(frame->next,1); while (*frame->next == OP_ALT); 583 frame->ecode = frame->next + 1+LINK_SIZE; 584 } 585 NEXT_OPCODE; 586 587 BEGIN_OPCODE(BRAMINZERO): 588 { 589 frame->next = frame->ecode+1; 590 do frame->next += GET(frame->next,1); while (*frame->next == OP_ALT); 591 RMATCH(15, frame->next + 1+LINK_SIZE, frame->eptrb, match_isgroup); 592 if (is_match) 593 RRETURN; 594 frame->ecode++; 595 } 596 NEXT_OPCODE; 597 598 /* End of a group, repeated or non-repeating. If we are at the end of 599 an assertion "group", stop matching and return MATCH_MATCH, but record the 600 current high water mark for use by positive assertions. Do this also 601 for the "once" (not-backup up) groups. */ 602 603 BEGIN_OPCODE(KET): 604 BEGIN_OPCODE(KETRMIN): 605 BEGIN_OPCODE(KETRMAX): 606 frame->prev = frame->ecode - GET(frame->ecode, 1); 607 frame->saved_eptr = frame->eptrb->epb_saved_eptr; 608 609 /* Back up the stack of bracket start pointers. */ 610 611 frame->eptrb = frame->eptrb->epb_prev; 612 613 if (*frame->prev == OP_ASSERT || *frame->prev == OP_ASSERT_NOT || *frame->prev == OP_ONCE) { 614 md->end_match_ptr = frame->eptr; /* For ONCE */ 615 md->end_offset_top = frame->offset_top; 616 is_match = true; 617 RRETURN; 618 } 619 620 /* In all other cases except a conditional group we have to check the 621 group number back at the start and if necessary complete handling an 622 extraction by setting the offsets and bumping the high water mark. */ 623 624 frame->number = *frame->prev - OP_BRA; 625 626 /* For extended extraction brackets (large number), we have to fish out 627 the number from a dummy opcode at the start. */ 628 629 if (frame->number > EXTRACT_BASIC_MAX) 630 frame->number = GET2(frame->prev, 2+LINK_SIZE); 631 frame->offset = frame->number << 1; 632 633 #ifdef DEBUG 634 printf("end bracket %d", frame->number); 635 printf("\n"); 636 #endif 637 638 /* Test for a numbered group. This includes groups called as a result 639 of recursion. Note that whole-pattern recursion is coded as a recurse 640 into group 0, so it won't be picked up here. Instead, we catch it when 641 the OP_END is reached. */ 642 643 if (frame->number > 0) { 644 if (frame->offset >= md->offset_max) 645 md->offset_overflow = true; 646 else { 647 md->offset_vector[frame->offset] = 648 md->offset_vector[md->offset_end - frame->number]; 649 md->offset_vector[frame->offset+1] = frame->eptr - md->start_subject; 650 if (frame->offset_top <= frame->offset) 651 frame->offset_top = frame->offset + 2; 652 } 653 } 654 655 /* For a non-repeating ket, just continue at this level. This also 656 happens for a repeating ket if no characters were matched in the group. 657 This is the forcible breaking of infinite loops as implemented in Perl 658 5.005. If there is an options reset, it will get obeyed in the normal 659 course of events. */ 660 661 if (*frame->ecode == OP_KET || frame->eptr == frame->saved_eptr) { 662 frame->ecode += 1 + LINK_SIZE; 663 NEXT_OPCODE; 664 } 665 666 /* The repeating kets try the rest of the pattern or restart from the 667 preceding bracket, in the appropriate order. */ 668 669 if (*frame->ecode == OP_KETRMIN) { 670 RMATCH(16, frame->ecode + 1+LINK_SIZE, frame->eptrb, 0); 671 if (is_match) 672 RRETURN; 673 RMATCH(17, frame->prev, frame->eptrb, match_isgroup); 674 if (is_match) 675 RRETURN; 676 } else { /* OP_KETRMAX */ 677 RMATCH(18, frame->prev, frame->eptrb, match_isgroup); 678 if (is_match) 679 RRETURN; 680 RMATCH(19, frame->ecode + 1+LINK_SIZE, frame->eptrb, 0); 681 if (is_match) 682 RRETURN; 683 } 684 RRETURN; 685 686 /* Start of subject, or after internal newline if multiline. */ 687 688 BEGIN_OPCODE(CIRC): 689 if (frame->eptr != md->start_subject && (!md->multiline || !isNewline(frame->eptr[-1]))) 690 RRETURN_NO_MATCH; 691 frame->ecode++; 692 NEXT_OPCODE; 693 694 /* End of subject, or before internal newline if multiline. */ 695 696 BEGIN_OPCODE(DOLL): 697 if (frame->eptr < md->end_subject && (!md->multiline || !isNewline(*frame->eptr))) 698 RRETURN_NO_MATCH; 699 frame->ecode++; 700 NEXT_OPCODE; 701 702 /* Word boundary assertions */ 703 704 BEGIN_OPCODE(NOT_WORD_BOUNDARY): 705 BEGIN_OPCODE(WORD_BOUNDARY): 706 /* Find out if the previous and current characters are "word" characters. 707 It takes a bit more work in UTF-8 mode. Characters > 128 are assumed to 708 be "non-word" characters. */ 709 710 if (frame->eptr == md->start_subject) 711 prev_is_word = false; 712 else { 713 const pcre_uchar *lastptr = frame->eptr - 1; 714 while(ISMIDCHAR(*lastptr)) 715 lastptr--; 716 GETCHAR(c, lastptr); 717 prev_is_word = c < 128 && (md->ctypes[c] & ctype_word) != 0; 718 } 719 if (frame->eptr >= md->end_subject) 720 cur_is_word = false; 721 else { 722 GETCHAR(c, frame->eptr); 723 cur_is_word = c < 128 && (md->ctypes[c] & ctype_word) != 0; 724 } 725 726 /* Now see if the situation is what we want */ 727 728 if ((*frame->ecode++ == OP_WORD_BOUNDARY) ? cur_is_word == prev_is_word : cur_is_word != prev_is_word) 729 RRETURN_NO_MATCH; 730 NEXT_OPCODE; 731 732 /* Match a single character type; inline for speed */ 733 734 BEGIN_OPCODE(ANY): 735 if (frame->eptr < md->end_subject && isNewline(*frame->eptr)) 736 RRETURN_NO_MATCH; 737 if (frame->eptr++ >= md->end_subject) 738 RRETURN_NO_MATCH; 739 while (frame->eptr < md->end_subject && ISMIDCHAR(*frame->eptr)) 740 frame->eptr++; 741 frame->ecode++; 742 NEXT_OPCODE; 743 744 BEGIN_OPCODE(NOT_DIGIT): 745 if (frame->eptr >= md->end_subject) 746 RRETURN_NO_MATCH; 747 GETCHARINCTEST(c, frame->eptr); 748 if (isASCIIDigit(c)) 749 RRETURN_NO_MATCH; 750 frame->ecode++; 751 NEXT_OPCODE; 752 753 BEGIN_OPCODE(DIGIT): 754 if (frame->eptr >= md->end_subject) 755 RRETURN_NO_MATCH; 756 GETCHARINCTEST(c, frame->eptr); 757 if (!isASCIIDigit(c)) 758 RRETURN_NO_MATCH; 759 frame->ecode++; 760 NEXT_OPCODE; 761 762 BEGIN_OPCODE(NOT_WHITESPACE): 763 if (frame->eptr >= md->end_subject) 764 RRETURN_NO_MATCH; 765 GETCHARINCTEST(c, frame->eptr); 766 if (c < 128 && (md->ctypes[c] & ctype_space)) 767 RRETURN_NO_MATCH; 768 frame->ecode++; 769 NEXT_OPCODE; 770 771 BEGIN_OPCODE(WHITESPACE): 772 if (frame->eptr >= md->end_subject) 773 RRETURN_NO_MATCH; 774 GETCHARINCTEST(c, frame->eptr); 775 if (c >= 128 || !(md->ctypes[c] & ctype_space)) 776 RRETURN_NO_MATCH; 777 frame->ecode++; 778 NEXT_OPCODE; 779 780 BEGIN_OPCODE(NOT_WORDCHAR): 781 if (frame->eptr >= md->end_subject) 782 RRETURN_NO_MATCH; 783 GETCHARINCTEST(c, frame->eptr); 784 if (c < 128 && (md->ctypes[c] & ctype_word)) 785 RRETURN_NO_MATCH; 786 frame->ecode++; 787 NEXT_OPCODE; 788 789 BEGIN_OPCODE(WORDCHAR): 790 if (frame->eptr >= md->end_subject) 791 RRETURN_NO_MATCH; 792 GETCHARINCTEST(c, frame->eptr); 793 if (c >= 128 || !(md->ctypes[c] & ctype_word)) 794 RRETURN_NO_MATCH; 795 frame->ecode++; 796 NEXT_OPCODE; 797 798 /* Match a back reference, possibly repeatedly. Look past the end of the 799 item to see if there is repeat information following. The code is similar 800 to that for character classes, but repeated for efficiency. Then obey 801 similar code to character type repeats - written out again for speed. 802 However, if the referenced string is the empty string, always treat 803 it as matched, any number of times (otherwise there could be infinite 804 loops). */ 805 806 BEGIN_OPCODE(REF): 807 frame->offset = GET2(frame->ecode, 1) << 1; /* Doubled ref number */ 808 frame->ecode += 3; /* Advance past item */ 809 810 /* If the reference is unset, set the length to be longer than the amount 811 of subject left; this ensures that every attempt at a match fails. We 812 can't just fail here, because of the possibility of quantifiers with zero 813 minima. */ 814 815 if (frame->offset >= frame->offset_top || md->offset_vector[frame->offset] < 0) 816 frame->length = 0; 817 else 818 frame->length = md->offset_vector[frame->offset+1] - md->offset_vector[frame->offset]; 819 820 /* Set up for repetition, or handle the non-repeated case */ 821 822 switch (*frame->ecode) { 823 case OP_CRSTAR: 824 case OP_CRMINSTAR: 825 case OP_CRPLUS: 826 case OP_CRMINPLUS: 827 case OP_CRQUERY: 828 case OP_CRMINQUERY: 829 c = *frame->ecode++ - OP_CRSTAR; 830 minimize = (c & 1) != 0; 831 min = rep_min[c]; /* Pick up values from tables; */ 832 frame->max = rep_max[c]; /* zero for max => infinity */ 833 if (frame->max == 0) 834 frame->max = INT_MAX; 835 break; 836 837 case OP_CRRANGE: 838 case OP_CRMINRANGE: 839 minimize = (*frame->ecode == OP_CRMINRANGE); 840 min = GET2(frame->ecode, 1); 841 frame->max = GET2(frame->ecode, 3); 842 if (frame->max == 0) 843 frame->max = INT_MAX; 844 frame->ecode += 5; 845 break; 846 847 default: /* No repeat follows */ 848 if (!match_ref(frame->offset, frame->eptr, frame->length, md)) 849 RRETURN_NO_MATCH; 850 frame->eptr += frame->length; 851 NEXT_OPCODE; 852 } 853 854 /* If the length of the reference is zero, just continue with the 855 main loop. */ 856 857 if (frame->length == 0) 858 NEXT_OPCODE; 859 860 /* First, ensure the minimum number of matches are present. */ 861 862 for (i = 1; i <= min; i++) { 863 if (!match_ref(frame->offset, frame->eptr, frame->length, md)) 864 RRETURN_NO_MATCH; 865 frame->eptr += frame->length; 866 } 867 868 /* If min = max, continue at the same level without recursion. 869 They are not both allowed to be zero. */ 870 871 if (min == frame->max) 872 NEXT_OPCODE; 873 874 /* If minimizing, keep trying and advancing the pointer */ 875 876 if (minimize) { 877 for (frame->fi = min;; frame->fi++) { 878 RMATCH(20, frame->ecode, frame->eptrb, 0); 879 if (is_match) 880 RRETURN; 881 if (frame->fi >= frame->max || !match_ref(frame->offset, frame->eptr, frame->length, md)) 882 RRETURN; 883 frame->eptr += frame->length; 884 } 885 ASSERT_NOT_REACHED(); 886 } 887 888 /* If maximizing, find the longest string and work backwards */ 889 890 else { 891 frame->pp = frame->eptr; 892 for (i = min; i < frame->max; i++) { 893 if (!match_ref(frame->offset, frame->eptr, frame->length, md)) 894 break; 895 frame->eptr += frame->length; 896 } 897 while (frame->eptr >= frame->pp) { 898 RMATCH(21, frame->ecode, frame->eptrb, 0); 899 if (is_match) 900 RRETURN; 901 frame->eptr -= frame->length; 902 } 903 RRETURN_NO_MATCH; 904 } 905 ASSERT_NOT_REACHED(); 906 907 /* Match a bit-mapped character class, possibly repeatedly. This op code is 908 used when all the characters in the class have values in the range 0-255, 909 and either the matching is caseful, or the characters are in the range 910 0-127 when UTF-8 processing is enabled. The only difference between 911 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 912 encountered. 913 914 First, look past the end of the item to see if there is repeat information 915 following. Then obey similar code to character type repeats - written out 916 again for speed. */ 917 918 BEGIN_OPCODE(NCLASS): 919 BEGIN_OPCODE(CLASS): 920 frame->data = frame->ecode + 1; /* Save for matching */ 921 frame->ecode += 33; /* Advance past the item */ 922 923 switch (*frame->ecode) { 924 case OP_CRSTAR: 925 case OP_CRMINSTAR: 926 case OP_CRPLUS: 927 case OP_CRMINPLUS: 928 case OP_CRQUERY: 929 case OP_CRMINQUERY: 930 c = *frame->ecode++ - OP_CRSTAR; 931 minimize = (c & 1) != 0; 932 min = rep_min[c]; /* Pick up values from tables; */ 933 frame->max = rep_max[c]; /* zero for max => infinity */ 934 if (frame->max == 0) 935 frame->max = INT_MAX; 936 break; 937 938 case OP_CRRANGE: 939 case OP_CRMINRANGE: 940 minimize = (*frame->ecode == OP_CRMINRANGE); 941 min = GET2(frame->ecode, 1); 942 frame->max = GET2(frame->ecode, 3); 943 if (frame->max == 0) 944 frame->max = INT_MAX; 945 frame->ecode += 5; 946 break; 947 948 default: /* No repeat follows */ 949 min = frame->max = 1; 950 break; 951 } 952 953 /* First, ensure the minimum number of matches are present. */ 954 955 for (i = 1; i <= min; i++) { 956 if (frame->eptr >= md->end_subject) 957 RRETURN_NO_MATCH; 958 GETCHARINC(c, frame->eptr); 959 if (c > 255) { 960 if (frame->data[-1] == OP_CLASS) 961 RRETURN_NO_MATCH; 962 } else { 963 if ((frame->data[c/8] & (1 << (c&7))) == 0) 964 RRETURN_NO_MATCH; 965 } 966 } 967 968 /* If max == min we can continue with the main loop without the 969 need to recurse. */ 970 971 if (min == frame->max) 972 NEXT_OPCODE; 973 974 /* If minimizing, keep testing the rest of the expression and advancing 975 the pointer while it matches the class. */ 976 if (minimize) { 977 { 978 for (frame->fi = min;; frame->fi++) { 979 RMATCH(22, frame->ecode, frame->eptrb, 0); 980 if (is_match) 981 RRETURN; 982 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) 983 RRETURN; 984 GETCHARINC(c, frame->eptr); 985 if (c > 255) { 986 if (frame->data[-1] == OP_CLASS) 987 RRETURN; 988 } else { 989 if ((frame->data[c/8] & (1 << (c&7))) == 0) 990 RRETURN; 991 } 992 } 993 } 994 ASSERT_NOT_REACHED(); 995 } 996 /* If maximizing, find the longest possible run, then work backwards. */ 997 else { 998 frame->pp = frame->eptr; 999 1000 for (i = min; i < frame->max; i++) { 1001 int len = 1; 1002 if (frame->eptr >= md->end_subject) 1003 break; 1004 GETCHARLEN(c, frame->eptr, len); 1005 if (c > 255) { 1006 if (frame->data[-1] == OP_CLASS) 1007 break; 1008 } else { 1009 if ((frame->data[c/8] & (1 << (c&7))) == 0) 1010 break; 1011 } 1012 frame->eptr += len; 1013 } 1014 for (;;) { 1015 RMATCH(24, frame->ecode, frame->eptrb, 0); 1016 if (is_match) 1017 RRETURN; 1018 if (frame->eptr-- == frame->pp) 1019 break; /* Stop if tried at original pos */ 1020 BACKCHAR(frame->eptr); 1021 } 1022 1023 RRETURN; 1024 } 1025 ASSERT_NOT_REACHED(); 1026 1027 /* Match an extended character class. This opcode is encountered only 1028 in UTF-8 mode, because that's the only time it is compiled. */ 1029 1030 BEGIN_OPCODE(XCLASS): 1031 frame->data = frame->ecode + 1 + LINK_SIZE; /* Save for matching */ 1032 frame->ecode += GET(frame->ecode, 1); /* Advance past the item */ 1033 1034 switch (*frame->ecode) { 1035 case OP_CRSTAR: 1036 case OP_CRMINSTAR: 1037 case OP_CRPLUS: 1038 case OP_CRMINPLUS: 1039 case OP_CRQUERY: 1040 case OP_CRMINQUERY: 1041 c = *frame->ecode++ - OP_CRSTAR; 1042 minimize = (c & 1) != 0; 1043 min = rep_min[c]; /* Pick up values from tables; */ 1044 frame->max = rep_max[c]; /* zero for max => infinity */ 1045 if (frame->max == 0) 1046 frame->max = INT_MAX; 1047 break; 1048 1049 case OP_CRRANGE: 1050 case OP_CRMINRANGE: 1051 minimize = (*frame->ecode == OP_CRMINRANGE); 1052 min = GET2(frame->ecode, 1); 1053 frame->max = GET2(frame->ecode, 3); 1054 if (frame->max == 0) 1055 frame->max = INT_MAX; 1056 frame->ecode += 5; 1057 break; 1058 1059 default: /* No repeat follows */ 1060 min = frame->max = 1; 1061 } 1062 1063 /* First, ensure the minimum number of matches are present. */ 1064 1065 for (i = 1; i <= min; i++) { 1066 if (frame->eptr >= md->end_subject) 1067 RRETURN_NO_MATCH; 1068 GETCHARINC(c, frame->eptr); 1069 if (!_pcre_xclass(c, frame->data)) 1070 RRETURN_NO_MATCH; 1071 } 1072 1073 /* If max == min we can continue with the main loop without the 1074 need to recurse. */ 1075 1076 if (min == frame->max) 1077 NEXT_OPCODE; 1078 1079 /* If minimizing, keep testing the rest of the expression and advancing 1080 the pointer while it matches the class. */ 1081 1082 if (minimize) { 1083 for (frame->fi = min;; frame->fi++) { 1084 RMATCH(26, frame->ecode, frame->eptrb, 0); 1085 if (is_match) 1086 RRETURN; 1087 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) 1088 RRETURN; 1089 GETCHARINC(c, frame->eptr); 1090 if (!_pcre_xclass(c, frame->data)) 1091 RRETURN; 1092 } 1093 ASSERT_NOT_REACHED(); 1094 } 1095 1096 /* If maximizing, find the longest possible run, then work backwards. */ 1097 1098 else { 1099 frame->pp = frame->eptr; 1100 for (i = min; i < frame->max; i++) { 1101 int len = 1; 1102 if (frame->eptr >= md->end_subject) 1103 break; 1104 GETCHARLEN(c, frame->eptr, len); 1105 if (!_pcre_xclass(c, frame->data)) 1106 break; 1107 frame->eptr += len; 1108 } 1109 for(;;) { 1110 RMATCH(27, frame->ecode, frame->eptrb, 0); 1111 if (is_match) 1112 RRETURN; 1113 if (frame->eptr-- == frame->pp) 1114 break; /* Stop if tried at original pos */ 1115 BACKCHAR(frame->eptr) 1116 } 1117 RRETURN; 1118 } 1119 1120 ASSERT_NOT_REACHED(); 1121 1122 /* Match a single character, casefully */ 1123 1124 BEGIN_OPCODE(CHAR): 1125 frame->length = 1; 1126 frame->ecode++; 1127 GETUTF8CHARLEN(frame->fc, frame->ecode, frame->length); 1128 { 1129 int dc; 1130 frame->ecode += frame->length; 1131 switch (md->end_subject - frame->eptr) { 1132 case 0: 1133 RRETURN_NO_MATCH; 1134 case 1: 1135 dc = *frame->eptr++; 1136 if (IS_LEADING_SURROGATE(dc)) 1137 RRETURN_NO_MATCH; 1138 break; 1139 default: 1140 GETCHARINC(dc, frame->eptr); 1141 } 1142 if (frame->fc != dc) 1143 RRETURN_NO_MATCH; 1144 } 1145 NEXT_OPCODE; 1146 1147 /* Match a single character, caselessly */ 1148 1149 BEGIN_OPCODE(CHARNC): 1150 frame->length = 1; 1151 frame->ecode++; 1152 GETUTF8CHARLEN(frame->fc, frame->ecode, frame->length); 1153 1154 if (md->end_subject - frame->eptr == 0) 1155 RRETURN_NO_MATCH; 1156 1157 { 1158 int dc; 1159 if (md->end_subject - frame->eptr == 1) { 1160 dc = *frame->eptr++; 1161 if (IS_LEADING_SURROGATE(dc)) 1162 RRETURN_NO_MATCH; 1163 } else 1164 GETCHARINC(dc, frame->eptr); 1165 frame->ecode += frame->length; 1166 1167 /* If we have Unicode property support, we can use it to test the other 1168 case of the character, if there is one. */ 1169 1170 if (frame->fc != dc) { 1171 if (dc != _pcre_ucp_othercase(frame->fc)) 1172 RRETURN_NO_MATCH; 1173 } 1174 } 1175 NEXT_OPCODE; 1176 1177 /* Match a single ASCII character. */ 1178 1179 BEGIN_OPCODE(ASCII_CHAR): 1180 if (md->end_subject == frame->eptr) 1181 RRETURN_NO_MATCH; 1182 if (*frame->eptr != frame->ecode[1]) 1183 RRETURN_NO_MATCH; 1184 ++frame->eptr; 1185 frame->ecode += 2; 1186 NEXT_OPCODE; 1187 1188 /* Match one of two cases of an ASCII character. */ 1189 1190 BEGIN_OPCODE(ASCII_LETTER_NC): 1191 if (md->end_subject == frame->eptr) 1192 RRETURN_NO_MATCH; 1193 if ((*frame->eptr | 0x20) != frame->ecode[1]) 1194 RRETURN_NO_MATCH; 1195 ++frame->eptr; 1196 frame->ecode += 2; 1197 NEXT_OPCODE; 1198 1199 /* Match a single character repeatedly; different opcodes share code. */ 1200 1201 BEGIN_OPCODE(EXACT): 1202 min = frame->max = GET2(frame->ecode, 1); 1203 minimize = false; 1204 frame->ecode += 3; 1205 goto REPEATCHAR; 1206 1207 BEGIN_OPCODE(UPTO): 1208 BEGIN_OPCODE(MINUPTO): 1209 min = 0; 1210 frame->max = GET2(frame->ecode, 1); 1211 minimize = *frame->ecode == OP_MINUPTO; 1212 frame->ecode += 3; 1213 goto REPEATCHAR; 1214 1215 BEGIN_OPCODE(STAR): 1216 BEGIN_OPCODE(MINSTAR): 1217 BEGIN_OPCODE(PLUS): 1218 BEGIN_OPCODE(MINPLUS): 1219 BEGIN_OPCODE(QUERY): 1220 BEGIN_OPCODE(MINQUERY): 1221 c = *frame->ecode++ - OP_STAR; 1222 minimize = (c & 1) != 0; 1223 min = rep_min[c]; /* Pick up values from tables; */ 1224 frame->max = rep_max[c]; /* zero for max => infinity */ 1225 if (frame->max == 0) 1226 frame->max = INT_MAX; 1227 1228 /* Common code for all repeated single-character matches. We can give 1229 up quickly if there are fewer than the minimum number of characters left in 1230 the subject. */ 1231 1232 REPEATCHAR: 1233 1234 frame->length = 1; 1235 GETUTF8CHARLEN(frame->fc, frame->ecode, frame->length); 1236 if (min * (frame->fc > 0xFFFF ? 2 : 1) > md->end_subject - frame->eptr) 1237 RRETURN_NO_MATCH; 1238 frame->ecode += frame->length; 1239 1240 if (frame->fc <= 0xFFFF) { 1241 int othercase = md->caseless ? _pcre_ucp_othercase(frame->fc) : -1; 1242 1243 for (i = 1; i <= min; i++) { 1244 if (*frame->eptr != frame->fc && *frame->eptr != othercase) 1245 RRETURN_NO_MATCH; 1246 ++frame->eptr; 1247 } 1248 1249 if (min == frame->max) 1250 NEXT_OPCODE; 1251 1252 if (minimize) { 1253 frame->repeat_othercase = othercase; 1254 for (frame->fi = min;; frame->fi++) { 1255 RMATCH(28, frame->ecode, frame->eptrb, 0); 1256 if (is_match) 1257 RRETURN; 1258 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) 1259 RRETURN; 1260 if (*frame->eptr != frame->fc && *frame->eptr != frame->repeat_othercase) 1261 RRETURN; 1262 ++frame->eptr; 1263 } 1264 ASSERT_NOT_REACHED(); 1265 } else { 1266 frame->pp = frame->eptr; 1267 for (i = min; i < frame->max; i++) { 1268 if (frame->eptr >= md->end_subject) 1269 break; 1270 if (*frame->eptr != frame->fc && *frame->eptr != othercase) 1271 break; 1272 ++frame->eptr; 1273 } 1274 while (frame->eptr >= frame->pp) { 1275 RMATCH(29, frame->ecode, frame->eptrb, 0); 1276 if (is_match) 1277 RRETURN; 1278 --frame->eptr; 1279 } 1280 RRETURN_NO_MATCH; 1281 } 1282 ASSERT_NOT_REACHED(); 1283 } else { 1284 /* No case on surrogate pairs, so no need to bother with "othercase". */ 1285 1286 for (i = 1; i <= min; i++) { 1287 int nc; 1288 GETCHAR(nc, frame->eptr); 1289 if (nc != frame->fc) 1290 RRETURN_NO_MATCH; 1291 frame->eptr += 2; 1292 } 1293 1294 if (min == frame->max) 1295 NEXT_OPCODE; 1296 1297 if (minimize) { 1298 for (frame->fi = min;; frame->fi++) { 1299 int nc; 1300 RMATCH(30, frame->ecode, frame->eptrb, 0); 1301 if (is_match) 1302 RRETURN; 1303 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) 1304 RRETURN; 1305 GETCHAR(nc, frame->eptr); 1306 if (*frame->eptr != frame->fc) 1307 RRETURN; 1308 frame->eptr += 2; 1309 } 1310 ASSERT_NOT_REACHED(); 1311 } else { 1312 frame->pp = frame->eptr; 1313 for (i = min; i < frame->max; i++) { 1314 int nc; 1315 if (frame->eptr > md->end_subject - 2) 1316 break; 1317 GETCHAR(nc, frame->eptr); 1318 if (*frame->eptr != frame->fc) 1319 break; 1320 frame->eptr += 2; 1321 } 1322 while (frame->eptr >= frame->pp) { 1323 RMATCH(31, frame->ecode, frame->eptrb, 0); 1324 if (is_match) 1325 RRETURN; 1326 frame->eptr -= 2; 1327 } 1328 RRETURN_NO_MATCH; 1329 } 1330 ASSERT_NOT_REACHED(); 1331 } 1332 ASSERT_NOT_REACHED(); 1333 1334 /* Match a negated single one-byte character. The character we are 1335 checking can be multibyte. */ 1336 1337 BEGIN_OPCODE(NOT): 1338 if (frame->eptr >= md->end_subject) 1339 RRETURN_NO_MATCH; 1340 frame->ecode++; 1341 GETCHARINCTEST(c, frame->eptr); 1342 if (md->caseless) { 1343 if (c < 128) 1344 c = md->lcc[c]; 1345 if (md->lcc[*frame->ecode++] == c) 1346 RRETURN_NO_MATCH; 1347 } else { 1348 if (*frame->ecode++ == c) 1349 RRETURN_NO_MATCH; 1350 } 1351 NEXT_OPCODE; 1352 1353 /* Match a negated single one-byte character repeatedly. This is almost a 1354 repeat of the code for a repeated single character, but I haven't found a 1355 nice way of commoning these up that doesn't require a test of the 1356 positive/negative option for each character match. Maybe that wouldn't add 1357 very much to the time taken, but character matching *is* what this is all 1358 about... */ 1359 1360 BEGIN_OPCODE(NOTEXACT): 1361 min = frame->max = GET2(frame->ecode, 1); 1362 minimize = false; 1363 frame->ecode += 3; 1364 goto REPEATNOTCHAR; 1365 1366 BEGIN_OPCODE(NOTUPTO): 1367 BEGIN_OPCODE(NOTMINUPTO): 1368 min = 0; 1369 frame->max = GET2(frame->ecode, 1); 1370 minimize = *frame->ecode == OP_NOTMINUPTO; 1371 frame->ecode += 3; 1372 goto REPEATNOTCHAR; 1373 1374 BEGIN_OPCODE(NOTSTAR): 1375 BEGIN_OPCODE(NOTMINSTAR): 1376 BEGIN_OPCODE(NOTPLUS): 1377 BEGIN_OPCODE(NOTMINPLUS): 1378 BEGIN_OPCODE(NOTQUERY): 1379 BEGIN_OPCODE(NOTMINQUERY): 1380 c = *frame->ecode++ - OP_NOTSTAR; 1381 minimize = (c & 1) != 0; 1382 min = rep_min[c]; /* Pick up values from tables; */ 1383 frame->max = rep_max[c]; /* zero for max => infinity */ 1384 if (frame->max == 0) frame->max = INT_MAX; 1385 1386 /* Common code for all repeated single-byte matches. We can give up quickly 1387 if there are fewer than the minimum number of bytes left in the 1388 subject. */ 1389 1390 REPEATNOTCHAR: 1391 if (min > md->end_subject - frame->eptr) 1392 RRETURN_NO_MATCH; 1393 frame->fc = *frame->ecode++; 1394 1395 /* The code is duplicated for the caseless and caseful cases, for speed, 1396 since matching characters is likely to be quite common. First, ensure the 1397 minimum number of matches are present. If min = max, continue at the same 1398 level without recursing. Otherwise, if minimizing, keep trying the rest of 1399 the expression and advancing one matching character if failing, up to the 1400 maximum. Alternatively, if maximizing, find the maximum number of 1401 characters and work backwards. */ 1402 1403 DPRINTF(("negative matching %c{%d,%d}\n", frame->fc, min, frame->max)); 1404 1405 if (md->caseless) { 1406 if (frame->fc < 128) 1407 frame->fc = md->lcc[frame->fc]; 1408 1409 { 1410 register int d; 1411 for (i = 1; i <= min; i++) { 1412 GETCHARINC(d, frame->eptr); 1413 if (d < 128) 1414 d = md->lcc[d]; 1415 if (frame->fc == d) 1416 RRETURN_NO_MATCH; 1417 } 1418 } 1419 1420 if (min == frame->max) 1421 NEXT_OPCODE; 1422 1423 if (minimize) { 1424 register int d; 1425 for (frame->fi = min;; frame->fi++) { 1426 RMATCH(38, frame->ecode, frame->eptrb, 0); 1427 if (is_match) 1428 RRETURN; 1429 GETCHARINC(d, frame->eptr); 1430 if (d < 128) 1431 d = md->lcc[d]; 1432 if (frame->fi >= frame->max || frame->eptr >= md->end_subject || frame->fc == d) 1433 RRETURN; 1434 } 1435 ASSERT_NOT_REACHED(); 1436 } 1437 1438 /* Maximize case */ 1439 1440 else { 1441 frame->pp = frame->eptr; 1442 1443 { 1444 register int d; 1445 for (i = min; i < frame->max; i++) { 1446 int len = 1; 1447 if (frame->eptr >= md->end_subject) 1448 break; 1449 GETCHARLEN(d, frame->eptr, len); 1450 if (d < 128) 1451 d = md->lcc[d]; 1452 if (frame->fc == d) 1453 break; 1454 frame->eptr += len; 1455 } 1456 for (;;) { 1457 RMATCH(40, frame->ecode, frame->eptrb, 0); 1458 if (is_match) 1459 RRETURN; 1460 if (frame->eptr-- == frame->pp) 1461 break; /* Stop if tried at original pos */ 1462 BACKCHAR(frame->eptr); 1463 } 1464 } 1465 1466 RRETURN; 1467 } 1468 ASSERT_NOT_REACHED(); 1469 } 1470 1471 /* Caseful comparisons */ 1472 1473 else { 1474 { 1475 register int d; 1476 for (i = 1; i <= min; i++) { 1477 GETCHARINC(d, frame->eptr); 1478 if (frame->fc == d) 1479 RRETURN_NO_MATCH; 1480 } 1481 } 1482 1483 if (min == frame->max) 1484 NEXT_OPCODE; 1485 1486 if (minimize) { 1487 register int d; 1488 for (frame->fi = min;; frame->fi++) { 1489 RMATCH(42, frame->ecode, frame->eptrb, 0); 1490 if (is_match) 1491 RRETURN; 1492 GETCHARINC(d, frame->eptr); 1493 if (frame->fi >= frame->max || frame->eptr >= md->end_subject || frame->fc == d) 1494 RRETURN; 1495 } 1496 ASSERT_NOT_REACHED(); 1497 } 1498 1499 /* Maximize case */ 1500 1501 else { 1502 frame->pp = frame->eptr; 1503 1504 { 1505 register int d; 1506 for (i = min; i < frame->max; i++) { 1507 int len = 1; 1508 if (frame->eptr >= md->end_subject) 1509 break; 1510 GETCHARLEN(d, frame->eptr, len); 1511 if (frame->fc == d) 1512 break; 1513 frame->eptr += len; 1514 } 1515 for (;;) { 1516 RMATCH(44, frame->ecode, frame->eptrb, 0); 1517 if (is_match) 1518 RRETURN; 1519 if (frame->eptr-- == frame->pp) 1520 break; /* Stop if tried at original pos */ 1521 BACKCHAR(frame->eptr); 1522 } 1523 } 1524 1525 RRETURN; 1526 } 1527 } 1528 ASSERT_NOT_REACHED(); 1529 1530 /* Match a single character type repeatedly; several different opcodes 1531 share code. This is very similar to the code for single characters, but we 1532 repeat it in the interests of efficiency. */ 1533 1534 BEGIN_OPCODE(TYPEEXACT): 1535 min = frame->max = GET2(frame->ecode, 1); 1536 minimize = true; 1537 frame->ecode += 3; 1538 goto REPEATTYPE; 1539 1540 BEGIN_OPCODE(TYPEUPTO): 1541 BEGIN_OPCODE(TYPEMINUPTO): 1542 min = 0; 1543 frame->max = GET2(frame->ecode, 1); 1544 minimize = *frame->ecode == OP_TYPEMINUPTO; 1545 frame->ecode += 3; 1546 goto REPEATTYPE; 1547 1548 BEGIN_OPCODE(TYPESTAR): 1549 BEGIN_OPCODE(TYPEMINSTAR): 1550 BEGIN_OPCODE(TYPEPLUS): 1551 BEGIN_OPCODE(TYPEMINPLUS): 1552 BEGIN_OPCODE(TYPEQUERY): 1553 BEGIN_OPCODE(TYPEMINQUERY): 1554 c = *frame->ecode++ - OP_TYPESTAR; 1555 minimize = (c & 1) != 0; 1556 min = rep_min[c]; /* Pick up values from tables; */ 1557 frame->max = rep_max[c]; /* zero for max => infinity */ 1558 if (frame->max == 0) 1559 frame->max = INT_MAX; 1560 1561 /* Common code for all repeated single character type matches. Note that 1562 in UTF-8 mode, '.' matches a character of any length, but for the other 1563 character types, the valid characters are all one-byte long. */ 1564 1565 REPEATTYPE: 1566 frame->ctype = *frame->ecode++; /* Code for the character type */ 1567 1568 /* First, ensure the minimum number of matches are present. Use inline 1569 code for maximizing the speed, and do the type test once at the start 1570 (i.e. keep it out of the loop). Also we can test that there are at least 1571 the minimum number of bytes before we start. This isn't as effective in 1572 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that 1573 is tidier. Also separate the UCP code, which can be the same for both UTF-8 1574 and single-bytes. */ 1575 1576 if (min > md->end_subject - frame->eptr) 1577 RRETURN_NO_MATCH; 1578 if (min > 0) { 1579 switch(frame->ctype) { 1580 case OP_ANY: 1581 for (i = 1; i <= min; i++) { 1582 if (frame->eptr >= md->end_subject || isNewline(*frame->eptr)) 1583 RRETURN_NO_MATCH; 1584 ++frame->eptr; 1585 while (frame->eptr < md->end_subject && ISMIDCHAR(*frame->eptr)) 1586 frame->eptr++; 1587 } 1588 break; 1589 1590 case OP_NOT_DIGIT: 1591 for (i = 1; i <= min; i++) { 1592 if (frame->eptr >= md->end_subject) 1593 RRETURN_NO_MATCH; 1594 GETCHARINC(c, frame->eptr); 1595 if (isASCIIDigit(c)) 1596 RRETURN_NO_MATCH; 1597 } 1598 break; 1599 1600 case OP_DIGIT: 1601 for (i = 1; i <= min; i++) { 1602 if (frame->eptr >= md->end_subject || !isASCIIDigit(*frame->eptr++)) 1603 RRETURN_NO_MATCH; 1604 /* No need to skip more bytes - we know it's a 1-byte character */ 1605 } 1606 break; 1607 1608 case OP_NOT_WHITESPACE: 1609 for (i = 1; i <= min; i++) { 1610 if (frame->eptr >= md->end_subject || 1611 (*frame->eptr < 128 && (md->ctypes[*frame->eptr] & ctype_space) != 0)) 1612 RRETURN_NO_MATCH; 1613 while (++frame->eptr < md->end_subject && ISMIDCHAR(*frame->eptr)); 1614 } 1615 break; 1616 1617 case OP_WHITESPACE: 1618 for (i = 1; i <= min; i++) { 1619 if (frame->eptr >= md->end_subject || 1620 *frame->eptr >= 128 || (md->ctypes[*frame->eptr++] & ctype_space) == 0) 1621 RRETURN_NO_MATCH; 1622 /* No need to skip more bytes - we know it's a 1-byte character */ 1623 } 1624 break; 1625 1626 case OP_NOT_WORDCHAR: 1627 for (i = 1; i <= min; i++) { 1628 if (frame->eptr >= md->end_subject || 1629 (*frame->eptr < 128 && (md->ctypes[*frame->eptr] & ctype_word) != 0)) 1630 RRETURN_NO_MATCH; 1631 while (++frame->eptr < md->end_subject && ISMIDCHAR(*frame->eptr)); 1632 } 1633 break; 1634 1635 case OP_WORDCHAR: 1636 for (i = 1; i <= min; i++) { 1637 if (frame->eptr >= md->end_subject || 1638 *frame->eptr >= 128 || (md->ctypes[*frame->eptr++] & ctype_word) == 0) 1639 RRETURN_NO_MATCH; 1640 /* No need to skip more bytes - we know it's a 1-byte character */ 1641 } 1642 break; 1643 1644 default: 1645 ASSERT_NOT_REACHED(); 1646 RRETURN_ERROR(JSRegExpErrorInternal); 1647 } /* End switch(frame->ctype) */ 1648 } 1649 1650 /* If min = max, continue at the same level without recursing */ 1651 1652 if (min == frame->max) 1653 NEXT_OPCODE; 1654 1655 /* If minimizing, we have to test the rest of the pattern before each 1656 subsequent match. */ 1657 1658 if (minimize) { 1659 for (frame->fi = min;; frame->fi++) { 1660 RMATCH(48, frame->ecode, frame->eptrb, 0); 1661 if (is_match) 1662 RRETURN; 1663 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) 1664 RRETURN; 1665 1666 GETCHARINC(c, frame->eptr); 1667 switch(frame->ctype) { 1668 case OP_ANY: 1669 if (isNewline(c)) 1670 RRETURN; 1671 break; 1672 1673 case OP_NOT_DIGIT: 1674 if (isASCIIDigit(c)) 1675 RRETURN; 1676 break; 1677 1678 case OP_DIGIT: 1679 if (!isASCIIDigit(c)) 1680 RRETURN; 1681 break; 1682 1683 case OP_NOT_WHITESPACE: 1684 if (c < 128 && (md->ctypes[c] & ctype_space)) 1685 RRETURN; 1686 break; 1687 1688 case OP_WHITESPACE: 1689 if (c >= 128 || !(md->ctypes[c] & ctype_space)) 1690 RRETURN; 1691 break; 1692 1693 case OP_NOT_WORDCHAR: 1694 if (c < 128 && (md->ctypes[c] & ctype_word)) 1695 RRETURN; 1696 break; 1697 1698 case OP_WORDCHAR: 1699 if (c >= 128 || !(md->ctypes[c] & ctype_word)) 1700 RRETURN; 1701 break; 1702 1703 default: 1704 ASSERT_NOT_REACHED(); 1705 RRETURN_ERROR(JSRegExpErrorInternal); 1706 } 1707 } 1708 ASSERT_NOT_REACHED(); 1709 } 1710 1711 /* If maximizing it is worth using inline code for speed, doing the type 1712 test once at the start (i.e. keep it out of the loop). */ 1713 1714 else { 1715 frame->pp = frame->eptr; /* Remember where we started */ 1716 1717 switch(frame->ctype) { 1718 case OP_ANY: 1719 1720 /* Special code is required for UTF8, but when the maximum is unlimited 1721 we don't need it, so we repeat the non-UTF8 code. This is probably 1722 worth it, because .* is quite a common idiom. */ 1723 1724 if (frame->max < INT_MAX) { 1725 for (i = min; i < frame->max; i++) { 1726 if (frame->eptr >= md->end_subject || isNewline(*frame->eptr)) 1727 break; 1728 frame->eptr++; 1729 while (frame->eptr < md->end_subject && (*frame->eptr & 0xc0) == 0x80) 1730 frame->eptr++; 1731 } 1732 } 1733 1734 /* Handle unlimited UTF-8 repeat */ 1735 1736 else { 1737 for (i = min; i < frame->max; i++) { 1738 if (frame->eptr >= md->end_subject || isNewline(*frame->eptr)) 1739 break; 1740 frame->eptr++; 1741 } 1742 break; 1743 } 1744 break; 1745 1746 case OP_NOT_DIGIT: 1747 for (i = min; i < frame->max; i++) { 1748 int len = 1; 1749 if (frame->eptr >= md->end_subject) 1750 break; 1751 GETCHARLEN(c, frame->eptr, len); 1752 if (isASCIIDigit(c)) 1753 break; 1754 frame->eptr+= len; 1755 } 1756 break; 1757 1758 case OP_DIGIT: 1759 for (i = min; i < frame->max; i++) { 1760 int len = 1; 1761 if (frame->eptr >= md->end_subject) 1762 break; 1763 GETCHARLEN(c, frame->eptr, len); 1764 if (!isASCIIDigit(c)) 1765 break; 1766 frame->eptr+= len; 1767 } 1768 break; 1769 1770 case OP_NOT_WHITESPACE: 1771 for (i = min; i < frame->max; i++) { 1772 int len = 1; 1773 if (frame->eptr >= md->end_subject) 1774 break; 1775 GETCHARLEN(c, frame->eptr, len); 1776 if (c < 128 && (md->ctypes[c] & ctype_space)) 1777 break; 1778 frame->eptr+= len; 1779 } 1780 break; 1781 1782 case OP_WHITESPACE: 1783 for (i = min; i < frame->max; i++) { 1784 int len = 1; 1785 if (frame->eptr >= md->end_subject) 1786 break; 1787 GETCHARLEN(c, frame->eptr, len); 1788 if (c >= 128 || !(md->ctypes[c] & ctype_space)) 1789 break; 1790 frame->eptr+= len; 1791 } 1792 break; 1793 1794 case OP_NOT_WORDCHAR: 1795 for (i = min; i < frame->max; i++) { 1796 int len = 1; 1797 if (frame->eptr >= md->end_subject) 1798 break; 1799 GETCHARLEN(c, frame->eptr, len); 1800 if (c < 128 && (md->ctypes[c] & ctype_word)) 1801 break; 1802 frame->eptr+= len; 1803 } 1804 break; 1805 1806 case OP_WORDCHAR: 1807 for (i = min; i < frame->max; i++) { 1808 int len = 1; 1809 if (frame->eptr >= md->end_subject) 1810 break; 1811 GETCHARLEN(c, frame->eptr, len); 1812 if (c >= 128 || !(md->ctypes[c] & ctype_word)) 1813 break; 1814 frame->eptr+= len; 1815 } 1816 break; 1817 1818 default: 1819 ASSERT_NOT_REACHED(); 1820 RRETURN_ERROR(JSRegExpErrorInternal); 1821 } 1822 1823 /* frame->eptr is now past the end of the maximum run */ 1824 1825 for (;;) { 1826 RMATCH(52, frame->ecode, frame->eptrb, 0); 1827 if (is_match) 1828 RRETURN; 1829 if (frame->eptr-- == frame->pp) 1830 break; /* Stop if tried at original pos */ 1831 BACKCHAR(frame->eptr); 1832 } 1833 1834 /* Get here if we can't make it match with any permitted repetitions */ 1835 1836 RRETURN; 1837 } 1838 ASSERT_NOT_REACHED(); 1839 1840 BEGIN_OPCODE(CRMINPLUS): 1841 BEGIN_OPCODE(CRMINQUERY): 1842 BEGIN_OPCODE(CRMINRANGE): 1843 BEGIN_OPCODE(CRMINSTAR): 1844 BEGIN_OPCODE(CRPLUS): 1845 BEGIN_OPCODE(CRQUERY): 1846 BEGIN_OPCODE(CRRANGE): 1847 BEGIN_OPCODE(CRSTAR): 1848 ASSERT_NOT_REACHED(); 1849 RRETURN_ERROR(JSRegExpErrorInternal); 1850 1851 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 1852 CAPTURING_BRACKET: 1853 #else 1854 default: 1855 #endif 1856 /* Opening capturing bracket. If there is space in the offset vector, save 1857 the current subject position in the working slot at the top of the vector. We 1858 mustn't change the current values of the data slot, because they may be set 1859 from a previous iteration of this group, and be referred to by a reference 1860 inside the group. 1861 1862 If the bracket fails to match, we need to restore this value and also the 1863 values of the final offsets, in case they were set by a previous iteration of 1864 the same bracket. 1865 1866 If there isn't enough space in the offset vector, treat this as if it were a 1867 non-capturing bracket. Don't worry about setting the flag for the error case 1868 here; that is handled in the code for KET. */ 1869 1870 ASSERT(*frame->ecode > OP_BRA); 1871 1872 frame->number = *frame->ecode - OP_BRA; 1873 1874 /* For extended extraction brackets (large number), we have to fish out the 1875 number from a dummy opcode at the start. */ 1876 1877 if (frame->number > EXTRACT_BASIC_MAX) 1878 frame->number = GET2(frame->ecode, 2+LINK_SIZE); 1879 frame->offset = frame->number << 1; 1880 1881 #ifdef DEBUG 1882 printf("start bracket %d subject=", frame->number); 1883 pchars(frame->eptr, 16, true, md); 1884 printf("\n"); 1885 #endif 1886 1887 if (frame->offset < md->offset_max) { 1888 frame->save_offset1 = md->offset_vector[frame->offset]; 1889 frame->save_offset2 = md->offset_vector[frame->offset + 1]; 1890 frame->save_offset3 = md->offset_vector[md->offset_end - frame->number]; 1891 1892 DPRINTF(("saving %d %d %d\n", frame->save_offset1, frame->save_offset2, frame->save_offset3)); 1893 md->offset_vector[md->offset_end - frame->number] = frame->eptr - md->start_subject; 1894 1895 do { 1896 RMATCH(1, frame->ecode + 1 + LINK_SIZE, frame->eptrb, match_isgroup); 1897 if (is_match) RRETURN; 1898 frame->ecode += GET(frame->ecode, 1); 1899 } while (*frame->ecode == OP_ALT); 1900 1901 DPRINTF(("bracket %d failed\n", frame->number)); 1902 1903 md->offset_vector[frame->offset] = frame->save_offset1; 1904 md->offset_vector[frame->offset + 1] = frame->save_offset2; 1905 md->offset_vector[md->offset_end - frame->number] = frame->save_offset3; 1906 1907 RRETURN; 1908 } 1909 1910 /* Insufficient room for saving captured contents */ 1911 1912 goto NON_CAPTURING_BRACKET; 517 1913 } 518 while (*frame->ecode == OP_ALT); 519 520 /* If hit the end of the group (which could be repeated), fail */ 521 522 if (*frame->ecode != OP_ONCE && *frame->ecode != OP_ALT) RRETURN; 523 524 /* Continue as from after the assertion, updating the offsets high water 525 mark, since extracts may have been taken. */ 526 527 do frame->ecode += GET(frame->ecode,1); while (*frame->ecode == OP_ALT); 528 529 frame->offset_top = md->end_offset_top; 530 frame->eptr = md->end_match_ptr; 531 532 /* For a non-repeating ket, just continue at this level. This also 533 happens for a repeating ket if no characters were matched in the group. 534 This is the forcible breaking of infinite loops as implemented in Perl 535 5.005. If there is an options reset, it will get obeyed in the normal 536 course of events. */ 537 538 if (*frame->ecode == OP_KET || frame->eptr == frame->saved_eptr) 539 { 540 frame->ecode += 1+LINK_SIZE; 541 NEXT_OPCODE; 542 } 543 544 /* The repeating kets try the rest of the pattern or restart from the 545 preceding bracket, in the appropriate order. We need to reset any options 546 that changed within the bracket before re-running it, so check the next 547 opcode. */ 548 549 if (*frame->ecode == OP_KETRMIN) 550 { 551 RMATCH(10, frame->ecode + 1 + LINK_SIZE, frame->eptrb, 0); 552 if (is_match) RRETURN; 553 RMATCH(11, frame->prev, frame->eptrb, match_isgroup); 554 if (is_match) RRETURN; 555 } 556 else /* OP_KETRMAX */ 557 { 558 RMATCH(12, frame->prev, frame->eptrb, match_isgroup); 559 if (is_match) RRETURN; 560 RMATCH(13, frame->ecode + 1+LINK_SIZE, frame->eptrb, 0); 561 if (is_match) RRETURN; 562 } 563 RRETURN; 564 565 /* An alternation is the end of a branch; scan along to find the end of the 566 bracketed group and go to there. */ 567 568 BEGIN_OPCODE(ALT): 569 do frame->ecode += GET(frame->ecode,1); while (*frame->ecode == OP_ALT); 570 NEXT_OPCODE; 571 572 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating 573 that it may occur zero times. It may repeat infinitely, or not at all - 574 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper 575 repeat limits are compiled as a number of copies, with the optional ones 576 preceded by BRAZERO or BRAMINZERO. */ 577 578 BEGIN_OPCODE(BRAZERO): 579 { 580 frame->next = frame->ecode+1; 581 RMATCH(14, frame->next, frame->eptrb, match_isgroup); 582 if (is_match) RRETURN; 583 do frame->next += GET(frame->next,1); while (*frame->next == OP_ALT); 584 frame->ecode = frame->next + 1+LINK_SIZE; 585 } 586 NEXT_OPCODE; 587 588 BEGIN_OPCODE(BRAMINZERO): 589 { 590 frame->next = frame->ecode+1; 591 do frame->next += GET(frame->next,1); while (*frame->next == OP_ALT); 592 RMATCH(15, frame->next + 1+LINK_SIZE, frame->eptrb, match_isgroup); 593 if (is_match) RRETURN; 594 frame->ecode++; 595 } 596 NEXT_OPCODE; 597 598 /* End of a group, repeated or non-repeating. If we are at the end of 599 an assertion "group", stop matching and return MATCH_MATCH, but record the 600 current high water mark for use by positive assertions. Do this also 601 for the "once" (not-backup up) groups. */ 602 603 BEGIN_OPCODE(KET): 604 BEGIN_OPCODE(KETRMIN): 605 BEGIN_OPCODE(KETRMAX): 606 frame->prev = frame->ecode - GET(frame->ecode, 1); 607 frame->saved_eptr = frame->eptrb->epb_saved_eptr; 608 609 /* Back up the stack of bracket start pointers. */ 610 611 frame->eptrb = frame->eptrb->epb_prev; 612 613 if (*frame->prev == OP_ASSERT || *frame->prev == OP_ASSERT_NOT || *frame->prev == OP_ONCE) 614 { 615 md->end_match_ptr = frame->eptr; /* For ONCE */ 616 md->end_offset_top = frame->offset_top; 617 is_match = true; 618 RRETURN; 619 } 620 621 /* In all other cases except a conditional group we have to check the 622 group number back at the start and if necessary complete handling an 623 extraction by setting the offsets and bumping the high water mark. */ 624 625 frame->number = *frame->prev - OP_BRA; 626 627 /* For extended extraction brackets (large number), we have to fish out 628 the number from a dummy opcode at the start. */ 629 630 if (frame->number > EXTRACT_BASIC_MAX) frame->number = GET2(frame->prev, 2+LINK_SIZE); 631 frame->offset = frame->number << 1; 632 633 #ifdef DEBUG 634 printf("end bracket %d", frame->number); 635 printf("\n"); 1914 1915 /* Do not stick any code in here without much thought; it is assumed 1916 that "continue" in the code above comes out to here to repeat the main 1917 loop. */ 1918 1919 } /* End of main loop */ 1920 1921 ASSERT_NOT_REACHED(); 1922 1923 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION 1924 1925 RRETURN_SWITCH: 1926 switch (frame->where) 1927 { 1928 case 0: goto RETURN; 1929 case 1: goto RRETURN_1; 1930 case 2: goto RRETURN_2; 1931 case 6: goto RRETURN_6; 1932 case 7: goto RRETURN_7; 1933 case 9: goto RRETURN_9; 1934 case 10: goto RRETURN_10; 1935 case 11: goto RRETURN_11; 1936 case 12: goto RRETURN_12; 1937 case 13: goto RRETURN_13; 1938 case 14: goto RRETURN_14; 1939 case 15: goto RRETURN_15; 1940 case 16: goto RRETURN_16; 1941 case 17: goto RRETURN_17; 1942 case 18: goto RRETURN_18; 1943 case 19: goto RRETURN_19; 1944 case 20: goto RRETURN_20; 1945 case 21: goto RRETURN_21; 1946 case 22: goto RRETURN_22; 1947 case 24: goto RRETURN_24; 1948 case 26: goto RRETURN_26; 1949 case 27: goto RRETURN_27; 1950 case 28: goto RRETURN_28; 1951 case 29: goto RRETURN_29; 1952 case 30: goto RRETURN_30; 1953 case 31: goto RRETURN_31; 1954 case 38: goto RRETURN_38; 1955 case 40: goto RRETURN_40; 1956 case 42: goto RRETURN_42; 1957 case 44: goto RRETURN_44; 1958 case 48: goto RRETURN_48; 1959 case 52: goto RRETURN_52; 1960 } 1961 1962 abort(); 1963 RRETURN_ERROR(JSRegExpErrorInternal); 1964 636 1965 #endif 637 638 /* Test for a numbered group. This includes groups called as a result 639 of recursion. Note that whole-pattern recursion is coded as a recurse 640 into group 0, so it won't be picked up here. Instead, we catch it when 641 the OP_END is reached. */ 642 643 if (frame->number > 0) 644 { 645 if (frame->offset >= md->offset_max) md->offset_overflow = true; else 646 { 647 md->offset_vector[frame->offset] = 648 md->offset_vector[md->offset_end - frame->number]; 649 md->offset_vector[frame->offset+1] = frame->eptr - md->start_subject; 650 if (frame->offset_top <= frame->offset) frame->offset_top = frame->offset + 2; 651 } 652 } 653 654 /* For a non-repeating ket, just continue at this level. This also 655 happens for a repeating ket if no characters were matched in the group. 656 This is the forcible breaking of infinite loops as implemented in Perl 657 5.005. If there is an options reset, it will get obeyed in the normal 658 course of events. */ 659 660 if (*frame->ecode == OP_KET || frame->eptr == frame->saved_eptr) 661 { 662 frame->ecode += 1 + LINK_SIZE; 663 NEXT_OPCODE; 664 } 665 666 /* The repeating kets try the rest of the pattern or restart from the 667 preceding bracket, in the appropriate order. */ 668 669 if (*frame->ecode == OP_KETRMIN) 670 { 671 RMATCH(16, frame->ecode + 1+LINK_SIZE, frame->eptrb, 0); 672 if (is_match) RRETURN; 673 RMATCH(17, frame->prev, frame->eptrb, match_isgroup); 674 if (is_match) RRETURN; 675 } 676 else /* OP_KETRMAX */ 677 { 678 RMATCH(18, frame->prev, frame->eptrb, match_isgroup); 679 if (is_match) RRETURN; 680 RMATCH(19, frame->ecode + 1+LINK_SIZE, frame->eptrb, 0); 681 if (is_match) RRETURN; 682 } 683 RRETURN; 684 685 /* Start of subject, or after internal newline if multiline. */ 686 687 BEGIN_OPCODE(CIRC): 688 if (frame->eptr != md->start_subject && (!md->multiline || !isNewline(frame->eptr[-1]))) 689 RRETURN_NO_MATCH; 690 frame->ecode++; 691 NEXT_OPCODE; 692 693 /* End of subject, or before internal newline if multiline. */ 694 695 BEGIN_OPCODE(DOLL): 696 if (frame->eptr < md->end_subject && (!md->multiline || !isNewline(*frame->eptr))) 697 RRETURN_NO_MATCH; 698 frame->ecode++; 699 NEXT_OPCODE; 700 701 /* Word boundary assertions */ 702 703 BEGIN_OPCODE(NOT_WORD_BOUNDARY): 704 BEGIN_OPCODE(WORD_BOUNDARY): 705 /* Find out if the previous and current characters are "word" characters. 706 It takes a bit more work in UTF-8 mode. Characters > 128 are assumed to 707 be "non-word" characters. */ 708 709 if (frame->eptr == md->start_subject) prev_is_word = false; else 710 { 711 const pcre_uchar *lastptr = frame->eptr - 1; 712 while(ISMIDCHAR(*lastptr)) lastptr--; 713 GETCHAR(c, lastptr); 714 prev_is_word = c < 128 && (md->ctypes[c] & ctype_word) != 0; 715 } 716 if (frame->eptr >= md->end_subject) cur_is_word = false; else 717 { 718 GETCHAR(c, frame->eptr); 719 cur_is_word = c < 128 && (md->ctypes[c] & ctype_word) != 0; 720 } 721 722 /* Now see if the situation is what we want */ 723 724 if ((*frame->ecode++ == OP_WORD_BOUNDARY)? 725 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 726 RRETURN_NO_MATCH; 727 NEXT_OPCODE; 728 729 /* Match a single character type; inline for speed */ 730 731 BEGIN_OPCODE(ANY): 732 if (frame->eptr < md->end_subject && isNewline(*frame->eptr)) 733 RRETURN_NO_MATCH; 734 if (frame->eptr++ >= md->end_subject) RRETURN_NO_MATCH; 735 while (frame->eptr < md->end_subject && ISMIDCHAR(*frame->eptr)) frame->eptr++; 736 frame->ecode++; 737 NEXT_OPCODE; 738 739 BEGIN_OPCODE(NOT_DIGIT): 740 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 741 GETCHARINCTEST(c, frame->eptr); 742 if (isASCIIDigit(c)) 743 RRETURN_NO_MATCH; 744 frame->ecode++; 745 NEXT_OPCODE; 746 747 BEGIN_OPCODE(DIGIT): 748 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 749 GETCHARINCTEST(c, frame->eptr); 750 if (!isASCIIDigit(c)) 751 RRETURN_NO_MATCH; 752 frame->ecode++; 753 NEXT_OPCODE; 754 755 BEGIN_OPCODE(NOT_WHITESPACE): 756 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 757 GETCHARINCTEST(c, frame->eptr); 758 if (c < 128 && (md->ctypes[c] & ctype_space) != 0) 759 RRETURN_NO_MATCH; 760 frame->ecode++; 761 NEXT_OPCODE; 762 763 BEGIN_OPCODE(WHITESPACE): 764 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 765 GETCHARINCTEST(c, frame->eptr); 766 if (c >= 128 || (md->ctypes[c] & ctype_space) == 0) 767 RRETURN_NO_MATCH; 768 frame->ecode++; 769 NEXT_OPCODE; 770 771 BEGIN_OPCODE(NOT_WORDCHAR): 772 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 773 GETCHARINCTEST(c, frame->eptr); 774 if (c < 128 && (md->ctypes[c] & ctype_word) != 0) 775 RRETURN_NO_MATCH; 776 frame->ecode++; 777 NEXT_OPCODE; 778 779 BEGIN_OPCODE(WORDCHAR): 780 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 781 GETCHARINCTEST(c, frame->eptr); 782 if (c >= 128 || (md->ctypes[c] & ctype_word) == 0) 783 RRETURN_NO_MATCH; 784 frame->ecode++; 785 NEXT_OPCODE; 786 787 /* Match a back reference, possibly repeatedly. Look past the end of the 788 item to see if there is repeat information following. The code is similar 789 to that for character classes, but repeated for efficiency. Then obey 790 similar code to character type repeats - written out again for speed. 791 However, if the referenced string is the empty string, always treat 792 it as matched, any number of times (otherwise there could be infinite 793 loops). */ 794 795 BEGIN_OPCODE(REF): 796 frame->offset = GET2(frame->ecode, 1) << 1; /* Doubled ref number */ 797 frame->ecode += 3; /* Advance past item */ 798 799 /* If the reference is unset, set the length to be longer than the amount 800 of subject left; this ensures that every attempt at a match fails. We 801 can't just fail here, because of the possibility of quantifiers with zero 802 minima. */ 803 804 frame->length = (frame->offset >= frame->offset_top || md->offset_vector[frame->offset] < 0)? 805 0 : 806 md->offset_vector[frame->offset+1] - md->offset_vector[frame->offset]; 807 808 /* Set up for repetition, or handle the non-repeated case */ 809 810 switch (*frame->ecode) 811 { 812 case OP_CRSTAR: 813 case OP_CRMINSTAR: 814 case OP_CRPLUS: 815 case OP_CRMINPLUS: 816 case OP_CRQUERY: 817 case OP_CRMINQUERY: 818 c = *frame->ecode++ - OP_CRSTAR; 819 minimize = (c & 1) != 0; 820 min = rep_min[c]; /* Pick up values from tables; */ 821 frame->max = rep_max[c]; /* zero for max => infinity */ 822 if (frame->max == 0) frame->max = INT_MAX; 823 break; 824 825 case OP_CRRANGE: 826 case OP_CRMINRANGE: 827 minimize = (*frame->ecode == OP_CRMINRANGE); 828 min = GET2(frame->ecode, 1); 829 frame->max = GET2(frame->ecode, 3); 830 if (frame->max == 0) frame->max = INT_MAX; 831 frame->ecode += 5; 832 break; 833 834 default: /* No repeat follows */ 835 if (!match_ref(frame->offset, frame->eptr, frame->length, md)) RRETURN_NO_MATCH; 836 frame->eptr += frame->length; 837 NEXT_OPCODE; 838 } 839 840 /* If the length of the reference is zero, just continue with the 841 main loop. */ 842 843 if (frame->length == 0) 844 NEXT_OPCODE; 845 846 /* First, ensure the minimum number of matches are present. */ 847 848 for (i = 1; i <= min; i++) 849 { 850 if (!match_ref(frame->offset, frame->eptr, frame->length, md)) RRETURN_NO_MATCH; 851 frame->eptr += frame->length; 852 } 853 854 /* If min = max, continue at the same level without recursion. 855 They are not both allowed to be zero. */ 856 857 if (min == frame->max) 858 NEXT_OPCODE; 859 860 /* If minimizing, keep trying and advancing the pointer */ 861 862 if (minimize) 863 { 864 for (frame->fi = min;; frame->fi++) 865 { 866 RMATCH(20, frame->ecode, frame->eptrb, 0); 867 if (is_match) RRETURN; 868 if (frame->fi >= frame->max || !match_ref(frame->offset, frame->eptr, frame->length, md)) 869 RRETURN; 870 frame->eptr += frame->length; 871 } 872 /* Control never gets here */ 873 } 874 875 /* If maximizing, find the longest string and work backwards */ 876 877 else 878 { 879 frame->pp = frame->eptr; 880 for (i = min; i < frame->max; i++) 881 { 882 if (!match_ref(frame->offset, frame->eptr, frame->length, md)) break; 883 frame->eptr += frame->length; 884 } 885 while (frame->eptr >= frame->pp) 886 { 887 RMATCH(21, frame->ecode, frame->eptrb, 0); 888 if (is_match) RRETURN; 889 frame->eptr -= frame->length; 890 } 891 RRETURN_NO_MATCH; 892 } 893 /* Control never gets here */ 894 895 /* Match a bit-mapped character class, possibly repeatedly. This op code is 896 used when all the characters in the class have values in the range 0-255, 897 and either the matching is caseful, or the characters are in the range 898 0-127 when UTF-8 processing is enabled. The only difference between 899 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 900 encountered. 901 902 First, look past the end of the item to see if there is repeat information 903 following. Then obey similar code to character type repeats - written out 904 again for speed. */ 905 906 BEGIN_OPCODE(NCLASS): 907 BEGIN_OPCODE(CLASS): 908 frame->data = frame->ecode + 1; /* Save for matching */ 909 frame->ecode += 33; /* Advance past the item */ 910 911 switch (*frame->ecode) 912 { 913 case OP_CRSTAR: 914 case OP_CRMINSTAR: 915 case OP_CRPLUS: 916 case OP_CRMINPLUS: 917 case OP_CRQUERY: 918 case OP_CRMINQUERY: 919 c = *frame->ecode++ - OP_CRSTAR; 920 minimize = (c & 1) != 0; 921 min = rep_min[c]; /* Pick up values from tables; */ 922 frame->max = rep_max[c]; /* zero for max => infinity */ 923 if (frame->max == 0) frame->max = INT_MAX; 924 break; 925 926 case OP_CRRANGE: 927 case OP_CRMINRANGE: 928 minimize = (*frame->ecode == OP_CRMINRANGE); 929 min = GET2(frame->ecode, 1); 930 frame->max = GET2(frame->ecode, 3); 931 if (frame->max == 0) frame->max = INT_MAX; 932 frame->ecode += 5; 933 break; 934 935 default: /* No repeat follows */ 936 min = frame->max = 1; 937 break; 938 } 939 940 /* First, ensure the minimum number of matches are present. */ 941 942 { 943 for (i = 1; i <= min; i++) 944 { 945 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 946 GETCHARINC(c, frame->eptr); 947 if (c > 255) 948 { 949 if (frame->data[-1] == OP_CLASS) RRETURN_NO_MATCH; 950 } 951 else 952 { 953 if ((frame->data[c/8] & (1 << (c&7))) == 0) RRETURN_NO_MATCH; 954 } 955 } 956 } 957 958 /* If max == min we can continue with the main loop without the 959 need to recurse. */ 960 961 if (min == frame->max) 962 NEXT_OPCODE; 963 964 /* If minimizing, keep testing the rest of the expression and advancing 965 the pointer while it matches the class. */ 966 if (minimize) 967 { 968 { 969 for (frame->fi = min;; frame->fi++) 970 { 971 RMATCH(22, frame->ecode, frame->eptrb, 0); 972 if (is_match) RRETURN; 973 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) RRETURN; 974 GETCHARINC(c, frame->eptr); 975 if (c > 255) 976 { 977 if (frame->data[-1] == OP_CLASS) RRETURN; 978 } 979 else 980 { 981 if ((frame->data[c/8] & (1 << (c&7))) == 0) RRETURN; 982 } 983 } 984 } 985 /* Control never gets here */ 986 } 987 /* If maximizing, find the longest possible run, then work backwards. */ 988 else 989 { 990 frame->pp = frame->eptr; 991 992 for (i = min; i < frame->max; i++) 993 { 994 int len = 1; 995 if (frame->eptr >= md->end_subject) break; 996 GETCHARLEN(c, frame->eptr, len); 997 if (c > 255) 998 { 999 if (frame->data[-1] == OP_CLASS) break; 1000 } 1001 else 1002 { 1003 if ((frame->data[c/8] & (1 << (c&7))) == 0) break; 1004 } 1005 frame->eptr += len; 1006 } 1007 for (;;) 1008 { 1009 RMATCH(24, frame->ecode, frame->eptrb, 0); 1010 if (is_match) RRETURN; 1011 if (frame->eptr-- == frame->pp) break; /* Stop if tried at original pos */ 1012 BACKCHAR(frame->eptr); 1013 } 1014 1015 RRETURN; 1016 } 1017 /* Control never gets here */ 1018 1019 /* Match an extended character class. This opcode is encountered only 1020 in UTF-8 mode, because that's the only time it is compiled. */ 1021 1022 BEGIN_OPCODE(XCLASS): 1023 frame->data = frame->ecode + 1 + LINK_SIZE; /* Save for matching */ 1024 frame->ecode += GET(frame->ecode, 1); /* Advance past the item */ 1025 1026 switch (*frame->ecode) 1027 { 1028 case OP_CRSTAR: 1029 case OP_CRMINSTAR: 1030 case OP_CRPLUS: 1031 case OP_CRMINPLUS: 1032 case OP_CRQUERY: 1033 case OP_CRMINQUERY: 1034 c = *frame->ecode++ - OP_CRSTAR; 1035 minimize = (c & 1) != 0; 1036 min = rep_min[c]; /* Pick up values from tables; */ 1037 frame->max = rep_max[c]; /* zero for max => infinity */ 1038 if (frame->max == 0) frame->max = INT_MAX; 1039 break; 1040 1041 case OP_CRRANGE: 1042 case OP_CRMINRANGE: 1043 minimize = (*frame->ecode == OP_CRMINRANGE); 1044 min = GET2(frame->ecode, 1); 1045 frame->max = GET2(frame->ecode, 3); 1046 if (frame->max == 0) frame->max = INT_MAX; 1047 frame->ecode += 5; 1048 break; 1049 1050 default: /* No repeat follows */ 1051 min = frame->max = 1; 1052 } 1053 1054 /* First, ensure the minimum number of matches are present. */ 1055 1056 for (i = 1; i <= min; i++) 1057 { 1058 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 1059 GETCHARINC(c, frame->eptr); 1060 if (!_pcre_xclass(c, frame->data)) RRETURN_NO_MATCH; 1061 } 1062 1063 /* If max == min we can continue with the main loop without the 1064 need to recurse. */ 1065 1066 if (min == frame->max) 1067 NEXT_OPCODE; 1068 1069 /* If minimizing, keep testing the rest of the expression and advancing 1070 the pointer while it matches the class. */ 1071 1072 if (minimize) 1073 { 1074 for (frame->fi = min;; frame->fi++) 1075 { 1076 RMATCH(26, frame->ecode, frame->eptrb, 0); 1077 if (is_match) RRETURN; 1078 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) RRETURN; 1079 GETCHARINC(c, frame->eptr); 1080 if (!_pcre_xclass(c, frame->data)) RRETURN; 1081 } 1082 /* Control never gets here */ 1083 } 1084 1085 /* If maximizing, find the longest possible run, then work backwards. */ 1086 1087 else 1088 { 1089 frame->pp = frame->eptr; 1090 for (i = min; i < frame->max; i++) 1091 { 1092 int len = 1; 1093 if (frame->eptr >= md->end_subject) break; 1094 GETCHARLEN(c, frame->eptr, len); 1095 if (!_pcre_xclass(c, frame->data)) break; 1096 frame->eptr += len; 1097 } 1098 for(;;) 1099 { 1100 RMATCH(27, frame->ecode, frame->eptrb, 0); 1101 if (is_match) RRETURN; 1102 if (frame->eptr-- == frame->pp) break; /* Stop if tried at original pos */ 1103 BACKCHAR(frame->eptr) 1104 } 1105 RRETURN; 1106 } 1107 1108 /* Control never gets here */ 1109 1110 /* Match a single character, casefully */ 1111 1112 BEGIN_OPCODE(CHAR): 1113 frame->length = 1; 1114 frame->ecode++; 1115 GETUTF8CHARLEN(frame->fc, frame->ecode, frame->length); 1116 { 1117 int dc; 1118 frame->ecode += frame->length; 1119 switch (md->end_subject - frame->eptr) 1120 { 1121 case 0: 1122 RRETURN_NO_MATCH; 1123 case 1: 1124 dc = *frame->eptr++; 1125 if (IS_LEADING_SURROGATE(dc)) 1126 RRETURN_NO_MATCH; 1127 break; 1128 default: 1129 GETCHARINC(dc, frame->eptr); 1130 } 1131 if (frame->fc != dc) RRETURN_NO_MATCH; 1132 } 1133 NEXT_OPCODE; 1134 1135 /* Match a single character, caselessly */ 1136 1137 BEGIN_OPCODE(CHARNC): 1138 frame->length = 1; 1139 frame->ecode++; 1140 GETUTF8CHARLEN(frame->fc, frame->ecode, frame->length); 1141 1142 if (md->end_subject - frame->eptr == 0) RRETURN_NO_MATCH; 1143 1144 { 1145 int dc; 1146 if (md->end_subject - frame->eptr == 1) { 1147 dc = *frame->eptr++; 1148 if (IS_LEADING_SURROGATE(dc)) 1149 RRETURN_NO_MATCH; 1150 } else 1151 GETCHARINC(dc, frame->eptr); 1152 frame->ecode += frame->length; 1153 1154 /* If we have Unicode property support, we can use it to test the other 1155 case of the character, if there is one. */ 1156 1157 if (frame->fc != dc) 1158 { 1159 if (dc != _pcre_ucp_othercase(frame->fc)) 1160 RRETURN_NO_MATCH; 1161 } 1162 } 1163 NEXT_OPCODE; 1164 1165 /* Match a single ASCII character. */ 1166 1167 BEGIN_OPCODE(ASCII_CHAR): 1168 if (md->end_subject == frame->eptr) 1169 RRETURN_NO_MATCH; 1170 if (*frame->eptr != frame->ecode[1]) 1171 RRETURN_NO_MATCH; 1172 ++frame->eptr; 1173 frame->ecode += 2; 1174 NEXT_OPCODE; 1175 1176 /* Match one of two cases of an ASCII character. */ 1177 1178 BEGIN_OPCODE(ASCII_LETTER_NC): 1179 if (md->end_subject == frame->eptr) 1180 RRETURN_NO_MATCH; 1181 if ((*frame->eptr | 0x20) != frame->ecode[1]) 1182 RRETURN_NO_MATCH; 1183 ++frame->eptr; 1184 frame->ecode += 2; 1185 NEXT_OPCODE; 1186 1187 /* Match a single character repeatedly; different opcodes share code. */ 1188 1189 BEGIN_OPCODE(EXACT): 1190 min = frame->max = GET2(frame->ecode, 1); 1191 minimize = false; 1192 frame->ecode += 3; 1193 goto REPEATCHAR; 1194 1195 BEGIN_OPCODE(UPTO): 1196 BEGIN_OPCODE(MINUPTO): 1197 min = 0; 1198 frame->max = GET2(frame->ecode, 1); 1199 minimize = *frame->ecode == OP_MINUPTO; 1200 frame->ecode += 3; 1201 goto REPEATCHAR; 1202 1203 BEGIN_OPCODE(STAR): 1204 BEGIN_OPCODE(MINSTAR): 1205 BEGIN_OPCODE(PLUS): 1206 BEGIN_OPCODE(MINPLUS): 1207 BEGIN_OPCODE(QUERY): 1208 BEGIN_OPCODE(MINQUERY): 1209 c = *frame->ecode++ - OP_STAR; 1210 minimize = (c & 1) != 0; 1211 min = rep_min[c]; /* Pick up values from tables; */ 1212 frame->max = rep_max[c]; /* zero for max => infinity */ 1213 if (frame->max == 0) frame->max = INT_MAX; 1214 1215 /* Common code for all repeated single-character matches. We can give 1216 up quickly if there are fewer than the minimum number of characters left in 1217 the subject. */ 1218 1219 REPEATCHAR: 1220 1221 frame->length = 1; 1222 GETUTF8CHARLEN(frame->fc, frame->ecode, frame->length); 1223 if (min * (frame->fc > 0xFFFF ? 2 : 1) > md->end_subject - frame->eptr) RRETURN_NO_MATCH; 1224 frame->ecode += frame->length; 1225 1226 if (frame->fc <= 0xFFFF) 1227 { 1228 int othercase = md->caseless ? _pcre_ucp_othercase(frame->fc) : -1; 1229 1230 for (i = 1; i <= min; i++) 1231 { 1232 if (*frame->eptr != frame->fc && *frame->eptr != othercase) RRETURN_NO_MATCH; 1233 ++frame->eptr; 1234 } 1235 1236 if (min == frame->max) 1237 NEXT_OPCODE; 1238 1239 if (minimize) 1240 { 1241 frame->repeat_othercase = othercase; 1242 for (frame->fi = min;; frame->fi++) 1243 { 1244 RMATCH(28, frame->ecode, frame->eptrb, 0); 1245 if (is_match) RRETURN; 1246 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) RRETURN; 1247 if (*frame->eptr != frame->fc && *frame->eptr != frame->repeat_othercase) RRETURN; 1248 ++frame->eptr; 1249 } 1250 /* Control never gets here */ 1251 } 1252 else 1253 { 1254 frame->pp = frame->eptr; 1255 for (i = min; i < frame->max; i++) 1256 { 1257 if (frame->eptr >= md->end_subject) break; 1258 if (*frame->eptr != frame->fc && *frame->eptr != othercase) break; 1259 ++frame->eptr; 1260 } 1261 while (frame->eptr >= frame->pp) 1262 { 1263 RMATCH(29, frame->ecode, frame->eptrb, 0); 1264 if (is_match) RRETURN; 1265 --frame->eptr; 1266 } 1267 RRETURN_NO_MATCH; 1268 } 1269 /* Control never gets here */ 1270 } 1271 else 1272 { 1273 /* No case on surrogate pairs, so no need to bother with "othercase". */ 1274 1275 for (i = 1; i <= min; i++) 1276 { 1277 int nc; 1278 GETCHAR(nc, frame->eptr); 1279 if (nc != frame->fc) RRETURN_NO_MATCH; 1280 frame->eptr += 2; 1281 } 1282 1283 if (min == frame->max) 1284 NEXT_OPCODE; 1285 1286 if (minimize) 1287 { 1288 for (frame->fi = min;; frame->fi++) 1289 { 1290 int nc; 1291 RMATCH(30, frame->ecode, frame->eptrb, 0); 1292 if (is_match) RRETURN; 1293 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) RRETURN; 1294 GETCHAR(nc, frame->eptr); 1295 if (*frame->eptr != frame->fc) RRETURN; 1296 frame->eptr += 2; 1297 } 1298 /* Control never gets here */ 1299 } 1300 else 1301 { 1302 frame->pp = frame->eptr; 1303 for (i = min; i < frame->max; i++) 1304 { 1305 int nc; 1306 if (frame->eptr > md->end_subject - 2) break; 1307 GETCHAR(nc, frame->eptr); 1308 if (*frame->eptr != frame->fc) break; 1309 frame->eptr += 2; 1310 } 1311 while (frame->eptr >= frame->pp) 1312 { 1313 RMATCH(31, frame->ecode, frame->eptrb, 0); 1314 if (is_match) RRETURN; 1315 frame->eptr -= 2; 1316 } 1317 RRETURN_NO_MATCH; 1318 } 1319 /* Control never gets here */ 1320 } 1321 /* Control never gets here */ 1322 1323 /* Match a negated single one-byte character. The character we are 1324 checking can be multibyte. */ 1325 1326 BEGIN_OPCODE(NOT): 1327 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 1328 frame->ecode++; 1329 GETCHARINCTEST(c, frame->eptr); 1330 if (md->caseless) 1331 { 1332 if (c < 128) 1333 c = md->lcc[c]; 1334 if (md->lcc[*frame->ecode++] == c) RRETURN_NO_MATCH; 1335 } 1336 else 1337 { 1338 if (*frame->ecode++ == c) RRETURN_NO_MATCH; 1339 } 1340 NEXT_OPCODE; 1341 1342 /* Match a negated single one-byte character repeatedly. This is almost a 1343 repeat of the code for a repeated single character, but I haven't found a 1344 nice way of commoning these up that doesn't require a test of the 1345 positive/negative option for each character match. Maybe that wouldn't add 1346 very much to the time taken, but character matching *is* what this is all 1347 about... */ 1348 1349 BEGIN_OPCODE(NOTEXACT): 1350 min = frame->max = GET2(frame->ecode, 1); 1351 minimize = false; 1352 frame->ecode += 3; 1353 goto REPEATNOTCHAR; 1354 1355 BEGIN_OPCODE(NOTUPTO): 1356 BEGIN_OPCODE(NOTMINUPTO): 1357 min = 0; 1358 frame->max = GET2(frame->ecode, 1); 1359 minimize = *frame->ecode == OP_NOTMINUPTO; 1360 frame->ecode += 3; 1361 goto REPEATNOTCHAR; 1362 1363 BEGIN_OPCODE(NOTSTAR): 1364 BEGIN_OPCODE(NOTMINSTAR): 1365 BEGIN_OPCODE(NOTPLUS): 1366 BEGIN_OPCODE(NOTMINPLUS): 1367 BEGIN_OPCODE(NOTQUERY): 1368 BEGIN_OPCODE(NOTMINQUERY): 1369 c = *frame->ecode++ - OP_NOTSTAR; 1370 minimize = (c & 1) != 0; 1371 min = rep_min[c]; /* Pick up values from tables; */ 1372 frame->max = rep_max[c]; /* zero for max => infinity */ 1373 if (frame->max == 0) frame->max = INT_MAX; 1374 1375 /* Common code for all repeated single-byte matches. We can give up quickly 1376 if there are fewer than the minimum number of bytes left in the 1377 subject. */ 1378 1379 REPEATNOTCHAR: 1380 if (min > md->end_subject - frame->eptr) RRETURN_NO_MATCH; 1381 frame->fc = *frame->ecode++; 1382 1383 /* The code is duplicated for the caseless and caseful cases, for speed, 1384 since matching characters is likely to be quite common. First, ensure the 1385 minimum number of matches are present. If min = max, continue at the same 1386 level without recursing. Otherwise, if minimizing, keep trying the rest of 1387 the expression and advancing one matching character if failing, up to the 1388 maximum. Alternatively, if maximizing, find the maximum number of 1389 characters and work backwards. */ 1390 1391 DPRINTF(("negative matching %c{%d,%d}\n", frame->fc, min, frame->max)); 1392 1393 if (md->caseless) 1394 { 1395 if (frame->fc < 128) 1396 frame->fc = md->lcc[frame->fc]; 1397 1398 { 1399 register int d; 1400 for (i = 1; i <= min; i++) 1401 { 1402 GETCHARINC(d, frame->eptr); 1403 if (d < 128) d = md->lcc[d]; 1404 if (frame->fc == d) RRETURN_NO_MATCH; 1405 } 1406 } 1407 1408 if (min == frame->max) 1409 NEXT_OPCODE; 1410 1411 if (minimize) 1412 { 1413 { 1414 register int d; 1415 for (frame->fi = min;; frame->fi++) 1416 { 1417 RMATCH(38, frame->ecode, frame->eptrb, 0); 1418 if (is_match) RRETURN; 1419 GETCHARINC(d, frame->eptr); 1420 if (d < 128) d = md->lcc[d]; 1421 if (frame->fi >= frame->max || frame->eptr >= md->end_subject || frame->fc == d) 1422 RRETURN; 1423 } 1424 } 1425 /* Control never gets here */ 1426 } 1427 1428 /* Maximize case */ 1429 1430 else 1431 { 1432 frame->pp = frame->eptr; 1433 1434 { 1435 register int d; 1436 for (i = min; i < frame->max; i++) 1437 { 1438 int len = 1; 1439 if (frame->eptr >= md->end_subject) break; 1440 GETCHARLEN(d, frame->eptr, len); 1441 if (d < 128) d = md->lcc[d]; 1442 if (frame->fc == d) break; 1443 frame->eptr += len; 1444 } 1445 for(;;) 1446 { 1447 RMATCH(40, frame->ecode, frame->eptrb, 0); 1448 if (is_match) RRETURN; 1449 if (frame->eptr-- == frame->pp) break; /* Stop if tried at original pos */ 1450 BACKCHAR(frame->eptr); 1451 } 1452 } 1453 1454 RRETURN; 1455 } 1456 /* Control never gets here */ 1457 } 1458 1459 /* Caseful comparisons */ 1460 1461 else 1462 { 1463 { 1464 register int d; 1465 for (i = 1; i <= min; i++) 1466 { 1467 GETCHARINC(d, frame->eptr); 1468 if (frame->fc == d) RRETURN_NO_MATCH; 1469 } 1470 } 1471 1472 if (min == frame->max) 1473 NEXT_OPCODE; 1474 1475 if (minimize) 1476 { 1477 { 1478 register int d; 1479 for (frame->fi = min;; frame->fi++) 1480 { 1481 RMATCH(42, frame->ecode, frame->eptrb, 0); 1482 if (is_match) RRETURN; 1483 GETCHARINC(d, frame->eptr); 1484 if (frame->fi >= frame->max || frame->eptr >= md->end_subject || frame->fc == d) 1485 RRETURN; 1486 } 1487 } 1488 /* Control never gets here */ 1489 } 1490 1491 /* Maximize case */ 1492 1493 else 1494 { 1495 frame->pp = frame->eptr; 1496 1497 { 1498 register int d; 1499 for (i = min; i < frame->max; i++) 1500 { 1501 int len = 1; 1502 if (frame->eptr >= md->end_subject) break; 1503 GETCHARLEN(d, frame->eptr, len); 1504 if (frame->fc == d) break; 1505 frame->eptr += len; 1506 } 1507 for(;;) 1508 { 1509 RMATCH(44, frame->ecode, frame->eptrb, 0); 1510 if (is_match) RRETURN; 1511 if (frame->eptr-- == frame->pp) break; /* Stop if tried at original pos */ 1512 BACKCHAR(frame->eptr); 1513 } 1514 } 1515 1516 RRETURN; 1517 } 1518 } 1519 /* Control never gets here */ 1520 1521 /* Match a single character type repeatedly; several different opcodes 1522 share code. This is very similar to the code for single characters, but we 1523 repeat it in the interests of efficiency. */ 1524 1525 BEGIN_OPCODE(TYPEEXACT): 1526 min = frame->max = GET2(frame->ecode, 1); 1527 minimize = true; 1528 frame->ecode += 3; 1529 goto REPEATTYPE; 1530 1531 BEGIN_OPCODE(TYPEUPTO): 1532 BEGIN_OPCODE(TYPEMINUPTO): 1533 min = 0; 1534 frame->max = GET2(frame->ecode, 1); 1535 minimize = *frame->ecode == OP_TYPEMINUPTO; 1536 frame->ecode += 3; 1537 goto REPEATTYPE; 1538 1539 BEGIN_OPCODE(TYPESTAR): 1540 BEGIN_OPCODE(TYPEMINSTAR): 1541 BEGIN_OPCODE(TYPEPLUS): 1542 BEGIN_OPCODE(TYPEMINPLUS): 1543 BEGIN_OPCODE(TYPEQUERY): 1544 BEGIN_OPCODE(TYPEMINQUERY): 1545 c = *frame->ecode++ - OP_TYPESTAR; 1546 minimize = (c & 1) != 0; 1547 min = rep_min[c]; /* Pick up values from tables; */ 1548 frame->max = rep_max[c]; /* zero for max => infinity */ 1549 if (frame->max == 0) frame->max = INT_MAX; 1550 1551 /* Common code for all repeated single character type matches. Note that 1552 in UTF-8 mode, '.' matches a character of any length, but for the other 1553 character types, the valid characters are all one-byte long. */ 1554 1555 REPEATTYPE: 1556 frame->ctype = *frame->ecode++; /* Code for the character type */ 1557 1558 /* First, ensure the minimum number of matches are present. Use inline 1559 code for maximizing the speed, and do the type test once at the start 1560 (i.e. keep it out of the loop). Also we can test that there are at least 1561 the minimum number of bytes before we start. This isn't as effective in 1562 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that 1563 is tidier. Also separate the UCP code, which can be the same for both UTF-8 1564 and single-bytes. */ 1565 1566 if (min > md->end_subject - frame->eptr) RRETURN_NO_MATCH; 1567 if (min > 0) 1568 { 1569 switch(frame->ctype) 1570 { 1571 case OP_ANY: 1572 for (i = 1; i <= min; i++) 1573 { 1574 if (frame->eptr >= md->end_subject || isNewline(*frame->eptr)) 1575 RRETURN_NO_MATCH; 1576 ++frame->eptr; 1577 while (frame->eptr < md->end_subject && ISMIDCHAR(*frame->eptr)) frame->eptr++; 1578 } 1579 break; 1580 1581 case OP_NOT_DIGIT: 1582 for (i = 1; i <= min; i++) 1583 { 1584 if (frame->eptr >= md->end_subject) RRETURN_NO_MATCH; 1585 GETCHARINC(c, frame->eptr); 1586 if (isASCIIDigit(c)) 1587 RRETURN_NO_MATCH; 1588 } 1589 break; 1590 1591 case OP_DIGIT: 1592 for (i = 1; i <= min; i++) 1593 { 1594 if (frame->eptr >= md->end_subject || !isASCIIDigit(*frame->eptr++)) 1595 RRETURN_NO_MATCH; 1596 /* No need to skip more bytes - we know it's a 1-byte character */ 1597 } 1598 break; 1599 1600 case OP_NOT_WHITESPACE: 1601 for (i = 1; i <= min; i++) 1602 { 1603 if (frame->eptr >= md->end_subject || 1604 (*frame->eptr < 128 && (md->ctypes[*frame->eptr] & ctype_space) != 0)) 1605 RRETURN_NO_MATCH; 1606 while (++frame->eptr < md->end_subject && ISMIDCHAR(*frame->eptr)); 1607 } 1608 break; 1609 1610 case OP_WHITESPACE: 1611 for (i = 1; i <= min; i++) 1612 { 1613 if (frame->eptr >= md->end_subject || 1614 *frame->eptr >= 128 || (md->ctypes[*frame->eptr++] & ctype_space) == 0) 1615 RRETURN_NO_MATCH; 1616 /* No need to skip more bytes - we know it's a 1-byte character */ 1617 } 1618 break; 1619 1620 case OP_NOT_WORDCHAR: 1621 for (i = 1; i <= min; i++) 1622 { 1623 if (frame->eptr >= md->end_subject || 1624 (*frame->eptr < 128 && (md->ctypes[*frame->eptr] & ctype_word) != 0)) 1625 RRETURN_NO_MATCH; 1626 while (++frame->eptr < md->end_subject && ISMIDCHAR(*frame->eptr)); 1627 } 1628 break; 1629 1630 case OP_WORDCHAR: 1631 for (i = 1; i <= min; i++) 1632 { 1633 if (frame->eptr >= md->end_subject || 1634 *frame->eptr >= 128 || (md->ctypes[*frame->eptr++] & ctype_word) == 0) 1635 RRETURN_NO_MATCH; 1636 /* No need to skip more bytes - we know it's a 1-byte character */ 1637 } 1638 break; 1639 1640 default: 1641 ASSERT_NOT_REACHED(); 1642 RRETURN_ERROR(JSRegExpErrorInternal); 1643 } /* End switch(frame->ctype) */ 1644 } 1645 1646 /* If min = max, continue at the same level without recursing */ 1647 1648 if (min == frame->max) 1649 NEXT_OPCODE; 1650 1651 /* If minimizing, we have to test the rest of the pattern before each 1652 subsequent match. */ 1653 1654 if (minimize) 1655 { 1656 { 1657 for (frame->fi = min;; frame->fi++) 1658 { 1659 RMATCH(48, frame->ecode, frame->eptrb, 0); 1660 if (is_match) RRETURN; 1661 if (frame->fi >= frame->max || frame->eptr >= md->end_subject) RRETURN; 1662 1663 GETCHARINC(c, frame->eptr); 1664 switch(frame->ctype) 1665 { 1666 case OP_ANY: 1667 if (isNewline(c)) RRETURN; 1668 break; 1669 1670 case OP_NOT_DIGIT: 1671 if (isASCIIDigit(c)) 1672 RRETURN; 1673 break; 1674 1675 case OP_DIGIT: 1676 if (!isASCIIDigit(c)) 1677 RRETURN; 1678 break; 1679 1680 case OP_NOT_WHITESPACE: 1681 if (c < 128 && (md->ctypes[c] & ctype_space) != 0) 1682 RRETURN; 1683 break; 1684 1685 case OP_WHITESPACE: 1686 if (c >= 128 || (md->ctypes[c] & ctype_space) == 0) 1687 RRETURN; 1688 break; 1689 1690 case OP_NOT_WORDCHAR: 1691 if (c < 128 && (md->ctypes[c] & ctype_word) != 0) 1692 RRETURN; 1693 break; 1694 1695 case OP_WORDCHAR: 1696 if (c >= 128 || (md->ctypes[c] & ctype_word) == 0) 1697 RRETURN; 1698 break; 1699 1700 default: 1701 ASSERT_NOT_REACHED(); 1702 RRETURN_ERROR(JSRegExpErrorInternal); 1703 } 1704 } 1705 } 1706 /* Control never gets here */ 1707 } 1708 1709 /* If maximizing it is worth using inline code for speed, doing the type 1710 test once at the start (i.e. keep it out of the loop). */ 1711 1712 else 1713 { 1714 frame->pp = frame->eptr; /* Remember where we started */ 1715 1716 switch(frame->ctype) 1717 { 1718 case OP_ANY: 1719 1720 /* Special code is required for UTF8, but when the maximum is unlimited 1721 we don't need it, so we repeat the non-UTF8 code. This is probably 1722 worth it, because .* is quite a common idiom. */ 1723 1724 if (frame->max < INT_MAX) 1725 { 1726 { 1727 for (i = min; i < frame->max; i++) 1728 { 1729 if (frame->eptr >= md->end_subject || isNewline(*frame->eptr)) break; 1730 frame->eptr++; 1731 while (frame->eptr < md->end_subject && (*frame->eptr & 0xc0) == 0x80) frame->eptr++; 1732 } 1733 } 1734 } 1735 1736 /* Handle unlimited UTF-8 repeat */ 1737 1738 else 1739 { 1740 { 1741 for (i = min; i < frame->max; i++) 1742 { 1743 if (frame->eptr >= md->end_subject || isNewline(*frame->eptr)) break; 1744 frame->eptr++; 1745 } 1746 break; 1747 } 1748 } 1749 break; 1750 1751 case OP_NOT_DIGIT: 1752 for (i = min; i < frame->max; i++) 1753 { 1754 int len = 1; 1755 if (frame->eptr >= md->end_subject) break; 1756 GETCHARLEN(c, frame->eptr, len); 1757 if (isASCIIDigit(c)) break; 1758 frame->eptr+= len; 1759 } 1760 break; 1761 1762 case OP_DIGIT: 1763 for (i = min; i < frame->max; i++) 1764 { 1765 int len = 1; 1766 if (frame->eptr >= md->end_subject) break; 1767 GETCHARLEN(c, frame->eptr, len); 1768 if (!isASCIIDigit(c)) break; 1769 frame->eptr+= len; 1770 } 1771 break; 1772 1773 case OP_NOT_WHITESPACE: 1774 for (i = min; i < frame->max; i++) 1775 { 1776 int len = 1; 1777 if (frame->eptr >= md->end_subject) break; 1778 GETCHARLEN(c, frame->eptr, len); 1779 if (c < 128 && (md->ctypes[c] & ctype_space) != 0) break; 1780 frame->eptr+= len; 1781 } 1782 break; 1783 1784 case OP_WHITESPACE: 1785 for (i = min; i < frame->max; i++) 1786 { 1787 int len = 1; 1788 if (frame->eptr >= md->end_subject) break; 1789 GETCHARLEN(c, frame->eptr, len); 1790 if (c >= 128 ||(md->ctypes[c] & ctype_space) == 0) break; 1791 frame->eptr+= len; 1792 } 1793 break; 1794 1795 case OP_NOT_WORDCHAR: 1796 for (i = min; i < frame->max; i++) 1797 { 1798 int len = 1; 1799 if (frame->eptr >= md->end_subject) break; 1800 GETCHARLEN(c, frame->eptr, len); 1801 if (c < 128 && (md->ctypes[c] & ctype_word) != 0) break; 1802 frame->eptr+= len; 1803 } 1804 break; 1805 1806 case OP_WORDCHAR: 1807 for (i = min; i < frame->max; i++) 1808 { 1809 int len = 1; 1810 if (frame->eptr >= md->end_subject) break; 1811 GETCHARLEN(c, frame->eptr, len); 1812 if (c >= 128 || (md->ctypes[c] & ctype_word) == 0) break; 1813 frame->eptr+= len; 1814 } 1815 break; 1816 1817 default: 1818 ASSERT_NOT_REACHED(); 1819 RRETURN_ERROR(JSRegExpErrorInternal); 1820 } 1821 1822 /* frame->eptr is now past the end of the maximum run */ 1823 1824 for(;;) 1825 { 1826 RMATCH(52, frame->ecode, frame->eptrb, 0); 1827 if (is_match) RRETURN; 1828 if (frame->eptr-- == frame->pp) break; /* Stop if tried at original pos */ 1829 BACKCHAR(frame->eptr); 1830 } 1831 1832 /* Get here if we can't make it match with any permitted repetitions */ 1833 1834 RRETURN; 1835 } 1836 /* Control never gets here */ 1837 1838 BEGIN_OPCODE(CRMINPLUS): 1839 BEGIN_OPCODE(CRMINQUERY): 1840 BEGIN_OPCODE(CRMINRANGE): 1841 BEGIN_OPCODE(CRMINSTAR): 1842 BEGIN_OPCODE(CRPLUS): 1843 BEGIN_OPCODE(CRQUERY): 1844 BEGIN_OPCODE(CRRANGE): 1845 BEGIN_OPCODE(CRSTAR): 1846 ASSERT_NOT_REACHED(); 1847 RRETURN_ERROR(JSRegExpErrorInternal); 1848 1849 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 1850 CAPTURING_BRACKET: 1851 #else 1852 default: 1853 #endif 1854 /* Opening capturing bracket. If there is space in the offset vector, save 1855 the current subject position in the working slot at the top of the vector. We 1856 mustn't change the current values of the data slot, because they may be set 1857 from a previous iteration of this group, and be referred to by a reference 1858 inside the group. 1859 1860 If the bracket fails to match, we need to restore this value and also the 1861 values of the final offsets, in case they were set by a previous iteration of 1862 the same bracket. 1863 1864 If there isn't enough space in the offset vector, treat this as if it were a 1865 non-capturing bracket. Don't worry about setting the flag for the error case 1866 here; that is handled in the code for KET. */ 1867 1868 ASSERT(*frame->ecode > OP_BRA); 1869 1870 frame->number = *frame->ecode - OP_BRA; 1871 1872 /* For extended extraction brackets (large number), we have to fish out the 1873 number from a dummy opcode at the start. */ 1874 1875 if (frame->number > EXTRACT_BASIC_MAX) 1876 frame->number = GET2(frame->ecode, 2+LINK_SIZE); 1877 frame->offset = frame->number << 1; 1878 1879 #ifdef DEBUG 1880 printf("start bracket %d subject=", frame->number); 1881 pchars(frame->eptr, 16, true, md); 1882 printf("\n"); 1883 #endif 1884 1885 if (frame->offset < md->offset_max) 1886 { 1887 frame->save_offset1 = md->offset_vector[frame->offset]; 1888 frame->save_offset2 = md->offset_vector[frame->offset + 1]; 1889 frame->save_offset3 = md->offset_vector[md->offset_end - frame->number]; 1890 1891 DPRINTF(("saving %d %d %d\n", frame->save_offset1, frame->save_offset2, frame->save_offset3)); 1892 md->offset_vector[md->offset_end - frame->number] = frame->eptr - md->start_subject; 1893 1894 do 1895 { 1896 RMATCH(1, frame->ecode + 1 + LINK_SIZE, frame->eptrb, match_isgroup); 1897 if (is_match) RRETURN; 1898 frame->ecode += GET(frame->ecode, 1); 1899 } 1900 while (*frame->ecode == OP_ALT); 1901 1902 DPRINTF(("bracket %d failed\n", frame->number)); 1903 1904 md->offset_vector[frame->offset] = frame->save_offset1; 1905 md->offset_vector[frame->offset + 1] = frame->save_offset2; 1906 md->offset_vector[md->offset_end - frame->number] = frame->save_offset3; 1907 1908 RRETURN; 1909 } 1910 1911 /* Insufficient room for saving captured contents */ 1912 1913 goto NON_CAPTURING_BRACKET; 1914 } 1915 1916 /* Do not stick any code in here without much thought; it is assumed 1917 that "continue" in the code above comes out to here to repeat the main 1918 loop. */ 1919 1920 } /* End of main loop */ 1921 1922 /* Control never reaches here */ 1923 1924 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION 1925 1926 RRETURN_SWITCH: 1927 switch (frame->where) 1928 { 1929 case 0: goto RETURN; 1930 case 1: goto RRETURN_1; 1931 case 2: goto RRETURN_2; 1932 case 6: goto RRETURN_6; 1933 case 7: goto RRETURN_7; 1934 case 9: goto RRETURN_9; 1935 case 10: goto RRETURN_10; 1936 case 11: goto RRETURN_11; 1937 case 12: goto RRETURN_12; 1938 case 13: goto RRETURN_13; 1939 case 14: goto RRETURN_14; 1940 case 15: goto RRETURN_15; 1941 case 16: goto RRETURN_16; 1942 case 17: goto RRETURN_17; 1943 case 18: goto RRETURN_18; 1944 case 19: goto RRETURN_19; 1945 case 20: goto RRETURN_20; 1946 case 21: goto RRETURN_21; 1947 case 22: goto RRETURN_22; 1948 case 24: goto RRETURN_24; 1949 case 26: goto RRETURN_26; 1950 case 27: goto RRETURN_27; 1951 case 28: goto RRETURN_28; 1952 case 29: goto RRETURN_29; 1953 case 30: goto RRETURN_30; 1954 case 31: goto RRETURN_31; 1955 case 38: goto RRETURN_38; 1956 case 40: goto RRETURN_40; 1957 case 42: goto RRETURN_42; 1958 case 44: goto RRETURN_44; 1959 case 48: goto RRETURN_48; 1960 case 52: goto RRETURN_52; 1961 } 1962 1963 abort(); 1964 RRETURN_ERROR(JSRegExpErrorInternal); 1965 1966 #endif 1967 1966 1968 1967 RETURN: 1969 1968 return is_match ? MATCH_MATCH : MATCH_NOMATCH; 1970 1969 1971 1970 RETURN_ERROR: 1972 1971 while (!(frame >= stackframes && frame < stackframesend)) {
Note:
See TracChangeset
for help on using the changeset viewer.