Changeset 27802 in webkit for trunk/JavaScriptCore/pcre/pcre_compile.cpp
- Timestamp:
- Nov 14, 2007, 5:17:31 PM (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/pcre/pcre_compile.cpp
r27752 r27802 2422 2422 } 2423 2423 2424 2424 static int calculateCompiledPatternLengthAndFlags(const pcre_char* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase, compile_data& compile_block, ErrorCode errorcode) 2425 { 2426 /* Make a pass over the pattern to compute the 2427 amount of store required to hold the compiled code. This does not have to be 2428 perfect as long as errors are overestimates. At the same time we can detect any 2429 flag settings right at the start, and extract them. Make an attempt to correct 2430 for any counted white space if an "extended" flag setting appears late in the 2431 pattern. We can't be so clever for #-comments. */ 2432 2433 int length = 1 + LINK_SIZE; /* For initial BRA plus length */ 2434 int branch_extra = 0; 2435 int branch_newextra; 2436 int lastitemlength = 0; 2437 BOOL class_utf8; 2438 BOOL capturing; 2439 unsigned int brastackptr = 0; 2440 int brastack[BRASTACK_SIZE]; 2441 uschar bralenstack[BRASTACK_SIZE]; 2442 int item_count = -1; 2443 int bracount = 0; 2444 2445 const pcre_uchar* ptr = (const pcre_uchar*)(pattern - 1); 2446 const pcre_uchar* patternEnd = (const pcre_uchar*)(pattern + patternLength); 2447 2448 while (++ptr < patternEnd) 2449 { 2450 int min = 0, max = 0; 2451 int class_optcount; 2452 int bracket_length; 2453 int duplength; 2454 2455 int c = *ptr; 2456 2457 item_count++; /* Is zero for the first non-comment item */ 2458 2459 switch(c) 2460 { 2461 /* A backslashed item may be an escaped data character or it may be a 2462 character type. */ 2463 2464 case '\\': 2465 c = check_escape(&ptr, patternEnd, &errorcode, bracount, false); 2466 if (errorcode != 0) 2467 return -1;; 2468 2469 lastitemlength = 1; /* Default length of last item for repeats */ 2470 2471 if (c >= 0) /* Data character */ 2472 { 2473 length += 2; /* For a one-byte character */ 2474 2475 if (c > 127) 2476 { 2477 int i; 2478 for (i = 0; i < _pcre_utf8_table1_size; i++) 2479 if (c <= _pcre_utf8_table1[i]) break; 2480 length += i; 2481 lastitemlength += i; 2482 } 2483 2484 continue; 2485 } 2486 2487 /* Other escapes need one byte */ 2488 2489 length++; 2490 2491 /* A back reference needs an additional 2 bytes, plus either one or 5 2492 bytes for a repeat. We also need to keep the value of the highest 2493 back reference. */ 2494 2495 if (c <= -ESC_REF) 2496 { 2497 int refnum = -c - ESC_REF; 2498 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1; 2499 if (refnum > compile_block.top_backref) 2500 compile_block.top_backref = refnum; 2501 length += 2; /* For single back reference */ 2502 if (ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd)) 2503 { 2504 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); 2505 if (errorcode != 0) return -1;; 2506 if ((min == 0 && (max == 1 || max == -1)) || 2507 (min == 1 && max == -1)) 2508 length++; 2509 else length += 5; 2510 if (ptr[1] == '?') ptr++; 2511 ptr++; 2512 } 2513 } 2514 continue; 2515 2516 case '^': /* Single-byte metacharacters */ 2517 case '.': 2518 case '$': 2519 length++; 2520 lastitemlength = 1; 2521 continue; 2522 2523 case '*': /* These repeats won't be after brackets; */ 2524 case '+': /* those are handled separately */ 2525 case '?': 2526 length++; 2527 goto POSESSIVE; /* A few lines below */ 2528 2529 /* This covers the cases of braced repeats after a single char, metachar, 2530 class, or back reference. */ 2531 2532 case '{': 2533 if (!is_counted_repeat(ptr+1, patternEnd)) goto NORMAL_CHAR; 2534 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode); 2535 if (errorcode != 0) return -1;; 2536 2537 /* These special cases just insert one extra opcode */ 2538 2539 if ((min == 0 && (max == 1 || max == -1)) || 2540 (min == 1 && max == -1)) 2541 length++; 2542 2543 /* These cases might insert additional copies of a preceding character. */ 2544 2545 else 2546 { 2547 if (min != 1) 2548 { 2549 length -= lastitemlength; /* Uncount the original char or metachar */ 2550 if (min > 0) length += 3 + lastitemlength; 2551 } 2552 length += lastitemlength + ((max > 0)? 3 : 1); 2553 } 2554 2555 if (ptr[1] == '?') ptr++; /* Needs no extra length */ 2556 2557 POSESSIVE: /* Test for possessive quantifier */ 2558 if (ptr[1] == '+') 2559 { 2560 ptr++; 2561 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */ 2562 } 2563 continue; 2564 2565 /* An alternation contains an offset to the next branch or ket. If any ims 2566 options changed in the previous branch(es), and/or if we are in a 2567 lookbehind assertion, extra space will be needed at the start of the 2568 branch. This is handled by branch_extra. */ 2569 2570 case '|': 2571 length += 1 + LINK_SIZE + branch_extra; 2572 continue; 2573 2574 /* A character class uses 33 characters provided that all the character 2575 values are less than 256. Otherwise, it uses a bit map for low valued 2576 characters, and individual items for others. Don't worry about character 2577 types that aren't allowed in classes - they'll get picked up during the 2578 compile. A character class that contains only one single-byte character 2579 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this 2580 where we can. (In UTF-8 mode we can do this only for chars < 128.) */ 2581 2582 case '[': 2583 if (*(++ptr) == '^') 2584 { 2585 class_optcount = 10; /* Greater than one */ 2586 ptr++; 2587 } 2588 else class_optcount = 0; 2589 2590 class_utf8 = false; 2591 2592 for (; ptr < patternEnd && *ptr != ']'; ++ptr) 2593 { 2594 /* Check for escapes */ 2595 2596 if (*ptr == '\\') 2597 { 2598 c = check_escape(&ptr, patternEnd, &errorcode, bracount, true); 2599 if (errorcode != 0) return -1;; 2600 2601 /* \b is backspace inside a class; \X is literal */ 2602 2603 if (-c == ESC_b) c = '\b'; 2604 2605 /* Handle escapes that turn into characters */ 2606 2607 if (c >= 0) goto NON_SPECIAL_CHARACTER; 2608 2609 /* Escapes that are meta-things. The normal ones just affect the 2610 bit map, but Unicode properties require an XCLASS extended item. */ 2611 2612 else 2613 { 2614 class_optcount = 10; /* \d, \s etc; make sure > 1 */ 2615 } 2616 } 2617 2618 /* Anything else increments the possible optimization count. We have to 2619 detect ranges here so that we can compute the number of extra ranges for 2620 caseless wide characters when UCP support is available. If there are wide 2621 characters, we are going to have to use an XCLASS, even for single 2622 characters. */ 2623 2624 else 2625 { 2626 int d; 2627 2628 { 2629 int extra = 0; 2630 GETCHARLENEND(c, ptr, patternEnd, extra); 2631 ptr += extra; 2632 } 2633 2634 /* Come here from handling \ above when it escapes to a char value */ 2635 2636 NON_SPECIAL_CHARACTER: 2637 class_optcount++; 2638 2639 d = -1; 2640 if (ptr + 1 < patternEnd && ptr[1] == '-') 2641 { 2642 pcre_uchar const *hyptr = ptr++; 2643 if (ptr + 1 < patternEnd && ptr[1] == '\\') 2644 { 2645 ptr++; 2646 d = check_escape(&ptr, patternEnd, &errorcode, bracount, true); 2647 if (errorcode != 0) return -1;; 2648 if (-d == ESC_b) d = '\b'; /* backspace */ 2649 } 2650 else if (ptr + 1 < patternEnd && ptr[1] != ']') 2651 { 2652 ptr++; 2653 { 2654 int extra = 0; 2655 GETCHARLENEND(d, ptr, patternEnd, extra); 2656 ptr += extra; 2657 } 2658 } 2659 if (d < 0) ptr = hyptr; /* go back to hyphen as data */ 2660 } 2661 2662 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or > 2663 127 for caseless matching, we will need to use an XCLASS. */ 2664 2665 if (d >= 0) 2666 { 2667 class_optcount = 10; /* Ensure > 1 */ 2668 if (d < c) 2669 { 2670 errorcode = ERR8; 2671 return -1;; 2672 } 2673 2674 if ((d > 255 || (ignoreCase && d > 127))) 2675 { 2676 uschar buffer[6]; 2677 if (!class_utf8) /* Allow for XCLASS overhead */ 2678 { 2679 class_utf8 = true; 2680 length += LINK_SIZE + 2; 2681 } 2682 2683 /* If we have UCP support, find out how many extra ranges are 2684 needed to map the other case of characters within this range. We 2685 have to mimic the range optimization here, because extending the 2686 range upwards might push d over a boundary that makes is use 2687 another byte in the UTF-8 representation. */ 2688 2689 if (ignoreCase) 2690 { 2691 int occ, ocd; 2692 int cc = c; 2693 int origd = d; 2694 while (get_othercase_range(&cc, origd, &occ, &ocd)) 2695 { 2696 if (occ >= c && ocd <= d) continue; /* Skip embedded */ 2697 2698 if (occ < c && ocd >= c - 1) /* Extend the basic range */ 2699 { /* if there is overlap, */ 2700 c = occ; /* noting that if occ < c */ 2701 continue; /* we can't have ocd > d */ 2702 } /* because a subrange is */ 2703 if (ocd > d && occ <= d + 1) /* always shorter than */ 2704 { /* the basic range. */ 2705 d = ocd; 2706 continue; 2707 } 2708 2709 /* An extra item is needed */ 2710 2711 length += 1 + _pcre_ord2utf8(occ, buffer) + 2712 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer)); 2713 } 2714 } 2715 2716 /* The length of the (possibly extended) range */ 2717 2718 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer); 2719 } 2720 2721 } 2722 2723 /* We have a single character. There is nothing to be done unless we 2724 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must 2725 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP 2726 support. */ 2727 2728 else 2729 { 2730 if ((c > 255 || (ignoreCase && c > 127))) 2731 { 2732 uschar buffer[6]; 2733 class_optcount = 10; /* Ensure > 1 */ 2734 if (!class_utf8) /* Allow for XCLASS overhead */ 2735 { 2736 class_utf8 = true; 2737 length += LINK_SIZE + 2; 2738 } 2739 length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer)); 2740 } 2741 } 2742 } 2743 } 2744 2745 if (ptr >= patternEnd) /* Missing terminating ']' */ 2746 { 2747 errorcode = ERR6; 2748 return -1;; 2749 } 2750 2751 /* We can optimize when there was only one optimizable character. Repeats 2752 for positive and negated single one-byte chars are handled by the general 2753 code. Here, we handle repeats for the class opcodes. */ 2754 2755 if (class_optcount == 1) length += 3; else 2756 { 2757 length += 33; 2758 2759 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier, 2760 we also need extra for wrapping the whole thing in a sub-pattern. */ 2761 2762 if (ptr + 1 < patternEnd && ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd)) 2763 { 2764 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); 2765 if (errorcode != 0) return -1;; 2766 if ((min == 0 && (max == 1 || max == -1)) || 2767 (min == 1 && max == -1)) 2768 length++; 2769 else length += 5; 2770 if (ptr + 1 < patternEnd && ptr[1] == '+') 2771 { 2772 ptr++; 2773 length += 2 + 2*LINK_SIZE; 2774 } 2775 else if (ptr + 1 < patternEnd && ptr[1] == '?') ptr++; 2776 } 2777 } 2778 continue; 2779 2780 /* Brackets may be genuine groups or special things */ 2781 2782 case '(': 2783 branch_newextra = 0; 2784 bracket_length = 1 + LINK_SIZE; 2785 capturing = false; 2786 2787 /* Handle special forms of bracket, which all start (? */ 2788 2789 if (ptr + 1 < patternEnd && ptr[1] == '?') 2790 { 2791 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) 2792 { 2793 /* Non-referencing groups and lookaheads just move the pointer on, and 2794 then behave like a non-special bracket, except that they don't increment 2795 the count of extracting brackets. Ditto for the "once only" bracket, 2796 which is in Perl from version 5.005. */ 2797 2798 case ':': 2799 case '=': 2800 case '!': 2801 ptr += 2; 2802 break; 2803 2804 /* Else loop checking valid options until ) is met. Anything else is an 2805 error. If we are without any brackets, i.e. at top level, the settings 2806 act as if specified in the options, so massage the options immediately. 2807 This is for backward compatibility with Perl 5.004. */ 2808 2809 default: 2810 errorcode = ERR12; 2811 return -1;; 2812 } 2813 } 2814 2815 else capturing = 1; 2816 2817 /* Capturing brackets must be counted so we can process escapes in a 2818 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need 2819 an additional 3 bytes of memory per capturing bracket. */ 2820 2821 if (capturing) 2822 { 2823 bracount++; 2824 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3; 2825 } 2826 2827 /* Save length for computing whole length at end if there's a repeat that 2828 requires duplication of the group. Also save the current value of 2829 branch_extra, and start the new group with the new value. If non-zero, this 2830 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */ 2831 2832 if (brastackptr >= sizeof(brastack)/sizeof(int)) 2833 { 2834 errorcode = ERR17; 2835 return -1;; 2836 } 2837 2838 bralenstack[brastackptr] = branch_extra; 2839 branch_extra = branch_newextra; 2840 2841 brastack[brastackptr++] = length; 2842 length += bracket_length; 2843 continue; 2844 2845 /* Handle ket. Look for subsequent max/min; for certain sets of values we 2846 have to replicate this bracket up to that many times. If brastackptr is 2847 0 this is an unmatched bracket which will generate an error, but take care 2848 not to try to access brastack[-1] when computing the length and restoring 2849 the branch_extra value. */ 2850 2851 case ')': 2852 length += 1 + LINK_SIZE; 2853 if (brastackptr > 0) 2854 { 2855 duplength = length - brastack[--brastackptr]; 2856 branch_extra = bralenstack[brastackptr]; 2857 } 2858 else duplength = 0; 2859 2860 /* Leave ptr at the final char; for read_repeat_counts this happens 2861 automatically; for the others we need an increment. */ 2862 2863 if (ptr + 1 < patternEnd && (c = ptr[1]) == '{' && is_counted_repeat(ptr+2, patternEnd)) 2864 { 2865 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); 2866 if (errorcode != 0) return -1;; 2867 } 2868 else if (c == '*') { min = 0; max = -1; ptr++; } 2869 else if (c == '+') { min = 1; max = -1; ptr++; } 2870 else if (c == '?') { min = 0; max = 1; ptr++; } 2871 else { min = 1; max = 1; } 2872 2873 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the 2874 group, and if the maximum is greater than zero, we have to replicate 2875 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting 2876 bracket set. */ 2877 2878 if (min == 0) 2879 { 2880 length++; 2881 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE); 2882 } 2883 2884 /* When the minimum is greater than zero, we have to replicate up to 2885 minval-1 times, with no additions required in the copies. Then, if there 2886 is a limited maximum we have to replicate up to maxval-1 times allowing 2887 for a BRAZERO item before each optional copy and nesting brackets for all 2888 but one of the optional copies. */ 2889 2890 else 2891 { 2892 length += (min - 1) * duplength; 2893 if (max > min) /* Need this test as max=-1 means no limit */ 2894 length += (max - min) * (duplength + 3 + 2*LINK_SIZE) 2895 - (2 + 2*LINK_SIZE); 2896 } 2897 2898 /* Allow space for once brackets for "possessive quantifier" */ 2899 2900 if (ptr + 1 < patternEnd && ptr[1] == '+') 2901 { 2902 ptr++; 2903 length += 2 + 2*LINK_SIZE; 2904 } 2905 continue; 2906 2907 /* Non-special character. It won't be space or # in extended mode, so it is 2908 always a genuine character. If we are in a \Q...\E sequence, check for the 2909 end; if not, we have a literal. */ 2910 2911 default: 2912 NORMAL_CHAR: 2913 2914 length += 2; /* For a one-byte character */ 2915 lastitemlength = 1; /* Default length of last item for repeats */ 2916 2917 /* In UTF-8 mode, check for additional bytes. */ 2918 2919 if (c > 127) 2920 { 2921 if (IS_LEADING_SURROGATE(c)) 2922 { 2923 c = DECODE_SURROGATE_PAIR(c, ptr < patternEnd ? *ptr : 0); 2924 ++ptr; 2925 } 2926 2927 { 2928 int i; 2929 for (i = 0; i < _pcre_utf8_table1_size; i++) 2930 if (c <= _pcre_utf8_table1[i]) break; 2931 length += i; 2932 lastitemlength += i; 2933 } 2934 } 2935 2936 continue; 2937 } 2938 } 2939 2940 length += 2 + LINK_SIZE; /* For final KET and END */ 2941 return length; 2942 } 2943 2944 #ifdef DEBUG 2945 static void printCompiledRegExp(real_pcre* re, int length) 2946 { 2947 printf("Length = %d top_bracket = %d top_backref = %d\n", 2948 length, re->top_bracket, re->top_backref); 2949 2950 if (re->options) { 2951 printf("%s%s%s\n", 2952 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", 2953 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "", 2954 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : ""); 2955 } 2956 2957 if (re->options & PCRE_FIRSTSET) { 2958 char ch = re->first_byte & 255; 2959 const char* caseless = ((re->first_byte & REQ_CASELESS) == 0) ? "" : " (caseless)"; 2960 if (isASCIIAlphanumeric(ch)) 2961 printf("First char = %c%s\n", ch, caseless); 2962 else 2963 printf("First char = \\x%02x%s\n", ch, caseless); 2964 } 2965 2966 if (re->options & PCRE_REQCHSET) { 2967 char ch = re->req_byte & 255; 2968 const char* caseless = ((re->req_byte & REQ_CASELESS) == 0) ? "" : " (caseless)"; 2969 if (isASCIIAlphanumeric(ch)) 2970 printf("Req char = %c%s\n", ch, caseless); 2971 else 2972 printf("Req char = \\x%02x%s\n", ch, caseless); 2973 } 2974 2975 // This debugging function has been removed from JavaScriptCore's PCRE 2976 //pcre_printint(re, stdout); 2977 } 2978 #endif 2425 2979 2426 2980 /************************************************* … … 2446 3000 */ 2447 3001 3002 static pcre* returnError(ErrorCode errorcode, const char** errorptr) 3003 { 3004 *errorptr = error_text(errorcode); 3005 return 0; 3006 } 3007 2448 3008 pcre * 2449 3009 jsRegExpCompile(const pcre_char* pattern, int patternLength, 2450 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,2451 unsigned* numSubpatterns, const char** errorptr)3010 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline, 3011 unsigned* numSubpatterns, const char** errorptr) 2452 3012 { 2453 real_pcre *re; 2454 int length = 1 + LINK_SIZE; /* For initial BRA plus length */ 2455 int c, firstbyte, reqbyte; 2456 int bracount = 0; 2457 int branch_extra = 0; 2458 int branch_newextra; 2459 int item_count = -1; 2460 int name_count = 0; 2461 int max_name_size = 0; 2462 int lastitemlength = 0; 2463 ErrorCode errorcode = ERR0; 2464 BOOL class_utf8; 2465 BOOL capturing; 2466 unsigned int brastackptr = 0; 2467 size_t size; 2468 uschar *code; 2469 const uschar *codestart; 2470 const pcre_uchar *ptr; 2471 const pcre_uchar *patternEnd; 2472 compile_data compile_block; 2473 int brastack[BRASTACK_SIZE]; 2474 uschar bralenstack[BRASTACK_SIZE]; 2475 2476 /* We can't pass back an error message if errorptr is NULL; I guess the best we 2477 can do is just return NULL, but we can set a code value if there is a code 2478 pointer. */ 2479 2480 if (errorptr == NULL) 2481 { 2482 return NULL; 2483 } 2484 2485 *errorptr = NULL; 2486 2487 /* Set up pointers to the individual character tables */ 2488 2489 compile_block.lcc = _pcre_default_tables + lcc_offset; 2490 compile_block.fcc = _pcre_default_tables + fcc_offset; 2491 compile_block.cbits = _pcre_default_tables + cbits_offset; 2492 compile_block.ctypes = _pcre_default_tables + ctypes_offset; 2493 2494 /* Maximum back reference and backref bitmap. This is updated for numeric 2495 references during the first pass, but for named references during the actual 2496 compile pass. The bitmap records up to 31 back references to help in deciding 2497 whether (.*) can be treated as anchored or not. */ 2498 2499 compile_block.top_backref = 0; 2500 compile_block.backref_map = 0; 2501 2502 /* Reflect pattern for debugging output */ 2503 2504 DPRINTF(("------------------------------------------------------------------\n")); 2505 2506 /* The first thing to do is to make a pass over the pattern to compute the 2507 amount of store required to hold the compiled code. This does not have to be 2508 perfect as long as errors are overestimates. At the same time we can detect any 2509 flag settings right at the start, and extract them. Make an attempt to correct 2510 for any counted white space if an "extended" flag setting appears late in the 2511 pattern. We can't be so clever for #-comments. */ 2512 2513 ptr = (const pcre_uchar *)(pattern - 1); 2514 patternEnd = (const pcre_uchar *)(pattern + patternLength); 2515 2516 while (++ptr < patternEnd) 2517 { 2518 int min = 0, max = 0; 2519 int class_optcount; 2520 int bracket_length; 2521 int duplength; 2522 2523 c = *ptr; 2524 2525 item_count++; /* Is zero for the first non-comment item */ 2526 2527 switch(c) 2528 { 2529 /* A backslashed item may be an escaped data character or it may be a 2530 character type. */ 2531 2532 case '\\': 2533 c = check_escape(&ptr, patternEnd, &errorcode, bracount, false); 2534 if (errorcode != 0) goto PCRE_ERROR_RETURN; 2535 2536 lastitemlength = 1; /* Default length of last item for repeats */ 2537 2538 if (c >= 0) /* Data character */ 2539 { 2540 length += 2; /* For a one-byte character */ 2541 2542 if (c > 127) 2543 { 2544 int i; 2545 for (i = 0; i < _pcre_utf8_table1_size; i++) 2546 if (c <= _pcre_utf8_table1[i]) break; 2547 length += i; 2548 lastitemlength += i; 2549 } 2550 2551 continue; 2552 } 2553 2554 /* Other escapes need one byte */ 2555 2556 length++; 2557 2558 /* A back reference needs an additional 2 bytes, plus either one or 5 2559 bytes for a repeat. We also need to keep the value of the highest 2560 back reference. */ 2561 2562 if (c <= -ESC_REF) 2563 { 2564 int refnum = -c - ESC_REF; 2565 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1; 2566 if (refnum > compile_block.top_backref) 2567 compile_block.top_backref = refnum; 2568 length += 2; /* For single back reference */ 2569 if (ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd)) 2570 { 2571 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); 2572 if (errorcode != 0) goto PCRE_ERROR_RETURN; 2573 if ((min == 0 && (max == 1 || max == -1)) || 2574 (min == 1 && max == -1)) 2575 length++; 2576 else length += 5; 2577 if (ptr[1] == '?') ptr++; 2578 ptr++; 2579 } 2580 } 2581 continue; 2582 2583 case '^': /* Single-byte metacharacters */ 2584 case '.': 2585 case '$': 2586 length++; 2587 lastitemlength = 1; 2588 continue; 2589 2590 case '*': /* These repeats won't be after brackets; */ 2591 case '+': /* those are handled separately */ 2592 case '?': 2593 length++; 2594 goto POSESSIVE; /* A few lines below */ 2595 2596 /* This covers the cases of braced repeats after a single char, metachar, 2597 class, or back reference. */ 2598 2599 case '{': 2600 if (!is_counted_repeat(ptr+1, patternEnd)) goto NORMAL_CHAR; 2601 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode); 2602 if (errorcode != 0) goto PCRE_ERROR_RETURN; 2603 2604 /* These special cases just insert one extra opcode */ 2605 2606 if ((min == 0 && (max == 1 || max == -1)) || 2607 (min == 1 && max == -1)) 2608 length++; 2609 2610 /* These cases might insert additional copies of a preceding character. */ 2611 2612 else 2613 { 2614 if (min != 1) 2615 { 2616 length -= lastitemlength; /* Uncount the original char or metachar */ 2617 if (min > 0) length += 3 + lastitemlength; 2618 } 2619 length += lastitemlength + ((max > 0)? 3 : 1); 2620 } 2621 2622 if (ptr[1] == '?') ptr++; /* Needs no extra length */ 2623 2624 POSESSIVE: /* Test for possessive quantifier */ 2625 if (ptr[1] == '+') 2626 { 2627 ptr++; 2628 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */ 2629 } 2630 continue; 2631 2632 /* An alternation contains an offset to the next branch or ket. If any ims 2633 options changed in the previous branch(es), and/or if we are in a 2634 lookbehind assertion, extra space will be needed at the start of the 2635 branch. This is handled by branch_extra. */ 2636 2637 case '|': 2638 length += 1 + LINK_SIZE + branch_extra; 2639 continue; 2640 2641 /* A character class uses 33 characters provided that all the character 2642 values are less than 256. Otherwise, it uses a bit map for low valued 2643 characters, and individual items for others. Don't worry about character 2644 types that aren't allowed in classes - they'll get picked up during the 2645 compile. A character class that contains only one single-byte character 2646 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this 2647 where we can. (In UTF-8 mode we can do this only for chars < 128.) */ 2648 2649 case '[': 2650 if (*(++ptr) == '^') 2651 { 2652 class_optcount = 10; /* Greater than one */ 2653 ptr++; 2654 } 2655 else class_optcount = 0; 2656 2657 class_utf8 = false; 2658 2659 for (; ptr < patternEnd && *ptr != ']'; ++ptr) 2660 { 2661 /* Check for escapes */ 2662 2663 if (*ptr == '\\') 2664 { 2665 c = check_escape(&ptr, patternEnd, &errorcode, bracount, true); 2666 if (errorcode != 0) goto PCRE_ERROR_RETURN; 2667 2668 /* \b is backspace inside a class; \X is literal */ 2669 2670 if (-c == ESC_b) c = '\b'; 2671 2672 /* Handle escapes that turn into characters */ 2673 2674 if (c >= 0) goto NON_SPECIAL_CHARACTER; 2675 2676 /* Escapes that are meta-things. The normal ones just affect the 2677 bit map, but Unicode properties require an XCLASS extended item. */ 2678 2679 else 2680 { 2681 class_optcount = 10; /* \d, \s etc; make sure > 1 */ 2682 } 2683 } 2684 2685 /* Anything else increments the possible optimization count. We have to 2686 detect ranges here so that we can compute the number of extra ranges for 2687 caseless wide characters when UCP support is available. If there are wide 2688 characters, we are going to have to use an XCLASS, even for single 2689 characters. */ 2690 2691 else 2692 { 2693 int d; 2694 2695 { 2696 int extra = 0; 2697 GETCHARLENEND(c, ptr, patternEnd, extra); 2698 ptr += extra; 2699 } 2700 2701 /* Come here from handling \ above when it escapes to a char value */ 2702 2703 NON_SPECIAL_CHARACTER: 2704 class_optcount++; 2705 2706 d = -1; 2707 if (ptr + 1 < patternEnd && ptr[1] == '-') 2708 { 2709 pcre_uchar const *hyptr = ptr++; 2710 if (ptr + 1 < patternEnd && ptr[1] == '\\') 2711 { 2712 ptr++; 2713 d = check_escape(&ptr, patternEnd, &errorcode, bracount, true); 2714 if (errorcode != 0) goto PCRE_ERROR_RETURN; 2715 if (-d == ESC_b) d = '\b'; /* backspace */ 3013 /* We can't pass back an error message if errorptr is NULL; I guess the best we 3014 can do is just return NULL, but we can set a code value if there is a code 3015 pointer. */ 3016 if (!errorptr) 3017 return 0; 3018 3019 *errorptr = NULL; 3020 3021 /* Set up pointers to the individual character tables */ 3022 3023 compile_data compile_block; 3024 3025 ErrorCode errorcode = ERR0; 3026 int length = calculateCompiledPatternLengthAndFlags(pattern, patternLength, ignoreCase, compile_block, errorcode); 3027 if (errorcode) 3028 return returnError(errorcode, errorptr); 3029 3030 if (length > MAX_PATTERN_SIZE) 3031 return returnError(ERR16, errorptr); 3032 3033 /* Compute the size of data block needed and get it. */ 3034 3035 size_t size = length + sizeof(real_pcre); 3036 real_pcre* re = reinterpret_cast<real_pcre*>(new char[size]); 3037 3038 if (!re) 3039 return returnError(ERR13, errorptr); 3040 3041 /* Put in the magic number, and save the sizes, options, and character table 3042 pointer. NULL is used for the default character tables. The nullpad field is at 3043 the end; it's there to help in the case when a regex compiled on a system with 3044 4-byte pointers is run on another with 8-byte pointers. */ 3045 3046 re->size = (pcre_uint32)size; 3047 re->options = (ignoreCase ? PCRE_CASELESS : 0) | (multiline ? PCRE_MULTILINE : 0); 3048 3049 /* The starting points of the name/number translation table and of the code are 3050 passed around in the compile data block. */ 3051 3052 const uschar* codestart = (const uschar*)(re + 1); 3053 compile_block.start_code = codestart; 3054 compile_block.start_pattern = (const pcre_uchar*)pattern; 3055 3056 /* Set up a starting, non-extracting bracket, then compile the expression. On 3057 error, errorcode will be set non-zero, so we don't need to look at the result 3058 of the function here. */ 3059 3060 const pcre_uchar* ptr = (const pcre_uchar*)pattern; 3061 const pcre_uchar* patternEnd = pattern + patternLength; 3062 uschar* code = (uschar*)codestart; 3063 *code = OP_BRA; 3064 int firstbyte, reqbyte; 3065 int bracketCount = 0; 3066 (void)compile_regex(re->options, &bracketCount, &code, &ptr, 3067 patternEnd, 3068 &errorcode, 0, &firstbyte, &reqbyte, &compile_block); 3069 re->top_bracket = bracketCount; 3070 re->top_backref = compile_block.top_backref; 3071 3072 /* If not reached end of pattern on success, there's an excess bracket. */ 3073 3074 if (errorcode == 0 && ptr < patternEnd) 3075 errorcode = ERR10; 3076 3077 /* Fill in the terminating state and check for disastrous overflow, but 3078 if debugging, leave the test till after things are printed out. */ 3079 3080 *code++ = OP_END; 3081 3082 #ifndef DEBUG 3083 if (code - codestart > length) 3084 errorcode = ERR7; 3085 #endif 3086 3087 /* Give an error if there's back reference to a non-existent capturing 3088 subpattern. */ 3089 3090 if (re->top_backref > re->top_bracket) 3091 errorcode = ERR15; 3092 3093 /* Failed to compile, or error while post-processing */ 3094 3095 if (errorcode != ERR0) { 3096 delete [] reinterpret_cast<char*>(re); 3097 return returnError(errorcode, errorptr); 3098 } 3099 3100 /* If the anchored option was not passed, set the flag if we can determine that 3101 the pattern is anchored by virtue of ^ characters or \A or anything else (such 3102 as starting with .* when DOTALL is set). 3103 3104 Otherwise, if we know what the first character has to be, save it, because that 3105 speeds up unanchored matches no end. If not, see if we can set the 3106 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches 3107 start with ^. and also when all branches start with .* for non-DOTALL matches. 3108 */ 3109 3110 if (is_anchored(codestart, re->options, 0, compile_block.backref_map)) 3111 re->options |= PCRE_ANCHORED; 3112 else { 3113 if (firstbyte < 0) 3114 firstbyte = find_firstassertedchar(codestart, re->options, false); 3115 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ 3116 { 3117 int ch = firstbyte & 255; 3118 if (ch < 127) { 3119 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && 3120 compile_block.fcc[ch] == ch)? ch : firstbyte; 3121 re->options |= PCRE_FIRSTSET; 2716 3122 } 2717 else if (ptr + 1 < patternEnd && ptr[1] != ']') 2718 { 2719 ptr++; 2720 { 2721 int extra = 0; 2722 GETCHARLENEND(d, ptr, patternEnd, extra); 2723 ptr += extra; 2724 } 2725 } 2726 if (d < 0) ptr = hyptr; /* go back to hyphen as data */ 2727 } 2728 2729 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or > 2730 127 for caseless matching, we will need to use an XCLASS. */ 2731 2732 if (d >= 0) 2733 { 2734 class_optcount = 10; /* Ensure > 1 */ 2735 if (d < c) 2736 { 2737 errorcode = ERR8; 2738 goto PCRE_ERROR_RETURN; 2739 } 2740 2741 if ((d > 255 || (ignoreCase && d > 127))) 2742 { 2743 uschar buffer[6]; 2744 if (!class_utf8) /* Allow for XCLASS overhead */ 2745 { 2746 class_utf8 = true; 2747 length += LINK_SIZE + 2; 2748 } 2749 2750 /* If we have UCP support, find out how many extra ranges are 2751 needed to map the other case of characters within this range. We 2752 have to mimic the range optimization here, because extending the 2753 range upwards might push d over a boundary that makes is use 2754 another byte in the UTF-8 representation. */ 2755 2756 if (ignoreCase) 2757 { 2758 int occ, ocd; 2759 int cc = c; 2760 int origd = d; 2761 while (get_othercase_range(&cc, origd, &occ, &ocd)) 2762 { 2763 if (occ >= c && ocd <= d) continue; /* Skip embedded */ 2764 2765 if (occ < c && ocd >= c - 1) /* Extend the basic range */ 2766 { /* if there is overlap, */ 2767 c = occ; /* noting that if occ < c */ 2768 continue; /* we can't have ocd > d */ 2769 } /* because a subrange is */ 2770 if (ocd > d && occ <= d + 1) /* always shorter than */ 2771 { /* the basic range. */ 2772 d = ocd; 2773 continue; 2774 } 2775 2776 /* An extra item is needed */ 2777 2778 length += 1 + _pcre_ord2utf8(occ, buffer) + 2779 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer)); 2780 } 2781 } 2782 2783 /* The length of the (possibly extended) range */ 2784 2785 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer); 2786 } 2787 2788 } 2789 2790 /* We have a single character. There is nothing to be done unless we 2791 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must 2792 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP 2793 support. */ 2794 2795 else 2796 { 2797 if ((c > 255 || (ignoreCase && c > 127))) 2798 { 2799 uschar buffer[6]; 2800 class_optcount = 10; /* Ensure > 1 */ 2801 if (!class_utf8) /* Allow for XCLASS overhead */ 2802 { 2803 class_utf8 = true; 2804 length += LINK_SIZE + 2; 2805 } 2806 length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer)); 2807 } 2808 } 2809 } 2810 } 2811 2812 if (ptr >= patternEnd) /* Missing terminating ']' */ 2813 { 2814 errorcode = ERR6; 2815 goto PCRE_ERROR_RETURN; 2816 } 2817 2818 /* We can optimize when there was only one optimizable character. Repeats 2819 for positive and negated single one-byte chars are handled by the general 2820 code. Here, we handle repeats for the class opcodes. */ 2821 2822 if (class_optcount == 1) length += 3; else 2823 { 2824 length += 33; 2825 2826 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier, 2827 we also need extra for wrapping the whole thing in a sub-pattern. */ 2828 2829 if (ptr + 1 < patternEnd && ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd)) 2830 { 2831 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); 2832 if (errorcode != 0) goto PCRE_ERROR_RETURN; 2833 if ((min == 0 && (max == 1 || max == -1)) || 2834 (min == 1 && max == -1)) 2835 length++; 2836 else length += 5; 2837 if (ptr + 1 < patternEnd && ptr[1] == '+') 2838 { 2839 ptr++; 2840 length += 2 + 2*LINK_SIZE; 2841 } 2842 else if (ptr + 1 < patternEnd && ptr[1] == '?') ptr++; 2843 } 2844 } 2845 continue; 2846 2847 /* Brackets may be genuine groups or special things */ 2848 2849 case '(': 2850 branch_newextra = 0; 2851 bracket_length = 1 + LINK_SIZE; 2852 capturing = false; 2853 2854 /* Handle special forms of bracket, which all start (? */ 2855 2856 if (ptr + 1 < patternEnd && ptr[1] == '?') 2857 { 2858 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) 2859 { 2860 /* Non-referencing groups and lookaheads just move the pointer on, and 2861 then behave like a non-special bracket, except that they don't increment 2862 the count of extracting brackets. Ditto for the "once only" bracket, 2863 which is in Perl from version 5.005. */ 2864 2865 case ':': 2866 case '=': 2867 case '!': 2868 ptr += 2; 2869 break; 2870 2871 /* Else loop checking valid options until ) is met. Anything else is an 2872 error. If we are without any brackets, i.e. at top level, the settings 2873 act as if specified in the options, so massage the options immediately. 2874 This is for backward compatibility with Perl 5.004. */ 2875 2876 default: 2877 errorcode = ERR12; 2878 goto PCRE_ERROR_RETURN; 2879 } 2880 } 2881 2882 else capturing = 1; 2883 2884 /* Capturing brackets must be counted so we can process escapes in a 2885 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need 2886 an additional 3 bytes of memory per capturing bracket. */ 2887 2888 if (capturing) 2889 { 2890 bracount++; 2891 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3; 2892 } 2893 2894 /* Save length for computing whole length at end if there's a repeat that 2895 requires duplication of the group. Also save the current value of 2896 branch_extra, and start the new group with the new value. If non-zero, this 2897 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */ 2898 2899 if (brastackptr >= sizeof(brastack)/sizeof(int)) 2900 { 2901 errorcode = ERR17; 2902 goto PCRE_ERROR_RETURN; 2903 } 2904 2905 bralenstack[brastackptr] = branch_extra; 2906 branch_extra = branch_newextra; 2907 2908 brastack[brastackptr++] = length; 2909 length += bracket_length; 2910 continue; 2911 2912 /* Handle ket. Look for subsequent max/min; for certain sets of values we 2913 have to replicate this bracket up to that many times. If brastackptr is 2914 0 this is an unmatched bracket which will generate an error, but take care 2915 not to try to access brastack[-1] when computing the length and restoring 2916 the branch_extra value. */ 2917 2918 case ')': 2919 length += 1 + LINK_SIZE; 2920 if (brastackptr > 0) 2921 { 2922 duplength = length - brastack[--brastackptr]; 2923 branch_extra = bralenstack[brastackptr]; 2924 } 2925 else duplength = 0; 2926 2927 /* Leave ptr at the final char; for read_repeat_counts this happens 2928 automatically; for the others we need an increment. */ 2929 2930 if (ptr + 1 < patternEnd && (c = ptr[1]) == '{' && is_counted_repeat(ptr+2, patternEnd)) 2931 { 2932 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); 2933 if (errorcode != 0) goto PCRE_ERROR_RETURN; 2934 } 2935 else if (c == '*') { min = 0; max = -1; ptr++; } 2936 else if (c == '+') { min = 1; max = -1; ptr++; } 2937 else if (c == '?') { min = 0; max = 1; ptr++; } 2938 else { min = 1; max = 1; } 2939 2940 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the 2941 group, and if the maximum is greater than zero, we have to replicate 2942 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting 2943 bracket set. */ 2944 2945 if (min == 0) 2946 { 2947 length++; 2948 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE); 2949 } 2950 2951 /* When the minimum is greater than zero, we have to replicate up to 2952 minval-1 times, with no additions required in the copies. Then, if there 2953 is a limited maximum we have to replicate up to maxval-1 times allowing 2954 for a BRAZERO item before each optional copy and nesting brackets for all 2955 but one of the optional copies. */ 2956 2957 else 2958 { 2959 length += (min - 1) * duplength; 2960 if (max > min) /* Need this test as max=-1 means no limit */ 2961 length += (max - min) * (duplength + 3 + 2*LINK_SIZE) 2962 - (2 + 2*LINK_SIZE); 2963 } 2964 2965 /* Allow space for once brackets for "possessive quantifier" */ 2966 2967 if (ptr + 1 < patternEnd && ptr[1] == '+') 2968 { 2969 ptr++; 2970 length += 2 + 2*LINK_SIZE; 2971 } 2972 continue; 2973 2974 /* Non-special character. It won't be space or # in extended mode, so it is 2975 always a genuine character. If we are in a \Q...\E sequence, check for the 2976 end; if not, we have a literal. */ 2977 2978 default: 2979 NORMAL_CHAR: 2980 2981 length += 2; /* For a one-byte character */ 2982 lastitemlength = 1; /* Default length of last item for repeats */ 2983 2984 /* In UTF-8 mode, check for additional bytes. */ 2985 2986 if (c > 127) 2987 { 2988 if (IS_LEADING_SURROGATE(c)) 2989 { 2990 c = DECODE_SURROGATE_PAIR(c, ptr < patternEnd ? *ptr : 0); 2991 ++ptr; 2992 } 2993 2994 { 2995 int i; 2996 for (i = 0; i < _pcre_utf8_table1_size; i++) 2997 if (c <= _pcre_utf8_table1[i]) break; 2998 length += i; 2999 lastitemlength += i; 3000 } 3001 } 3002 3003 continue; 3123 } 3124 else if (is_startline(codestart, 0, compile_block.backref_map)) 3125 re->options |= PCRE_STARTLINE; 3004 3126 } 3005 } 3006 3007 length += 2 + LINK_SIZE; /* For final KET and END */ 3008 3009 if (length > MAX_PATTERN_SIZE) 3010 { 3011 errorcode = ERR16; 3012 goto PCRE_ERROR_RETURN; 3013 } 3014 3015 /* Compute the size of data block needed and get it. */ 3016 3017 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); 3018 re = reinterpret_cast<real_pcre*>(new char[size]); 3019 3020 if (re == NULL) 3021 { 3022 errorcode = ERR13; 3023 goto PCRE_ERROR_RETURN; 3024 } 3025 3026 /* Put in the magic number, and save the sizes, options, and character table 3027 pointer. NULL is used for the default character tables. The nullpad field is at 3028 the end; it's there to help in the case when a regex compiled on a system with 3029 4-byte pointers is run on another with 8-byte pointers. */ 3030 3031 re->size = (pcre_uint32)size; 3032 re->options = (ignoreCase ? PCRE_CASELESS : 0) | (multiline ? PCRE_MULTILINE : 0); 3033 3034 /* The starting points of the name/number translation table and of the code are 3035 passed around in the compile data block. */ 3036 3037 codestart = (const uschar *)(re + 1); 3038 compile_block.start_code = codestart; 3039 compile_block.start_pattern = (const pcre_uchar *)pattern; 3040 compile_block.req_varyopt = 0; 3041 3042 /* Set up a starting, non-extracting bracket, then compile the expression. On 3043 error, errorcode will be set non-zero, so we don't need to look at the result 3044 of the function here. */ 3045 3046 ptr = (const pcre_uchar *)pattern; 3047 code = (uschar *)codestart; 3048 *code = OP_BRA; 3049 bracount = 0; 3050 (void)compile_regex(re->options, &bracount, &code, &ptr, 3051 patternEnd, 3052 &errorcode, 0, &firstbyte, &reqbyte, &compile_block); 3053 re->top_bracket = bracount; 3054 re->top_backref = compile_block.top_backref; 3055 3056 /* If not reached end of pattern on success, there's an excess bracket. */ 3057 3058 if (errorcode == 0 && ptr < patternEnd) errorcode = ERR10; 3059 3060 /* Fill in the terminating state and check for disastrous overflow, but 3061 if debugging, leave the test till after things are printed out. */ 3062 3063 *code++ = OP_END; 3064 3065 #ifndef DEBUG 3066 if (code - codestart > length) errorcode = ERR7; 3127 3128 /* For an anchored pattern, we use the "required byte" only if it follows a 3129 variable length item in the regex. Remove the caseless flag for non-caseable 3130 bytes. */ 3131 3132 if (reqbyte >= 0 && (!(re->options & PCRE_ANCHORED) || (reqbyte & REQ_VARY))) { 3133 int ch = reqbyte & 255; 3134 if (ch < 127) { 3135 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && 3136 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; 3137 re->options |= PCRE_REQCHSET; 3138 } 3139 } 3140 3141 #ifdef DEBUG 3142 printCompiledRegExp(re); 3143 3144 /* This check is done here in the debugging case so that the code that 3145 was compiled can be seen. */ 3146 if (code - codestart > length) { 3147 (pcre_free)(re); 3148 *errorptr = error_text(ERR7); 3149 return NULL; 3150 } 3151 3067 3152 #endif 3068 3069 /* Give an error if there's back reference to a non-existent capturing 3070 subpattern. */ 3071 3072 if (re->top_backref > re->top_bracket) errorcode = ERR15; 3073 3074 /* Failed to compile, or error while post-processing */ 3075 3076 if (errorcode != ERR0) 3077 { 3078 delete [] reinterpret_cast<char*>(re); 3079 PCRE_ERROR_RETURN: 3080 *errorptr = error_text(errorcode); 3081 return NULL; 3082 } 3083 3084 /* If the anchored option was not passed, set the flag if we can determine that 3085 the pattern is anchored by virtue of ^ characters or \A or anything else (such 3086 as starting with .* when DOTALL is set). 3087 3088 Otherwise, if we know what the first character has to be, save it, because that 3089 speeds up unanchored matches no end. If not, see if we can set the 3090 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches 3091 start with ^. and also when all branches start with .* for non-DOTALL matches. 3092 */ 3093 3094 { 3095 if (is_anchored(codestart, re->options, 0, compile_block.backref_map)) 3096 re->options |= PCRE_ANCHORED; 3097 else 3098 { 3099 if (firstbyte < 0) 3100 firstbyte = find_firstassertedchar(codestart, re->options, false); 3101 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ 3102 { 3103 int ch = firstbyte & 255; 3104 if (ch < 127) 3105 { /* Strange indentation to aid in merging. */ 3106 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && 3107 compile_block.fcc[ch] == ch)? ch : firstbyte; 3108 re->options |= PCRE_FIRSTSET; 3109 } 3110 } 3111 else if (is_startline(codestart, 0, compile_block.backref_map)) 3112 re->options |= PCRE_STARTLINE; 3113 } 3114 } 3115 3116 /* For an anchored pattern, we use the "required byte" only if it follows a 3117 variable length item in the regex. Remove the caseless flag for non-caseable 3118 bytes. */ 3119 3120 if (reqbyte >= 0 && 3121 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) 3122 { 3123 int ch = reqbyte & 255; 3124 if (ch < 127) 3125 { /* Strange indentation to aid in merging. */ 3126 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && 3127 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; 3128 re->options |= PCRE_REQCHSET; 3129 } 3130 } 3131 3132 /* Print out the compiled data if debugging is enabled. This is never the 3133 case when building a production library. */ 3134 3135 #ifdef DEBUG 3136 3137 printf("Length = %d top_bracket = %d top_backref = %d\n", 3138 length, re->top_bracket, re->top_backref); 3139 3140 if (re->options != 0) 3141 { 3142 printf("%s%s%s\n", 3143 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", 3144 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "", 3145 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : ""); 3146 } 3147 3148 if ((re->options & PCRE_FIRSTSET) != 0) 3149 { 3150 int ch = re->first_byte & 255; 3151 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? 3152 "" : " (caseless)"; 3153 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); 3154 else printf("First char = \\x%02x%s\n", ch, caseless); 3155 } 3156 3157 if ((re->options & PCRE_REQCHSET) != 0) 3158 { 3159 int ch = re->req_byte & 255; 3160 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? 3161 "" : " (caseless)"; 3162 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); 3163 else printf("Req char = \\x%02x%s\n", ch, caseless); 3164 } 3165 3166 pcre_printint(re, stdout); 3167 3168 /* This check is done here in the debugging case so that the code that 3169 was compiled can be seen. */ 3170 3171 if (code - codestart > length) 3172 { 3173 (pcre_free)(re); 3174 *errorptr = error_text(ERR7); 3175 return NULL; 3176 } 3177 3178 #endif 3179 3180 if (numSubpatterns) 3181 *numSubpatterns = re->top_bracket; 3182 return (pcre *)re; 3153 3154 if (numSubpatterns) 3155 *numSubpatterns = re->top_bracket; 3156 return (pcre *)re; 3183 3157 } 3184 3158
Note:
See TracChangeset
for help on using the changeset viewer.