Ignore:
Timestamp:
Nov 14, 2007, 5:17:31 PM (18 years ago)
Author:
[email protected]
Message:

2007-11-14 Eric Seidel <[email protected]>

Reviewed by Sam.

Give PCRE a (small) bath.
Fix some formating and break things off into separate functions
http://bugs.webkit.org/show_bug.cgi?id=15993

  • pcre/pcre_compile.cpp: (calculateCompiledPatternLengthAndFlags): (printCompiledRegExp): (returnError): (jsRegExpCompile):
  • pcre/pcre_internal.h: (compile_data::compile_data):
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/pcre/pcre_compile.cpp

    r27752 r27802  
    24222422}
    24232423
    2424 
     2424static int calculateCompiledPatternLengthAndFlags(const pcre_char* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase, compile_data& compile_block, ErrorCode errorcode)
     2425{
     2426    /* Make a pass over the pattern to compute the
     2427     amount of store required to hold the compiled code. This does not have to be
     2428     perfect as long as errors are overestimates. At the same time we can detect any
     2429     flag settings right at the start, and extract them. Make an attempt to correct
     2430     for any counted white space if an "extended" flag setting appears late in the
     2431     pattern. We can't be so clever for #-comments. */
     2432   
     2433    int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
     2434    int branch_extra = 0;
     2435    int branch_newextra;
     2436    int lastitemlength = 0;
     2437    BOOL class_utf8;
     2438    BOOL capturing;
     2439    unsigned int brastackptr = 0;
     2440    int brastack[BRASTACK_SIZE];
     2441    uschar bralenstack[BRASTACK_SIZE];
     2442    int item_count = -1;
     2443    int bracount = 0;
     2444   
     2445    const pcre_uchar* ptr = (const pcre_uchar*)(pattern - 1);
     2446    const pcre_uchar* patternEnd = (const pcre_uchar*)(pattern + patternLength);
     2447   
     2448    while (++ptr < patternEnd)
     2449    {
     2450        int min = 0, max = 0;
     2451        int class_optcount;
     2452        int bracket_length;
     2453        int duplength;
     2454       
     2455        int c = *ptr;
     2456       
     2457        item_count++;    /* Is zero for the first non-comment item */
     2458       
     2459        switch(c)
     2460        {
     2461                /* A backslashed item may be an escaped data character or it may be a
     2462                 character type. */
     2463               
     2464            case '\\':
     2465                c = check_escape(&ptr, patternEnd, &errorcode, bracount, false);
     2466                if (errorcode != 0)
     2467                    return -1;;
     2468               
     2469                lastitemlength = 1;     /* Default length of last item for repeats */
     2470               
     2471                if (c >= 0)             /* Data character */
     2472                {
     2473                    length += 2;          /* For a one-byte character */
     2474                   
     2475                    if (c > 127)
     2476                    {
     2477                        int i;
     2478                        for (i = 0; i < _pcre_utf8_table1_size; i++)
     2479                            if (c <= _pcre_utf8_table1[i]) break;
     2480                        length += i;
     2481                        lastitemlength += i;
     2482                    }
     2483                   
     2484                    continue;
     2485                }
     2486               
     2487                /* Other escapes need one byte */
     2488               
     2489                length++;
     2490               
     2491                /* A back reference needs an additional 2 bytes, plus either one or 5
     2492                 bytes for a repeat. We also need to keep the value of the highest
     2493                 back reference. */
     2494               
     2495                if (c <= -ESC_REF)
     2496                {
     2497                    int refnum = -c - ESC_REF;
     2498                    compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
     2499                    if (refnum > compile_block.top_backref)
     2500                        compile_block.top_backref = refnum;
     2501                    length += 2;   /* For single back reference */
     2502                    if (ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
     2503                    {
     2504                        ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
     2505                        if (errorcode != 0) return -1;;
     2506                        if ((min == 0 && (max == 1 || max == -1)) ||
     2507                            (min == 1 && max == -1))
     2508                            length++;
     2509                        else length += 5;
     2510                        if (ptr[1] == '?') ptr++;
     2511                        ptr++;
     2512                    }
     2513                }
     2514                continue;
     2515               
     2516                case '^':     /* Single-byte metacharacters */
     2517                case '.':
     2518                case '$':
     2519                length++;
     2520                lastitemlength = 1;
     2521                continue;
     2522               
     2523                case '*':            /* These repeats won't be after brackets; */
     2524                case '+':            /* those are handled separately */
     2525                case '?':
     2526                length++;
     2527                goto POSESSIVE;      /* A few lines below */
     2528               
     2529                /* This covers the cases of braced repeats after a single char, metachar,
     2530                 class, or back reference. */
     2531               
     2532                case '{':
     2533                if (!is_counted_repeat(ptr+1, patternEnd)) goto NORMAL_CHAR;
     2534                ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
     2535                if (errorcode != 0) return -1;;
     2536               
     2537                /* These special cases just insert one extra opcode */
     2538               
     2539                if ((min == 0 && (max == 1 || max == -1)) ||
     2540                    (min == 1 && max == -1))
     2541                    length++;
     2542               
     2543                /* These cases might insert additional copies of a preceding character. */
     2544               
     2545                else
     2546                {
     2547                    if (min != 1)
     2548                    {
     2549                        length -= lastitemlength;   /* Uncount the original char or metachar */
     2550                        if (min > 0) length += 3 + lastitemlength;
     2551                    }
     2552                    length += lastitemlength + ((max > 0)? 3 : 1);
     2553                }
     2554               
     2555                if (ptr[1] == '?') ptr++;      /* Needs no extra length */
     2556               
     2557            POSESSIVE:                     /* Test for possessive quantifier */
     2558                if (ptr[1] == '+')
     2559                {
     2560                    ptr++;
     2561                    length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
     2562                }
     2563                continue;
     2564               
     2565                /* An alternation contains an offset to the next branch or ket. If any ims
     2566                 options changed in the previous branch(es), and/or if we are in a
     2567                 lookbehind assertion, extra space will be needed at the start of the
     2568                 branch. This is handled by branch_extra. */
     2569               
     2570                case '|':
     2571                length += 1 + LINK_SIZE + branch_extra;
     2572                continue;
     2573               
     2574                /* A character class uses 33 characters provided that all the character
     2575                 values are less than 256. Otherwise, it uses a bit map for low valued
     2576                 characters, and individual items for others. Don't worry about character
     2577                 types that aren't allowed in classes - they'll get picked up during the
     2578                 compile. A character class that contains only one single-byte character
     2579                 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
     2580                 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
     2581               
     2582                case '[':
     2583                if (*(++ptr) == '^')
     2584                {
     2585                    class_optcount = 10;  /* Greater than one */
     2586                    ptr++;
     2587                }
     2588                else class_optcount = 0;
     2589               
     2590                class_utf8 = false;
     2591               
     2592                for (; ptr < patternEnd && *ptr != ']'; ++ptr)
     2593                {
     2594                    /* Check for escapes */
     2595                   
     2596                    if (*ptr == '\\')
     2597                    {
     2598                        c = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
     2599                        if (errorcode != 0) return -1;;
     2600                       
     2601                        /* \b is backspace inside a class; \X is literal */
     2602                       
     2603                        if (-c == ESC_b) c = '\b';
     2604                       
     2605                        /* Handle escapes that turn into characters */
     2606                       
     2607                        if (c >= 0) goto NON_SPECIAL_CHARACTER;
     2608                       
     2609                        /* Escapes that are meta-things. The normal ones just affect the
     2610                         bit map, but Unicode properties require an XCLASS extended item. */
     2611                       
     2612                        else
     2613                        {
     2614                            class_optcount = 10;         /* \d, \s etc; make sure > 1 */
     2615                        }
     2616                    }
     2617                   
     2618                    /* Anything else increments the possible optimization count. We have to
     2619                     detect ranges here so that we can compute the number of extra ranges for
     2620                     caseless wide characters when UCP support is available. If there are wide
     2621                     characters, we are going to have to use an XCLASS, even for single
     2622                     characters. */
     2623                   
     2624                    else
     2625                    {
     2626                        int d;
     2627                       
     2628                        {
     2629                            int extra = 0;
     2630                            GETCHARLENEND(c, ptr, patternEnd, extra);
     2631                            ptr += extra;
     2632                        }
     2633                       
     2634                        /* Come here from handling \ above when it escapes to a char value */
     2635                       
     2636                    NON_SPECIAL_CHARACTER:
     2637                        class_optcount++;
     2638                       
     2639                        d = -1;
     2640                        if (ptr + 1 < patternEnd && ptr[1] == '-')
     2641                        {
     2642                            pcre_uchar const *hyptr = ptr++;
     2643                            if (ptr + 1 < patternEnd && ptr[1] == '\\')
     2644                            {
     2645                                ptr++;
     2646                                d = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
     2647                                if (errorcode != 0) return -1;;
     2648                                if (-d == ESC_b) d = '\b';        /* backspace */
     2649                            }
     2650                            else if (ptr + 1 < patternEnd && ptr[1] != ']')
     2651                            {
     2652                                ptr++;
     2653                                {
     2654                                    int extra = 0;
     2655                                    GETCHARLENEND(d, ptr, patternEnd, extra);
     2656                                    ptr += extra;
     2657                                }
     2658                            }
     2659                            if (d < 0) ptr = hyptr;      /* go back to hyphen as data */
     2660                        }
     2661                       
     2662                        /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
     2663                         127 for caseless matching, we will need to use an XCLASS. */
     2664                       
     2665                        if (d >= 0)
     2666                        {
     2667                            class_optcount = 10;     /* Ensure > 1 */
     2668                            if (d < c)
     2669                            {
     2670                                errorcode = ERR8;
     2671                                return -1;;
     2672                            }
     2673                           
     2674                            if ((d > 255 || (ignoreCase && d > 127)))
     2675                            {
     2676                                uschar buffer[6];
     2677                                if (!class_utf8)         /* Allow for XCLASS overhead */
     2678                                {
     2679                                    class_utf8 = true;
     2680                                    length += LINK_SIZE + 2;
     2681                                }
     2682                               
     2683                                /* If we have UCP support, find out how many extra ranges are
     2684                                 needed to map the other case of characters within this range. We
     2685                                 have to mimic the range optimization here, because extending the
     2686                                 range upwards might push d over a boundary that makes is use
     2687                                 another byte in the UTF-8 representation. */
     2688                               
     2689                                if (ignoreCase)
     2690                                {
     2691                                    int occ, ocd;
     2692                                    int cc = c;
     2693                                    int origd = d;
     2694                                    while (get_othercase_range(&cc, origd, &occ, &ocd))
     2695                                    {
     2696                                        if (occ >= c && ocd <= d) continue;   /* Skip embedded */
     2697                                       
     2698                                        if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
     2699                                        {                            /* if there is overlap,   */
     2700                                            c = occ;                     /* noting that if occ < c */
     2701                                            continue;                    /* we can't have ocd > d  */
     2702                                        }                            /* because a subrange is  */
     2703                                        if (ocd > d && occ <= d + 1)   /* always shorter than    */
     2704                                        {                            /* the basic range.       */
     2705                                            d = ocd;
     2706                                            continue;
     2707                                        }
     2708                                       
     2709                                        /* An extra item is needed */
     2710                                       
     2711                                        length += 1 + _pcre_ord2utf8(occ, buffer) +
     2712                                        ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
     2713                                    }
     2714                                }
     2715                               
     2716                                /* The length of the (possibly extended) range */
     2717                               
     2718                                length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
     2719                            }
     2720                           
     2721                        }
     2722                       
     2723                        /* We have a single character. There is nothing to be done unless we
     2724                         are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
     2725                         allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
     2726                         support. */
     2727                       
     2728                        else
     2729                        {
     2730                            if ((c > 255 || (ignoreCase && c > 127)))
     2731                            {
     2732                                uschar buffer[6];
     2733                                class_optcount = 10;     /* Ensure > 1 */
     2734                                if (!class_utf8)         /* Allow for XCLASS overhead */
     2735                                {
     2736                                    class_utf8 = true;
     2737                                    length += LINK_SIZE + 2;
     2738                                }
     2739                                length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer));
     2740                            }
     2741                        }
     2742                    }
     2743                }
     2744               
     2745                if (ptr >= patternEnd)                          /* Missing terminating ']' */
     2746                {
     2747                    errorcode = ERR6;
     2748                    return -1;;
     2749                }
     2750               
     2751                /* We can optimize when there was only one optimizable character. Repeats
     2752                 for positive and negated single one-byte chars are handled by the general
     2753                 code. Here, we handle repeats for the class opcodes. */
     2754               
     2755                if (class_optcount == 1) length += 3; else
     2756                {
     2757                    length += 33;
     2758                   
     2759                    /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
     2760                     we also need extra for wrapping the whole thing in a sub-pattern. */
     2761                   
     2762                    if (ptr + 1 < patternEnd && ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
     2763                    {
     2764                        ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
     2765                        if (errorcode != 0) return -1;;
     2766                        if ((min == 0 && (max == 1 || max == -1)) ||
     2767                            (min == 1 && max == -1))
     2768                            length++;
     2769                        else length += 5;
     2770                        if (ptr + 1 < patternEnd && ptr[1] == '+')
     2771                        {
     2772                            ptr++;
     2773                            length += 2 + 2*LINK_SIZE;
     2774                        }
     2775                        else if (ptr + 1 < patternEnd && ptr[1] == '?') ptr++;
     2776                    }
     2777                }
     2778                continue;
     2779               
     2780                /* Brackets may be genuine groups or special things */
     2781               
     2782                case '(':
     2783                branch_newextra = 0;
     2784                bracket_length = 1 + LINK_SIZE;
     2785                capturing = false;
     2786               
     2787                /* Handle special forms of bracket, which all start (? */
     2788               
     2789                if (ptr + 1 < patternEnd && ptr[1] == '?')
     2790                {
     2791                    switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0))
     2792                    {
     2793                            /* Non-referencing groups and lookaheads just move the pointer on, and
     2794                             then behave like a non-special bracket, except that they don't increment
     2795                             the count of extracting brackets. Ditto for the "once only" bracket,
     2796                             which is in Perl from version 5.005. */
     2797                           
     2798                        case ':':
     2799                        case '=':
     2800                        case '!':
     2801                            ptr += 2;
     2802                            break;
     2803                           
     2804                            /* Else loop checking valid options until ) is met. Anything else is an
     2805                             error. If we are without any brackets, i.e. at top level, the settings
     2806                             act as if specified in the options, so massage the options immediately.
     2807                             This is for backward compatibility with Perl 5.004. */
     2808                           
     2809                        default:
     2810                            errorcode = ERR12;
     2811                            return -1;;
     2812                    }
     2813                }
     2814               
     2815                else capturing = 1;
     2816               
     2817                /* Capturing brackets must be counted so we can process escapes in a
     2818                 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
     2819                 an additional 3 bytes of memory per capturing bracket. */
     2820               
     2821                if (capturing)
     2822                {
     2823                    bracount++;
     2824                    if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
     2825                }
     2826               
     2827                /* Save length for computing whole length at end if there's a repeat that
     2828                 requires duplication of the group. Also save the current value of
     2829                 branch_extra, and start the new group with the new value. If non-zero, this
     2830                 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
     2831               
     2832                if (brastackptr >= sizeof(brastack)/sizeof(int))
     2833                {
     2834                    errorcode = ERR17;
     2835                    return -1;;
     2836                }
     2837               
     2838                bralenstack[brastackptr] = branch_extra;
     2839                branch_extra = branch_newextra;
     2840               
     2841                brastack[brastackptr++] = length;
     2842                length += bracket_length;
     2843                continue;
     2844               
     2845                /* Handle ket. Look for subsequent max/min; for certain sets of values we
     2846                 have to replicate this bracket up to that many times. If brastackptr is
     2847                 0 this is an unmatched bracket which will generate an error, but take care
     2848                 not to try to access brastack[-1] when computing the length and restoring
     2849                 the branch_extra value. */
     2850               
     2851                case ')':
     2852                length += 1 + LINK_SIZE;
     2853                if (brastackptr > 0)
     2854                {
     2855                    duplength = length - brastack[--brastackptr];
     2856                    branch_extra = bralenstack[brastackptr];
     2857                }
     2858                else duplength = 0;
     2859               
     2860                /* Leave ptr at the final char; for read_repeat_counts this happens
     2861                 automatically; for the others we need an increment. */
     2862               
     2863                if (ptr + 1 < patternEnd && (c = ptr[1]) == '{' && is_counted_repeat(ptr+2, patternEnd))
     2864                {
     2865                    ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
     2866                    if (errorcode != 0) return -1;;
     2867                }
     2868                else if (c == '*') { min = 0; max = -1; ptr++; }
     2869                else if (c == '+') { min = 1; max = -1; ptr++; }
     2870                else if (c == '?') { min = 0; max = 1;  ptr++; }
     2871                else { min = 1; max = 1; }
     2872               
     2873                /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
     2874                 group, and if the maximum is greater than zero, we have to replicate
     2875                 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
     2876                 bracket set. */
     2877               
     2878                if (min == 0)
     2879                {
     2880                    length++;
     2881                    if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
     2882                }
     2883               
     2884                /* When the minimum is greater than zero, we have to replicate up to
     2885                 minval-1 times, with no additions required in the copies. Then, if there
     2886                 is a limited maximum we have to replicate up to maxval-1 times allowing
     2887                 for a BRAZERO item before each optional copy and nesting brackets for all
     2888                 but one of the optional copies. */
     2889               
     2890                else
     2891                {
     2892                    length += (min - 1) * duplength;
     2893                    if (max > min)   /* Need this test as max=-1 means no limit */
     2894                        length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
     2895                        - (2 + 2*LINK_SIZE);
     2896                }
     2897               
     2898                /* Allow space for once brackets for "possessive quantifier" */
     2899               
     2900                if (ptr + 1 < patternEnd && ptr[1] == '+')
     2901                {
     2902                    ptr++;
     2903                    length += 2 + 2*LINK_SIZE;
     2904                }
     2905                continue;
     2906               
     2907                /* Non-special character. It won't be space or # in extended mode, so it is
     2908                 always a genuine character. If we are in a \Q...\E sequence, check for the
     2909                 end; if not, we have a literal. */
     2910               
     2911                default:
     2912            NORMAL_CHAR:
     2913               
     2914                length += 2;          /* For a one-byte character */
     2915                lastitemlength = 1;   /* Default length of last item for repeats */
     2916               
     2917                /* In UTF-8 mode, check for additional bytes. */
     2918               
     2919                if (c > 127)
     2920                {
     2921                    if (IS_LEADING_SURROGATE(c))
     2922                    {
     2923                        c = DECODE_SURROGATE_PAIR(c, ptr < patternEnd ? *ptr : 0);
     2924                        ++ptr;
     2925                    }
     2926                   
     2927                    {
     2928                        int i;
     2929                        for (i = 0; i < _pcre_utf8_table1_size; i++)
     2930                            if (c <= _pcre_utf8_table1[i]) break;
     2931                        length += i;
     2932                        lastitemlength += i;
     2933                    }
     2934                }
     2935               
     2936                continue;
     2937        }
     2938    }
     2939   
     2940    length += 2 + LINK_SIZE;    /* For final KET and END */
     2941    return length;
     2942}
     2943
     2944#ifdef DEBUG
     2945static void printCompiledRegExp(real_pcre* re, int length)
     2946{
     2947    printf("Length = %d top_bracket = %d top_backref = %d\n",
     2948           length, re->top_bracket, re->top_backref);
     2949   
     2950    if (re->options) {
     2951        printf("%s%s%s\n",
     2952               ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
     2953               ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
     2954               ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "");
     2955    }
     2956   
     2957    if (re->options & PCRE_FIRSTSET) {
     2958        char ch = re->first_byte & 255;
     2959        const char* caseless = ((re->first_byte & REQ_CASELESS) == 0) ? "" : " (caseless)";
     2960        if (isASCIIAlphanumeric(ch))
     2961            printf("First char = %c%s\n", ch, caseless);
     2962        else
     2963            printf("First char = \\x%02x%s\n", ch, caseless);
     2964    }
     2965   
     2966    if (re->options & PCRE_REQCHSET) {
     2967        char ch = re->req_byte & 255;
     2968        const char* caseless = ((re->req_byte & REQ_CASELESS) == 0) ? "" : " (caseless)";
     2969        if (isASCIIAlphanumeric(ch))
     2970            printf("Req char = %c%s\n", ch, caseless);
     2971        else
     2972            printf("Req char = \\x%02x%s\n", ch, caseless);
     2973    }
     2974   
     2975    // This debugging function has been removed from JavaScriptCore's PCRE
     2976    //pcre_printint(re, stdout);
     2977}
     2978#endif
    24252979
    24262980/*************************************************
     
    24463000*/
    24473001
     3002static pcre* returnError(ErrorCode errorcode, const char** errorptr)
     3003{
     3004    *errorptr = error_text(errorcode);
     3005    return 0;
     3006}
     3007
    24483008pcre *
    24493009jsRegExpCompile(const pcre_char* pattern, int patternLength,
    2450     JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
    2451     unsigned* numSubpatterns, const char** errorptr)
     3010                JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
     3011                unsigned* numSubpatterns, const char** errorptr)
    24523012{
    2453 real_pcre *re;
    2454 int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
    2455 int c, firstbyte, reqbyte;
    2456 int bracount = 0;
    2457 int branch_extra = 0;
    2458 int branch_newextra;
    2459 int item_count = -1;
    2460 int name_count = 0;
    2461 int max_name_size = 0;
    2462 int lastitemlength = 0;
    2463 ErrorCode errorcode = ERR0;
    2464 BOOL class_utf8;
    2465 BOOL capturing;
    2466 unsigned int brastackptr = 0;
    2467 size_t size;
    2468 uschar *code;
    2469 const uschar *codestart;
    2470 const pcre_uchar *ptr;
    2471 const pcre_uchar *patternEnd;
    2472 compile_data compile_block;
    2473 int brastack[BRASTACK_SIZE];
    2474 uschar bralenstack[BRASTACK_SIZE];
    2475 
    2476 /* We can't pass back an error message if errorptr is NULL; I guess the best we
    2477 can do is just return NULL, but we can set a code value if there is a code
    2478 pointer. */
    2479 
    2480 if (errorptr == NULL)
    2481   {
    2482   return NULL;
    2483   }
    2484 
    2485 *errorptr = NULL;
    2486 
    2487 /* Set up pointers to the individual character tables */
    2488 
    2489 compile_block.lcc = _pcre_default_tables + lcc_offset;
    2490 compile_block.fcc = _pcre_default_tables + fcc_offset;
    2491 compile_block.cbits = _pcre_default_tables + cbits_offset;
    2492 compile_block.ctypes = _pcre_default_tables + ctypes_offset;
    2493 
    2494 /* Maximum back reference and backref bitmap. This is updated for numeric
    2495 references during the first pass, but for named references during the actual
    2496 compile pass. The bitmap records up to 31 back references to help in deciding
    2497 whether (.*) can be treated as anchored or not. */
    2498 
    2499 compile_block.top_backref = 0;
    2500 compile_block.backref_map = 0;
    2501 
    2502 /* Reflect pattern for debugging output */
    2503 
    2504 DPRINTF(("------------------------------------------------------------------\n"));
    2505 
    2506 /* The first thing to do is to make a pass over the pattern to compute the
    2507 amount of store required to hold the compiled code. This does not have to be
    2508 perfect as long as errors are overestimates. At the same time we can detect any
    2509 flag settings right at the start, and extract them. Make an attempt to correct
    2510 for any counted white space if an "extended" flag setting appears late in the
    2511 pattern. We can't be so clever for #-comments. */
    2512 
    2513 ptr = (const pcre_uchar *)(pattern - 1);
    2514 patternEnd = (const pcre_uchar *)(pattern + patternLength);
    2515 
    2516 while (++ptr < patternEnd)
    2517   {
    2518   int min = 0, max = 0;
    2519   int class_optcount;
    2520   int bracket_length;
    2521   int duplength;
    2522 
    2523   c = *ptr;
    2524  
    2525   item_count++;    /* Is zero for the first non-comment item */
    2526 
    2527   switch(c)
    2528     {
    2529     /* A backslashed item may be an escaped data character or it may be a
    2530     character type. */
    2531 
    2532     case '\\':
    2533     c = check_escape(&ptr, patternEnd, &errorcode, bracount, false);
    2534     if (errorcode != 0) goto PCRE_ERROR_RETURN;
    2535 
    2536     lastitemlength = 1;     /* Default length of last item for repeats */
    2537 
    2538     if (c >= 0)             /* Data character */
    2539       {
    2540       length += 2;          /* For a one-byte character */
    2541 
    2542       if (c > 127)
    2543         {
    2544         int i;
    2545         for (i = 0; i < _pcre_utf8_table1_size; i++)
    2546           if (c <= _pcre_utf8_table1[i]) break;
    2547         length += i;
    2548         lastitemlength += i;
    2549         }
    2550 
    2551       continue;
    2552       }
    2553 
    2554     /* Other escapes need one byte */
    2555 
    2556     length++;
    2557 
    2558     /* A back reference needs an additional 2 bytes, plus either one or 5
    2559     bytes for a repeat. We also need to keep the value of the highest
    2560     back reference. */
    2561 
    2562     if (c <= -ESC_REF)
    2563       {
    2564       int refnum = -c - ESC_REF;
    2565       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
    2566       if (refnum > compile_block.top_backref)
    2567         compile_block.top_backref = refnum;
    2568       length += 2;   /* For single back reference */
    2569       if (ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
    2570         {
    2571         ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
    2572         if (errorcode != 0) goto PCRE_ERROR_RETURN;
    2573         if ((min == 0 && (max == 1 || max == -1)) ||
    2574           (min == 1 && max == -1))
    2575             length++;
    2576         else length += 5;
    2577         if (ptr[1] == '?') ptr++;
    2578             ptr++;
    2579         }
    2580       }
    2581     continue;
    2582 
    2583     case '^':     /* Single-byte metacharacters */
    2584     case '.':
    2585     case '$':
    2586     length++;
    2587     lastitemlength = 1;
    2588     continue;
    2589 
    2590     case '*':            /* These repeats won't be after brackets; */
    2591     case '+':            /* those are handled separately */
    2592     case '?':
    2593     length++;
    2594     goto POSESSIVE;      /* A few lines below */
    2595 
    2596     /* This covers the cases of braced repeats after a single char, metachar,
    2597     class, or back reference. */
    2598 
    2599     case '{':
    2600     if (!is_counted_repeat(ptr+1, patternEnd)) goto NORMAL_CHAR;
    2601     ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
    2602     if (errorcode != 0) goto PCRE_ERROR_RETURN;
    2603 
    2604     /* These special cases just insert one extra opcode */
    2605 
    2606     if ((min == 0 && (max == 1 || max == -1)) ||
    2607       (min == 1 && max == -1))
    2608         length++;
    2609 
    2610     /* These cases might insert additional copies of a preceding character. */
    2611 
    2612     else
    2613       {
    2614       if (min != 1)
    2615         {
    2616         length -= lastitemlength;   /* Uncount the original char or metachar */
    2617         if (min > 0) length += 3 + lastitemlength;
    2618         }
    2619       length += lastitemlength + ((max > 0)? 3 : 1);
    2620       }
    2621 
    2622     if (ptr[1] == '?') ptr++;      /* Needs no extra length */
    2623 
    2624     POSESSIVE:                     /* Test for possessive quantifier */
    2625     if (ptr[1] == '+')
    2626       {
    2627       ptr++;
    2628       length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
    2629       }
    2630     continue;
    2631 
    2632     /* An alternation contains an offset to the next branch or ket. If any ims
    2633     options changed in the previous branch(es), and/or if we are in a
    2634     lookbehind assertion, extra space will be needed at the start of the
    2635     branch. This is handled by branch_extra. */
    2636 
    2637     case '|':
    2638     length += 1 + LINK_SIZE + branch_extra;
    2639     continue;
    2640 
    2641     /* A character class uses 33 characters provided that all the character
    2642     values are less than 256. Otherwise, it uses a bit map for low valued
    2643     characters, and individual items for others. Don't worry about character
    2644     types that aren't allowed in classes - they'll get picked up during the
    2645     compile. A character class that contains only one single-byte character
    2646     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
    2647     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
    2648 
    2649     case '[':
    2650     if (*(++ptr) == '^')
    2651       {
    2652       class_optcount = 10;  /* Greater than one */
    2653       ptr++;
    2654       }
    2655     else class_optcount = 0;
    2656 
    2657     class_utf8 = false;
    2658 
    2659     for (; ptr < patternEnd && *ptr != ']'; ++ptr)
    2660       {
    2661       /* Check for escapes */
    2662 
    2663       if (*ptr == '\\')
    2664         {
    2665         c = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
    2666         if (errorcode != 0) goto PCRE_ERROR_RETURN;
    2667 
    2668         /* \b is backspace inside a class; \X is literal */
    2669 
    2670         if (-c == ESC_b) c = '\b';
    2671 
    2672         /* Handle escapes that turn into characters */
    2673 
    2674         if (c >= 0) goto NON_SPECIAL_CHARACTER;
    2675 
    2676         /* Escapes that are meta-things. The normal ones just affect the
    2677         bit map, but Unicode properties require an XCLASS extended item. */
    2678 
    2679         else
    2680           {
    2681           class_optcount = 10;         /* \d, \s etc; make sure > 1 */
    2682           }
    2683         }
    2684 
    2685       /* Anything else increments the possible optimization count. We have to
    2686       detect ranges here so that we can compute the number of extra ranges for
    2687       caseless wide characters when UCP support is available. If there are wide
    2688       characters, we are going to have to use an XCLASS, even for single
    2689       characters. */
    2690 
    2691       else
    2692         {
    2693         int d;
    2694 
    2695           {
    2696           int extra = 0;
    2697           GETCHARLENEND(c, ptr, patternEnd, extra);
    2698           ptr += extra;
    2699           }
    2700 
    2701         /* Come here from handling \ above when it escapes to a char value */
    2702 
    2703         NON_SPECIAL_CHARACTER:
    2704         class_optcount++;
    2705 
    2706         d = -1;
    2707         if (ptr + 1 < patternEnd && ptr[1] == '-')
    2708           {
    2709           pcre_uchar const *hyptr = ptr++;
    2710           if (ptr + 1 < patternEnd && ptr[1] == '\\')
    2711             {
    2712             ptr++;
    2713             d = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
    2714             if (errorcode != 0) goto PCRE_ERROR_RETURN;
    2715             if (-d == ESC_b) d = '\b';        /* backspace */
     3013    /* We can't pass back an error message if errorptr is NULL; I guess the best we
     3014     can do is just return NULL, but we can set a code value if there is a code
     3015     pointer. */
     3016    if (!errorptr)
     3017        return 0;
     3018   
     3019    *errorptr = NULL;
     3020   
     3021    /* Set up pointers to the individual character tables */
     3022   
     3023    compile_data compile_block;
     3024   
     3025    ErrorCode errorcode = ERR0;
     3026    int length = calculateCompiledPatternLengthAndFlags(pattern, patternLength, ignoreCase, compile_block, errorcode);
     3027    if (errorcode)
     3028        return returnError(errorcode, errorptr);
     3029   
     3030    if (length > MAX_PATTERN_SIZE)
     3031        return returnError(ERR16, errorptr);
     3032   
     3033    /* Compute the size of data block needed and get it. */
     3034   
     3035    size_t size = length + sizeof(real_pcre);
     3036    real_pcre* re = reinterpret_cast<real_pcre*>(new char[size]);
     3037   
     3038    if (!re)
     3039        return returnError(ERR13, errorptr);
     3040   
     3041    /* Put in the magic number, and save the sizes, options, and character table
     3042     pointer. NULL is used for the default character tables. The nullpad field is at
     3043     the end; it's there to help in the case when a regex compiled on a system with
     3044     4-byte pointers is run on another with 8-byte pointers. */
     3045   
     3046    re->size = (pcre_uint32)size;
     3047    re->options = (ignoreCase ? PCRE_CASELESS : 0) | (multiline ? PCRE_MULTILINE : 0);
     3048   
     3049    /* The starting points of the name/number translation table and of the code are
     3050     passed around in the compile data block. */
     3051   
     3052    const uschar* codestart = (const uschar*)(re + 1);
     3053    compile_block.start_code = codestart;
     3054    compile_block.start_pattern = (const pcre_uchar*)pattern;
     3055   
     3056    /* Set up a starting, non-extracting bracket, then compile the expression. On
     3057     error, errorcode will be set non-zero, so we don't need to look at the result
     3058     of the function here. */
     3059   
     3060    const pcre_uchar* ptr = (const pcre_uchar*)pattern;
     3061    const pcre_uchar* patternEnd = pattern + patternLength;
     3062    uschar* code = (uschar*)codestart;
     3063    *code = OP_BRA;
     3064    int firstbyte, reqbyte;
     3065    int bracketCount = 0;
     3066    (void)compile_regex(re->options, &bracketCount, &code, &ptr,
     3067                        patternEnd,
     3068                        &errorcode, 0, &firstbyte, &reqbyte, &compile_block);
     3069    re->top_bracket = bracketCount;
     3070    re->top_backref = compile_block.top_backref;
     3071   
     3072    /* If not reached end of pattern on success, there's an excess bracket. */
     3073   
     3074    if (errorcode == 0 && ptr < patternEnd)
     3075        errorcode = ERR10;
     3076   
     3077    /* Fill in the terminating state and check for disastrous overflow, but
     3078     if debugging, leave the test till after things are printed out. */
     3079   
     3080    *code++ = OP_END;
     3081   
     3082#ifndef DEBUG
     3083    if (code - codestart > length)
     3084        errorcode = ERR7;
     3085#endif
     3086   
     3087    /* Give an error if there's back reference to a non-existent capturing
     3088     subpattern. */
     3089   
     3090    if (re->top_backref > re->top_bracket)
     3091        errorcode = ERR15;
     3092   
     3093    /* Failed to compile, or error while post-processing */
     3094   
     3095    if (errorcode != ERR0) {
     3096        delete [] reinterpret_cast<char*>(re);
     3097        return returnError(errorcode, errorptr);
     3098    }
     3099   
     3100    /* If the anchored option was not passed, set the flag if we can determine that
     3101     the pattern is anchored by virtue of ^ characters or \A or anything else (such
     3102     as starting with .* when DOTALL is set).
     3103     
     3104     Otherwise, if we know what the first character has to be, save it, because that
     3105     speeds up unanchored matches no end. If not, see if we can set the
     3106     PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
     3107     start with ^. and also when all branches start with .* for non-DOTALL matches.
     3108     */
     3109   
     3110    if (is_anchored(codestart, re->options, 0, compile_block.backref_map))
     3111        re->options |= PCRE_ANCHORED;
     3112    else {
     3113        if (firstbyte < 0)
     3114            firstbyte = find_firstassertedchar(codestart, re->options, false);
     3115        if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
     3116        {
     3117            int ch = firstbyte & 255;
     3118            if (ch < 127) {
     3119                re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
     3120                                  compile_block.fcc[ch] == ch)? ch : firstbyte;
     3121                re->options |= PCRE_FIRSTSET;
    27163122            }
    2717           else if (ptr + 1 < patternEnd && ptr[1] != ']')
    2718             {
    2719             ptr++;
    2720               {
    2721               int extra = 0;
    2722               GETCHARLENEND(d, ptr, patternEnd, extra);
    2723               ptr += extra;
    2724               }
    2725             }
    2726           if (d < 0) ptr = hyptr;      /* go back to hyphen as data */
    2727           }
    2728 
    2729         /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
    2730         127 for caseless matching, we will need to use an XCLASS. */
    2731 
    2732         if (d >= 0)
    2733           {
    2734           class_optcount = 10;     /* Ensure > 1 */
    2735           if (d < c)
    2736             {
    2737             errorcode = ERR8;
    2738             goto PCRE_ERROR_RETURN;
    2739             }
    2740 
    2741           if ((d > 255 || (ignoreCase && d > 127)))
    2742             {
    2743             uschar buffer[6];
    2744             if (!class_utf8)         /* Allow for XCLASS overhead */
    2745               {
    2746               class_utf8 = true;
    2747               length += LINK_SIZE + 2;
    2748               }
    2749 
    2750             /* If we have UCP support, find out how many extra ranges are
    2751             needed to map the other case of characters within this range. We
    2752             have to mimic the range optimization here, because extending the
    2753             range upwards might push d over a boundary that makes is use
    2754             another byte in the UTF-8 representation. */
    2755 
    2756             if (ignoreCase)
    2757               {
    2758               int occ, ocd;
    2759               int cc = c;
    2760               int origd = d;
    2761               while (get_othercase_range(&cc, origd, &occ, &ocd))
    2762                 {
    2763                 if (occ >= c && ocd <= d) continue;   /* Skip embedded */
    2764 
    2765                 if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
    2766                   {                            /* if there is overlap,   */
    2767                   c = occ;                     /* noting that if occ < c */
    2768                   continue;                    /* we can't have ocd > d  */
    2769                   }                            /* because a subrange is  */
    2770                 if (ocd > d && occ <= d + 1)   /* always shorter than    */
    2771                   {                            /* the basic range.       */
    2772                   d = ocd;
    2773                   continue;
    2774                   }
    2775 
    2776                 /* An extra item is needed */
    2777 
    2778                 length += 1 + _pcre_ord2utf8(occ, buffer) +
    2779                   ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
    2780                 }
    2781               }
    2782 
    2783             /* The length of the (possibly extended) range */
    2784 
    2785             length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
    2786             }
    2787 
    2788           }
    2789 
    2790         /* We have a single character. There is nothing to be done unless we
    2791         are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
    2792         allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
    2793         support. */
    2794 
    2795         else
    2796           {
    2797           if ((c > 255 || (ignoreCase && c > 127)))
    2798             {
    2799             uschar buffer[6];
    2800             class_optcount = 10;     /* Ensure > 1 */
    2801             if (!class_utf8)         /* Allow for XCLASS overhead */
    2802               {
    2803               class_utf8 = true;
    2804               length += LINK_SIZE + 2;
    2805               }
    2806             length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer));
    2807             }
    2808           }
    2809         }
    2810       }
    2811 
    2812     if (ptr >= patternEnd)                          /* Missing terminating ']' */
    2813       {
    2814       errorcode = ERR6;
    2815       goto PCRE_ERROR_RETURN;
    2816       }
    2817 
    2818     /* We can optimize when there was only one optimizable character. Repeats
    2819     for positive and negated single one-byte chars are handled by the general
    2820     code. Here, we handle repeats for the class opcodes. */
    2821 
    2822     if (class_optcount == 1) length += 3; else
    2823       {
    2824       length += 33;
    2825 
    2826       /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
    2827       we also need extra for wrapping the whole thing in a sub-pattern. */
    2828 
    2829       if (ptr + 1 < patternEnd && ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
    2830         {
    2831         ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
    2832         if (errorcode != 0) goto PCRE_ERROR_RETURN;
    2833         if ((min == 0 && (max == 1 || max == -1)) ||
    2834           (min == 1 && max == -1))
    2835             length++;
    2836         else length += 5;
    2837         if (ptr + 1 < patternEnd && ptr[1] == '+')
    2838           {
    2839           ptr++;
    2840           length += 2 + 2*LINK_SIZE;
    2841           }
    2842         else if (ptr + 1 < patternEnd && ptr[1] == '?') ptr++;
    2843         }
    2844       }
    2845     continue;
    2846 
    2847     /* Brackets may be genuine groups or special things */
    2848 
    2849     case '(':
    2850     branch_newextra = 0;
    2851     bracket_length = 1 + LINK_SIZE;
    2852     capturing = false;
    2853 
    2854     /* Handle special forms of bracket, which all start (? */
    2855 
    2856     if (ptr + 1 < patternEnd && ptr[1] == '?')
    2857       {
    2858       switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0))
    2859         {
    2860         /* Non-referencing groups and lookaheads just move the pointer on, and
    2861         then behave like a non-special bracket, except that they don't increment
    2862         the count of extracting brackets. Ditto for the "once only" bracket,
    2863         which is in Perl from version 5.005. */
    2864 
    2865         case ':':
    2866         case '=':
    2867         case '!':
    2868         ptr += 2;
    2869         break;
    2870 
    2871         /* Else loop checking valid options until ) is met. Anything else is an
    2872         error. If we are without any brackets, i.e. at top level, the settings
    2873         act as if specified in the options, so massage the options immediately.
    2874         This is for backward compatibility with Perl 5.004. */
    2875 
    2876         default:
    2877         errorcode = ERR12;
    2878         goto PCRE_ERROR_RETURN;
    2879         }
    2880       }
    2881 
    2882     else capturing = 1;
    2883 
    2884     /* Capturing brackets must be counted so we can process escapes in a
    2885     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
    2886     an additional 3 bytes of memory per capturing bracket. */
    2887 
    2888     if (capturing)
    2889       {
    2890       bracount++;
    2891       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
    2892       }
    2893 
    2894     /* Save length for computing whole length at end if there's a repeat that
    2895     requires duplication of the group. Also save the current value of
    2896     branch_extra, and start the new group with the new value. If non-zero, this
    2897     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
    2898 
    2899     if (brastackptr >= sizeof(brastack)/sizeof(int))
    2900       {
    2901       errorcode = ERR17;
    2902       goto PCRE_ERROR_RETURN;
    2903       }
    2904 
    2905     bralenstack[brastackptr] = branch_extra;
    2906     branch_extra = branch_newextra;
    2907 
    2908     brastack[brastackptr++] = length;
    2909     length += bracket_length;
    2910     continue;
    2911 
    2912     /* Handle ket. Look for subsequent max/min; for certain sets of values we
    2913     have to replicate this bracket up to that many times. If brastackptr is
    2914     0 this is an unmatched bracket which will generate an error, but take care
    2915     not to try to access brastack[-1] when computing the length and restoring
    2916     the branch_extra value. */
    2917 
    2918     case ')':
    2919     length += 1 + LINK_SIZE;
    2920     if (brastackptr > 0)
    2921       {
    2922       duplength = length - brastack[--brastackptr];
    2923       branch_extra = bralenstack[brastackptr];
    2924       }
    2925     else duplength = 0;
    2926 
    2927     /* Leave ptr at the final char; for read_repeat_counts this happens
    2928     automatically; for the others we need an increment. */
    2929 
    2930     if (ptr + 1 < patternEnd && (c = ptr[1]) == '{' && is_counted_repeat(ptr+2, patternEnd))
    2931       {
    2932       ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
    2933       if (errorcode != 0) goto PCRE_ERROR_RETURN;
    2934       }
    2935     else if (c == '*') { min = 0; max = -1; ptr++; }
    2936     else if (c == '+') { min = 1; max = -1; ptr++; }
    2937     else if (c == '?') { min = 0; max = 1;  ptr++; }
    2938     else { min = 1; max = 1; }
    2939 
    2940     /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
    2941     group, and if the maximum is greater than zero, we have to replicate
    2942     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
    2943     bracket set. */
    2944 
    2945     if (min == 0)
    2946       {
    2947       length++;
    2948       if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
    2949       }
    2950 
    2951     /* When the minimum is greater than zero, we have to replicate up to
    2952     minval-1 times, with no additions required in the copies. Then, if there
    2953     is a limited maximum we have to replicate up to maxval-1 times allowing
    2954     for a BRAZERO item before each optional copy and nesting brackets for all
    2955     but one of the optional copies. */
    2956 
    2957     else
    2958       {
    2959       length += (min - 1) * duplength;
    2960       if (max > min)   /* Need this test as max=-1 means no limit */
    2961         length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
    2962           - (2 + 2*LINK_SIZE);
    2963       }
    2964 
    2965     /* Allow space for once brackets for "possessive quantifier" */
    2966 
    2967     if (ptr + 1 < patternEnd && ptr[1] == '+')
    2968       {
    2969       ptr++;
    2970       length += 2 + 2*LINK_SIZE;
    2971       }
    2972     continue;
    2973 
    2974     /* Non-special character. It won't be space or # in extended mode, so it is
    2975     always a genuine character. If we are in a \Q...\E sequence, check for the
    2976     end; if not, we have a literal. */
    2977 
    2978     default:
    2979     NORMAL_CHAR:
    2980 
    2981     length += 2;          /* For a one-byte character */
    2982     lastitemlength = 1;   /* Default length of last item for repeats */
    2983 
    2984     /* In UTF-8 mode, check for additional bytes. */
    2985 
    2986     if (c > 127)
    2987       {
    2988         if (IS_LEADING_SURROGATE(c))
    2989           {
    2990           c = DECODE_SURROGATE_PAIR(c, ptr < patternEnd ? *ptr : 0);
    2991           ++ptr;
    2992           }
    2993 
    2994         {
    2995           int i;
    2996           for (i = 0; i < _pcre_utf8_table1_size; i++)
    2997             if (c <= _pcre_utf8_table1[i]) break;
    2998           length += i;
    2999           lastitemlength += i;
    3000         }
    3001       }
    3002 
    3003     continue;
     3123        }
     3124        else if (is_startline(codestart, 0, compile_block.backref_map))
     3125            re->options |= PCRE_STARTLINE;
    30043126    }
    3005   }
    3006 
    3007 length += 2 + LINK_SIZE;    /* For final KET and END */
    3008 
    3009 if (length > MAX_PATTERN_SIZE)
    3010   {
    3011   errorcode = ERR16;
    3012   goto PCRE_ERROR_RETURN;
    3013   }
    3014 
    3015 /* Compute the size of data block needed and get it. */
    3016 
    3017 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
    3018 re = reinterpret_cast<real_pcre*>(new char[size]);
    3019 
    3020 if (re == NULL)
    3021   {
    3022   errorcode = ERR13;
    3023   goto PCRE_ERROR_RETURN;
    3024   }
    3025 
    3026 /* Put in the magic number, and save the sizes, options, and character table
    3027 pointer. NULL is used for the default character tables. The nullpad field is at
    3028 the end; it's there to help in the case when a regex compiled on a system with
    3029 4-byte pointers is run on another with 8-byte pointers. */
    3030 
    3031 re->size = (pcre_uint32)size;
    3032 re->options = (ignoreCase ? PCRE_CASELESS : 0) | (multiline ? PCRE_MULTILINE : 0);
    3033 
    3034 /* The starting points of the name/number translation table and of the code are
    3035 passed around in the compile data block. */
    3036 
    3037 codestart = (const uschar *)(re + 1);
    3038 compile_block.start_code = codestart;
    3039 compile_block.start_pattern = (const pcre_uchar *)pattern;
    3040 compile_block.req_varyopt = 0;
    3041 
    3042 /* Set up a starting, non-extracting bracket, then compile the expression. On
    3043 error, errorcode will be set non-zero, so we don't need to look at the result
    3044 of the function here. */
    3045 
    3046 ptr = (const pcre_uchar *)pattern;
    3047 code = (uschar *)codestart;
    3048 *code = OP_BRA;
    3049 bracount = 0;
    3050 (void)compile_regex(re->options, &bracount, &code, &ptr,
    3051   patternEnd,
    3052   &errorcode, 0, &firstbyte, &reqbyte, &compile_block);
    3053 re->top_bracket = bracount;
    3054 re->top_backref = compile_block.top_backref;
    3055 
    3056 /* If not reached end of pattern on success, there's an excess bracket. */
    3057 
    3058 if (errorcode == 0 && ptr < patternEnd) errorcode = ERR10;
    3059 
    3060 /* Fill in the terminating state and check for disastrous overflow, but
    3061 if debugging, leave the test till after things are printed out. */
    3062 
    3063 *code++ = OP_END;
    3064 
    3065 #ifndef DEBUG
    3066 if (code - codestart > length) errorcode = ERR7;
     3127   
     3128    /* For an anchored pattern, we use the "required byte" only if it follows a
     3129     variable length item in the regex. Remove the caseless flag for non-caseable
     3130     bytes. */
     3131   
     3132    if (reqbyte >= 0 && (!(re->options & PCRE_ANCHORED) || (reqbyte & REQ_VARY))) {
     3133        int ch = reqbyte & 255;
     3134        if (ch < 127) {
     3135            re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
     3136                            compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
     3137            re->options |= PCRE_REQCHSET;
     3138        }
     3139    }
     3140   
     3141#ifdef DEBUG
     3142    printCompiledRegExp(re);
     3143   
     3144    /* This check is done here in the debugging case so that the code that
     3145     was compiled can be seen. */
     3146    if (code - codestart > length) {
     3147        (pcre_free)(re);
     3148        *errorptr = error_text(ERR7);
     3149        return NULL;
     3150    }
     3151   
    30673152#endif
    3068 
    3069 /* Give an error if there's back reference to a non-existent capturing
    3070 subpattern. */
    3071 
    3072 if (re->top_backref > re->top_bracket) errorcode = ERR15;
    3073 
    3074 /* Failed to compile, or error while post-processing */
    3075 
    3076 if (errorcode != ERR0)
    3077   {
    3078   delete [] reinterpret_cast<char*>(re);
    3079   PCRE_ERROR_RETURN:
    3080   *errorptr = error_text(errorcode);
    3081   return NULL;
    3082   }
    3083 
    3084 /* If the anchored option was not passed, set the flag if we can determine that
    3085 the pattern is anchored by virtue of ^ characters or \A or anything else (such
    3086 as starting with .* when DOTALL is set).
    3087 
    3088 Otherwise, if we know what the first character has to be, save it, because that
    3089 speeds up unanchored matches no end. If not, see if we can set the
    3090 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
    3091 start with ^. and also when all branches start with .* for non-DOTALL matches.
    3092 */
    3093 
    3094   {
    3095   if (is_anchored(codestart, re->options, 0, compile_block.backref_map))
    3096     re->options |= PCRE_ANCHORED;
    3097   else
    3098     {
    3099     if (firstbyte < 0)
    3100       firstbyte = find_firstassertedchar(codestart, re->options, false);
    3101     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
    3102       {
    3103       int ch = firstbyte & 255;
    3104       if (ch < 127)
    3105       { /* Strange indentation to aid in merging. */
    3106       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
    3107          compile_block.fcc[ch] == ch)? ch : firstbyte;
    3108       re->options |= PCRE_FIRSTSET;
    3109       }
    3110       }
    3111     else if (is_startline(codestart, 0, compile_block.backref_map))
    3112       re->options |= PCRE_STARTLINE;
    3113     }
    3114   }
    3115 
    3116 /* For an anchored pattern, we use the "required byte" only if it follows a
    3117 variable length item in the regex. Remove the caseless flag for non-caseable
    3118 bytes. */
    3119 
    3120 if (reqbyte >= 0 &&
    3121      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
    3122   {
    3123   int ch = reqbyte & 255;
    3124   if (ch < 127)
    3125   { /* Strange indentation to aid in merging. */
    3126   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
    3127       compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
    3128   re->options |= PCRE_REQCHSET;
    3129   }
    3130   }
    3131 
    3132 /* Print out the compiled data if debugging is enabled. This is never the
    3133 case when building a production library. */
    3134 
    3135 #ifdef DEBUG
    3136 
    3137 printf("Length = %d top_bracket = %d top_backref = %d\n",
    3138   length, re->top_bracket, re->top_backref);
    3139 
    3140 if (re->options != 0)
    3141   {
    3142   printf("%s%s%s\n",
    3143     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
    3144     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
    3145     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "");
    3146   }
    3147 
    3148 if ((re->options & PCRE_FIRSTSET) != 0)
    3149   {
    3150   int ch = re->first_byte & 255;
    3151   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
    3152     "" : " (caseless)";
    3153   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
    3154     else printf("First char = \\x%02x%s\n", ch, caseless);
    3155   }
    3156 
    3157 if ((re->options & PCRE_REQCHSET) != 0)
    3158   {
    3159   int ch = re->req_byte & 255;
    3160   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
    3161     "" : " (caseless)";
    3162   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
    3163     else printf("Req char = \\x%02x%s\n", ch, caseless);
    3164   }
    3165 
    3166 pcre_printint(re, stdout);
    3167 
    3168 /* This check is done here in the debugging case so that the code that
    3169 was compiled can be seen. */
    3170 
    3171 if (code - codestart > length)
    3172   {
    3173   (pcre_free)(re);
    3174   *errorptr = error_text(ERR7);
    3175   return NULL;
    3176   }
    3177 
    3178 #endif
    3179 
    3180 if (numSubpatterns)
    3181     *numSubpatterns = re->top_bracket;
    3182 return (pcre *)re;
     3153   
     3154    if (numSubpatterns)
     3155        *numSubpatterns = re->top_bracket;
     3156    return (pcre *)re;
    31833157}
    31843158
Note: See TracChangeset for help on using the changeset viewer.