Ignore:
Timestamp:
Nov 29, 2007, 3:04:06 AM (18 years ago)
Author:
[email protected]
Message:

2007-11-24 Eric Seidel <[email protected]>

Reviewed by Maciej.

Clean up jsRegExpExecute

  • pcre/pcre_compile.cpp: (returnError): (jsRegExpCompile):
  • pcre/pcre_exec.cpp: (jsRegExpExecute):
  • pcre/pcre_internal.h:
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/pcre/pcre_exec.cpp

    r27947 r28138  
    20262026
    20272027Arguments:
    2028   argument_re     points to the compiled expression
     2028  re              points to the compiled expression
    20292029  extra_data      points to extra data or is NULL
    20302030  subject         points to the subject string
     
    20412041*/
    20422042
    2043 int
    2044 jsRegExpExecute(const pcre *argument_re,
    2045   const UChar* subject, int length, int start_offset, int *offsets,
    2046   int offsetcount)
     2043int jsRegExpExecute(const JSRegExp* re,
     2044                    const UChar* subject, int length, int start_offset, int* offsets,
     2045                    int offsetcount)
    20472046{
    2048 int rc, resetcount, ocount;
    2049 int first_byte = -1;
    2050 int req_byte = -1;
    2051 int req_byte2 = -1;
    2052 BOOL using_temporary_offsets = false;
    2053 BOOL first_byte_caseless = false;
    2054 BOOL startline;
    2055 BOOL req_byte_caseless = false;
    2056 match_data match_block;
    2057 USPTR start_match = (USPTR)subject + start_offset;
    2058 USPTR end_subject;
    2059 USPTR req_byte_ptr = start_match - 1;
    2060 const uschar *start_code;
    2061 
    2062 const real_pcre *external_re = (const real_pcre *)argument_re;
    2063 const real_pcre *re = external_re;
    2064 
    2065 /* Plausibility checks */
    2066 
    2067 ASSERT(re);
    2068 ASSERT(subject);
    2069 ASSERT(offsetcount >= 0);
    2070 ASSERT(offsets || offsetcount == 0);
    2071 
    2072 /* Set up other data */
    2073 
    2074 startline = (re->options & PCRE_STARTLINE) != 0;
    2075 
    2076 /* The code starts after the real_pcre block and the capture name table. */
    2077 
    2078 start_code = (const uschar *)(external_re + 1);
    2079 
    2080 match_block.start_subject = (USPTR)subject;
    2081 match_block.end_subject = match_block.start_subject + length;
    2082 end_subject = match_block.end_subject;
    2083 
    2084 match_block.lcc = _pcre_default_tables + lcc_offset;
    2085 match_block.ctypes = _pcre_default_tables + ctypes_offset;
    2086 
    2087 match_block.multiline = (re->options & PCRE_MULTILINE) != 0;
    2088 match_block.caseless = (re->options & PCRE_CASELESS) != 0;
    2089 
    2090 /* If the expression has got more back references than the offsets supplied can
    2091 hold, we get a temporary chunk of working store to use during the matching.
    2092 Otherwise, we can use the vector supplied, rounding down its size to a multiple
    2093 of 3. */
    2094 
    2095 ocount = offsetcount - (offsetcount % 3);
    2096 
    2097 if (re->top_backref > 0 && re->top_backref >= ocount/3)
    2098   {
    2099   ocount = re->top_backref * 3 + 3;
    2100   match_block.offset_vector = new int[ocount];
    2101   if (match_block.offset_vector == NULL) return JSRegExpErrorNoMemory;
    2102   using_temporary_offsets = true;
    2103   DPRINTF(("Got memory to hold back references\n"));
    2104   }
    2105 else match_block.offset_vector = offsets;
    2106 
    2107 match_block.offset_end = ocount;
    2108 match_block.offset_max = (2*ocount)/3;
    2109 match_block.offset_overflow = false;
    2110 
    2111 /* Compute the minimum number of offsets that we need to reset each time. Doing
    2112 this makes a huge difference to execution time when there aren't many brackets
    2113 in the pattern. */
    2114 
    2115 resetcount = 2 + re->top_bracket * 2;
    2116 if (resetcount > offsetcount) resetcount = ocount;
    2117 
    2118 /* Reset the working variable associated with each extraction. These should
    2119 never be used unless previously set, but they get saved and restored, and so we
    2120 initialize them to avoid reading uninitialized locations. */
    2121 
    2122 if (match_block.offset_vector != NULL)
    2123   {
    2124   register int *iptr = match_block.offset_vector + ocount;
    2125   register int *iend = iptr - resetcount/2 + 1;
    2126   while (--iptr >= iend) *iptr = -1;
    2127   }
    2128 
    2129 /* Set up the first character to match, if available. The first_byte value is
    2130 never set for an anchored regular expression, but the anchoring may be forced
    2131 at run time, so we have to test for anchoring. The first char may be unset for
    2132 an unanchored pattern, of course. If there's no first char and the pattern was
    2133 studied, there may be a bitmap of possible first characters. */
    2134 
    2135   if ((re->options & PCRE_FIRSTSET) != 0)
    2136     {
    2137     first_byte = re->first_byte & 255;
    2138     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == true)
    2139       first_byte = match_block.lcc[first_byte];
     2047    ASSERT(re);
     2048    ASSERT(subject);
     2049    ASSERT(offsetcount >= 0);
     2050    ASSERT(offsets || offsetcount == 0);
     2051   
     2052    match_data match_block;
     2053    match_block.start_subject = (USPTR)subject;
     2054    match_block.end_subject = match_block.start_subject + length;
     2055    USPTR end_subject = match_block.end_subject;
     2056   
     2057    match_block.lcc = _pcre_default_tables + lcc_offset;
     2058    match_block.ctypes = _pcre_default_tables + ctypes_offset;
     2059   
     2060    match_block.multiline = (re->options & PCRE_MULTILINE) != 0;
     2061    match_block.caseless = (re->options & PCRE_CASELESS) != 0;
     2062   
     2063    /* If the expression has got more back references than the offsets supplied can
     2064     hold, we get a temporary chunk of working store to use during the matching.
     2065     Otherwise, we can use the vector supplied, rounding down its size to a multiple
     2066     of 3. */
     2067   
     2068    int ocount = offsetcount - (offsetcount % 3);
     2069   
     2070    BOOL using_temporary_offsets = false;
     2071    if (re->top_backref > 0 && re->top_backref >= ocount/3) {
     2072        ocount = re->top_backref * 3 + 3;
     2073        match_block.offset_vector = new int[ocount];
     2074        if (!match_block.offset_vector)
     2075            return JSRegExpErrorNoMemory;
     2076        using_temporary_offsets = true;
     2077    } else
     2078        match_block.offset_vector = offsets;
     2079   
     2080    match_block.offset_end = ocount;
     2081    match_block.offset_max = (2*ocount)/3;
     2082    match_block.offset_overflow = false;
     2083   
     2084    /* Compute the minimum number of offsets that we need to reset each time. Doing
     2085     this makes a huge difference to execution time when there aren't many brackets
     2086     in the pattern. */
     2087   
     2088    int resetcount = 2 + re->top_bracket * 2;
     2089    if (resetcount > offsetcount)
     2090        resetcount = ocount;
     2091   
     2092    /* Reset the working variable associated with each extraction. These should
     2093     never be used unless previously set, but they get saved and restored, and so we
     2094     initialize them to avoid reading uninitialized locations. */
     2095   
     2096    if (match_block.offset_vector) {
     2097        int* iptr = match_block.offset_vector + ocount;
     2098        int* iend = iptr - resetcount/2 + 1;
     2099        while (--iptr >= iend)
     2100            *iptr = -1;
    21402101    }
    2141 
    2142 /* For anchored or unanchored matches, there may be a "last known required
    2143 character" set. */
    2144 
    2145 if ((re->options & PCRE_REQCHSET) != 0)
    2146   {
    2147   req_byte = re->req_byte & 255;
    2148   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
    2149   req_byte2 = (_pcre_default_tables + fcc_offset)[req_byte];  /* case flipped */
    2150   }
    2151 
    2152 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
    2153 the loop runs just once. */
    2154 
    2155 do
    2156   {
    2157   USPTR save_end_subject = end_subject;
    2158 
    2159   /* Reset the maximum number of extractions we might see. */
    2160 
    2161   if (match_block.offset_vector != NULL)
    2162     {
    2163     register int *iptr = match_block.offset_vector;
    2164     register int *iend = iptr + resetcount;
    2165     while (iptr < iend) *iptr++ = -1;
     2102   
     2103    /* Set up the first character to match, if available. The first_byte value is
     2104     never set for an anchored regular expression, but the anchoring may be forced
     2105     at run time, so we have to test for anchoring. The first char may be unset for
     2106     an unanchored pattern, of course. If there's no first char and the pattern was
     2107     studied, there may be a bitmap of possible first characters. */
     2108   
     2109    BOOL first_byte_caseless = false;
     2110    int first_byte = -1;
     2111    if (re->options & PCRE_FIRSTSET) {
     2112        first_byte = re->first_byte & 255;
     2113        if ((first_byte_caseless = (re->first_byte & REQ_CASELESS)))
     2114            first_byte = match_block.lcc[first_byte];
    21662115    }
    2167 
    2168   /* Advance to a unique first char if possible. If firstline is true, the
    2169   start of the match is constrained to the first line of a multiline string.
    2170   Implement this by temporarily adjusting end_subject so that we stop scanning
    2171   at a newline. If the match fails at the newline, later code breaks this loop.
    2172   */
    2173 
    2174   /* Now test for a unique first byte */
    2175 
    2176   if (first_byte >= 0)
    2177     {
    2178     pcre_uchar first_char = first_byte;
    2179     if (first_byte_caseless)
    2180       while (start_match < end_subject)
    2181         {
    2182         int sm = *start_match;
    2183         if (sm > 127)
    2184           break;
    2185         if (match_block.lcc[sm] == first_char)
    2186           break;
    2187         start_match++;
     2116   
     2117    /* For anchored or unanchored matches, there may be a "last known required
     2118     character" set. */
     2119   
     2120    BOOL req_byte_caseless = false;
     2121    int req_byte = -1;
     2122    int req_byte2 = -1;
     2123    if (re->options & PCRE_REQCHSET) {
     2124        req_byte = re->req_byte & 255;
     2125        req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
     2126        req_byte2 = (_pcre_default_tables + fcc_offset)[req_byte];  /* case flipped */
     2127    }
     2128   
     2129    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
     2130     the loop runs just once. */
     2131   
     2132    USPTR start_match = (USPTR)subject + start_offset;
     2133    USPTR req_byte_ptr = start_match - 1;
     2134   
     2135    do {
     2136        USPTR save_end_subject = end_subject;
     2137       
     2138        /* Reset the maximum number of extractions we might see. */
     2139       
     2140        if (match_block.offset_vector != NULL) {
     2141            int* iptr = match_block.offset_vector;
     2142            int* iend = iptr + resetcount;
     2143            while (iptr < iend)
     2144                *iptr++ = -1;
    21882145        }
    2189     else
    2190       while (start_match < end_subject && *start_match != first_char)
    2191         start_match++;
     2146       
     2147        /* Advance to a unique first char if possible. If firstline is true, the
     2148         start of the match is constrained to the first line of a multiline string.
     2149         Implement this by temporarily adjusting end_subject so that we stop scanning
     2150         at a newline. If the match fails at the newline, later code breaks this loop.
     2151         */
     2152       
     2153        /* Now test for a unique first byte */
     2154       
     2155        if (first_byte >= 0) {
     2156            pcre_uchar first_char = first_byte;
     2157            if (first_byte_caseless)
     2158                while (start_match < end_subject) {
     2159                    int sm = *start_match;
     2160                    if (sm > 127)
     2161                        break;
     2162                    if (match_block.lcc[sm] == first_char)
     2163                        break;
     2164                    start_match++;
     2165                }
     2166            else
     2167                while (start_match < end_subject && *start_match != first_char)
     2168                    start_match++;
     2169        }
     2170       
     2171        /* Or to just after \n for a multiline match if possible */
     2172       
     2173        else if (re->options & PCRE_STARTLINE) {
     2174            if (start_match > match_block.start_subject + start_offset) {
     2175                while (start_match < end_subject && !isNewline(start_match[-1]))
     2176                    start_match++;
     2177            }
     2178        }
     2179       
     2180        /* Restore fudged end_subject */
     2181       
     2182        end_subject = save_end_subject;
     2183       
     2184#ifdef DEBUG  /* Sigh. Some compilers never learn. */
     2185        printf(">>>> Match against: ");
     2186        pchars(start_match, end_subject - start_match, true, &match_block);
     2187        printf("\n");
     2188#endif
     2189       
     2190        /* If req_byte is set, we know that that character must appear in the subject
     2191         for the match to succeed. If the first character is set, req_byte must be
     2192         later in the subject; otherwise the test starts at the match point. This
     2193         optimization can save a huge amount of backtracking in patterns with nested
     2194         unlimited repeats that aren't going to match. Writing separate code for
     2195         cased/caseless versions makes it go faster, as does using an autoincrement
     2196         and backing off on a match.
     2197         
     2198         HOWEVER: when the subject string is very, very long, searching to its end can
     2199         take a long time, and give bad performance on quite ordinary patterns. This
     2200         showed up when somebody was matching /^C/ on a 32-megabyte string... so we
     2201         don't do this when the string is sufficiently long.
     2202         
     2203         ALSO: this processing is disabled when partial matching is requested.
     2204         */
     2205       
     2206        if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) {
     2207            USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
     2208           
     2209            /* We don't need to repeat the search if we haven't yet reached the
     2210             place we found it at last time. */
     2211           
     2212            if (p > req_byte_ptr) {
     2213                if (req_byte_caseless) {
     2214                    while (p < end_subject) {
     2215                        int pp = *p++;
     2216                        if (pp == req_byte || pp == req_byte2) {
     2217                            p--;
     2218                            break;
     2219                        }
     2220                    }
     2221                } else {
     2222                    while (p < end_subject) {
     2223                        if (*p++ == req_byte) {
     2224                            p--;
     2225                            break;
     2226                        }
     2227                    }
     2228                }
     2229               
     2230                /* If we can't find the required character, break the matching loop */
     2231               
     2232                if (p >= end_subject)
     2233                    break;
     2234               
     2235                /* If we have found the required character, save the point where we
     2236                 found it, so that we don't search again next time round the loop if
     2237                 the start hasn't passed this character yet. */
     2238               
     2239                req_byte_ptr = p;
     2240            }
     2241        }
     2242       
     2243        /* When a match occurs, substrings will be set for all internal extractions;
     2244         we just need to set up the whole thing as substring 0 before returning. If
     2245         there were too many extractions, set the return code to zero. In the case
     2246         where we had to get some local store to hold offsets for backreferences, copy
     2247         those back references that we can. In this case there need not be overflow
     2248         if certain parts of the pattern were not used. */
     2249       
     2250        match_block.match_call_count = 0;
     2251       
     2252       
     2253        /* The code starts after the JSRegExp block and the capture name table. */
     2254        const uschar* start_code = (const uschar*)(re + 1);
     2255       
     2256        int returnCode = match(start_match, start_code, 2, &match_block);
     2257       
     2258        /* When the result is no match, if the subject's first character was a
     2259         newline and the PCRE_FIRSTLINE option is set, break (which will return
     2260         PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
     2261         newline in the subject. Otherwise, advance the pointer to the next character
     2262         and continue - but the continuation will actually happen only when the
     2263         pattern is not anchored. */
     2264       
     2265        if (returnCode == MATCH_NOMATCH) {
     2266            start_match++;
     2267            while(start_match < end_subject && ISMIDCHAR(*start_match))
     2268                start_match++;
     2269            continue;
     2270        }
     2271       
     2272        if (returnCode != MATCH_MATCH) {
     2273            DPRINTF((">>>> error: returning %d\n", rc));
     2274            return returnCode;
     2275        }
     2276       
     2277        /* We have a match! Copy the offset information from temporary store if
     2278         necessary */
     2279       
     2280        if (using_temporary_offsets) {
     2281            if (offsetcount >= 4) {
     2282                memcpy(offsets + 2, match_block.offset_vector + 2, (offsetcount - 2) * sizeof(int));
     2283                DPRINTF(("Copied offsets from temporary memory\n"));
     2284            }
     2285            if (match_block.end_offset_top > offsetcount)
     2286                match_block.offset_overflow = true;
     2287           
     2288            DPRINTF(("Freeing temporary memory\n"));
     2289            delete [] match_block.offset_vector;
     2290        }
     2291       
     2292        returnCode = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
     2293       
     2294        if (offsetcount < 2)
     2295            returnCode = 0;
     2296        else {
     2297            offsets[0] = start_match - match_block.start_subject;
     2298            offsets[1] = match_block.end_match_ptr - match_block.start_subject;
     2299        }
     2300       
     2301        DPRINTF((">>>> returning %d\n", rc));
     2302        return returnCode;
     2303    } while (start_match <= end_subject);
     2304   
     2305    if (using_temporary_offsets) {
     2306        DPRINTF(("Freeing temporary memory\n"));
     2307        delete [] match_block.offset_vector;
    21922308    }
    2193 
    2194   /* Or to just after \n for a multiline match if possible */
    2195 
    2196   else if (startline)
    2197     {
    2198     if (start_match > match_block.start_subject + start_offset)
    2199       {
    2200       while (start_match < end_subject && !isNewline(start_match[-1]))
    2201         start_match++;
    2202       }
    2203     }
    2204 
    2205   /* Restore fudged end_subject */
    2206 
    2207   end_subject = save_end_subject;
    2208 
    2209 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
    2210   printf(">>>> Match against: ");
    2211   pchars(start_match, end_subject - start_match, true, &match_block);
    2212   printf("\n");
    2213 #endif
    2214 
    2215   /* If req_byte is set, we know that that character must appear in the subject
    2216   for the match to succeed. If the first character is set, req_byte must be
    2217   later in the subject; otherwise the test starts at the match point. This
    2218   optimization can save a huge amount of backtracking in patterns with nested
    2219   unlimited repeats that aren't going to match. Writing separate code for
    2220   cased/caseless versions makes it go faster, as does using an autoincrement
    2221   and backing off on a match.
    2222 
    2223   HOWEVER: when the subject string is very, very long, searching to its end can
    2224   take a long time, and give bad performance on quite ordinary patterns. This
    2225   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
    2226   don't do this when the string is sufficiently long.
    2227 
    2228   ALSO: this processing is disabled when partial matching is requested.
    2229   */
    2230 
    2231   if (req_byte >= 0 &&
    2232       end_subject - start_match < REQ_BYTE_MAX)
    2233     {
    2234     register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
    2235 
    2236     /* We don't need to repeat the search if we haven't yet reached the
    2237     place we found it at last time. */
    2238 
    2239     if (p > req_byte_ptr)
    2240       {
    2241       if (req_byte_caseless)
    2242         {
    2243         while (p < end_subject)
    2244           {
    2245           register int pp = *p++;
    2246           if (pp == req_byte || pp == req_byte2) { p--; break; }
    2247           }
    2248         }
    2249       else
    2250         {
    2251         while (p < end_subject)
    2252           {
    2253           if (*p++ == req_byte) { p--; break; }
    2254           }
    2255         }
    2256 
    2257       /* If we can't find the required character, break the matching loop */
    2258 
    2259       if (p >= end_subject) break;
    2260 
    2261       /* If we have found the required character, save the point where we
    2262       found it, so that we don't search again next time round the loop if
    2263       the start hasn't passed this character yet. */
    2264 
    2265       req_byte_ptr = p;
    2266       }
    2267     }
    2268 
    2269   /* When a match occurs, substrings will be set for all internal extractions;
    2270   we just need to set up the whole thing as substring 0 before returning. If
    2271   there were too many extractions, set the return code to zero. In the case
    2272   where we had to get some local store to hold offsets for backreferences, copy
    2273   those back references that we can. In this case there need not be overflow
    2274   if certain parts of the pattern were not used. */
    2275 
    2276   match_block.match_call_count = 0;
    2277 
    2278   rc = match(start_match, start_code, 2, &match_block);
    2279 
    2280   /* When the result is no match, if the subject's first character was a
    2281   newline and the PCRE_FIRSTLINE option is set, break (which will return
    2282   PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
    2283   newline in the subject. Otherwise, advance the pointer to the next character
    2284   and continue - but the continuation will actually happen only when the
    2285   pattern is not anchored. */
    2286 
    2287   if (rc == MATCH_NOMATCH)
    2288     {
    2289     start_match++;
    2290       while(start_match < end_subject && ISMIDCHAR(*start_match))
    2291         start_match++;
    2292     continue;
    2293     }
    2294 
    2295   if (rc != MATCH_MATCH)
    2296     {
    2297     DPRINTF((">>>> error: returning %d\n", rc));
    2298     return rc;
    2299     }
    2300 
    2301   /* We have a match! Copy the offset information from temporary store if
    2302   necessary */
    2303 
    2304   if (using_temporary_offsets)
    2305     {
    2306     if (offsetcount >= 4)
    2307       {
    2308       memcpy(offsets + 2, match_block.offset_vector + 2,
    2309         (offsetcount - 2) * sizeof(int));
    2310       DPRINTF(("Copied offsets from temporary memory\n"));
    2311       }
    2312     if (match_block.end_offset_top > offsetcount)
    2313       match_block.offset_overflow = true;
    2314 
    2315     DPRINTF(("Freeing temporary memory\n"));
    2316     delete [] match_block.offset_vector;
    2317     }
    2318 
    2319   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
    2320 
    2321   if (offsetcount < 2) rc = 0; else
    2322     {
    2323     offsets[0] = start_match - match_block.start_subject;
    2324     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
    2325     }
    2326 
    2327   DPRINTF((">>>> returning %d\n", rc));
    2328   return rc;
    2329   }
    2330 
    2331 /* This "while" is the end of the "do" above */
    2332 
    2333 while (start_match <= end_subject);
    2334 
    2335 if (using_temporary_offsets)
    2336   {
    2337   DPRINTF(("Freeing temporary memory\n"));
    2338   delete [] match_block.offset_vector;
    2339   }
    2340 
    2341   DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
    2342   return JSRegExpErrorNoMatch;
     2309   
     2310    DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
     2311    return JSRegExpErrorNoMatch;
    23432312}
Note: See TracChangeset for help on using the changeset viewer.