Changeset 28138 in webkit for trunk/JavaScriptCore/pcre/pcre_exec.cpp
- Timestamp:
- Nov 29, 2007, 3:04:06 AM (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/pcre/pcre_exec.cpp
r27947 r28138 2026 2026 2027 2027 Arguments: 2028 argument_repoints to the compiled expression2028 re points to the compiled expression 2029 2029 extra_data points to extra data or is NULL 2030 2030 subject points to the subject string … … 2041 2041 */ 2042 2042 2043 int 2044 jsRegExpExecute(const pcre *argument_re, 2045 const UChar* subject, int length, int start_offset, int *offsets, 2046 int offsetcount) 2043 int jsRegExpExecute(const JSRegExp* re, 2044 const UChar* subject, int length, int start_offset, int* offsets, 2045 int offsetcount) 2047 2046 { 2048 int rc, resetcount, ocount; 2049 int first_byte = -1; 2050 int req_byte = -1; 2051 int req_byte2 = -1; 2052 BOOL using_temporary_offsets = false; 2053 BOOL first_byte_caseless = false; 2054 BOOL startline; 2055 BOOL req_byte_caseless = false; 2056 match_data match_block; 2057 USPTR start_match = (USPTR)subject + start_offset; 2058 USPTR end_subject; 2059 USPTR req_byte_ptr = start_match - 1; 2060 const uschar *start_code; 2061 2062 const real_pcre *external_re = (const real_pcre *)argument_re; 2063 const real_pcre *re = external_re; 2064 2065 /* Plausibility checks */ 2066 2067 ASSERT(re); 2068 ASSERT(subject); 2069 ASSERT(offsetcount >= 0); 2070 ASSERT(offsets || offsetcount == 0); 2071 2072 /* Set up other data */ 2073 2074 startline = (re->options & PCRE_STARTLINE) != 0; 2075 2076 /* The code starts after the real_pcre block and the capture name table. */ 2077 2078 start_code = (const uschar *)(external_re + 1); 2079 2080 match_block.start_subject = (USPTR)subject; 2081 match_block.end_subject = match_block.start_subject + length; 2082 end_subject = match_block.end_subject; 2083 2084 match_block.lcc = _pcre_default_tables + lcc_offset; 2085 match_block.ctypes = _pcre_default_tables + ctypes_offset; 2086 2087 match_block.multiline = (re->options & PCRE_MULTILINE) != 0; 2088 match_block.caseless = (re->options & PCRE_CASELESS) != 0; 2089 2090 /* If the expression has got more back references than the offsets supplied can 2091 hold, we get a temporary chunk of working store to use during the matching. 2092 Otherwise, we can use the vector supplied, rounding down its size to a multiple 2093 of 3. */ 2094 2095 ocount = offsetcount - (offsetcount % 3); 2096 2097 if (re->top_backref > 0 && re->top_backref >= ocount/3) 2098 { 2099 ocount = re->top_backref * 3 + 3; 2100 match_block.offset_vector = new int[ocount]; 2101 if (match_block.offset_vector == NULL) return JSRegExpErrorNoMemory; 2102 using_temporary_offsets = true; 2103 DPRINTF(("Got memory to hold back references\n")); 2104 } 2105 else match_block.offset_vector = offsets; 2106 2107 match_block.offset_end = ocount; 2108 match_block.offset_max = (2*ocount)/3; 2109 match_block.offset_overflow = false; 2110 2111 /* Compute the minimum number of offsets that we need to reset each time. Doing 2112 this makes a huge difference to execution time when there aren't many brackets 2113 in the pattern. */ 2114 2115 resetcount = 2 + re->top_bracket * 2; 2116 if (resetcount > offsetcount) resetcount = ocount; 2117 2118 /* Reset the working variable associated with each extraction. These should 2119 never be used unless previously set, but they get saved and restored, and so we 2120 initialize them to avoid reading uninitialized locations. */ 2121 2122 if (match_block.offset_vector != NULL) 2123 { 2124 register int *iptr = match_block.offset_vector + ocount; 2125 register int *iend = iptr - resetcount/2 + 1; 2126 while (--iptr >= iend) *iptr = -1; 2127 } 2128 2129 /* Set up the first character to match, if available. The first_byte value is 2130 never set for an anchored regular expression, but the anchoring may be forced 2131 at run time, so we have to test for anchoring. The first char may be unset for 2132 an unanchored pattern, of course. If there's no first char and the pattern was 2133 studied, there may be a bitmap of possible first characters. */ 2134 2135 if ((re->options & PCRE_FIRSTSET) != 0) 2136 { 2137 first_byte = re->first_byte & 255; 2138 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == true) 2139 first_byte = match_block.lcc[first_byte]; 2047 ASSERT(re); 2048 ASSERT(subject); 2049 ASSERT(offsetcount >= 0); 2050 ASSERT(offsets || offsetcount == 0); 2051 2052 match_data match_block; 2053 match_block.start_subject = (USPTR)subject; 2054 match_block.end_subject = match_block.start_subject + length; 2055 USPTR end_subject = match_block.end_subject; 2056 2057 match_block.lcc = _pcre_default_tables + lcc_offset; 2058 match_block.ctypes = _pcre_default_tables + ctypes_offset; 2059 2060 match_block.multiline = (re->options & PCRE_MULTILINE) != 0; 2061 match_block.caseless = (re->options & PCRE_CASELESS) != 0; 2062 2063 /* If the expression has got more back references than the offsets supplied can 2064 hold, we get a temporary chunk of working store to use during the matching. 2065 Otherwise, we can use the vector supplied, rounding down its size to a multiple 2066 of 3. */ 2067 2068 int ocount = offsetcount - (offsetcount % 3); 2069 2070 BOOL using_temporary_offsets = false; 2071 if (re->top_backref > 0 && re->top_backref >= ocount/3) { 2072 ocount = re->top_backref * 3 + 3; 2073 match_block.offset_vector = new int[ocount]; 2074 if (!match_block.offset_vector) 2075 return JSRegExpErrorNoMemory; 2076 using_temporary_offsets = true; 2077 } else 2078 match_block.offset_vector = offsets; 2079 2080 match_block.offset_end = ocount; 2081 match_block.offset_max = (2*ocount)/3; 2082 match_block.offset_overflow = false; 2083 2084 /* Compute the minimum number of offsets that we need to reset each time. Doing 2085 this makes a huge difference to execution time when there aren't many brackets 2086 in the pattern. */ 2087 2088 int resetcount = 2 + re->top_bracket * 2; 2089 if (resetcount > offsetcount) 2090 resetcount = ocount; 2091 2092 /* Reset the working variable associated with each extraction. These should 2093 never be used unless previously set, but they get saved and restored, and so we 2094 initialize them to avoid reading uninitialized locations. */ 2095 2096 if (match_block.offset_vector) { 2097 int* iptr = match_block.offset_vector + ocount; 2098 int* iend = iptr - resetcount/2 + 1; 2099 while (--iptr >= iend) 2100 *iptr = -1; 2140 2101 } 2141 2142 /* For anchored or unanchored matches, there may be a "last known required 2143 character" set. */ 2144 2145 if ((re->options & PCRE_REQCHSET) != 0) 2146 { 2147 req_byte = re->req_byte & 255; 2148 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; 2149 req_byte2 = (_pcre_default_tables + fcc_offset)[req_byte]; /* case flipped */ 2150 } 2151 2152 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 2153 the loop runs just once. */ 2154 2155 do 2156 { 2157 USPTR save_end_subject = end_subject; 2158 2159 /* Reset the maximum number of extractions we might see. */ 2160 2161 if (match_block.offset_vector != NULL) 2162 { 2163 register int *iptr = match_block.offset_vector; 2164 register int *iend = iptr + resetcount; 2165 while (iptr < iend) *iptr++ = -1; 2102 2103 /* Set up the first character to match, if available. The first_byte value is 2104 never set for an anchored regular expression, but the anchoring may be forced 2105 at run time, so we have to test for anchoring. The first char may be unset for 2106 an unanchored pattern, of course. If there's no first char and the pattern was 2107 studied, there may be a bitmap of possible first characters. */ 2108 2109 BOOL first_byte_caseless = false; 2110 int first_byte = -1; 2111 if (re->options & PCRE_FIRSTSET) { 2112 first_byte = re->first_byte & 255; 2113 if ((first_byte_caseless = (re->first_byte & REQ_CASELESS))) 2114 first_byte = match_block.lcc[first_byte]; 2166 2115 } 2167 2168 /* Advance to a unique first char if possible. If firstline is true, the 2169 start of the match is constrained to the first line of a multiline string. 2170 Implement this by temporarily adjusting end_subject so that we stop scanning 2171 at a newline. If the match fails at the newline, later code breaks this loop. 2172 */ 2173 2174 /* Now test for a unique first byte */ 2175 2176 if (first_byte >= 0) 2177 { 2178 pcre_uchar first_char = first_byte; 2179 if (first_byte_caseless) 2180 while (start_match < end_subject) 2181 { 2182 int sm = *start_match; 2183 if (sm > 127) 2184 break; 2185 if (match_block.lcc[sm] == first_char) 2186 break; 2187 start_match++; 2116 2117 /* For anchored or unanchored matches, there may be a "last known required 2118 character" set. */ 2119 2120 BOOL req_byte_caseless = false; 2121 int req_byte = -1; 2122 int req_byte2 = -1; 2123 if (re->options & PCRE_REQCHSET) { 2124 req_byte = re->req_byte & 255; 2125 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; 2126 req_byte2 = (_pcre_default_tables + fcc_offset)[req_byte]; /* case flipped */ 2127 } 2128 2129 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 2130 the loop runs just once. */ 2131 2132 USPTR start_match = (USPTR)subject + start_offset; 2133 USPTR req_byte_ptr = start_match - 1; 2134 2135 do { 2136 USPTR save_end_subject = end_subject; 2137 2138 /* Reset the maximum number of extractions we might see. */ 2139 2140 if (match_block.offset_vector != NULL) { 2141 int* iptr = match_block.offset_vector; 2142 int* iend = iptr + resetcount; 2143 while (iptr < iend) 2144 *iptr++ = -1; 2188 2145 } 2189 else 2190 while (start_match < end_subject && *start_match != first_char) 2191 start_match++; 2146 2147 /* Advance to a unique first char if possible. If firstline is true, the 2148 start of the match is constrained to the first line of a multiline string. 2149 Implement this by temporarily adjusting end_subject so that we stop scanning 2150 at a newline. If the match fails at the newline, later code breaks this loop. 2151 */ 2152 2153 /* Now test for a unique first byte */ 2154 2155 if (first_byte >= 0) { 2156 pcre_uchar first_char = first_byte; 2157 if (first_byte_caseless) 2158 while (start_match < end_subject) { 2159 int sm = *start_match; 2160 if (sm > 127) 2161 break; 2162 if (match_block.lcc[sm] == first_char) 2163 break; 2164 start_match++; 2165 } 2166 else 2167 while (start_match < end_subject && *start_match != first_char) 2168 start_match++; 2169 } 2170 2171 /* Or to just after \n for a multiline match if possible */ 2172 2173 else if (re->options & PCRE_STARTLINE) { 2174 if (start_match > match_block.start_subject + start_offset) { 2175 while (start_match < end_subject && !isNewline(start_match[-1])) 2176 start_match++; 2177 } 2178 } 2179 2180 /* Restore fudged end_subject */ 2181 2182 end_subject = save_end_subject; 2183 2184 #ifdef DEBUG /* Sigh. Some compilers never learn. */ 2185 printf(">>>> Match against: "); 2186 pchars(start_match, end_subject - start_match, true, &match_block); 2187 printf("\n"); 2188 #endif 2189 2190 /* If req_byte is set, we know that that character must appear in the subject 2191 for the match to succeed. If the first character is set, req_byte must be 2192 later in the subject; otherwise the test starts at the match point. This 2193 optimization can save a huge amount of backtracking in patterns with nested 2194 unlimited repeats that aren't going to match. Writing separate code for 2195 cased/caseless versions makes it go faster, as does using an autoincrement 2196 and backing off on a match. 2197 2198 HOWEVER: when the subject string is very, very long, searching to its end can 2199 take a long time, and give bad performance on quite ordinary patterns. This 2200 showed up when somebody was matching /^C/ on a 32-megabyte string... so we 2201 don't do this when the string is sufficiently long. 2202 2203 ALSO: this processing is disabled when partial matching is requested. 2204 */ 2205 2206 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) { 2207 USPTR p = start_match + ((first_byte >= 0)? 1 : 0); 2208 2209 /* We don't need to repeat the search if we haven't yet reached the 2210 place we found it at last time. */ 2211 2212 if (p > req_byte_ptr) { 2213 if (req_byte_caseless) { 2214 while (p < end_subject) { 2215 int pp = *p++; 2216 if (pp == req_byte || pp == req_byte2) { 2217 p--; 2218 break; 2219 } 2220 } 2221 } else { 2222 while (p < end_subject) { 2223 if (*p++ == req_byte) { 2224 p--; 2225 break; 2226 } 2227 } 2228 } 2229 2230 /* If we can't find the required character, break the matching loop */ 2231 2232 if (p >= end_subject) 2233 break; 2234 2235 /* If we have found the required character, save the point where we 2236 found it, so that we don't search again next time round the loop if 2237 the start hasn't passed this character yet. */ 2238 2239 req_byte_ptr = p; 2240 } 2241 } 2242 2243 /* When a match occurs, substrings will be set for all internal extractions; 2244 we just need to set up the whole thing as substring 0 before returning. If 2245 there were too many extractions, set the return code to zero. In the case 2246 where we had to get some local store to hold offsets for backreferences, copy 2247 those back references that we can. In this case there need not be overflow 2248 if certain parts of the pattern were not used. */ 2249 2250 match_block.match_call_count = 0; 2251 2252 2253 /* The code starts after the JSRegExp block and the capture name table. */ 2254 const uschar* start_code = (const uschar*)(re + 1); 2255 2256 int returnCode = match(start_match, start_code, 2, &match_block); 2257 2258 /* When the result is no match, if the subject's first character was a 2259 newline and the PCRE_FIRSTLINE option is set, break (which will return 2260 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first 2261 newline in the subject. Otherwise, advance the pointer to the next character 2262 and continue - but the continuation will actually happen only when the 2263 pattern is not anchored. */ 2264 2265 if (returnCode == MATCH_NOMATCH) { 2266 start_match++; 2267 while(start_match < end_subject && ISMIDCHAR(*start_match)) 2268 start_match++; 2269 continue; 2270 } 2271 2272 if (returnCode != MATCH_MATCH) { 2273 DPRINTF((">>>> error: returning %d\n", rc)); 2274 return returnCode; 2275 } 2276 2277 /* We have a match! Copy the offset information from temporary store if 2278 necessary */ 2279 2280 if (using_temporary_offsets) { 2281 if (offsetcount >= 4) { 2282 memcpy(offsets + 2, match_block.offset_vector + 2, (offsetcount - 2) * sizeof(int)); 2283 DPRINTF(("Copied offsets from temporary memory\n")); 2284 } 2285 if (match_block.end_offset_top > offsetcount) 2286 match_block.offset_overflow = true; 2287 2288 DPRINTF(("Freeing temporary memory\n")); 2289 delete [] match_block.offset_vector; 2290 } 2291 2292 returnCode = match_block.offset_overflow? 0 : match_block.end_offset_top/2; 2293 2294 if (offsetcount < 2) 2295 returnCode = 0; 2296 else { 2297 offsets[0] = start_match - match_block.start_subject; 2298 offsets[1] = match_block.end_match_ptr - match_block.start_subject; 2299 } 2300 2301 DPRINTF((">>>> returning %d\n", rc)); 2302 return returnCode; 2303 } while (start_match <= end_subject); 2304 2305 if (using_temporary_offsets) { 2306 DPRINTF(("Freeing temporary memory\n")); 2307 delete [] match_block.offset_vector; 2192 2308 } 2193 2194 /* Or to just after \n for a multiline match if possible */ 2195 2196 else if (startline) 2197 { 2198 if (start_match > match_block.start_subject + start_offset) 2199 { 2200 while (start_match < end_subject && !isNewline(start_match[-1])) 2201 start_match++; 2202 } 2203 } 2204 2205 /* Restore fudged end_subject */ 2206 2207 end_subject = save_end_subject; 2208 2209 #ifdef DEBUG /* Sigh. Some compilers never learn. */ 2210 printf(">>>> Match against: "); 2211 pchars(start_match, end_subject - start_match, true, &match_block); 2212 printf("\n"); 2213 #endif 2214 2215 /* If req_byte is set, we know that that character must appear in the subject 2216 for the match to succeed. If the first character is set, req_byte must be 2217 later in the subject; otherwise the test starts at the match point. This 2218 optimization can save a huge amount of backtracking in patterns with nested 2219 unlimited repeats that aren't going to match. Writing separate code for 2220 cased/caseless versions makes it go faster, as does using an autoincrement 2221 and backing off on a match. 2222 2223 HOWEVER: when the subject string is very, very long, searching to its end can 2224 take a long time, and give bad performance on quite ordinary patterns. This 2225 showed up when somebody was matching /^C/ on a 32-megabyte string... so we 2226 don't do this when the string is sufficiently long. 2227 2228 ALSO: this processing is disabled when partial matching is requested. 2229 */ 2230 2231 if (req_byte >= 0 && 2232 end_subject - start_match < REQ_BYTE_MAX) 2233 { 2234 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); 2235 2236 /* We don't need to repeat the search if we haven't yet reached the 2237 place we found it at last time. */ 2238 2239 if (p > req_byte_ptr) 2240 { 2241 if (req_byte_caseless) 2242 { 2243 while (p < end_subject) 2244 { 2245 register int pp = *p++; 2246 if (pp == req_byte || pp == req_byte2) { p--; break; } 2247 } 2248 } 2249 else 2250 { 2251 while (p < end_subject) 2252 { 2253 if (*p++ == req_byte) { p--; break; } 2254 } 2255 } 2256 2257 /* If we can't find the required character, break the matching loop */ 2258 2259 if (p >= end_subject) break; 2260 2261 /* If we have found the required character, save the point where we 2262 found it, so that we don't search again next time round the loop if 2263 the start hasn't passed this character yet. */ 2264 2265 req_byte_ptr = p; 2266 } 2267 } 2268 2269 /* When a match occurs, substrings will be set for all internal extractions; 2270 we just need to set up the whole thing as substring 0 before returning. If 2271 there were too many extractions, set the return code to zero. In the case 2272 where we had to get some local store to hold offsets for backreferences, copy 2273 those back references that we can. In this case there need not be overflow 2274 if certain parts of the pattern were not used. */ 2275 2276 match_block.match_call_count = 0; 2277 2278 rc = match(start_match, start_code, 2, &match_block); 2279 2280 /* When the result is no match, if the subject's first character was a 2281 newline and the PCRE_FIRSTLINE option is set, break (which will return 2282 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first 2283 newline in the subject. Otherwise, advance the pointer to the next character 2284 and continue - but the continuation will actually happen only when the 2285 pattern is not anchored. */ 2286 2287 if (rc == MATCH_NOMATCH) 2288 { 2289 start_match++; 2290 while(start_match < end_subject && ISMIDCHAR(*start_match)) 2291 start_match++; 2292 continue; 2293 } 2294 2295 if (rc != MATCH_MATCH) 2296 { 2297 DPRINTF((">>>> error: returning %d\n", rc)); 2298 return rc; 2299 } 2300 2301 /* We have a match! Copy the offset information from temporary store if 2302 necessary */ 2303 2304 if (using_temporary_offsets) 2305 { 2306 if (offsetcount >= 4) 2307 { 2308 memcpy(offsets + 2, match_block.offset_vector + 2, 2309 (offsetcount - 2) * sizeof(int)); 2310 DPRINTF(("Copied offsets from temporary memory\n")); 2311 } 2312 if (match_block.end_offset_top > offsetcount) 2313 match_block.offset_overflow = true; 2314 2315 DPRINTF(("Freeing temporary memory\n")); 2316 delete [] match_block.offset_vector; 2317 } 2318 2319 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2; 2320 2321 if (offsetcount < 2) rc = 0; else 2322 { 2323 offsets[0] = start_match - match_block.start_subject; 2324 offsets[1] = match_block.end_match_ptr - match_block.start_subject; 2325 } 2326 2327 DPRINTF((">>>> returning %d\n", rc)); 2328 return rc; 2329 } 2330 2331 /* This "while" is the end of the "do" above */ 2332 2333 while (start_match <= end_subject); 2334 2335 if (using_temporary_offsets) 2336 { 2337 DPRINTF(("Freeing temporary memory\n")); 2338 delete [] match_block.offset_vector; 2339 } 2340 2341 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 2342 return JSRegExpErrorNoMatch; 2309 2310 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 2311 return JSRegExpErrorNoMatch; 2343 2312 }
Note:
See TracChangeset
for help on using the changeset viewer.