Changeset 27752 in webkit for trunk/JavaScriptCore/pcre/pcre_compile.cpp
- Timestamp:
- Nov 13, 2007, 9:25:26 AM (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/pcre/pcre_compile.cpp
r27730 r27752 162 162 { 163 163 const pcre_uchar *ptr = *ptrptr + 1; 164 int c,i;164 int i; 165 165 166 166 /* If backslash is at the end of the pattern, it's an error. */ … … 171 171 } 172 172 173 c = *ptr;173 int c = *ptr; 174 174 175 175 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in … … 184 184 else 185 185 { 186 const pcre_uchar *oldptr;187 186 switch (c) 188 187 { 189 /* A number of Perl escapes are not handled by PCRE. We give an explicit 190 error. */ 191 192 /* The handling of escape sequences consisting of a string of digits 193 starting with one that is not zero is not straightforward. By experiment, 194 the way Perl works seems to be as follows: 195 196 Outside a character class, the digits are read as a decimal number. If the 197 number is less than 10, or if there are that many previous extracting 198 left brackets, then it is a back reference. Otherwise, up to three octal 199 digits are read to form an escaped byte. Thus \123 is likely to be octal 200 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal 201 value is greater than 377, the least significant 8 bits are taken. Inside a 202 character class, \ followed by a digit is always an octal number. */ 188 /* Escape sequences starting with a non-zero digit are backreferences, 189 unless there are insufficient brackets, in which case they are octal 190 escape sequences. Those sequences end on the first non-octal character 191 or when we overflow 0-255, whichever comes first. */ 203 192 204 193 case '1': case '2': case '3': case '4': case '5': … … 207 196 if (!isclass) 208 197 { 209 oldptr = ptr;198 const pcre_uchar *oldptr = ptr; 210 199 c -= '0'; 211 while (ptr + 1 < patternEnd && isASCIIDigit(ptr[1]) )200 while (ptr + 1 < patternEnd && isASCIIDigit(ptr[1]) && c <= bracount) 212 201 c = c * 10 + *(++ptr) - '0'; 213 if (c < 10 || c <= bracount)202 if (c <= bracount) 214 203 { 215 204 c = -(ESC_REF + c); … … 219 208 } 220 209 221 /* Handle an octal number following \. If the first digit is 8 or 9, Perl 222 generates a binary zero byte and treats the digit as a following literal. 223 Thus we have to pull back the pointer by one. */ 210 /* Handle an octal number following \. If the first digit is 8 or 9, 211 this is not octal. */ 224 212 225 213 if ((c = *ptr) >= '8') 226 {227 ptr--;228 c = 0;229 214 break; 230 }231 215 232 216 /* \0 always starts an octal number, but we may drop through to here with a … … 235 219 case '0': 236 220 c -= '0'; 237 while (i++ < 2 && ptr + 1 < patternEnd && ptr[1] >= '0' && ptr[1] <= '7') 238 c = c * 8 + *(++ptr) - '0'; 239 c &= 255; /* Take least significant 8 bits */ 221 for (i = 1; i <= 2; ++i) 222 { 223 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7') 224 break; 225 int cc = c * 8 + ptr[i] - '0'; 226 if (cc > 255) 227 break; 228 c = cc; 229 } 230 ptr += i - 1; 240 231 break; 241 232 242 /* \x is complicated. \x{ddd} is a character number which can be greater243 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is244 treated as a data character. */245 246 233 case 'x': 247 if (ptr + 1 < patternEnd && ptr[1] == '{') 248 { 249 const pcre_uchar *pt = ptr + 2; 250 int count = 0; 251 252 c = 0; 253 while (pt < patternEnd && isASCIIHexDigit(*pt)) 254 { 255 register int cc = *pt++; 256 if (c == 0 && cc == '0') continue; /* Leading zeroes */ 257 count++; 258 259 if (cc >= 'a') cc -= 32; /* Convert to upper case */ 260 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); 261 } 262 263 if (pt < patternEnd && *pt == '}') 264 { 265 if (c < 0 || count > 8) *errorcodeptr = ERR3; 266 else if (c >= 0xD800 && c <= 0xDFFF) *errorcodeptr = ERR3; // half of surrogate pair 267 else if (c >= 0xFDD0 && c <= 0xFDEF) *errorcodeptr = ERR3; // ? 268 else if (c == 0xFFFE) *errorcodeptr = ERR3; // not a character 269 else if (c == 0xFFFF) *errorcodeptr = ERR3; // not a character 270 else if (c > 0x10FFFF) *errorcodeptr = ERR3; // out of Unicode character range 271 ptr = pt; 234 c = 0; 235 for (i = 1; i <= 2; ++i) 236 { 237 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) 238 { 239 c = 'x'; 240 i = 1; 272 241 break; 273 242 } 274 275 /* If the sequence of hex digits does not end with '}', then we don't 276 recognize this construct; fall through to the normal \x handling. */ 277 } 278 279 /* Read just a single-byte hex-defined char */ 280 243 int cc = ptr[i]; 244 if (cc >= 'a') cc -= 32; /* Convert to upper case */ 245 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10)); 246 } 247 ptr += i - 1; 248 break; 249 250 case 'u': 281 251 c = 0; 282 while (i++ < 2 && ptr + 1 < patternEnd && isASCIIHexDigit(ptr[1])) 283 { 284 int cc; /* Some compilers don't like ++ */ 285 cc = *(++ptr); /* in initializers */ 286 if (cc >= 'a') cc -= 32; /* Convert to upper case */ 252 for (i = 1; i <= 4; ++i) 253 { 254 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) 255 { 256 c = 'u'; 257 i = 1; 258 break; 259 } 260 int cc = ptr[i]; 261 if (cc >= 'a') cc -= 32; /* Convert to upper case */ 287 262 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); 288 263 } 264 ptr += i - 1; 289 265 break; 290 291 case 'u': {292 const pcre_uchar *pt = ptr;293 c = 0;294 while (i++ < 4)295 {296 if (pt + 1 >= patternEnd || !isASCIIHexDigit(pt[1]))297 {298 pt = ptr;299 c = 'u';300 break;301 }302 else303 {304 int cc; /* Some compilers don't like ++ */305 cc = *(++pt); /* in initializers */306 if (cc >= 'a') cc -= 32; /* Convert to upper case */307 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));308 }309 }310 ptr = pt;311 break;312 }313 266 314 267 /* Other special escapes not starting with a digit are straightforward */ … … 934 887 BOOL negate_class; 935 888 BOOL should_flip_negation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */ 936 BOOL possessive_quantifier;937 889 BOOL is_quantifier; 938 890 int class_charcount; … … 1026 978 /* If the first character is '^', set the negation flag and skip it. */ 1027 979 1028 if ( (c = *(++ptr))== '^')980 if (ptr[1] == '^') 1029 981 { 1030 982 negate_class = true; 1031 c = *(++ptr);983 ++ptr; 1032 984 } 1033 985 else … … 1053 1005 memset(classbits, 0, 32 * sizeof(uschar)); 1054 1006 1055 /* Process characters until ] is reached. By writing this as a "do" it 1056 means that an initial ] is taken as a data character. The first pass 1007 /* Process characters until ] is reached. The first pass 1057 1008 through the regex checked the overall syntax, so we don't need to be very 1058 1009 strict here. At the start of the loop, c contains the first byte of the 1059 1010 character. */ 1060 1011 1061 do1012 while ((c = *(++ptr)) != ']') 1062 1013 { 1063 1014 if (c > 127) … … 1286 1237 } 1287 1238 1288 /* Loop until ']' reached; the check for end of string happens inside the1289 loop. This "while" is the end of the "do" above. */1290 1291 while ((c = *(++ptr)) != ']');1292 1293 1239 /* If class_charcount is 1, we saw precisely one character whose value is 1294 1240 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we … … 1431 1377 1432 1378 op_type = 0; /* Default single-char op codes */ 1433 possessive_quantifier = false; /* Default not possessive quantifier */1434 1379 1435 1380 /* Save start of previous item, in case we have to move it up to make space … … 1444 1389 repeat type to the non-default. */ 1445 1390 1446 if (ptr + 1 < patternEnd && ptr[1] == '+') 1447 { 1448 repeat_type = 0; /* Force greedy */ 1449 possessive_quantifier = true; 1450 ptr++; 1451 } 1452 else if (ptr + 1 < patternEnd && ptr[1] == '?') 1391 if (ptr + 1 < patternEnd && ptr[1] == '?') 1453 1392 { 1454 1393 repeat_type = 1; … … 1829 1768 *errorcodeptr = ERR11; 1830 1769 goto FAILED; 1831 }1832 1833 /* If the character following a repeat is '+', we wrap the entire repeated1834 item inside OP_ONCE brackets. This is just syntactic sugar, taken from1835 Sun's Java package. The repeated item starts at tempcode, not at previous,1836 which might be the first part of a string whose (former) last char we1837 repeated. However, we don't support '+' after a greediness '?'. */1838 1839 if (possessive_quantifier)1840 {1841 int len = code - tempcode;1842 memmove(tempcode + 1+LINK_SIZE, tempcode, len);1843 code += 1 + LINK_SIZE;1844 len += 1 + LINK_SIZE;1845 tempcode[0] = OP_ONCE;1846 *code++ = OP_KET;1847 PUTINC(code, 0, len);1848 PUT(tempcode, 1, len);1849 1770 } 1850 1771 … … 2736 2657 class_utf8 = false; 2737 2658 2738 /* Written as a "do" so that an initial ']' is taken as data */ 2739 2740 if (*ptr != 0) do 2741 { 2742 /* Outside \Q...\E, check for escapes */ 2659 for (; ptr < patternEnd && *ptr != ']'; ++ptr) 2660 { 2661 /* Check for escapes */ 2743 2662 2744 2663 if (*ptr == '\\') … … 2890 2809 } 2891 2810 } 2892 while (++ptr < patternEnd && *ptr != ']'); /* Concludes "do" above */2893 2811 2894 2812 if (ptr >= patternEnd) /* Missing terminating ']' */
Note:
See TracChangeset
for help on using the changeset viewer.