Changeset 197426 in webkit for trunk/Source/JavaScriptCore/yarr/YarrInterpreter.cpp
- Timestamp:
- Mar 1, 2016, 4:39:01 PM (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/JavaScriptCore/yarr/YarrInterpreter.cpp
r194496 r197426 1 1 /* 2 * Copyright (C) 2009 Apple Inc. All rights reserved.2 * Copyright (C) 2009, 2013, 2016 Apple Inc. All rights reserved. 3 3 * Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged 4 4 * … … 29 29 30 30 #include "Yarr.h" 31 #include "YarrCanonicalizeU CS2.h"31 #include "YarrCanonicalizeUnicode.h" 32 32 #include <wtf/BumpPointerAllocator.h> 33 33 #include <wtf/DataLog.h> … … 45 45 46 46 struct BackTrackInfoPatternCharacter { 47 uintptr_t begin; // Only needed for unicode patterns 47 48 uintptr_t matchAmount; 48 49 }; 49 50 struct BackTrackInfoCharacterClass { 51 uintptr_t begin; // Only needed for unicode patterns 50 52 uintptr_t matchAmount; 51 53 }; … … 168 170 class InputStream { 169 171 public: 170 InputStream(const CharType* input, unsigned start, unsigned length )172 InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs) 171 173 : input(input) 172 174 , pos(start) 173 175 , length(length) 176 , decodeSurrogatePairs(decodeSurrogatePairs) 174 177 { 175 178 } … … 205 208 unsigned p = pos - negativePositionOffest; 206 209 ASSERT(p < length); 207 return input[p]; 210 int result = input[p]; 211 if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length 212 && U16_IS_TRAIL(input[p + 1])) { 213 if (atEnd()) 214 return -1; 215 216 result = U16_GET_SUPPLEMENTARY(result, input[p + 1]); 217 next(); 218 } 219 return result; 220 } 221 222 int readSurrogatePairChecked(unsigned negativePositionOffest) 223 { 224 RELEASE_ASSERT(pos >= negativePositionOffest); 225 unsigned p = pos - negativePositionOffest; 226 ASSERT(p < length); 227 if (p + 1 >= length) 228 return -1; 229 230 int first = input[p]; 231 if (U16_IS_LEAD(first) && U16_IS_TRAIL(input[p + 1])) 232 return U16_GET_SUPPLEMENTARY(first, input[p + 1]); 233 234 return -1; 208 235 } 209 236 … … 211 238 { 212 239 ASSERT(from < length); 213 return input[from]; 240 int result = input[from]; 241 if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length 242 && U16_IS_TRAIL(input[from + 1])) { 243 244 result = U16_GET_SUPPLEMENTARY(result, input[from + 1]); 245 } 246 return result; 214 247 } 215 248 … … 282 315 unsigned pos; 283 316 unsigned length; 317 bool decodeSurrogatePairs; 284 318 }; 285 319 286 320 bool testCharacterClass(CharacterClass* characterClass, int ch) 287 321 { 288 if (ch & 0x FF80) {322 if (ch & 0x1FFF80) { 289 323 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) 290 324 if (ch == characterClass->m_matchesUnicode[i]) … … 310 344 } 311 345 346 bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset) 347 { 348 return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset); 349 } 350 312 351 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset) 313 352 { … … 329 368 return false; 330 369 331 if (pattern->m_ignoreCase) { 332 for (unsigned i = 0; i < matchSize; ++i) { 333 int oldCh = input.reread(matchBegin + i); 334 int ch = input.readChecked(negativeInputOffset + matchSize - i); 335 336 if (oldCh == ch) 337 continue; 338 339 // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that 370 for (unsigned i = 0; i < matchSize; ++i) { 371 int oldCh = input.reread(matchBegin + i); 372 int ch; 373 if (!U_IS_BMP(oldCh)) { 374 ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i); 375 ++i; 376 } else 377 ch = input.readChecked(negativeInputOffset + matchSize - i); 378 379 if (oldCh == ch) 380 continue; 381 382 if (pattern->m_ignoreCase) { 383 // The definition for canonicalize (see ES 6.0, 15.10.2.8) means that 340 384 // unicode values are never allowed to match against ascii ones. 341 385 if (isASCII(oldCh) || isASCII(ch)) { 342 386 if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) 343 387 continue; 344 } else if (areCanonicallyEquivalent(oldCh, ch ))388 } else if (areCanonicallyEquivalent(oldCh, ch, unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2)) 345 389 continue; 346 347 input.uncheckInput(matchSize); 348 return false; 349 } 350 } else { 351 for (unsigned i = 0; i < matchSize; ++i) { 352 if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) { 353 input.uncheckInput(matchSize); 354 return false; 355 } 356 } 390 } 391 392 input.uncheckInput(matchSize); 393 return false; 357 394 } 358 395 … … 397 434 if (backTrack->matchAmount) { 398 435 --backTrack->matchAmount; 399 input.uncheckInput(1); 436 if (unicode && !U_IS_BMP(term.atom.patternCharacter)) 437 input.uncheckInput(2); 438 else 439 input.uncheckInput(1); 400 440 return true; 401 441 } … … 408 448 return true; 409 449 } 410 input. uncheckInput(backTrack->matchAmount);450 input.setPos(backTrack->begin); 411 451 break; 412 452 } … … 447 487 { 448 488 ASSERT(term.type == ByteTerm::TypeCharacterClass); 449 BackTrackInfo PatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);489 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); 450 490 451 491 switch (term.atom.quantityType) { 452 492 case QuantifierFixedCount: { 493 if (unicode) { 494 backTrack->begin = input.getPos(); 495 unsigned matchAmount = 0; 496 for (matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { 497 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) { 498 input.setPos(backTrack->begin); 499 return false; 500 } 501 } 502 503 return true; 504 } 505 453 506 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { 454 507 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) … … 459 512 460 513 case QuantifierGreedy: { 514 backTrack->begin = input.getPos(); 461 515 unsigned matchAmount = 0; 462 516 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { … … 473 527 474 528 case QuantifierNonGreedy: 529 backTrack->begin = input.getPos(); 475 530 backTrack->matchAmount = 0; 476 531 return true; … … 484 539 { 485 540 ASSERT(term.type == ByteTerm::TypeCharacterClass); 486 BackTrackInfo PatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);541 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); 487 542 488 543 switch (term.atom.quantityType) { 489 544 case QuantifierFixedCount: 545 if (unicode) 546 input.setPos(backTrack->begin); 490 547 break; 491 548 492 549 case QuantifierGreedy: 493 550 if (backTrack->matchAmount) { 551 if (unicode) { 552 // Rematch one less match 553 input.setPos(backTrack->begin); 554 --backTrack->matchAmount; 555 for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) { 556 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { 557 input.uncheckInput(1); 558 break; 559 } 560 } 561 return true; 562 } 494 563 --backTrack->matchAmount; 495 564 input.uncheckInput(1); … … 504 573 return true; 505 574 } 506 input. uncheckInput(backTrack->matchAmount);575 input.setPos(backTrack->begin); 507 576 break; 508 577 } … … 774 843 return false; 775 844 776 // Successful match! Okay, what's next? - loop around and try to match mo ar!845 // Successful match! Okay, what's next? - loop around and try to match more! 777 846 context->term -= (term.atom.parenthesesWidth + 1); 778 847 return true; … … 1155 1224 case ByteTerm::TypePatternCharacterOnce: 1156 1225 case ByteTerm::TypePatternCharacterFixed: { 1226 if (unicode) { 1227 if (!U_IS_BMP(currentTerm().atom.patternCharacter)) { 1228 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1229 if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) { 1230 BACKTRACK(); 1231 } 1232 } 1233 MATCH_NEXT(); 1234 } 1235 } 1236 unsigned position = input.getPos(); // May need to back out reading a surrogate pair. 1237 1157 1238 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1158 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) 1239 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) { 1240 input.setPos(position); 1159 1241 BACKTRACK(); 1242 } 1160 1243 } 1161 1244 MATCH_NEXT(); … … 1177 1260 case ByteTerm::TypePatternCharacterNonGreedy: { 1178 1261 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1262 backTrack->begin = input.getPos(); 1179 1263 backTrack->matchAmount = 0; 1180 1264 MATCH_NEXT(); … … 1183 1267 case ByteTerm::TypePatternCasedCharacterOnce: 1184 1268 case ByteTerm::TypePatternCasedCharacterFixed: { 1269 if (unicode) { 1270 // Case insensitive matching of unicode charaters are handled as TypeCharacterClass 1271 ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter)); 1272 1273 unsigned position = input.getPos(); // May need to back out reading a surrogate pair. 1274 1275 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1276 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) { 1277 input.setPos(position); 1278 BACKTRACK(); 1279 } 1280 } 1281 MATCH_NEXT(); 1282 } 1283 1185 1284 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1186 1285 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) … … 1191 1290 case ByteTerm::TypePatternCasedCharacterGreedy: { 1192 1291 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1292 1293 // Case insensitive matching of unicode charaters are handled as TypeCharacterClass 1294 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter)); 1295 1193 1296 unsigned matchAmount = 0; 1194 1297 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { … … 1205 1308 case ByteTerm::TypePatternCasedCharacterNonGreedy: { 1206 1309 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1310 1311 // Case insensitive matching of unicode charaters are handled as TypeCharacterClass 1312 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter)); 1313 1207 1314 backTrack->matchAmount = 0; 1208 1315 MATCH_NEXT(); … … 1440 1547 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) 1441 1548 : pattern(pattern) 1549 , unicode(pattern->m_unicode) 1442 1550 , output(output) 1443 , input(input, start, length )1551 , input(input, start, length, pattern->m_unicode) 1444 1552 , allocatorPool(0) 1445 1553 , remainingMatchCount(matchLimit) … … 1449 1557 private: 1450 1558 BytecodePattern* pattern; 1559 bool unicode; 1451 1560 unsigned* output; 1452 1561 InputStream input; … … 1507 1616 } 1508 1617 1509 void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)1618 void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 1510 1619 { 1511 1620 if (m_pattern.m_ignoreCase) { 1512 ASSERT(u_tolower(ch) <= 0xFFFF);1513 ASSERT(u_toupper(ch) <= 0xFFFF);1514 1515 UChar lo = u_tolower(ch);1516 UChar hi = u_toupper(ch);1621 ASSERT(u_tolower(ch) <= UCHAR_MAX_VALUE); 1622 ASSERT(u_toupper(ch) <= UCHAR_MAX_VALUE); 1623 1624 UChar32 lo = u_tolower(ch); 1625 UChar32 hi = u_toupper(ch); 1517 1626 1518 1627 if (lo != hi) {
Note:
See TracChangeset
for help on using the changeset viewer.