Ignore:
Timestamp:
Mar 16, 2020, 5:12:17 PM (5 years ago)
Author:
[email protected]
Message:

JavaScript identifier grammar supports unescaped astral symbols, but JSC doesn’t
https://bugs.webkit.org/show_bug.cgi?id=208998

Reviewed by Michael Saboff.

JSTests:

  • stress/unicode-identifiers-with-surrogate-pairs.js: Added.

(let.c.of.chars.eval.foo):
(throwsSyntaxError):
(let.c.of.continueChars.throwsSyntaxError.foo):

Source/JavaScriptCore:

This patch fixes a bug in the parser that allows for surrogate pairs when parsing identifiers.
It also makes a few other changes to the parser:

1) When looking for keywords we just need to check that subsequent
character cannot be a identifier part or an escape start.

2) The only time we call parseIdentifierSlowCase is when we hit an
escape start or a surrogate pair so we can optimize that to just
copy everything up slow character into our buffer.

3) We shouldn't allow for asking if a UChar is an identifier start/part.

  • KeywordLookupGenerator.py:

(Trie.printSubTreeAsC):
(Trie.printAsC):

  • parser/Lexer.cpp:

(JSC::isNonLatin1IdentStart):
(JSC::isIdentStart):
(JSC::isSingleCharacterIdentStart):
(JSC::cannotBeIdentStart):
(JSC::isIdentPart):
(JSC::isSingleCharacterIdentPart):
(JSC::cannotBeIdentPartOrEscapeStart):
(JSC::Lexer<LChar>::currentCodePoint const):
(JSC::Lexer<UChar>::currentCodePoint const):
(JSC::Lexer<LChar>::parseIdentifier):
(JSC::Lexer<UChar>::parseIdentifier):
(JSC::Lexer<CharacterType>::parseIdentifierSlowCase):
(JSC::Lexer<T>::lexWithoutClearingLineTerminator):
(JSC::Lexer<T>::scanRegExp):
(JSC::isIdentPartIncludingEscapeTemplate): Deleted.
(JSC::isIdentPartIncludingEscape): Deleted.

  • parser/Lexer.h:

(JSC::Lexer::setOffsetFromSourcePtr): Deleted.

  • parser/Parser.cpp:

(JSC::Parser<LexerType>::printUnexpectedTokenText):

  • parser/ParserTokens.h:

Source/WTF:

  • wtf/text/WTFString.cpp:

(WTF::String::fromCodePoint):

  • wtf/text/WTFString.h:

LayoutTests:

Fix broken test that asserted a non-ID_START codepoint was a start codepoint and
an ID_START codepoint was not a valid codepoint...

  • js/script-tests/unicode-escape-sequences.js:
  • js/unicode-escape-sequences-expected.txt:
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/JavaScriptCore/parser/Lexer.cpp

    r257681 r258531  
    733733}
    734734
    735 static NEVER_INLINE bool isNonLatin1IdentStart(UChar c)
     735static bool isNonLatin1IdentStart(UChar32 c)
    736736{
    737737    return u_hasBinaryProperty(c, UCHAR_ID_START);
    738738}
    739739
    740 static inline bool isIdentStart(LChar c)
    741 {
    742     return typesOfLatin1Characters[c] == CharacterIdentifierStart;
    743 }
    744 
    745 static inline bool isIdentStart(UChar32 c)
    746 {
    747     return isLatin1(c) ? isIdentStart(static_cast<LChar>(c)) : isNonLatin1IdentStart(c);
     740template<typename CharacterType>
     741static ALWAYS_INLINE bool isIdentStart(CharacterType c)
     742{
     743    static_assert(std::is_same_v<CharacterType, LChar> || std::is_same_v<CharacterType, UChar32>, "Call isSingleCharacterIdentStart for UChars that don't need to check for surrogate pairs");
     744    if (!isLatin1(c))
     745        return isNonLatin1IdentStart(c);
     746    return typesOfLatin1Characters[static_cast<LChar>(c)] == CharacterIdentifierStart;
     747}
     748
     749static ALWAYS_INLINE bool isSingleCharacterIdentStart(UChar c)
     750{
     751    if (LIKELY(isLatin1(c)))
     752        return isIdentStart(static_cast<LChar>(c));
     753    return !U16_IS_SURROGATE(c) && isIdentStart(static_cast<UChar32>(c));
     754}
     755
     756static ALWAYS_INLINE bool cannotBeIdentStart(LChar c)
     757{
     758    return !isIdentStart(c) && c != '\\';
     759}
     760
     761static ALWAYS_INLINE bool cannotBeIdentStart(UChar c)
     762{
     763    if (LIKELY(isLatin1(c)))
     764        return cannotBeIdentStart(static_cast<LChar>(c));
     765    return Lexer<UChar>::isWhiteSpace(c) || Lexer<UChar>::isLineTerminator(c);
    748766}
    749767
     
    753771}
    754772
    755 static ALWAYS_INLINE bool isIdentPart(LChar c)
    756 {
     773template<typename CharacterType>
     774static ALWAYS_INLINE bool isIdentPart(CharacterType c)
     775{
     776    static_assert(std::is_same_v<CharacterType, LChar> || std::is_same_v<CharacterType, UChar32>, "Call isSingleCharacterIdentPart for UChars that don't need to check for surrogate pairs");
     777    if (!isLatin1(c))
     778        return isNonLatin1IdentPart(c);
     779
    757780    // Character types are divided into two groups depending on whether they can be part of an
    758781    // identifier or not. Those whose type value is less or equal than CharacterOtherIdentifierPart can be
    759782    // part of an identifier. (See the CharacterType definition for more details.)
    760     return typesOfLatin1Characters[c] <= CharacterOtherIdentifierPart;
    761 }
    762 
    763 static ALWAYS_INLINE bool isIdentPart(UChar32 c)
    764 {
    765     return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
    766 }
    767 
    768 static ALWAYS_INLINE bool isIdentPart(UChar c)
    769 {
    770     return isIdentPart(static_cast<UChar32>(c));
    771 }
    772 
    773 template<typename CharacterType> ALWAYS_INLINE bool isIdentPartIncludingEscapeTemplate(const CharacterType* code, const CharacterType* codeEnd)
    774 {
    775     if (isIdentPart(code[0]))
    776         return true;
    777 
    778     // Shortest sequence handled below is \u{0}, which is 5 characters.
    779     if (!(code[0] == '\\' && codeEnd - code >= 5 && code[1] == 'u'))
    780         return false;
    781 
    782     if (code[2] == '{') {
    783         UChar32 codePoint = 0;
    784         const CharacterType* pointer;
    785         for (pointer = &code[3]; pointer < codeEnd; ++pointer) {
    786             auto digit = *pointer;
    787             if (!isASCIIHexDigit(digit))
    788                 break;
    789             codePoint = (codePoint << 4) | toASCIIHexValue(digit);
    790             if (codePoint > UCHAR_MAX_VALUE)
    791                 return false;
    792         }
    793         return isIdentPart(codePoint) && pointer < codeEnd && *pointer == '}';
    794     }
    795 
    796     // Shortest sequence handled below is \uXXXX, which is 6 characters.
    797     if (codeEnd - code < 6)
    798         return false;
    799 
    800     auto character1 = code[2];
    801     auto character2 = code[3];
    802     auto character3 = code[4];
    803     auto character4 = code[5];
    804     return isASCIIHexDigit(character1) && isASCIIHexDigit(character2) && isASCIIHexDigit(character3) && isASCIIHexDigit(character4)
    805         && isIdentPart(Lexer<LChar>::convertUnicode(character1, character2, character3, character4));
    806 }
    807 
    808 static ALWAYS_INLINE bool isIdentPartIncludingEscape(const LChar* code, const LChar* codeEnd)
    809 {
    810     return isIdentPartIncludingEscapeTemplate(code, codeEnd);
    811 }
    812 
    813 static ALWAYS_INLINE bool isIdentPartIncludingEscape(const UChar* code, const UChar* codeEnd)
    814 {
    815     return isIdentPartIncludingEscapeTemplate(code, codeEnd);
     783    return typesOfLatin1Characters[static_cast<LChar>(c)] <= CharacterOtherIdentifierPart;
     784}
     785
     786static ALWAYS_INLINE bool isSingleCharacterIdentPart(UChar c)
     787{
     788    if (LIKELY(isLatin1(c)))
     789        return isIdentPart(static_cast<LChar>(c));
     790    return !U16_IS_SURROGATE(c) && isIdentPart(static_cast<UChar32>(c));
     791}
     792
     793static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(LChar c)
     794{
     795    return !isIdentPart(c) && c != '\\';
     796}
     797
     798// NOTE: This may give give false negatives (for non-ascii) but won't give false posititves.
     799// This means it can be used to detect the end of a keyword (all keywords are ascii)
     800static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(UChar c)
     801{
     802    if (LIKELY(isLatin1(c)))
     803        return cannotBeIdentPartOrEscapeStart(static_cast<LChar>(c));
     804    return Lexer<UChar>::isWhiteSpace(c) || Lexer<UChar>::isLineTerminator(c);
     805}
     806
     807
     808template<>
     809ALWAYS_INLINE UChar32 Lexer<LChar>::currentCodePoint() const
     810{
     811    return m_current;
     812}
     813
     814template<>
     815ALWAYS_INLINE UChar32 Lexer<UChar>::currentCodePoint() const
     816{
     817    ASSERT_WITH_MESSAGE(!isIdentStart(static_cast<UChar32>(U_SENTINEL)), "error values shouldn't appear as a valid identifier start code point");
     818    if (!U16_IS_SURROGATE(m_current))
     819        return m_current;
     820
     821    UChar trail = peek(1);
     822    if (UNLIKELY(!U16_IS_LEAD(m_current) || !U16_IS_SURROGATE_TRAIL(trail)))
     823        return U_SENTINEL;
     824
     825    UChar32 codePoint = U16_GET_SUPPLEMENTARY(m_current, trail);
     826    return codePoint;
    816827}
    817828
     
    953964   
    954965    const LChar* identifierStart = currentSourcePtr();
    955     unsigned identifierLineStart = currentLineStartOffset();
     966    ASSERT(isIdentStart(m_current) || m_current == '\\');
     967    while (isIdentPart(m_current))
     968        shift();
    956969   
    957     while (isIdentPart(m_current))
    958         shift();
    959    
    960     if (UNLIKELY(m_current == '\\')) {
    961         setOffsetFromSourcePtr(identifierStart, identifierLineStart);
    962         return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
    963     }
     970    if (UNLIKELY(m_current == '\\'))
     971        return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode, identifierStart);
    964972
    965973    const Identifier* ident = nullptr;
     
    10081016template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<UChar>::parseIdentifier(JSTokenData* tokenData, OptionSet<LexerFlags> lexerFlags, bool strictMode)
    10091017{
     1018    ASSERT(!m_parsingBuiltinFunction);
    10101019    tokenData->escaped = false;
    10111020    const ptrdiff_t remaining = m_codeEnd - m_code;
     
    10171026        }
    10181027    }
     1028
     1029    const UChar* identifierStart = currentSourcePtr();
     1030    UChar orAllChars = 0;
     1031    ASSERT(isSingleCharacterIdentStart(m_current) || U16_IS_SURROGATE(m_current) || m_current == '\\');
     1032    while (isSingleCharacterIdentPart(m_current)) {
     1033        orAllChars |= m_current;
     1034        shift();
     1035    }
    10191036   
    1020     bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction;
    1021     bool isWellKnownSymbol = false;
    1022     if (isPrivateName) {
    1023         ASSERT(m_parsingBuiltinFunction);
    1024         shift();
    1025         if (m_current == '@') {
    1026             isWellKnownSymbol = true;
    1027             shift();
    1028         }
    1029     }
    1030 
    1031 
    1032     const UChar* identifierStart = currentSourcePtr();
    1033     int identifierLineStart = currentLineStartOffset();
    1034 
    1035     UChar orAllChars = 0;
    1036    
    1037     while (isIdentPart(m_current)) {
    1038         orAllChars |= m_current;
    1039         shift();
    1040     }
    1041    
    1042     if (UNLIKELY(m_current == '\\')) {
    1043         ASSERT(!isPrivateName);
    1044         setOffsetFromSourcePtr(identifierStart, identifierLineStart);
    1045         return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
    1046     }
    1047 
    1048     bool isAll8Bit = false;
    1049 
    1050     if (!(orAllChars & ~0xff))
    1051         isAll8Bit = true;
    1052 
     1037    if (UNLIKELY(U16_IS_SURROGATE(m_current) || m_current == '\\'))
     1038        return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode, identifierStart);
     1039
     1040    bool isAll8Bit = !(orAllChars & ~0xff);
    10531041    const Identifier* ident = nullptr;
    10541042   
    1055     if (shouldCreateIdentifier || m_parsingBuiltinFunction) {
     1043    if (shouldCreateIdentifier) {
    10561044        int identifierLength = currentSourcePtr() - identifierStart;
    1057         if (m_parsingBuiltinFunction && isPrivateName) {
    1058             if (isWellKnownSymbol)
    1059                 ident = &m_arena->makeIdentifier(m_vm, m_vm.propertyNames->builtinNames().lookUpWellKnownSymbol(identifierStart, identifierLength));
    1060             else
    1061                 ident = &m_arena->makeIdentifier(m_vm, m_vm.propertyNames->builtinNames().lookUpPrivateName(identifierStart, identifierLength));
    1062             if (!ident)
    1063                 return INVALID_PRIVATE_NAME_ERRORTOK;
    1064         } else {
    1065             if (isAll8Bit)
    1066                 ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
    1067             else
    1068                 ident = makeIdentifier(identifierStart, identifierLength);
    1069             if (m_parsingBuiltinFunction) {
    1070                 if (!isSafeBuiltinIdentifier(m_vm, ident)) {
    1071                     m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions.");
    1072                     return ERRORTOK;
    1073                 }
    1074                 if (*ident == m_vm.propertyNames->undefinedKeyword)
    1075                     tokenData->ident = &m_vm.propertyNames->undefinedPrivateName;
    1076             }
    1077         }
     1045        if (isAll8Bit)
     1046            ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
     1047        else
     1048            ident = makeIdentifier(identifierStart, identifierLength);
    10781049        tokenData->ident = ident;
    10791050    } else
    10801051        tokenData->ident = nullptr;
    10811052   
    1082     if (UNLIKELY((remaining < maxTokenLength) && !lexerFlags.contains(LexerFlags::IgnoreReservedWords)) && !isPrivateName) {
     1053    if (UNLIKELY((remaining < maxTokenLength) && !lexerFlags.contains(LexerFlags::IgnoreReservedWords))) {
    10831054        ASSERT(shouldCreateIdentifier);
    10841055        if (remaining < maxTokenLength) {
     
    10961067}
    10971068
    1098 template<typename CharacterType> template<bool shouldCreateIdentifier> JSTokenType Lexer<CharacterType>::parseIdentifierSlowCase(JSTokenData* tokenData, OptionSet<LexerFlags> lexerFlags, bool strictMode)
    1099 {
    1100     tokenData->escaped = true;
    1101     auto identifierStart = currentSourcePtr();
    1102     bool bufferRequired = false;
    1103 
    1104     while (true) {
    1105         if (LIKELY(isIdentPart(m_current))) {
    1106             shift();
    1107             continue;
    1108         }
    1109         if (LIKELY(m_current != '\\'))
    1110             break;
    1111 
    1112         // \uXXXX unicode characters.
    1113         bufferRequired = true;
     1069template<typename CharacterType>
     1070template<bool shouldCreateIdentifier>
     1071JSTokenType Lexer<CharacterType>::parseIdentifierSlowCase(JSTokenData* tokenData, OptionSet<LexerFlags> lexerFlags, bool strictMode, const CharacterType* identifierStart)
     1072{
     1073    ASSERT(U16_IS_SURROGATE(m_current) || m_current == '\\');
     1074    ASSERT(m_buffer16.isEmpty());
     1075    ASSERT(!tokenData->escaped);
     1076
     1077    auto fillBuffer = [&] (bool isStart = false) {
     1078        // \uXXXX unicode characters or Surrogate pairs.
    11141079        if (identifierStart != currentSourcePtr())
    11151080            m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
    1116         shift();
    1117         if (UNLIKELY(m_current != 'u'))
    1118             return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
    1119         shift();
    1120         auto character = parseUnicodeEscape();
    1121         if (UNLIKELY(!character.isValid()))
    1122             return character.isIncomplete() ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
    1123         if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character.value()) : !isIdentStart(character.value())))
    1124             return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
    1125         if (shouldCreateIdentifier)
    1126             recordUnicodeCodePoint(character.value());
     1081
     1082        if (m_current == '\\') {
     1083            tokenData->escaped = true;
     1084            shift();
     1085            if (UNLIKELY(m_current != 'u'))
     1086                return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
     1087            shift();
     1088            auto character = parseUnicodeEscape();
     1089            if (UNLIKELY(!character.isValid()))
     1090                return character.isIncomplete() ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
     1091            if (UNLIKELY(isStart ? !isIdentStart(character.value()) : !isIdentPart(character.value())))
     1092                return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
     1093            if (shouldCreateIdentifier)
     1094                recordUnicodeCodePoint(character.value());
     1095            identifierStart = currentSourcePtr();
     1096            return IDENT;
     1097        }
     1098
     1099        ASSERT(U16_IS_SURROGATE(m_current));
     1100        if (UNLIKELY(!U16_IS_SURROGATE_LEAD(m_current)))
     1101            return INVALID_UNICODE_ENCODING_ERRORTOK;
     1102
     1103        UChar32 codePoint = currentCodePoint();
     1104        if (UNLIKELY(codePoint == U_SENTINEL))
     1105            return INVALID_UNICODE_ENCODING_ERRORTOK;
     1106        if (UNLIKELY(isStart ? !isNonLatin1IdentStart(codePoint) : !isNonLatin1IdentPart(codePoint)))
     1107            return INVALID_IDENTIFIER_UNICODE_ERRORTOK;
     1108        append16(m_code, 2);
     1109        shift();
     1110        shift();
    11271111        identifierStart = currentSourcePtr();
    1128     }
    1129 
    1130     int identifierLength;
     1112        return IDENT;
     1113    };
     1114
     1115    JSTokenType type = fillBuffer(identifierStart == currentSourcePtr());
     1116    if (UNLIKELY(type & ErrorTokenFlag))
     1117        return type;
     1118
     1119    while (true) {
     1120        if (LIKELY(isSingleCharacterIdentPart(m_current))) {
     1121            shift();
     1122            continue;
     1123        }
     1124        if (!U16_IS_SURROGATE(m_current) && m_current != '\\')
     1125            break;
     1126
     1127        type = fillBuffer();
     1128        if (UNLIKELY(type & ErrorTokenFlag))
     1129            return type;
     1130    }
     1131
    11311132    const Identifier* ident = nullptr;
    11321133    if (shouldCreateIdentifier) {
    1133         if (!bufferRequired) {
    1134             identifierLength = currentSourcePtr() - identifierStart;
    1135             ident = makeIdentifier(identifierStart, identifierLength);
    1136         } else {
    1137             if (identifierStart != currentSourcePtr())
    1138                 m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
    1139             ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
    1140         }
     1134        if (identifierStart != currentSourcePtr())
     1135            m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
     1136        ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
    11411137
    11421138        tokenData->ident = ident;
     
    11531149        JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
    11541150        if ((token != RESERVED_IF_STRICT) || strictMode)
    1155             return bufferRequired ? UNEXPECTED_ESCAPE_ERRORTOK : token;
     1151            return UNEXPECTED_ESCAPE_ERRORTOK;
    11561152    }
    11571153
     
    19131909    if (LIKELY(isLatin1(m_current)))
    19141910        type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
    1915     else if (isNonLatin1IdentStart(m_current))
    1916         type = CharacterIdentifierStart;
    1917     else if (isLineTerminator(m_current))
    1918         type = CharacterLineTerminator;
    1919     else
    1920         type = CharacterInvalid;
     1911    else {
     1912        UChar32 codePoint;
     1913        U16_GET(m_code, 0, 0, m_codeEnd - m_code, codePoint);
     1914        if (isNonLatin1IdentStart(codePoint))
     1915            type = CharacterIdentifierStart;
     1916        else if (isLineTerminator(m_current))
     1917            type = CharacterLineTerminator;
     1918        else
     1919            type = CharacterInvalid;
     1920    }
    19211921
    19221922    switch (type) {
     
    22322232            token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
    22332233
    2234         if (UNLIKELY(isIdentStart(m_current))) {
     2234        if (LIKELY(cannotBeIdentStart(m_current))) {
     2235            m_buffer8.shrink(0);
     2236            break;
     2237        }
     2238
     2239        if (UNLIKELY(isIdentStart(currentCodePoint()))) {
    22352240            m_lexErrorMessage = "No identifiers allowed directly after numeric literal"_s;
    22362241            token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
     
    22632268            }
    22642269
    2265             if (UNLIKELY(isIdentStart(m_current))) {
     2270            if (LIKELY(cannotBeIdentStart(m_current))) {
     2271                if (LIKELY(token != BIGINT))
     2272                    token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
     2273                m_buffer8.shrink(0);
     2274                break;
     2275            }
     2276
     2277            if (UNLIKELY(isIdentStart(currentCodePoint()))) {
    22662278                m_lexErrorMessage = "No space between hexadecimal literal and identifier"_s;
    22672279                token = UNTERMINATED_HEX_NUMBER_ERRORTOK;
     
    22952307            }
    22962308
    2297             if (UNLIKELY(isIdentStart(m_current))) {
     2309            if (LIKELY(cannotBeIdentStart(m_current))) {
     2310                if (LIKELY(token != BIGINT))
     2311                    token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
     2312                m_buffer8.shrink(0);
     2313                break;
     2314            }
     2315
     2316            if (UNLIKELY(isIdentStart(currentCodePoint()))) {
    22982317                m_lexErrorMessage = "No space between binary literal and identifier"_s;
    22992318                token = UNTERMINATED_BINARY_NUMBER_ERRORTOK;
     
    23282347            }
    23292348
    2330             if (UNLIKELY(isIdentStart(m_current))) {
     2349            if (LIKELY(cannotBeIdentStart(m_current))) {
     2350                if (LIKELY(token != BIGINT))
     2351                    token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
     2352                m_buffer8.shrink(0);
     2353                break;
     2354            }
     2355
     2356            if (UNLIKELY(isIdentStart(currentCodePoint()))) {
    23312357                m_lexErrorMessage = "No space between octal literal and identifier"_s;
    23322358                token = UNTERMINATED_OCTAL_NUMBER_ERRORTOK;
     
    23952421        }
    23962422
    2397         if (UNLIKELY(isIdentStart(m_current))) {
     2423        if (LIKELY(cannotBeIdentStart(m_current))) {
     2424            m_buffer8.shrink(0);
     2425            break;
     2426        }
     2427
     2428        if (UNLIKELY(isIdentStart(currentCodePoint()))) {
    23982429            m_lexErrorMessage = "No identifiers allowed directly after numeric literal"_s;
    23992430            token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
     
    24172448        break;
    24182449        }
    2419     case CharacterIdentifierStart:
    2420         ASSERT(isIdentStart(m_current));
     2450    case CharacterIdentifierStart: {
     2451        if constexpr (ASSERT_ENABLED) {
     2452            UChar32 codePoint;
     2453            U16_GET(m_code, 0, 0, m_codeEnd - m_code, codePoint);
     2454            ASSERT(isIdentStart(codePoint));
     2455        }
    24212456        FALLTHROUGH;
     2457    }
    24222458    case CharacterBackSlash:
    24232459        parseIdent:
     
    25792615
    25802616    tokenData->pattern = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
    2581 
    25822617    m_buffer16.shrink(0);
    2583     charactersOredTogether = 0;
    2584 
    2585     while (isIdentPart(m_current)) {
    2586         record16(m_current);
    2587         orCharacter<T>(charactersOredTogether, m_current);
    2588         shift();
    2589     }
    2590 
    2591     tokenData->flags = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
    2592     m_buffer16.shrink(0);
     2618
     2619    ASSERT(m_buffer8.isEmpty());
     2620    while (LIKELY(isLatin1(m_current)) && isIdentPart(static_cast<LChar>(m_current))) {
     2621        record8(static_cast<LChar>(m_current));
     2622        shift();
     2623    }
     2624
     2625    // Normally this would not be a lex error but dealing with surrogate pairs here is annoying and it's going to be an error anyway...
     2626    if (UNLIKELY(!isLatin1(m_current))) {
     2627        m_buffer8.shrink(0);
     2628        JSTokenType token = INVALID_IDENTIFIER_UNICODE_ERRORTOK;
     2629        fillTokenInfo(tokenRecord, token, m_lineNumber, currentOffset(), currentLineStartOffset(), currentPosition());
     2630        m_error = true;
     2631        String codePoint = String::fromCodePoint(currentCodePoint());
     2632        if (!codePoint)
     2633            codePoint = "`invalid unicode character`";
     2634        m_lexErrorMessage = makeString("Invalid non-latin character in RexExp literal's flags '", getToken(*tokenRecord), codePoint, "'");
     2635        return token;
     2636    }
     2637
     2638    tokenData->flags = makeIdentifier(m_buffer8.data(), m_buffer8.size());
     2639    m_buffer8.shrink(0);
    25932640
    25942641    // Since RegExp always ends with /, m_atLineStart always becomes false.
Note: See TracChangeset for help on using the changeset viewer.