Ignore:
Timestamp:
Apr 29, 2015, 9:33:12 AM (10 years ago)
Author:
Darin Adler
Message:

[ES6] Implement Unicode code point escapes
https://bugs.webkit.org/show_bug.cgi?id=144377

Reviewed by Antti Koivisto.

Source/JavaScriptCore:

  • parser/Lexer.cpp: Moved the UnicodeHexValue class in here from

the header. Made it a non-member class so it doesn't need to be part
of a template. Made it use UChar32 instead of int for the value to
make it clearer what goes into this class.
(JSC::ParsedUnicodeEscapeValue::isIncomplete): Added. Replaces the
old type() function.
(JSC::Lexer<CharacterType>::parseUnicodeEscape): Renamed from
parseFourDigitUnicodeHex and added support for code point escapes.
(JSC::isLatin1): Added an overload for UChar32.
(JSC::isIdentStart): Changed this to take UChar32; no caller tries
to call it with a UChar, so no need to overload for that type for now.
(JSC::isNonLatin1IdentPart): Changed argument type to UChar32 for clarity.
Also added FIXME about a subtle ES6 change that we might want to make later.
(JSC::isIdentPart): Changed this to take UChar32; no caller tries
to call it with a UChar, so no need to overload for that type for now.
(JSC::isIdentPartIncludingEscapeTemplate): Made this a template so that we
don't need to repeat the code twice. Added code to handle code point escapes.
(JSC::isIdentPartIncludingEscape): Call the template instead of having the
code in line.
(JSC::Lexer<CharacterType>::recordUnicodeCodePoint): Added.
(JSC::Lexer<CharacterType>::parseIdentifierSlowCase): Made small tweaks and
updated to call parseUnicodeEscape instead of parseFourDigitUnicodeHex.
(JSC::Lexer<CharacterType>::parseComplexEscape): Call parseUnicodeEscape
instead of parseFourDigitUnicodeHex. Move the code to handle "\u" before
the code that handles the escapes, since the code point escape code now
consumes characters while parsing rather than peeking ahead. Test case
covers this: Symptom would be that "\u{" would evaluate to "u" instead of
giving a syntax error.

  • parser/Lexer.h: Updated for above changes.
  • runtime/StringConstructor.cpp:

(JSC::stringFromCodePoint): Use ICU's UCHAR_MAX_VALUE instead of writing
out 0x10FFFF; clearer this way.

Source/WebCore:

Test: js/unicode-escape-sequences.html

  • css/CSSParser.cpp:

(WebCore::CSSParser::parseEscape): Use ICU's UCHAR_MAX_VALUE instead of writing
out 0x10FFFF; clearer this way. Also use our replacementCharacter instead of
writing out 0xFFFD.

  • html/parser/HTMLEntityParser.cpp:

(WebCore::isAlphaNumeric): Deleted.
(WebCore::HTMLEntityParser::legalEntityFor): Use ICU's UCHAR_MAX_VALUE and
U_IS_SURROGATE instead of writing the code out. Didn't use U_IS_UNICODE_CHAR
because that also includes U_IS_UNICODE_NONCHAR and thus would change behavior,
but maye it's something we want to do in the future.
(WebCore::HTMLEntityParser::consumeNamedEntity): Use isASCIIAlphanumeric instead
of a the function in this file that does the same thing less efficiently.

  • html/parser/InputStreamPreprocessor.h:

(WebCore::InputStreamPreprocessor::processNextInputCharacter): Use
replacementCharacter from CharacterNames.h instead of writing out 0xFFFd.

  • xml/parser/CharacterReferenceParserInlines.h:

(WebCore::consumeCharacterReference): Use ICU's UCHAR_MAX_VALUE instead of
defining our own local highestValidCharacter constant.

LayoutTests:

  • js/script-tests/unicode-escape-sequences.js: Added.
  • js/unicode-escape-sequences-expected.txt: Added.
  • js/unicode-escape-sequences.html: Added. Generated with make-script-test-wrappers.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/JavaScriptCore/parser/Lexer.cpp

    r183373 r183552  
    611611}
    612612
    613 template <typename T>
    614 typename Lexer<T>::UnicodeHexValue Lexer<T>::parseFourDigitUnicodeHex()
    615 {
    616     T char1 = peek(1);
    617     T char2 = peek(2);
    618     T char3 = peek(3);
    619 
    620     if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
    621         return UnicodeHexValue((m_code + 4) >= m_codeEnd ? UnicodeHexValue::IncompleteHex : UnicodeHexValue::InvalidHex);
    622 
    623     int result = convertUnicode(m_current, char1, char2, char3);
     613struct ParsedUnicodeEscapeValue {
     614    ParsedUnicodeEscapeValue(UChar32 value)
     615        : m_value(value)
     616    {
     617        ASSERT(isValid());
     618    }
     619
     620    enum SpecialValueType { Incomplete = -2, Invalid = -1 };
     621    ParsedUnicodeEscapeValue(SpecialValueType type)
     622        : m_value(type)
     623    {
     624    }
     625
     626    bool isValid() const { return m_value >= 0; }
     627    bool isIncomplete() const { return m_value == Incomplete; }
     628
     629    UChar32 value() const
     630    {
     631        ASSERT(isValid());
     632        return m_value;
     633    }
     634
     635private:
     636    UChar32 m_value;
     637};
     638
     639template<typename CharacterType> ParsedUnicodeEscapeValue Lexer<CharacterType>::parseUnicodeEscape()
     640{
     641    if (m_current == '{') {
     642        shift();
     643        UChar32 codePoint = 0;
     644        do {
     645            if (!isASCIIHexDigit(m_current))
     646                return m_current ? ParsedUnicodeEscapeValue::Invalid : ParsedUnicodeEscapeValue::Incomplete;
     647            codePoint = (codePoint << 4) | toASCIIHexValue(m_current);
     648            if (codePoint > UCHAR_MAX_VALUE)
     649                return ParsedUnicodeEscapeValue::Invalid;
     650            shift();
     651        } while (m_current != '}');
     652        shift();
     653        return codePoint;
     654    }
     655
     656    auto character2 = peek(1);
     657    auto character3 = peek(2);
     658    auto character4 = peek(3);
     659    if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(character2) || !isASCIIHexDigit(character3) || !isASCIIHexDigit(character4)))
     660        return (m_code + 4) >= m_codeEnd ? ParsedUnicodeEscapeValue::Incomplete : ParsedUnicodeEscapeValue::Invalid;
     661    auto result = convertUnicode(m_current, character2, character3, character4);
    624662    shift();
    625663    shift();
    626664    shift();
    627665    shift();
    628     return UnicodeHexValue(result);
     666    return result;
    629667}
    630668
     
    666704}
    667705
     706static ALWAYS_INLINE bool isLatin1(UChar32 c)
     707{
     708    return !(c & ~0xFF);
     709}
     710
    668711static inline bool isIdentStart(LChar c)
    669712{
     
    671714}
    672715
    673 static inline bool isIdentStart(UChar c)
     716static inline bool isIdentStart(UChar32 c)
    674717{
    675718    return isLatin1(c) ? isIdentStart(static_cast<LChar>(c)) : isNonLatin1IdentStart(c);
    676719}
    677720
    678 static NEVER_INLINE bool isNonLatin1IdentPart(int c)
    679 {
     721static NEVER_INLINE bool isNonLatin1IdentPart(UChar32 c)
     722{
     723    // FIXME: ES6 says this should be based on the Unicode property ID_Continue now instead.
    680724    return (U_GET_GC_MASK(c) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || c == 0x200C || c == 0x200D;
    681725}
     
    689733}
    690734
     735static ALWAYS_INLINE bool isIdentPart(UChar32 c)
     736{
     737    return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
     738}
     739
    691740static ALWAYS_INLINE bool isIdentPart(UChar c)
    692741{
    693     return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
    694 }
    695 
    696 template <typename T>
    697 bool isUnicodeEscapeIdentPart(const T* code)
    698 {
    699     T char1 = code[0];
    700     T char2 = code[1];
    701     T char3 = code[2];
    702     T char4 = code[3];
    703    
    704     if (!isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3) || !isASCIIHexDigit(char4))
     742    return isIdentPart(static_cast<UChar32>(c));
     743}
     744
     745template<typename CharacterType> ALWAYS_INLINE bool isIdentPartIncludingEscapeTemplate(const CharacterType* code, const CharacterType* codeEnd)
     746{
     747    if (isIdentPart(code[0]))
     748        return true;
     749
     750    // Shortest sequence handled below is \u{0}, which is 5 characters.
     751    if (!(code[0] == '\\' && codeEnd - code >= 5 && code[1] == 'u'))
    705752        return false;
    706    
    707     return isIdentPart(Lexer<T>::convertUnicode(char1, char2, char3, char4));
     753
     754    if (code[2] == '{') {
     755        UChar32 codePoint = 0;
     756        const CharacterType* pointer;
     757        for (pointer = &code[3]; pointer < codeEnd; ++pointer) {
     758            auto digit = *pointer;
     759            if (!isASCIIHexDigit(digit))
     760                break;
     761            codePoint = (codePoint << 4) | toASCIIHexValue(digit);
     762            if (codePoint > UCHAR_MAX_VALUE)
     763                return false;
     764        }
     765        return isIdentPart(codePoint) && pointer < codeEnd && *pointer == '}';
     766    }
     767
     768    // Shortest sequence handled below is \uXXXX, which is 6 characters.
     769    if (codeEnd - code < 6)
     770        return false;
     771
     772    auto character1 = code[2];
     773    auto character2 = code[3];
     774    auto character3 = code[4];
     775    auto character4 = code[5];
     776    return isASCIIHexDigit(character1) && isASCIIHexDigit(character2) && isASCIIHexDigit(character3) && isASCIIHexDigit(character4)
     777        && isIdentPart(Lexer<LChar>::convertUnicode(character1, character2, character3, character4));
    708778}
    709779
    710780static ALWAYS_INLINE bool isIdentPartIncludingEscape(const LChar* code, const LChar* codeEnd)
    711781{
    712     if (isIdentPart(*code))
    713         return true;
    714 
    715     return (*code == '\\' && ((codeEnd - code) >= 6) && code[1] == 'u' && isUnicodeEscapeIdentPart(code+2));
     782    return isIdentPartIncludingEscapeTemplate(code, codeEnd);
    716783}
    717784
    718785static ALWAYS_INLINE bool isIdentPartIncludingEscape(const UChar* code, const UChar* codeEnd)
    719786{
    720     if (isIdentPart(*code))
    721         return true;
    722    
    723     return (*code == '\\' && ((codeEnd - code) >= 6) && code[1] == 'u' && isUnicodeEscapeIdentPart(code+2));
     787    return isIdentPartIncludingEscapeTemplate(code, codeEnd);
    724788}
    725789
     
    800864}
    801865   
     866template<typename CharacterType> inline void Lexer<CharacterType>::recordUnicodeCodePoint(UChar32 codePoint)
     867{
     868    ASSERT(codePoint >= 0);
     869    ASSERT(codePoint <= UCHAR_MAX_VALUE);
     870    if (U_IS_BMP(codePoint))
     871        record16(codePoint);
     872    else {
     873        UChar codeUnits[2] = { U16_LEAD(codePoint), U16_TRAIL(codePoint) };
     874        append16(codeUnits, 2);
     875    }
     876}
     877
    802878#if !ASSERT_DISABLED
    803879bool isSafeBuiltinIdentifier(VM& vm, const Identifier* ident)
     
    808884     * be used as a safety net while implementing builtins.
    809885     */
     886    // FIXME: How can a debug-only assertion be a safety net?
    810887    if (*ident == vm.propertyNames->builtinNames().callPublicName())
    811888        return false;
     
    9611038}
    9621039
    963 template <typename T>
    964 template <bool shouldCreateIdentifier> JSTokenType Lexer<T>::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
     1040template<typename CharacterType> template<bool shouldCreateIdentifier> JSTokenType Lexer<CharacterType>::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
    9651041{
    9661042    const ptrdiff_t remaining = m_codeEnd - m_code;
    967     const T* identifierStart = currentSourcePtr();
     1043    auto identifierStart = currentSourcePtr();
    9681044    bool bufferRequired = false;
    9691045
     
    9841060            return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
    9851061        shift();
    986         UnicodeHexValue character = parseFourDigitUnicodeHex();
     1062        auto character = parseUnicodeEscape();
    9871063        if (UNLIKELY(!character.isValid()))
    988             return character.valueType() == UnicodeHexValue::IncompleteHex ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
    989         UChar ucharacter = static_cast<UChar>(character.value());
    990         if (UNLIKELY(m_buffer16.size() ? !isIdentPart(ucharacter) : !isIdentStart(ucharacter)))
     1064            return character.isIncomplete() ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
     1065        if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character.value()) : !isIdentStart(character.value())))
    9911066            return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
    9921067        if (shouldCreateIdentifier)
    993             record16(ucharacter);
     1068            recordUnicodeCodePoint(character.value());
    9941069        identifierStart = currentSourcePtr();
    9951070    }
    9961071
    9971072    int identifierLength;
    998     const Identifier* ident = 0;
     1073    const Identifier* ident = nullptr;
    9991074    if (shouldCreateIdentifier) {
    10001075        if (!bufferRequired) {
     
    10091084        tokenData->ident = ident;
    10101085    } else
    1011         tokenData->ident = 0;
     1086        tokenData->ident = nullptr;
    10121087
    10131088    if (LIKELY(!bufferRequired && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
     
    11261201    if (m_current == 'u') {
    11271202        shift();
    1128         UnicodeHexValue character = parseFourDigitUnicodeHex();
    1129         if (character.isValid()) {
    1130             if (shouldBuildStrings)
    1131                 record16(character.value());
    1132             return StringParsedSuccessfully;
    1133         }
    11341203
    11351204        if (escapeParseMode == EscapeParseMode::String && m_current == stringQuoteCharacter) {
     
    11391208        }
    11401209
     1210        auto character = parseUnicodeEscape();
     1211        if (character.isValid()) {
     1212            if (shouldBuildStrings)
     1213                recordUnicodeCodePoint(character.value());
     1214            return StringParsedSuccessfully;
     1215        }
     1216
    11411217        m_lexErrorMessage = ASCIILiteral("\\u can only be followed by a Unicode character sequence");
    1142         return character.valueType() == UnicodeHexValue::IncompleteHex ? StringUnterminated : StringCannotBeParsed;
     1218        return character.isIncomplete() ? StringUnterminated : StringCannotBeParsed;
    11431219    }
    11441220
Note: See TracChangeset for help on using the changeset viewer.