Context Navigation

← Previous Change
Next Change →

Lexer.cpp

Timestamp:

Apr 21, 2012, 1:03:13 PM (13 years ago)

Author:

Darin Adler

Message:

Change JavaScript lexer to use 0 instead of -1 for sentinel, eliminating the need to put characters into ints
https://bugs.webkit.org/show_bug.cgi?id=84523

Reviewed by Oliver Hunt.

Profiles showed that checks against -1 were costly, and I saw they could be eliminated.
Streamlined this code to use standard character types and 0 rather than -1. One benefit
of this is that there's no widening and narrowing. Another is that there are many cases
where we already have the correct behavior for 0, so can eliminate a branch that was
used to test for -1 before. Also eliminates typecasts in the code.

parser/Lexer.cpp:

(JSC::Lexer::invalidCharacterMessage): Updated use of String::format since m_current is now a
character type, not an int.
(JSC::Lexer::setCode): Use 0 rather than -1 when past the end.
(JSC::Lexer::shift): Ditto. Also spruced up the comment a bit.
(JSC::Lexer::atEnd): Added. New function that distinguishes an actual 0 character from the end
of the code. This can be used places we used to cheeck for -1.
(JSC::Lexer::peek): Updated to use -1 instead of 0. Removed meaningless comment.
(JSC::Lexer::parseFourDigitUnicodeHex): Changed to use character types instead of int.
(JSC::Lexer::shiftLineTerminator): Removed now-unneeded type casts. Changed local variable that
had a data-member-style name.
(JSC::Lexer::parseIdentifier): Removed now-unneeded explicit checks for -1, since the isIdentPart
function already returns false for the 0 character. Updated types in a couple other places. Used
the atEnd function where needed.
(JSC::Lexer::parseIdentifierSlowCase): More of the same.
(JSC::characterRequiresParseStringSlowCase): Added overloaded helper function for parseString.
(JSC::Lexer::parseString): Ditto.
(JSC::Lexer::parseStringSlowCase): Ditto.
(JSC::Lexer::parseMultilineComment): Ditto.
(JSC::Lexer::lex): More of the same. Also changed code to set the startOffset directly in
the tokenInfo instead of putting it in a local variable first, saving some memory access.
(JSC::Lexer::scanRegExp): Ditto.
(JSC::Lexer::skipRegExp): Ditto.

parser/Lexer.h: Changed return type of the peek function and type of m_current from int to

the character type. Added atEnd function.
(JSC::Lexer::setOffset): Used 0 instead of -1 and removed an overzealous attempt to optimize.
(JSC::Lexer::lexExpectIdentifier): Used 0 instead of -1.

File:

: 1 edited

trunk/Source/JavaScriptCore/parser/Lexer.cpp (modified) (34 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/Source/JavaScriptCore/parser/Lexer.cpp

-              r114844
+              r114845
         return "Invalid character: '`'";
     default:
         return String::format("Invalid character '\\u%04u'", m_current).impl();
+        return String::format("Invalid character '\\u%04u'", static_cast<unsigned>(m_current)).impl();
+    }
+}
 …
         m_current = *m_code;
     else
         m_current = -1;
+        m_current = 0;
     ASSERT(currentOffset() == source.startOffset());
+}
 …
 ALWAYS_INLINE void Lexer<T>::shift()
+{
+    // Faster than an if-else sequence
+    ASSERT(m_current != -1);
+    m_current = -1;
+    m_code++;
+    // At one point timing showed that setting m_current to 0 unconditionally was faster than an if-else sequence.
+    m_current = 0;
+    ++m_code;
     if (LIKELY(m_code < m_codeEnd))
         m_current = *m_code;
 …
 template <typename T>
+ALWAYS_INLINE int Lexer<T>::peek(int offset)
+ALWAYS_INLINE bool Lexer<T>::atEnd() const
+{
+    ASSERT(!m_current || m_code < m_codeEnd);
+    return UNLIKELY(UNLIKELY(!m_current) && m_code == m_codeEnd);
+}
+template <typename T>
+ALWAYS_INLINE T Lexer<T>::peek(int offset) const
+{
     ASSERT(offset > 0 && offset < 5);
     const T* code = m_code + offset;
     return (code < m_codeEnd) ? *code : -1;
+    return (code < m_codeEnd) ? *code : 0;
+}
 …
 int Lexer<T>::parseFourDigitUnicodeHex()
+{
     int char1 = peek(1);
     int char2 = peek(2);
     int char3 = peek(3);
+    T char1 = peek(1);
+    T char2 = peek(2);
+    T char3 = peek(3);
     if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
 …
 void Lexer<T>::shiftLineTerminator()
+{
     ASSERT(isLineTerminator(static_cast<T>(m_current)));
     int prev = m_current;
+    ASSERT(isLineTerminator(m_current));
+    T prev = m_current;
     shift();
 …
     const LChar* identifierStart = currentCharacter();
     while (m_current != -1 && isIdentPart(static_cast<LChar>(m_current)))
+    while (isIdentPart(m_current))
         shift();
 …
     UChar orAllChars = 0;
     while (m_current != -1 && isIdentPart(static_cast<UChar>(m_current))) {
+    while (isIdentPart(m_current)) {
         orAllChars |= m_current;
         shift();
 …
     while (true) {
         if (LIKELY(m_current != -1 && isIdentPart(static_cast<T>(m_current)))) {
+        if (LIKELY(isIdentPart(m_current))) {
             shift();
             continue;
 …
+}
+static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(LChar character)
+{
+    return character < 0xE;
+}
+static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(UChar character)
+{
+    return character < 0xE || character > 0xFF;
+}
 template <typename T>
 template <bool shouldBuildStrings> ALWAYS_INLINE bool Lexer<T>::parseString(JSTokenData* tokenData, bool strictMode)
 …
     int startingOffset = currentOffset();
     int startingLineNumber = lineNumber();
     int stringQuoteCharacter = m_current;
+    T stringQuoteCharacter = m_current;
     shift();
 …
     while (m_current != stringQuoteCharacter) {
         if (UNLIKELY((m_current == '\\'))) {
+        if (UNLIKELY(m_current == '\\')) {
             if (stringStart != currentCharacter() && shouldBuildStrings)
                 append8(stringStart, currentCharacter() - stringStart);
 …
                     return false;
+                }
                 int prev = m_current;
+                T prev = m_current;
                 shift();
                 if (shouldBuildStrings)
 …
+        }
         if (UNLIKELY(((m_current > 0xff) || (m_current < 0xe)))) {
+        if (UNLIKELY(characterRequiresParseStringSlowCase(m_current))) {
             setOffset(startingOffset);
             setLineNumber(startingLineNumber);
 …
 template <bool shouldBuildStrings> bool Lexer<T>::parseStringSlowCase(JSTokenData* tokenData, bool strictMode)
+{
     int stringQuoteCharacter = m_current;
+    T stringQuoteCharacter = m_current;
     shift();
 …
                     record16(escape);
                 shift();
             } else if (UNLIKELY(isLineTerminator(static_cast<T>(m_current))))
+            } else if (UNLIKELY(isLineTerminator(m_current)))
                 shiftLineTerminator();
             else if (m_current == 'x') {
 …
                     return false;
+                }
                 int prev = m_current;
+                T prev = m_current;
                 shift();
                 if (shouldBuildStrings)
 …
             } else if (!strictMode && isASCIIOctalDigit(m_current)) {
                 // Octal character sequences
                 int character1 = m_current;
+                T character1 = m_current;
                 shift();
                 if (isASCIIOctalDigit(m_current)) {
                     // Two octal characters
                     int character2 = m_current;
+                    T character2 = m_current;
                     shift();
                     if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
 …
                         record16(character1 - '0');
+                }
             } else if (m_current != -1) {
+            } else if (!atEnd()) {
                 if (shouldBuildStrings)
                     record16(m_current);
 …
+        }
         // Fast check for characters that require special handling.
         // Catches -1, \n, \r, 0x2028, and 0x2029 as efficiently
+        // Catches 0, \n, \r, 0x2028, and 0x2029 as efficiently
         // as possible, and lets through all common ASCII characters.
         if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
             // New-line or end of input is not allowed
             if (UNLIKELY(m_current == -1) || UNLIKELY(isLineTerminator(static_cast<T>(m_current)))) {
+            if (atEnd() || isLineTerminator(m_current)) {
                 m_lexErrorMessage = "Unexpected EOF";
                 return false;
 …
+        }
         if (UNLIKELY(m_current == -1))
+        if (atEnd())
             return false;
         if (isLineTerminator(static_cast<T>(m_current))) {
+        if (isLineTerminator(m_current)) {
             shiftLineTerminator();
             m_terminator = true;
 …
 start:
+    while (m_current != -1 && isWhiteSpace(static_cast<T>(m_current)))
+        shift();
+    int startOffset = currentOffset();
+    if (UNLIKELY(m_current == -1))
+    while (isWhiteSpace(m_current))
+        shift();
+    if (atEnd())
         return EOFTOK;
+    tokenInfo->startOffset = currentOffset();
     CharacterType type;
     if (LIKELY(isLatin1(static_cast<T>(m_current))))
+    if (LIKELY(isLatin1(m_current)))
         type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
     else if (isNonLatin1IdentStart(m_current))
         type = CharacterIdentifierStart;
     else if (isLineTerminator(static_cast<T>(m_current)))
+    else if (isLineTerminator(m_current))
         type = CharacterLineTerminator;
     else
 …
         // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
         if (UNLIKELY(m_current != -1 && isIdentStart(static_cast<T>(m_current)))) {
+        if (UNLIKELY(isIdentStart(m_current))) {
             m_lexErrorMessage = "At least one digit must occur after a decimal point";
             goto returnError;
 …
         break;
     case CharacterIdentifierStart:
         ASSERT(isIdentStart(static_cast<T>(m_current)));
+        ASSERT(isIdentStart(m_current));
         // Fall through into CharacterBackSlash.
     case CharacterBackSlash:
 …
         break;
     case CharacterLineTerminator:
         ASSERT(isLineTerminator(static_cast<T>(m_current)));
+        ASSERT(isLineTerminator(m_current));
         shiftLineTerminator();
         m_atLineStart = true;
 …
 inSingleLineComment:
     while (!isLineTerminator(static_cast<T>(m_current))) {
         if (UNLIKELY(m_current == -1))
+    while (!isLineTerminator(m_current)) {
+        if (atEnd())
             return EOFTOK;
         shift();
 …
 returnToken:
     tokenInfo->line = m_lineNumber;
-    tokenInfo->startOffset = startOffset;
     tokenInfo->endOffset = currentOffset();
     m_lastToken = token;
 …
     m_error = true;
     tokenInfo->line = m_lineNumber;
-    tokenInfo->startOffset = startOffset;
     tokenInfo->endOffset = currentOffset();
     return ERRORTOK;
 …
     while (true) {
+        int current = m_current;
+        if (isLineTerminator(static_cast<T>(current)) || current == -1) {
+        if (isLineTerminator(m_current) || atEnd()) {
             m_buffer16.resize(0);
             return false;
+        }
+        shift();
+        if (current == '/' && !lastWasEscape && !inBrackets)
+            break;
+        record16(current);
+        T prev = m_current;
+        shift();
+        if (prev == '/' && !lastWasEscape && !inBrackets)
+            break;
+        record16(prev);
         if (lastWasEscape) {
 …
+        }
         switch (current) {
+        switch (prev) {
         case '[':
             inBrackets = true;
 …
     m_buffer16.resize(0);
     while (m_current != -1 && isIdentPart(static_cast<T>(m_current))) {
+    while (isIdentPart(m_current)) {
         record16(m_current);
         shift();
 …
     while (true) {
+        int current = m_current;
+        if (isLineTerminator(static_cast<T>(current)) || current == -1)
+        if (isLineTerminator(m_current) || atEnd())
             return false;
+        shift();
+        if (current == '/' && !lastWasEscape && !inBrackets)
+        T prev = m_current;
+        shift();
+        if (prev == '/' && !lastWasEscape && !inBrackets)
             break;
 …
+        }
         switch (current) {
+        switch (prev) {
         case '[':
             inBrackets = true;
 …
+    }
     while (m_current != -1 && isIdentPart(static_cast<T>(m_current)))
+    while (isIdentPart(m_current))
         shift();

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 114845 in webkit for trunk/Source/JavaScriptCore/parser/Lexer.cpp

Legend:

trunk/Source/JavaScriptCore/parser/Lexer.cpp

Download in other formats: