Ignore:
Timestamp:
Jul 3, 2010, 1:30:24 PM (15 years ago)
Author:
[email protected]
Message:

Move BOM handling out of the lexer and parser
https://bugs.webkit.org/show_bug.cgi?id=41539

Reviewed by Geoffrey Garen.

JavaScriptCore:

Doing the BOM stripping in the lexer meant that we could
end up having to strip the BOMs from a source multiple times.
To deal with this we now require all strings provided by
a SourceProvider to already have had the BOMs stripped.
This also simplifies some of the lexer logic.

  • parser/Lexer.cpp:

(JSC::Lexer::setCode):
(JSC::Lexer::sourceCode):

  • parser/SourceProvider.h:

(JSC::SourceProvider::SourceProvider):
(JSC::UStringSourceProvider::create):
(JSC::UStringSourceProvider::getRange):
(JSC::UStringSourceProvider::UStringSourceProvider):

  • wtf/text/StringImpl.h:

(WebCore::StringImpl::copyStringWithoutBOMs):

WebCore:

Update WebCore to ensure that SourceProviders don't
produce strings with BOMs in them.

  • bindings/js/ScriptSourceProvider.h:

(WebCore::ScriptSourceProvider::ScriptSourceProvider):

  • bindings/js/StringSourceProvider.h:

(WebCore::StringSourceProvider::StringSourceProvider):

  • loader/CachedScript.cpp:

(WebCore::CachedScript::CachedScript):
(WebCore::CachedScript::script):

  • loader/CachedScript.h:

(WebCore::CachedScript::):

CachedScript now stores decoded data with the BOMs stripped,
and caches the presence of BOMs across memory purges.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/parser/Lexer.cpp

    r62416 r62449  
    4646namespace JSC {
    4747
    48 static const UChar byteOrderMark = 0xFEFF;
    4948
    5049enum CharacterTypes {
     
    257256    m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
    258257
    259     // ECMA-262 calls for stripping all Cf characters, but we only strip BOM characters.
    260     // See <https://bugs.webkit.org/show_bug.cgi?id=4931> for details.
    261     if (source.provider()->hasBOMs()) {
    262         for (const UChar* p = m_codeStart; p < m_codeEnd; ++p) {
    263             if (UNLIKELY(*p == byteOrderMark)) {
    264                 copyCodeWithoutBOMs();
    265                 break;
    266             }
    267         }
    268     }
    269 
    270258    if (LIKELY(m_code < m_codeEnd))
    271259        m_current = *m_code;
     
    273261        m_current = -1;
    274262    ASSERT(currentOffset() == source.startOffset());
    275 }
    276 
    277 void Lexer::copyCodeWithoutBOMs()
    278 {
    279     // Note: In this case, the character offset data for debugging will be incorrect.
    280     // If it's important to correctly debug code with extraneous BOMs, then the caller
    281     // should strip the BOMs when creating the SourceProvider object and do its own
    282     // mapping of offsets within the stripped text to original text offset.
    283 
    284     m_codeWithoutBOMs.reserveCapacity(m_codeEnd - m_code);
    285     for (const UChar* p = m_code; p < m_codeEnd; ++p) {
    286         UChar c = *p;
    287         if (c != byteOrderMark)
    288             m_codeWithoutBOMs.append(c);
    289     }
    290     ptrdiff_t startDelta = m_codeStart - m_code;
    291     m_code = m_codeWithoutBOMs.data();
    292     m_codeStart = m_code + startDelta;
    293     m_codeEnd = m_codeWithoutBOMs.data() + m_codeWithoutBOMs.size();
    294263}
    295264
     
    11811150SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine)
    11821151{
    1183     if (m_codeWithoutBOMs.isEmpty())
    1184         return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine);
    1185 
    1186     const UChar* data = m_source->provider()->data();
    1187    
    1188     ASSERT(openBrace < closeBrace);
    1189     int i;
    1190     for (i = m_source->startOffset(); i < openBrace; ++i) {
    1191         if (data[i] == byteOrderMark) {
    1192             openBrace++;
    1193             closeBrace++;
    1194         }
    1195     }
    1196     for (; i < closeBrace; ++i) {
    1197         if (data[i] == byteOrderMark)
    1198             closeBrace++;
    1199     }
    1200 
    1201     ASSERT(openBrace < closeBrace);
    1202 
    12031152    return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine);
    12041153}
Note: See TracChangeset for help on using the changeset viewer.