Changeset 65302 in webkit for trunk/JavaScriptCore/runtime/UString.cpp
- Timestamp:
- Aug 12, 2010, 11:42:16 PM (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/runtime/UString.cpp
r65295 r65302 258 258 } 259 259 260 // FIXME: If tolerateTrailingJunk is true, then we want to tolerate junk 261 // after the number, even if it contains invalid UTF-16 sequences. So we 262 // shouldn't use the UTF8String function, which returns null when it 263 // encounters invalid UTF-16. Further, we have no need to convert the 264 // non-ASCII characters to UTF-8, so the UTF8String does quite a bit of 265 // unnecessary work. 266 267 // FIXME: The space skipping code below skips only ASCII spaces, but callers 268 // need to skip all StrWhiteSpace. The isStrWhiteSpace function does the 269 // right thing but requires UChar, not char, for its argument. 270 260 271 const UChar* data = this->characters(); 261 272 const UChar* end = data + size; … … 584 595 } 585 596 586 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available. 587 static inline void putUTF8Triple(char*& buffer, UChar ch) 588 { 589 ASSERT(ch >= 0x0800); 590 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); 591 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); 592 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); 593 } 594 595 CString UString::utf8(bool strict) const 596 { 597 unsigned length = this->length(); 598 const UChar* characters = this->characters(); 599 600 // Allocate a buffer big enough to hold all the characters 601 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 602 // Optimization ideas, if we find this function is hot: 603 // * We could speculatively create a CStringBuffer to contain 'length' 604 // characters, and resize if necessary (i.e. if the buffer contains 605 // non-ascii characters). (Alternatively, scan the buffer first for 606 // ascii characters, so we know this will be sufficient). 607 // * We could allocate a CStringBuffer with an appropriate size to 608 // have a good chance of being able to write the string into the 609 // buffer without reallocing (say, 1.5 x length). 610 Vector<char, 1024> bufferVector(length * 3); 611 612 char* buffer = bufferVector.data(); 613 ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict); 614 ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion 615 616 if (result == sourceIllegal) // Only produced from strict conversion. 597 CString UString::UTF8String(bool strict) const 598 { 599 // Allocate a buffer big enough to hold all the characters. 600 const unsigned len = length(); 601 Vector<char, 1024> buffer(len * 3); 602 603 // Convert to runs of 8-bit characters. 604 char* p = buffer.data(); 605 const UChar* d = reinterpret_cast<const UChar*>(&characters()[0]); 606 ConversionResult result = convertUTF16ToUTF8(&d, d + len, &p, p + buffer.size(), strict); 607 if (result != conversionOK) 617 608 return CString(); 618 609 619 // If a high surrogate is left unconverted, treat it the same was as an unpaired high surrogate 620 // would have been handled in the middle of a string with non-strict conversion - which is to say, 621 // simply encode it to UTF-8. 622 if (result == sourceExhausted) { 623 // This should be one unpaired high surrogate. 624 ASSERT((characters + 1) == (this->characters() + length)); 625 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF)); 626 // There should be room left, since one UChar hasn't been converted. 627 ASSERT((buffer + 3) <= (buffer + bufferVector.size())); 628 putUTF8Triple(buffer, *characters); 629 } 630 631 return CString(bufferVector.data(), buffer - bufferVector.data()); 610 return CString(buffer.data(), p - buffer.data()); 632 611 } 633 612
Note:
See TracChangeset
for help on using the changeset viewer.