Changeset 27746 in webkit for trunk/JavaScriptCore


Ignore:
Timestamp:
Nov 12, 2007, 11:12:55 PM (18 years ago)
Author:
[email protected]
Message:

Reviewed by Darin.

http://bugs.webkit.org/show_bug.cgi?id=15953
Add UTF-8 encoding/decoding to WTF

  • kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode.
  • kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient parameter. Callers are not interested in getting decoding results in strict mode, so this allows for bailing out as soon as an error is seen.
  • kjs/function.cpp: (KJS::encode): Updated for new UString::UTF8String() signature.
  • API/JSStringRef.cpp: (JSStringCreateWithCharacters): Disambiguate UChar. (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string!
  • bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16().
  • wtf/unicode/UTF8.cpp: Added. (WTF::Unicode::inlineUTF8SequenceLengthNonASCII): (WTF::Unicode::inlineUTF8SequenceLength): (WTF::Unicode::UTF8SequenceLength): (WTF::Unicode::decodeUTF8Sequence): (WTF::Unicode::): (WTF::Unicode::ConvertUTF16ToUTF8): (WTF::Unicode::isLegalUTF8): (WTF::Unicode::ConvertUTF8ToUTF16):
  • wtf/unicode/UTF8.h: Added. (WTF::Unicode::): Some code moved from ustring.h, some adapted from unicode.org sources.
Location:
trunk/JavaScriptCore
Files:
2 added
11 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/API/JSStringRef.cpp

    r27730 r27746  
    3737#include <kjs/ustring.h>
    3838#include <kjs/value.h>
     39#include <wtf/unicode/UTF8.h>
    3940
    4041using namespace KJS;
     42using namespace WTF::Unicode;
    4143
    4244JSStringRef JSStringCreateWithCharacters(const JSChar* chars, size_t numChars)
    4345{
    4446    JSLock lock;
    45     return toRef(UString(reinterpret_cast<const UChar*>(chars), static_cast<int>(numChars)).rep()->ref());
     47    return toRef(UString(reinterpret_cast<const KJS::UChar*>(chars), static_cast<int>(numChars)).rep()->ref());
    4648}
    4749
     
    4951{
    5052    JSLock lock;
    51     // FIXME: <rdar://problem/4949018>
    52     return toRef(UString(string).rep()->ref());
     53
     54    size_t length = strlen(string);
     55    Vector< ::UChar, 1024> buffer(length);
     56    ::UChar* p = buffer.data();
     57    ConvertUTF8ToUTF16(&string, string + length, &p, p + length, false);
     58
     59    return toRef(UString(reinterpret_cast<KJS::UChar*>(buffer.data()), p - buffer.data()).rep()->ref());
    5360}
    5461
  • trunk/JavaScriptCore/ChangeLog

    r27745 r27746  
     12007-11-12  Alexey Proskuryakov  <[email protected]>
     2
     3        Reviewed by Darin.
     4
     5        http://bugs.webkit.org/show_bug.cgi?id=15953
     6        Add UTF-8 encoding/decoding to WTF
     7
     8        * kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode.
     9        * kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient
     10        parameter. Callers are not interested in getting decoding results in strict mode, so
     11        this allows for bailing out as soon as an error is seen.
     12
     13        * kjs/function.cpp:
     14        (KJS::encode): Updated for new UString::UTF8String() signature.
     15
     16        * API/JSStringRef.cpp:
     17        (JSStringCreateWithCharacters): Disambiguate UChar.
     18        (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string!
     19        * bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16().
     20
     21        * wtf/unicode/UTF8.cpp: Added.
     22        (WTF::Unicode::inlineUTF8SequenceLengthNonASCII):
     23        (WTF::Unicode::inlineUTF8SequenceLength):
     24        (WTF::Unicode::UTF8SequenceLength):
     25        (WTF::Unicode::decodeUTF8Sequence):
     26        (WTF::Unicode::):
     27        (WTF::Unicode::ConvertUTF16ToUTF8):
     28        (WTF::Unicode::isLegalUTF8):
     29        (WTF::Unicode::ConvertUTF8ToUTF16):
     30        * wtf/unicode/UTF8.h: Added.
     31        (WTF::Unicode::):
     32        Some code moved from ustring.h, some adapted from unicode.org sources.
     33
     34        * JavaScriptCore.exp:
     35        * JavaScriptCore.pri:
     36        * JavaScriptCore.vcproj/WTF/WTF.vcproj:
     37        * JavaScriptCore.xcodeproj/project.pbxproj:
     38        * JavaScriptCoreSources.bkl:
     39        Added UTF8.{h,cpp}
     40
    1412007-11-12  Josh Aas  <[email protected]>
    242
  • trunk/JavaScriptCore/JavaScriptCore.exp

    r27711 r27746  
    260260__ZNK3KJS7JSValue7toFloatEPNS_9ExecStateE
    261261__ZNK3KJS7JSValue9toIntegerEPNS_9ExecStateE
    262 __ZNK3KJS7UString10UTF8StringEv
     262__ZNK3KJS7UString10UTF8StringEb
    263263__ZNK3KJS7UString14toStrictUInt32EPb
    264264__ZNK3KJS7UString5asciiEv
  • trunk/JavaScriptCore/JavaScriptCore.pri

    r27686 r27746  
    3434    wtf/HashTable.cpp \
    3535    wtf/FastMalloc.cpp \
     36    wtf/unicode/UTF8.cpp \
    3637    bindings/NP_jsobject.cpp \
    3738    bindings/npruntime.cpp \
  • trunk/JavaScriptCore/JavaScriptCore.vcproj/WTF/WTF.vcproj

    r26787 r27746  
    312312                        >
    313313                </File>
     314                <File
     315                        RelativePath="..\..\wtf\unicode\UTF8.h"
     316                        >
     317                </File>
     318                <File
     319                        RelativePath="..\..\wtf\unicode\UTF8.cpp"
     320                        >
     321                </File>
    314322        </Files>
    315323        <Globals>
  • trunk/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj

    r27687 r27746  
    236236                E195679609E7CF1200B89D13 /* UnicodeIcu.h in Headers */ = {isa = PBXBuildFile; fileRef = E195678F09E7CF1200B89D13 /* UnicodeIcu.h */; settings = {ATTRIBUTES = (Private, ); }; };
    237237                E195679809E7CF1200B89D13 /* Unicode.h in Headers */ = {isa = PBXBuildFile; fileRef = E195679409E7CF1200B89D13 /* Unicode.h */; settings = {ATTRIBUTES = (Private, ); }; };
     238                E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E1EF79A80CE97BA60088D500 /* UTF8.cpp */; };
     239                E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */ = {isa = PBXBuildFile; fileRef = E1EF79A90CE97BA60088D500 /* UTF8.h */; };
    238240/* End PBXBuildFile section */
    239241
     
    590592                E195678F09E7CF1200B89D13 /* UnicodeIcu.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UnicodeIcu.h; sourceTree = "<group>"; };
    591593                E195679409E7CF1200B89D13 /* Unicode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Unicode.h; sourceTree = "<group>"; };
     594                E1EF79A80CE97BA60088D500 /* UTF8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UTF8.cpp; sourceTree = "<group>"; };
     595                E1EF79A90CE97BA60088D500 /* UTF8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UTF8.h; sourceTree = "<group>"; };
    592596                F5BB2BC5030F772101FCFE1D /* completion.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = completion.h; sourceTree = "<group>"; tabWidth = 8; };
    593597                F5C290E60284F98E018635CA /* JavaScriptCorePrefix.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; name = JavaScriptCorePrefix.h; path = ../JavaScriptCorePrefix.h; sourceTree = "<group>"; tabWidth = 8; };
     
    10881092                                E195678E09E7CF1200B89D13 /* icu */,
    10891093                                E195679409E7CF1200B89D13 /* Unicode.h */,
     1094                                E1EF79A90CE97BA60088D500 /* UTF8.h */,
     1095                                E1EF79A80CE97BA60088D500 /* UTF8.cpp */,
    10901096                        );
    10911097                        path = unicode;
     
    12541260                                932F5B5C0822A1C700736975 /* ustring.h in Headers */,
    12551261                                14ABB36F099C076400E2A24F /* value.h in Headers */,
     1262                                E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */,
    12561263                        );
    12571264                        runOnlyForDeploymentPostprocessing = 0;
     
    13691376                        isa = PBXProject;
    13701377                        buildConfigurationList = 149C277108902AFE008A9EFC /* Build configuration list for PBXProject "JavaScriptCore" */;
    1371                         compatibilityVersion = "Xcode 2.4";
    13721378                        hasScannedForEncodings = 1;
    13731379                        mainGroup = 0867D691FE84028FC02AAC07 /* JavaScriptCore */;
     
    15421548                                932F5BBA0822A1C700736975 /* runtime_object.cpp in Sources */,
    15431549                                932F5BC50822A1C700736975 /* runtime_root.cpp in Sources */,
     1550                                E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */,
    15441551                        );
    15451552                        runOnlyForDeploymentPostprocessing = 0;
  • trunk/JavaScriptCore/JavaScriptCoreSources.bkl

    r27686 r27746  
    114114        wtf/HashTable.cpp
    115115        wtf/TCSystemAlloc.cpp
     116        wtf/unicode/UTF8.cpp
    116117    </set>
    117118
  • trunk/JavaScriptCore/bindings/c/c_utility.cpp

    r27022 r27746  
    3939#include "runtime_root.h"
    4040#include "Platform.h"
    41 #if USE(ICU_UNICODE)
    42 #include <unicode/ucnv.h>
    43 #endif
    4441#include <wtf/Assertions.h>
     42#include <wtf/unicode/UTF8.h>
     43
     44using namespace WTF::Unicode;
    4545
    4646namespace KJS { namespace Bindings {
     
    5353
    5454// Requires free() of returned UTF16Chars.
    55 void convertUTF8ToUTF16(const NPUTF8 *UTF8Chars, int UTF8Length, NPUTF16 **UTF16Chars, unsigned int *UTF16Length)
     55void convertUTF8ToUTF16(const NPUTF8* UTF8Chars, int UTF8Length, NPUTF16** UTF16Chars, unsigned int* UTF16Length)
    5656{
    57 #if USE(ICU_UNICODE)
    5857    ASSERT(UTF8Chars || UTF8Length == 0);
    5958    ASSERT(UTF16Chars);
     
    6160    if (UTF8Length == -1)
    6261        UTF8Length = static_cast<int>(strlen(UTF8Chars));
    63        
    64     // UTF16Length maximum length is the length of the UTF8 string, plus one to include terminator
    65     // Without the plus one, it will convert ok, but a warning is generated from the converter as
    66     // there is not enough room for a terminating character.
    67     *UTF16Length = UTF8Length + 1;
    68        
    69     *UTF16Chars = 0;
    70     UErrorCode status = U_ZERO_ERROR;
    71     UConverter* conv = ucnv_open("utf8", &status);
    72     if (U_SUCCESS(status)) {
    73         *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length));
    74         ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, 0, 0, 0, &status);
    75         *UTF16Length = ucnv_toUChars(conv, (::UChar*)*UTF16Chars, *UTF16Length, UTF8Chars, UTF8Length, &status);
    76         ucnv_close(conv);
    77     }
     62
     63    *UTF16Length = UTF8Length;
     64    *UTF16Chars = static_cast<NPUTF16*>(malloc(sizeof(NPUTF16) * (*UTF16Length)));
    7865   
     66    const char* sourcestart = UTF8Chars;
     67    const char* sourceend = sourcestart + UTF8Length;
     68
     69    ::UChar* targetstart = reinterpret_cast< ::UChar*>(*UTF16Chars);
     70    ::UChar* targetend = targetstart + UTF8Length;
     71   
     72    ConversionResult result = ConvertUTF8ToUTF16(&sourcestart, sourceend, &targetstart, targetend, true);
     73   
     74    *UTF16Length = targetstart - *UTF16Chars;
     75
    7976    // Check to see if the conversion was successful
    8077    // Some plugins return invalid UTF-8 in NPVariantType_String, see <http://bugs.webkit.org/show_bug.cgi?id=5163>
    8178    // There is no "bad data" for latin1. It is unlikely that the plugin was really sending text in this encoding,
    8279    // but it should have used UTF-8, and now we are simply avoiding a crash.
    83     if (!U_SUCCESS(status)) {
     80    if (result != conversionOK) {
    8481        *UTF16Length = UTF8Length;
    8582       
    8683        if (!*UTF16Chars)   // If the memory wasn't allocated, allocate it.
    87             *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length));
     84            *UTF16Chars = (NPUTF16*)malloc(sizeof(NPUTF16) * (*UTF16Length));
    8885 
    8986        for (unsigned i = 0; i < *UTF16Length; i++)
    9087            (*UTF16Chars)[i] = UTF8Chars[i] & 0xFF;
    9188    }
    92 #else
    93     ASSERT(!"Implement me!");   
    94 #endif
    9589}
    9690
  • trunk/JavaScriptCore/kjs/function.cpp

    r27448 r27746  
    4343#include <wtf/Assertions.h>
    4444#include <wtf/MathExtras.h>
    45 #include <wtf/unicode/Unicode.h>
     45#include <wtf/unicode/UTF8.h>
    4646
    4747using namespace WTF;
     
    515515{
    516516  UString r = "", s, str = args[0]->toString(exec);
    517   bool wasGoodUTF16;
    518   CString cstr = str.UTF8String(&wasGoodUTF16);
    519   if (!wasGoodUTF16)
     517  CString cstr = str.UTF8String(true);
     518  if (!cstr.c_str())
    520519    return throwError(exec, URIError, "String contained an illegal UTF-16 sequence.");
    521520  const char* p = cstr.c_str();
  • trunk/JavaScriptCore/kjs/ustring.cpp

    r27406 r27746  
    12721272}
    12731273
    1274 inline int inlineUTF8SequenceLengthNonASCII(char b0)
    1275 {
    1276   if ((b0 & 0xC0) != 0xC0)
    1277     return 0;
    1278   if ((b0 & 0xE0) == 0xC0)
    1279     return 2;
    1280   if ((b0 & 0xF0) == 0xE0)
    1281     return 3;
    1282   if ((b0 & 0xF8) == 0xF0)
    1283     return 4;
    1284   return 0;
    1285 }
    1286 
    1287 int UTF8SequenceLengthNonASCII(char b0)
    1288 {
    1289   return inlineUTF8SequenceLengthNonASCII(b0);
    1290 }
    1291 
    1292 inline int inlineUTF8SequenceLength(char b0)
    1293 {
    1294   return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
    1295 }
    1296 
    1297 // Given a first byte, gives the length of the UTF-8 sequence it begins.
    1298 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
    1299 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
    1300 int UTF8SequenceLength(char b0)
    1301 {
    1302   return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
    1303 }
    1304 
    1305 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
    1306 // Only allows Unicode characters (U-00000000 to U-0010FFFF).
    1307 // Returns -1 if the sequence is not valid (including presence of extra bytes).
    1308 int decodeUTF8Sequence(const char *sequence)
    1309 {
    1310   // Handle 0-byte sequences (never valid).
    1311   const unsigned char b0 = sequence[0];
    1312   const int length = inlineUTF8SequenceLength(b0);
    1313   if (length == 0)
    1314     return -1;
    1315 
    1316   // Handle 1-byte sequences (plain ASCII).
    1317   const unsigned char b1 = sequence[1];
    1318   if (length == 1) {
    1319     if (b1)
    1320       return -1;
    1321     return b0;
    1322   }
    1323 
    1324   // Handle 2-byte sequences.
    1325   if ((b1 & 0xC0) != 0x80)
    1326     return -1;
    1327   const unsigned char b2 = sequence[2];
    1328   if (length == 2) {
    1329     if (b2)
    1330       return -1;
    1331     const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
    1332     if (c < 0x80)
    1333       return -1;
    1334     return c;
    1335   }
    1336 
    1337   // Handle 3-byte sequences.
    1338   if ((b2 & 0xC0) != 0x80)
    1339     return -1;
    1340   const unsigned char b3 = sequence[3];
    1341   if (length == 3) {
    1342     if (b3)
    1343       return -1;
    1344     const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
    1345     if (c < 0x800)
    1346       return -1;
    1347     // UTF-16 surrogates should never appear in UTF-8 data.
    1348     if (c >= 0xD800 && c <= 0xDFFF)
    1349       return -1;
    1350     return c;
    1351   }
    1352 
    1353   // Handle 4-byte sequences.
    1354   if ((b3 & 0xC0) != 0x80)
    1355     return -1;
    1356   const unsigned char b4 = sequence[4];
    1357   if (length == 4) {
    1358     if (b4)
    1359       return -1;
    1360     const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
    1361     if (c < 0x10000 || c > 0x10FFFF)
    1362       return -1;
    1363     return c;
    1364   }
    1365 
    1366   return -1;
    1367 }
    1368 
    1369 CString UString::UTF8String(bool* utf16WasGood) const
    1370 {
    1371   if (utf16WasGood)
    1372     *utf16WasGood = true;
    1373 
     1274CString UString::UTF8String(bool strict) const
     1275{
    13741276  // Allocate a buffer big enough to hold all the characters.
    13751277  const int length = size();
     
    13771279
    13781280  // Convert to runs of 8-bit characters.
    1379   char *p = buffer.begin();
    1380   const UChar *d = data();
    1381   for (int i = 0; i != length; ++i) {
    1382     unsigned short c = d[i].unicode();
    1383     if (c < 0x80) {
    1384       *p++ = (char)c;
    1385     } else if (c < 0x800) {
    1386       *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
    1387       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
    1388     } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) {
    1389       unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF));
    1390       *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
    1391       *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
    1392       *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
    1393       *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
    1394       ++i;
    1395     } else {
    1396       if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF)
    1397         *utf16WasGood = false;
    1398       *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
    1399       *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
    1400       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
    1401     }
    1402   }
    1403 
    1404   // Return the result as a C string.
    1405   CString result(buffer.data(), p - buffer.data());
    1406 
    1407   return result;
    1408 }
    1409 
    1410 CString UString::UTF8String() const
    1411 {
    1412     return UTF8String(0);
     1281  char* p = buffer.data();
     1282  const ::UChar* d = &data()->uc;
     1283  ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict);
     1284  if (result != conversionOK)
     1285    return CString();
     1286
     1287  return CString(buffer.data(), p - buffer.data());
    14131288}
    14141289
  • trunk/JavaScriptCore/kjs/ustring.h

    r27406 r27746  
    266266    /**
    267267     * @return The string converted to the 8-bit string type CString().
     268     * This method is not Unicode safe and shouldn't be used unless the string
     269     * is known to be ASCII.
    268270     */
    269271    CString cstring() const;
     
    279281    /**
    280282     * Convert the string to UTF-8, assuming it is UTF-16 encoded.
    281      * Since this function is tolerant of badly formed UTF-16, it can create UTF-8
    282      * strings that are invalid because they have characters in the range
    283      * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to
    284      * be otherwise valid.
    285      */
    286     CString UTF8String() const;
    287     CString UTF8String(bool* utf16WasGood) const;
     283     * In non-strict mode, this function is tolerant of badly formed UTF-16, it
     284     * can create UTF-8 strings that are invalid because they have characters in
     285     * the range U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is
     286     * guaranteed to be otherwise valid.
     287     * In strict mode, error is returned as null CString.
     288     */
     289    CString UTF8String(bool strict = false) const;
    288290
    289291    /**
     
    427429 
    428430  int compare(const UString &, const UString &);
    429 
    430   // Given a first byte, gives the length of the UTF-8 sequence it begins.
    431   // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
    432   // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
    433   int UTF8SequenceLength(char);
    434 
    435   // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
    436   // Only allows Unicode characters (U-00000000 to U-0010FFFF).
    437   // Returns -1 if the sequence is not valid (including presence of extra bytes).
    438   int decodeUTF8Sequence(const char *);
    439431
    440432inline UString::UString()
Note: See TracChangeset for help on using the changeset viewer.