Changeset 27746 in webkit for trunk/JavaScriptCore/kjs


Ignore:
Timestamp:
Nov 12, 2007, 11:12:55 PM (18 years ago)
Author:
[email protected]
Message:

Reviewed by Darin.

http://bugs.webkit.org/show_bug.cgi?id=15953
Add UTF-8 encoding/decoding to WTF

  • kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode.
  • kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient parameter. Callers are not interested in getting decoding results in strict mode, so this allows for bailing out as soon as an error is seen.
  • kjs/function.cpp: (KJS::encode): Updated for new UString::UTF8String() signature.
  • API/JSStringRef.cpp: (JSStringCreateWithCharacters): Disambiguate UChar. (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string!
  • bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16().
  • wtf/unicode/UTF8.cpp: Added. (WTF::Unicode::inlineUTF8SequenceLengthNonASCII): (WTF::Unicode::inlineUTF8SequenceLength): (WTF::Unicode::UTF8SequenceLength): (WTF::Unicode::decodeUTF8Sequence): (WTF::Unicode::): (WTF::Unicode::ConvertUTF16ToUTF8): (WTF::Unicode::isLegalUTF8): (WTF::Unicode::ConvertUTF8ToUTF16):
  • wtf/unicode/UTF8.h: Added. (WTF::Unicode::): Some code moved from ustring.h, some adapted from unicode.org sources.
Location:
trunk/JavaScriptCore/kjs
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/kjs/function.cpp

    r27448 r27746  
    4343#include <wtf/Assertions.h>
    4444#include <wtf/MathExtras.h>
    45 #include <wtf/unicode/Unicode.h>
     45#include <wtf/unicode/UTF8.h>
    4646
    4747using namespace WTF;
     
    515515{
    516516  UString r = "", s, str = args[0]->toString(exec);
    517   bool wasGoodUTF16;
    518   CString cstr = str.UTF8String(&wasGoodUTF16);
    519   if (!wasGoodUTF16)
     517  CString cstr = str.UTF8String(true);
     518  if (!cstr.c_str())
    520519    return throwError(exec, URIError, "String contained an illegal UTF-16 sequence.");
    521520  const char* p = cstr.c_str();
  • trunk/JavaScriptCore/kjs/ustring.cpp

    r27406 r27746  
    12721272}
    12731273
    1274 inline int inlineUTF8SequenceLengthNonASCII(char b0)
    1275 {
    1276   if ((b0 & 0xC0) != 0xC0)
    1277     return 0;
    1278   if ((b0 & 0xE0) == 0xC0)
    1279     return 2;
    1280   if ((b0 & 0xF0) == 0xE0)
    1281     return 3;
    1282   if ((b0 & 0xF8) == 0xF0)
    1283     return 4;
    1284   return 0;
    1285 }
    1286 
    1287 int UTF8SequenceLengthNonASCII(char b0)
    1288 {
    1289   return inlineUTF8SequenceLengthNonASCII(b0);
    1290 }
    1291 
    1292 inline int inlineUTF8SequenceLength(char b0)
    1293 {
    1294   return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
    1295 }
    1296 
    1297 // Given a first byte, gives the length of the UTF-8 sequence it begins.
    1298 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
    1299 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
    1300 int UTF8SequenceLength(char b0)
    1301 {
    1302   return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
    1303 }
    1304 
    1305 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
    1306 // Only allows Unicode characters (U-00000000 to U-0010FFFF).
    1307 // Returns -1 if the sequence is not valid (including presence of extra bytes).
    1308 int decodeUTF8Sequence(const char *sequence)
    1309 {
    1310   // Handle 0-byte sequences (never valid).
    1311   const unsigned char b0 = sequence[0];
    1312   const int length = inlineUTF8SequenceLength(b0);
    1313   if (length == 0)
    1314     return -1;
    1315 
    1316   // Handle 1-byte sequences (plain ASCII).
    1317   const unsigned char b1 = sequence[1];
    1318   if (length == 1) {
    1319     if (b1)
    1320       return -1;
    1321     return b0;
    1322   }
    1323 
    1324   // Handle 2-byte sequences.
    1325   if ((b1 & 0xC0) != 0x80)
    1326     return -1;
    1327   const unsigned char b2 = sequence[2];
    1328   if (length == 2) {
    1329     if (b2)
    1330       return -1;
    1331     const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
    1332     if (c < 0x80)
    1333       return -1;
    1334     return c;
    1335   }
    1336 
    1337   // Handle 3-byte sequences.
    1338   if ((b2 & 0xC0) != 0x80)
    1339     return -1;
    1340   const unsigned char b3 = sequence[3];
    1341   if (length == 3) {
    1342     if (b3)
    1343       return -1;
    1344     const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
    1345     if (c < 0x800)
    1346       return -1;
    1347     // UTF-16 surrogates should never appear in UTF-8 data.
    1348     if (c >= 0xD800 && c <= 0xDFFF)
    1349       return -1;
    1350     return c;
    1351   }
    1352 
    1353   // Handle 4-byte sequences.
    1354   if ((b3 & 0xC0) != 0x80)
    1355     return -1;
    1356   const unsigned char b4 = sequence[4];
    1357   if (length == 4) {
    1358     if (b4)
    1359       return -1;
    1360     const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
    1361     if (c < 0x10000 || c > 0x10FFFF)
    1362       return -1;
    1363     return c;
    1364   }
    1365 
    1366   return -1;
    1367 }
    1368 
    1369 CString UString::UTF8String(bool* utf16WasGood) const
    1370 {
    1371   if (utf16WasGood)
    1372     *utf16WasGood = true;
    1373 
     1274CString UString::UTF8String(bool strict) const
     1275{
    13741276  // Allocate a buffer big enough to hold all the characters.
    13751277  const int length = size();
     
    13771279
    13781280  // Convert to runs of 8-bit characters.
    1379   char *p = buffer.begin();
    1380   const UChar *d = data();
    1381   for (int i = 0; i != length; ++i) {
    1382     unsigned short c = d[i].unicode();
    1383     if (c < 0x80) {
    1384       *p++ = (char)c;
    1385     } else if (c < 0x800) {
    1386       *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
    1387       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
    1388     } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) {
    1389       unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF));
    1390       *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
    1391       *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
    1392       *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
    1393       *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
    1394       ++i;
    1395     } else {
    1396       if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF)
    1397         *utf16WasGood = false;
    1398       *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
    1399       *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
    1400       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
    1401     }
    1402   }
    1403 
    1404   // Return the result as a C string.
    1405   CString result(buffer.data(), p - buffer.data());
    1406 
    1407   return result;
    1408 }
    1409 
    1410 CString UString::UTF8String() const
    1411 {
    1412     return UTF8String(0);
     1281  char* p = buffer.data();
     1282  const ::UChar* d = &data()->uc;
     1283  ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict);
     1284  if (result != conversionOK)
     1285    return CString();
     1286
     1287  return CString(buffer.data(), p - buffer.data());
    14131288}
    14141289
  • trunk/JavaScriptCore/kjs/ustring.h

    r27406 r27746  
    266266    /**
    267267     * @return The string converted to the 8-bit string type CString().
     268     * This method is not Unicode safe and shouldn't be used unless the string
     269     * is known to be ASCII.
    268270     */
    269271    CString cstring() const;
     
    279281    /**
    280282     * Convert the string to UTF-8, assuming it is UTF-16 encoded.
    281      * Since this function is tolerant of badly formed UTF-16, it can create UTF-8
    282      * strings that are invalid because they have characters in the range
    283      * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to
    284      * be otherwise valid.
    285      */
    286     CString UTF8String() const;
    287     CString UTF8String(bool* utf16WasGood) const;
     283     * In non-strict mode, this function is tolerant of badly formed UTF-16, it
     284     * can create UTF-8 strings that are invalid because they have characters in
     285     * the range U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is
     286     * guaranteed to be otherwise valid.
     287     * In strict mode, error is returned as null CString.
     288     */
     289    CString UTF8String(bool strict = false) const;
    288290
    289291    /**
     
    427429 
    428430  int compare(const UString &, const UString &);
    429 
    430   // Given a first byte, gives the length of the UTF-8 sequence it begins.
    431   // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
    432   // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
    433   int UTF8SequenceLength(char);
    434 
    435   // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
    436   // Only allows Unicode characters (U-00000000 to U-0010FFFF).
    437   // Returns -1 if the sequence is not valid (including presence of extra bytes).
    438   int decodeUTF8Sequence(const char *);
    439431
    440432inline UString::UString()
Note: See TracChangeset for help on using the changeset viewer.