Ignore:
Timestamp:
Nov 12, 2007, 11:12:55 PM (18 years ago)
Author:
[email protected]
Message:

Reviewed by Darin.

http://bugs.webkit.org/show_bug.cgi?id=15953
Add UTF-8 encoding/decoding to WTF

  • kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode.
  • kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient parameter. Callers are not interested in getting decoding results in strict mode, so this allows for bailing out as soon as an error is seen.
  • kjs/function.cpp: (KJS::encode): Updated for new UString::UTF8String() signature.
  • API/JSStringRef.cpp: (JSStringCreateWithCharacters): Disambiguate UChar. (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string!
  • bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16().
  • wtf/unicode/UTF8.cpp: Added. (WTF::Unicode::inlineUTF8SequenceLengthNonASCII): (WTF::Unicode::inlineUTF8SequenceLength): (WTF::Unicode::UTF8SequenceLength): (WTF::Unicode::decodeUTF8Sequence): (WTF::Unicode::): (WTF::Unicode::ConvertUTF16ToUTF8): (WTF::Unicode::isLegalUTF8): (WTF::Unicode::ConvertUTF8ToUTF16):
  • wtf/unicode/UTF8.h: Added. (WTF::Unicode::): Some code moved from ustring.h, some adapted from unicode.org sources.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/kjs/ustring.cpp

    r27406 r27746  
    12721272}
    12731273
    1274 inline int inlineUTF8SequenceLengthNonASCII(char b0)
    1275 {
    1276   if ((b0 & 0xC0) != 0xC0)
    1277     return 0;
    1278   if ((b0 & 0xE0) == 0xC0)
    1279     return 2;
    1280   if ((b0 & 0xF0) == 0xE0)
    1281     return 3;
    1282   if ((b0 & 0xF8) == 0xF0)
    1283     return 4;
    1284   return 0;
    1285 }
    1286 
    1287 int UTF8SequenceLengthNonASCII(char b0)
    1288 {
    1289   return inlineUTF8SequenceLengthNonASCII(b0);
    1290 }
    1291 
    1292 inline int inlineUTF8SequenceLength(char b0)
    1293 {
    1294   return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
    1295 }
    1296 
    1297 // Given a first byte, gives the length of the UTF-8 sequence it begins.
    1298 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
    1299 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
    1300 int UTF8SequenceLength(char b0)
    1301 {
    1302   return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
    1303 }
    1304 
    1305 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
    1306 // Only allows Unicode characters (U-00000000 to U-0010FFFF).
    1307 // Returns -1 if the sequence is not valid (including presence of extra bytes).
    1308 int decodeUTF8Sequence(const char *sequence)
    1309 {
    1310   // Handle 0-byte sequences (never valid).
    1311   const unsigned char b0 = sequence[0];
    1312   const int length = inlineUTF8SequenceLength(b0);
    1313   if (length == 0)
    1314     return -1;
    1315 
    1316   // Handle 1-byte sequences (plain ASCII).
    1317   const unsigned char b1 = sequence[1];
    1318   if (length == 1) {
    1319     if (b1)
    1320       return -1;
    1321     return b0;
    1322   }
    1323 
    1324   // Handle 2-byte sequences.
    1325   if ((b1 & 0xC0) != 0x80)
    1326     return -1;
    1327   const unsigned char b2 = sequence[2];
    1328   if (length == 2) {
    1329     if (b2)
    1330       return -1;
    1331     const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
    1332     if (c < 0x80)
    1333       return -1;
    1334     return c;
    1335   }
    1336 
    1337   // Handle 3-byte sequences.
    1338   if ((b2 & 0xC0) != 0x80)
    1339     return -1;
    1340   const unsigned char b3 = sequence[3];
    1341   if (length == 3) {
    1342     if (b3)
    1343       return -1;
    1344     const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
    1345     if (c < 0x800)
    1346       return -1;
    1347     // UTF-16 surrogates should never appear in UTF-8 data.
    1348     if (c >= 0xD800 && c <= 0xDFFF)
    1349       return -1;
    1350     return c;
    1351   }
    1352 
    1353   // Handle 4-byte sequences.
    1354   if ((b3 & 0xC0) != 0x80)
    1355     return -1;
    1356   const unsigned char b4 = sequence[4];
    1357   if (length == 4) {
    1358     if (b4)
    1359       return -1;
    1360     const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
    1361     if (c < 0x10000 || c > 0x10FFFF)
    1362       return -1;
    1363     return c;
    1364   }
    1365 
    1366   return -1;
    1367 }
    1368 
    1369 CString UString::UTF8String(bool* utf16WasGood) const
    1370 {
    1371   if (utf16WasGood)
    1372     *utf16WasGood = true;
    1373 
     1274CString UString::UTF8String(bool strict) const
     1275{
    13741276  // Allocate a buffer big enough to hold all the characters.
    13751277  const int length = size();
     
    13771279
    13781280  // Convert to runs of 8-bit characters.
    1379   char *p = buffer.begin();
    1380   const UChar *d = data();
    1381   for (int i = 0; i != length; ++i) {
    1382     unsigned short c = d[i].unicode();
    1383     if (c < 0x80) {
    1384       *p++ = (char)c;
    1385     } else if (c < 0x800) {
    1386       *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
    1387       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
    1388     } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) {
    1389       unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF));
    1390       *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
    1391       *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
    1392       *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
    1393       *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
    1394       ++i;
    1395     } else {
    1396       if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF)
    1397         *utf16WasGood = false;
    1398       *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
    1399       *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
    1400       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
    1401     }
    1402   }
    1403 
    1404   // Return the result as a C string.
    1405   CString result(buffer.data(), p - buffer.data());
    1406 
    1407   return result;
    1408 }
    1409 
    1410 CString UString::UTF8String() const
    1411 {
    1412     return UTF8String(0);
     1281  char* p = buffer.data();
     1282  const ::UChar* d = &data()->uc;
     1283  ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict);
     1284  if (result != conversionOK)
     1285    return CString();
     1286
     1287  return CString(buffer.data(), p - buffer.data());
    14131288}
    14141289
Note: See TracChangeset for help on using the changeset viewer.