Changeset 27746 in webkit for trunk/JavaScriptCore/kjs/ustring.cpp
- Timestamp:
- Nov 12, 2007, 11:12:55 PM (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/kjs/ustring.cpp
r27406 r27746 1272 1272 } 1273 1273 1274 inline int inlineUTF8SequenceLengthNonASCII(char b0) 1275 { 1276 if ((b0 & 0xC0) != 0xC0) 1277 return 0; 1278 if ((b0 & 0xE0) == 0xC0) 1279 return 2; 1280 if ((b0 & 0xF0) == 0xE0) 1281 return 3; 1282 if ((b0 & 0xF8) == 0xF0) 1283 return 4; 1284 return 0; 1285 } 1286 1287 int UTF8SequenceLengthNonASCII(char b0) 1288 { 1289 return inlineUTF8SequenceLengthNonASCII(b0); 1290 } 1291 1292 inline int inlineUTF8SequenceLength(char b0) 1293 { 1294 return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0); 1295 } 1296 1297 // Given a first byte, gives the length of the UTF-8 sequence it begins. 1298 // Returns 0 for bytes that are not legal starts of UTF-8 sequences. 1299 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). 1300 int UTF8SequenceLength(char b0) 1301 { 1302 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); 1303 } 1304 1305 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. 1306 // Only allows Unicode characters (U-00000000 to U-0010FFFF). 1307 // Returns -1 if the sequence is not valid (including presence of extra bytes). 1308 int decodeUTF8Sequence(const char *sequence) 1309 { 1310 // Handle 0-byte sequences (never valid). 1311 const unsigned char b0 = sequence[0]; 1312 const int length = inlineUTF8SequenceLength(b0); 1313 if (length == 0) 1314 return -1; 1315 1316 // Handle 1-byte sequences (plain ASCII). 1317 const unsigned char b1 = sequence[1]; 1318 if (length == 1) { 1319 if (b1) 1320 return -1; 1321 return b0; 1322 } 1323 1324 // Handle 2-byte sequences. 1325 if ((b1 & 0xC0) != 0x80) 1326 return -1; 1327 const unsigned char b2 = sequence[2]; 1328 if (length == 2) { 1329 if (b2) 1330 return -1; 1331 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); 1332 if (c < 0x80) 1333 return -1; 1334 return c; 1335 } 1336 1337 // Handle 3-byte sequences. 1338 if ((b2 & 0xC0) != 0x80) 1339 return -1; 1340 const unsigned char b3 = sequence[3]; 1341 if (length == 3) { 1342 if (b3) 1343 return -1; 1344 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); 1345 if (c < 0x800) 1346 return -1; 1347 // UTF-16 surrogates should never appear in UTF-8 data. 1348 if (c >= 0xD800 && c <= 0xDFFF) 1349 return -1; 1350 return c; 1351 } 1352 1353 // Handle 4-byte sequences. 1354 if ((b3 & 0xC0) != 0x80) 1355 return -1; 1356 const unsigned char b4 = sequence[4]; 1357 if (length == 4) { 1358 if (b4) 1359 return -1; 1360 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); 1361 if (c < 0x10000 || c > 0x10FFFF) 1362 return -1; 1363 return c; 1364 } 1365 1366 return -1; 1367 } 1368 1369 CString UString::UTF8String(bool* utf16WasGood) const 1370 { 1371 if (utf16WasGood) 1372 *utf16WasGood = true; 1373 1274 CString UString::UTF8String(bool strict) const 1275 { 1374 1276 // Allocate a buffer big enough to hold all the characters. 1375 1277 const int length = size(); … … 1377 1279 1378 1280 // Convert to runs of 8-bit characters. 1379 char *p = buffer.begin(); 1380 const UChar *d = data(); 1381 for (int i = 0; i != length; ++i) { 1382 unsigned short c = d[i].unicode(); 1383 if (c < 0x80) { 1384 *p++ = (char)c; 1385 } else if (c < 0x800) { 1386 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 1387 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 1388 } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) { 1389 unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF)); 1390 *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8 1391 *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set 1392 *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 1393 *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set 1394 ++i; 1395 } else { 1396 if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF) 1397 *utf16WasGood = false; 1398 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 1399 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 1400 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 1401 } 1402 } 1403 1404 // Return the result as a C string. 1405 CString result(buffer.data(), p - buffer.data()); 1406 1407 return result; 1408 } 1409 1410 CString UString::UTF8String() const 1411 { 1412 return UTF8String(0); 1281 char* p = buffer.data(); 1282 const ::UChar* d = &data()->uc; 1283 ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict); 1284 if (result != conversionOK) 1285 return CString(); 1286 1287 return CString(buffer.data(), p - buffer.data()); 1413 1288 } 1414 1289
Note:
See TracChangeset
for help on using the changeset viewer.