Changeset 27746 in webkit for trunk/JavaScriptCore/kjs
- Timestamp:
- Nov 12, 2007, 11:12:55 PM (18 years ago)
- Location:
- trunk/JavaScriptCore/kjs
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/kjs/function.cpp
r27448 r27746 43 43 #include <wtf/Assertions.h> 44 44 #include <wtf/MathExtras.h> 45 #include <wtf/unicode/U nicode.h>45 #include <wtf/unicode/UTF8.h> 46 46 47 47 using namespace WTF; … … 515 515 { 516 516 UString r = "", s, str = args[0]->toString(exec); 517 bool wasGoodUTF16; 518 CString cstr = str.UTF8String(&wasGoodUTF16); 519 if (!wasGoodUTF16) 517 CString cstr = str.UTF8String(true); 518 if (!cstr.c_str()) 520 519 return throwError(exec, URIError, "String contained an illegal UTF-16 sequence."); 521 520 const char* p = cstr.c_str(); -
trunk/JavaScriptCore/kjs/ustring.cpp
r27406 r27746 1272 1272 } 1273 1273 1274 inline int inlineUTF8SequenceLengthNonASCII(char b0) 1275 { 1276 if ((b0 & 0xC0) != 0xC0) 1277 return 0; 1278 if ((b0 & 0xE0) == 0xC0) 1279 return 2; 1280 if ((b0 & 0xF0) == 0xE0) 1281 return 3; 1282 if ((b0 & 0xF8) == 0xF0) 1283 return 4; 1284 return 0; 1285 } 1286 1287 int UTF8SequenceLengthNonASCII(char b0) 1288 { 1289 return inlineUTF8SequenceLengthNonASCII(b0); 1290 } 1291 1292 inline int inlineUTF8SequenceLength(char b0) 1293 { 1294 return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0); 1295 } 1296 1297 // Given a first byte, gives the length of the UTF-8 sequence it begins. 1298 // Returns 0 for bytes that are not legal starts of UTF-8 sequences. 1299 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). 1300 int UTF8SequenceLength(char b0) 1301 { 1302 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); 1303 } 1304 1305 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. 1306 // Only allows Unicode characters (U-00000000 to U-0010FFFF). 1307 // Returns -1 if the sequence is not valid (including presence of extra bytes). 1308 int decodeUTF8Sequence(const char *sequence) 1309 { 1310 // Handle 0-byte sequences (never valid). 1311 const unsigned char b0 = sequence[0]; 1312 const int length = inlineUTF8SequenceLength(b0); 1313 if (length == 0) 1314 return -1; 1315 1316 // Handle 1-byte sequences (plain ASCII). 1317 const unsigned char b1 = sequence[1]; 1318 if (length == 1) { 1319 if (b1) 1320 return -1; 1321 return b0; 1322 } 1323 1324 // Handle 2-byte sequences. 1325 if ((b1 & 0xC0) != 0x80) 1326 return -1; 1327 const unsigned char b2 = sequence[2]; 1328 if (length == 2) { 1329 if (b2) 1330 return -1; 1331 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); 1332 if (c < 0x80) 1333 return -1; 1334 return c; 1335 } 1336 1337 // Handle 3-byte sequences. 1338 if ((b2 & 0xC0) != 0x80) 1339 return -1; 1340 const unsigned char b3 = sequence[3]; 1341 if (length == 3) { 1342 if (b3) 1343 return -1; 1344 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); 1345 if (c < 0x800) 1346 return -1; 1347 // UTF-16 surrogates should never appear in UTF-8 data. 1348 if (c >= 0xD800 && c <= 0xDFFF) 1349 return -1; 1350 return c; 1351 } 1352 1353 // Handle 4-byte sequences. 1354 if ((b3 & 0xC0) != 0x80) 1355 return -1; 1356 const unsigned char b4 = sequence[4]; 1357 if (length == 4) { 1358 if (b4) 1359 return -1; 1360 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); 1361 if (c < 0x10000 || c > 0x10FFFF) 1362 return -1; 1363 return c; 1364 } 1365 1366 return -1; 1367 } 1368 1369 CString UString::UTF8String(bool* utf16WasGood) const 1370 { 1371 if (utf16WasGood) 1372 *utf16WasGood = true; 1373 1274 CString UString::UTF8String(bool strict) const 1275 { 1374 1276 // Allocate a buffer big enough to hold all the characters. 1375 1277 const int length = size(); … … 1377 1279 1378 1280 // Convert to runs of 8-bit characters. 1379 char *p = buffer.begin(); 1380 const UChar *d = data(); 1381 for (int i = 0; i != length; ++i) { 1382 unsigned short c = d[i].unicode(); 1383 if (c < 0x80) { 1384 *p++ = (char)c; 1385 } else if (c < 0x800) { 1386 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 1387 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 1388 } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) { 1389 unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF)); 1390 *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8 1391 *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set 1392 *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 1393 *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set 1394 ++i; 1395 } else { 1396 if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF) 1397 *utf16WasGood = false; 1398 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 1399 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 1400 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 1401 } 1402 } 1403 1404 // Return the result as a C string. 1405 CString result(buffer.data(), p - buffer.data()); 1406 1407 return result; 1408 } 1409 1410 CString UString::UTF8String() const 1411 { 1412 return UTF8String(0); 1281 char* p = buffer.data(); 1282 const ::UChar* d = &data()->uc; 1283 ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict); 1284 if (result != conversionOK) 1285 return CString(); 1286 1287 return CString(buffer.data(), p - buffer.data()); 1413 1288 } 1414 1289 -
trunk/JavaScriptCore/kjs/ustring.h
r27406 r27746 266 266 /** 267 267 * @return The string converted to the 8-bit string type CString(). 268 * This method is not Unicode safe and shouldn't be used unless the string 269 * is known to be ASCII. 268 270 */ 269 271 CString cstring() const; … … 279 281 /** 280 282 * Convert the string to UTF-8, assuming it is UTF-16 encoded. 281 * Since this function is tolerant of badly formed UTF-16, it can create UTF-8282 * strings that are invalid because they have characters in the range283 * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to284 * be otherwise valid.285 * /286 CString UTF8String() const;287 CString UTF8String(bool * utf16WasGood) const;283 * In non-strict mode, this function is tolerant of badly formed UTF-16, it 284 * can create UTF-8 strings that are invalid because they have characters in 285 * the range U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is 286 * guaranteed to be otherwise valid. 287 * In strict mode, error is returned as null CString. 288 */ 289 CString UTF8String(bool strict = false) const; 288 290 289 291 /** … … 427 429 428 430 int compare(const UString &, const UString &); 429 430 // Given a first byte, gives the length of the UTF-8 sequence it begins.431 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.432 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).433 int UTF8SequenceLength(char);434 435 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.436 // Only allows Unicode characters (U-00000000 to U-0010FFFF).437 // Returns -1 if the sequence is not valid (including presence of extra bytes).438 int decodeUTF8Sequence(const char *);439 431 440 432 inline UString::UString()
Note:
See TracChangeset
for help on using the changeset viewer.