Changeset 27746 in webkit for trunk/JavaScriptCore
- Timestamp:
- Nov 12, 2007, 11:12:55 PM (18 years ago)
- Location:
- trunk/JavaScriptCore
- Files:
-
- 2 added
- 11 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/API/JSStringRef.cpp
r27730 r27746 37 37 #include <kjs/ustring.h> 38 38 #include <kjs/value.h> 39 #include <wtf/unicode/UTF8.h> 39 40 40 41 using namespace KJS; 42 using namespace WTF::Unicode; 41 43 42 44 JSStringRef JSStringCreateWithCharacters(const JSChar* chars, size_t numChars) 43 45 { 44 46 JSLock lock; 45 return toRef(UString(reinterpret_cast<const UChar*>(chars), static_cast<int>(numChars)).rep()->ref());47 return toRef(UString(reinterpret_cast<const KJS::UChar*>(chars), static_cast<int>(numChars)).rep()->ref()); 46 48 } 47 49 … … 49 51 { 50 52 JSLock lock; 51 // FIXME: <rdar://problem/4949018> 52 return toRef(UString(string).rep()->ref()); 53 54 size_t length = strlen(string); 55 Vector< ::UChar, 1024> buffer(length); 56 ::UChar* p = buffer.data(); 57 ConvertUTF8ToUTF16(&string, string + length, &p, p + length, false); 58 59 return toRef(UString(reinterpret_cast<KJS::UChar*>(buffer.data()), p - buffer.data()).rep()->ref()); 53 60 } 54 61 -
trunk/JavaScriptCore/ChangeLog
r27745 r27746 1 2007-11-12 Alexey Proskuryakov <[email protected]> 2 3 Reviewed by Darin. 4 5 http://bugs.webkit.org/show_bug.cgi?id=15953 6 Add UTF-8 encoding/decoding to WTF 7 8 * kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode. 9 * kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient 10 parameter. Callers are not interested in getting decoding results in strict mode, so 11 this allows for bailing out as soon as an error is seen. 12 13 * kjs/function.cpp: 14 (KJS::encode): Updated for new UString::UTF8String() signature. 15 16 * API/JSStringRef.cpp: 17 (JSStringCreateWithCharacters): Disambiguate UChar. 18 (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string! 19 * bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16(). 20 21 * wtf/unicode/UTF8.cpp: Added. 22 (WTF::Unicode::inlineUTF8SequenceLengthNonASCII): 23 (WTF::Unicode::inlineUTF8SequenceLength): 24 (WTF::Unicode::UTF8SequenceLength): 25 (WTF::Unicode::decodeUTF8Sequence): 26 (WTF::Unicode::): 27 (WTF::Unicode::ConvertUTF16ToUTF8): 28 (WTF::Unicode::isLegalUTF8): 29 (WTF::Unicode::ConvertUTF8ToUTF16): 30 * wtf/unicode/UTF8.h: Added. 31 (WTF::Unicode::): 32 Some code moved from ustring.h, some adapted from unicode.org sources. 33 34 * JavaScriptCore.exp: 35 * JavaScriptCore.pri: 36 * JavaScriptCore.vcproj/WTF/WTF.vcproj: 37 * JavaScriptCore.xcodeproj/project.pbxproj: 38 * JavaScriptCoreSources.bkl: 39 Added UTF8.{h,cpp} 40 1 41 2007-11-12 Josh Aas <[email protected]> 2 42 -
trunk/JavaScriptCore/JavaScriptCore.exp
r27711 r27746 260 260 __ZNK3KJS7JSValue7toFloatEPNS_9ExecStateE 261 261 __ZNK3KJS7JSValue9toIntegerEPNS_9ExecStateE 262 __ZNK3KJS7UString10UTF8StringE v262 __ZNK3KJS7UString10UTF8StringEb 263 263 __ZNK3KJS7UString14toStrictUInt32EPb 264 264 __ZNK3KJS7UString5asciiEv -
trunk/JavaScriptCore/JavaScriptCore.pri
r27686 r27746 34 34 wtf/HashTable.cpp \ 35 35 wtf/FastMalloc.cpp \ 36 wtf/unicode/UTF8.cpp \ 36 37 bindings/NP_jsobject.cpp \ 37 38 bindings/npruntime.cpp \ -
trunk/JavaScriptCore/JavaScriptCore.vcproj/WTF/WTF.vcproj
r26787 r27746 312 312 > 313 313 </File> 314 <File 315 RelativePath="..\..\wtf\unicode\UTF8.h" 316 > 317 </File> 318 <File 319 RelativePath="..\..\wtf\unicode\UTF8.cpp" 320 > 321 </File> 314 322 </Files> 315 323 <Globals> -
trunk/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj
r27687 r27746 236 236 E195679609E7CF1200B89D13 /* UnicodeIcu.h in Headers */ = {isa = PBXBuildFile; fileRef = E195678F09E7CF1200B89D13 /* UnicodeIcu.h */; settings = {ATTRIBUTES = (Private, ); }; }; 237 237 E195679809E7CF1200B89D13 /* Unicode.h in Headers */ = {isa = PBXBuildFile; fileRef = E195679409E7CF1200B89D13 /* Unicode.h */; settings = {ATTRIBUTES = (Private, ); }; }; 238 E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E1EF79A80CE97BA60088D500 /* UTF8.cpp */; }; 239 E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */ = {isa = PBXBuildFile; fileRef = E1EF79A90CE97BA60088D500 /* UTF8.h */; }; 238 240 /* End PBXBuildFile section */ 239 241 … … 590 592 E195678F09E7CF1200B89D13 /* UnicodeIcu.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UnicodeIcu.h; sourceTree = "<group>"; }; 591 593 E195679409E7CF1200B89D13 /* Unicode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Unicode.h; sourceTree = "<group>"; }; 594 E1EF79A80CE97BA60088D500 /* UTF8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UTF8.cpp; sourceTree = "<group>"; }; 595 E1EF79A90CE97BA60088D500 /* UTF8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UTF8.h; sourceTree = "<group>"; }; 592 596 F5BB2BC5030F772101FCFE1D /* completion.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = completion.h; sourceTree = "<group>"; tabWidth = 8; }; 593 597 F5C290E60284F98E018635CA /* JavaScriptCorePrefix.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; name = JavaScriptCorePrefix.h; path = ../JavaScriptCorePrefix.h; sourceTree = "<group>"; tabWidth = 8; }; … … 1088 1092 E195678E09E7CF1200B89D13 /* icu */, 1089 1093 E195679409E7CF1200B89D13 /* Unicode.h */, 1094 E1EF79A90CE97BA60088D500 /* UTF8.h */, 1095 E1EF79A80CE97BA60088D500 /* UTF8.cpp */, 1090 1096 ); 1091 1097 path = unicode; … … 1254 1260 932F5B5C0822A1C700736975 /* ustring.h in Headers */, 1255 1261 14ABB36F099C076400E2A24F /* value.h in Headers */, 1262 E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */, 1256 1263 ); 1257 1264 runOnlyForDeploymentPostprocessing = 0; … … 1369 1376 isa = PBXProject; 1370 1377 buildConfigurationList = 149C277108902AFE008A9EFC /* Build configuration list for PBXProject "JavaScriptCore" */; 1371 compatibilityVersion = "Xcode 2.4";1372 1378 hasScannedForEncodings = 1; 1373 1379 mainGroup = 0867D691FE84028FC02AAC07 /* JavaScriptCore */; … … 1542 1548 932F5BBA0822A1C700736975 /* runtime_object.cpp in Sources */, 1543 1549 932F5BC50822A1C700736975 /* runtime_root.cpp in Sources */, 1550 E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */, 1544 1551 ); 1545 1552 runOnlyForDeploymentPostprocessing = 0; -
trunk/JavaScriptCore/JavaScriptCoreSources.bkl
r27686 r27746 114 114 wtf/HashTable.cpp 115 115 wtf/TCSystemAlloc.cpp 116 wtf/unicode/UTF8.cpp 116 117 </set> 117 118 -
trunk/JavaScriptCore/bindings/c/c_utility.cpp
r27022 r27746 39 39 #include "runtime_root.h" 40 40 #include "Platform.h" 41 #if USE(ICU_UNICODE)42 #include <unicode/ucnv.h>43 #endif44 41 #include <wtf/Assertions.h> 42 #include <wtf/unicode/UTF8.h> 43 44 using namespace WTF::Unicode; 45 45 46 46 namespace KJS { namespace Bindings { … … 53 53 54 54 // Requires free() of returned UTF16Chars. 55 void convertUTF8ToUTF16(const NPUTF8 *UTF8Chars, int UTF8Length, NPUTF16 **UTF16Chars, unsigned int *UTF16Length)55 void convertUTF8ToUTF16(const NPUTF8* UTF8Chars, int UTF8Length, NPUTF16** UTF16Chars, unsigned int* UTF16Length) 56 56 { 57 #if USE(ICU_UNICODE)58 57 ASSERT(UTF8Chars || UTF8Length == 0); 59 58 ASSERT(UTF16Chars); … … 61 60 if (UTF8Length == -1) 62 61 UTF8Length = static_cast<int>(strlen(UTF8Chars)); 63 64 // UTF16Length maximum length is the length of the UTF8 string, plus one to include terminator 65 // Without the plus one, it will convert ok, but a warning is generated from the converter as 66 // there is not enough room for a terminating character. 67 *UTF16Length = UTF8Length + 1; 68 69 *UTF16Chars = 0; 70 UErrorCode status = U_ZERO_ERROR; 71 UConverter* conv = ucnv_open("utf8", &status); 72 if (U_SUCCESS(status)) { 73 *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length)); 74 ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, 0, 0, 0, &status); 75 *UTF16Length = ucnv_toUChars(conv, (::UChar*)*UTF16Chars, *UTF16Length, UTF8Chars, UTF8Length, &status); 76 ucnv_close(conv); 77 } 62 63 *UTF16Length = UTF8Length; 64 *UTF16Chars = static_cast<NPUTF16*>(malloc(sizeof(NPUTF16) * (*UTF16Length))); 78 65 66 const char* sourcestart = UTF8Chars; 67 const char* sourceend = sourcestart + UTF8Length; 68 69 ::UChar* targetstart = reinterpret_cast< ::UChar*>(*UTF16Chars); 70 ::UChar* targetend = targetstart + UTF8Length; 71 72 ConversionResult result = ConvertUTF8ToUTF16(&sourcestart, sourceend, &targetstart, targetend, true); 73 74 *UTF16Length = targetstart - *UTF16Chars; 75 79 76 // Check to see if the conversion was successful 80 77 // Some plugins return invalid UTF-8 in NPVariantType_String, see <http://bugs.webkit.org/show_bug.cgi?id=5163> 81 78 // There is no "bad data" for latin1. It is unlikely that the plugin was really sending text in this encoding, 82 79 // but it should have used UTF-8, and now we are simply avoiding a crash. 83 if ( !U_SUCCESS(status)) {80 if (result != conversionOK) { 84 81 *UTF16Length = UTF8Length; 85 82 86 83 if (!*UTF16Chars) // If the memory wasn't allocated, allocate it. 87 *UTF16Chars = (NPUTF16 84 *UTF16Chars = (NPUTF16*)malloc(sizeof(NPUTF16) * (*UTF16Length)); 88 85 89 86 for (unsigned i = 0; i < *UTF16Length; i++) 90 87 (*UTF16Chars)[i] = UTF8Chars[i] & 0xFF; 91 88 } 92 #else93 ASSERT(!"Implement me!");94 #endif95 89 } 96 90 -
trunk/JavaScriptCore/kjs/function.cpp
r27448 r27746 43 43 #include <wtf/Assertions.h> 44 44 #include <wtf/MathExtras.h> 45 #include <wtf/unicode/U nicode.h>45 #include <wtf/unicode/UTF8.h> 46 46 47 47 using namespace WTF; … … 515 515 { 516 516 UString r = "", s, str = args[0]->toString(exec); 517 bool wasGoodUTF16; 518 CString cstr = str.UTF8String(&wasGoodUTF16); 519 if (!wasGoodUTF16) 517 CString cstr = str.UTF8String(true); 518 if (!cstr.c_str()) 520 519 return throwError(exec, URIError, "String contained an illegal UTF-16 sequence."); 521 520 const char* p = cstr.c_str(); -
trunk/JavaScriptCore/kjs/ustring.cpp
r27406 r27746 1272 1272 } 1273 1273 1274 inline int inlineUTF8SequenceLengthNonASCII(char b0) 1275 { 1276 if ((b0 & 0xC0) != 0xC0) 1277 return 0; 1278 if ((b0 & 0xE0) == 0xC0) 1279 return 2; 1280 if ((b0 & 0xF0) == 0xE0) 1281 return 3; 1282 if ((b0 & 0xF8) == 0xF0) 1283 return 4; 1284 return 0; 1285 } 1286 1287 int UTF8SequenceLengthNonASCII(char b0) 1288 { 1289 return inlineUTF8SequenceLengthNonASCII(b0); 1290 } 1291 1292 inline int inlineUTF8SequenceLength(char b0) 1293 { 1294 return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0); 1295 } 1296 1297 // Given a first byte, gives the length of the UTF-8 sequence it begins. 1298 // Returns 0 for bytes that are not legal starts of UTF-8 sequences. 1299 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). 1300 int UTF8SequenceLength(char b0) 1301 { 1302 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); 1303 } 1304 1305 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. 1306 // Only allows Unicode characters (U-00000000 to U-0010FFFF). 1307 // Returns -1 if the sequence is not valid (including presence of extra bytes). 1308 int decodeUTF8Sequence(const char *sequence) 1309 { 1310 // Handle 0-byte sequences (never valid). 1311 const unsigned char b0 = sequence[0]; 1312 const int length = inlineUTF8SequenceLength(b0); 1313 if (length == 0) 1314 return -1; 1315 1316 // Handle 1-byte sequences (plain ASCII). 1317 const unsigned char b1 = sequence[1]; 1318 if (length == 1) { 1319 if (b1) 1320 return -1; 1321 return b0; 1322 } 1323 1324 // Handle 2-byte sequences. 1325 if ((b1 & 0xC0) != 0x80) 1326 return -1; 1327 const unsigned char b2 = sequence[2]; 1328 if (length == 2) { 1329 if (b2) 1330 return -1; 1331 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); 1332 if (c < 0x80) 1333 return -1; 1334 return c; 1335 } 1336 1337 // Handle 3-byte sequences. 1338 if ((b2 & 0xC0) != 0x80) 1339 return -1; 1340 const unsigned char b3 = sequence[3]; 1341 if (length == 3) { 1342 if (b3) 1343 return -1; 1344 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); 1345 if (c < 0x800) 1346 return -1; 1347 // UTF-16 surrogates should never appear in UTF-8 data. 1348 if (c >= 0xD800 && c <= 0xDFFF) 1349 return -1; 1350 return c; 1351 } 1352 1353 // Handle 4-byte sequences. 1354 if ((b3 & 0xC0) != 0x80) 1355 return -1; 1356 const unsigned char b4 = sequence[4]; 1357 if (length == 4) { 1358 if (b4) 1359 return -1; 1360 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); 1361 if (c < 0x10000 || c > 0x10FFFF) 1362 return -1; 1363 return c; 1364 } 1365 1366 return -1; 1367 } 1368 1369 CString UString::UTF8String(bool* utf16WasGood) const 1370 { 1371 if (utf16WasGood) 1372 *utf16WasGood = true; 1373 1274 CString UString::UTF8String(bool strict) const 1275 { 1374 1276 // Allocate a buffer big enough to hold all the characters. 1375 1277 const int length = size(); … … 1377 1279 1378 1280 // Convert to runs of 8-bit characters. 1379 char *p = buffer.begin(); 1380 const UChar *d = data(); 1381 for (int i = 0; i != length; ++i) { 1382 unsigned short c = d[i].unicode(); 1383 if (c < 0x80) { 1384 *p++ = (char)c; 1385 } else if (c < 0x800) { 1386 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 1387 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 1388 } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) { 1389 unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF)); 1390 *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8 1391 *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set 1392 *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 1393 *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set 1394 ++i; 1395 } else { 1396 if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF) 1397 *utf16WasGood = false; 1398 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 1399 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 1400 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 1401 } 1402 } 1403 1404 // Return the result as a C string. 1405 CString result(buffer.data(), p - buffer.data()); 1406 1407 return result; 1408 } 1409 1410 CString UString::UTF8String() const 1411 { 1412 return UTF8String(0); 1281 char* p = buffer.data(); 1282 const ::UChar* d = &data()->uc; 1283 ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict); 1284 if (result != conversionOK) 1285 return CString(); 1286 1287 return CString(buffer.data(), p - buffer.data()); 1413 1288 } 1414 1289 -
trunk/JavaScriptCore/kjs/ustring.h
r27406 r27746 266 266 /** 267 267 * @return The string converted to the 8-bit string type CString(). 268 * This method is not Unicode safe and shouldn't be used unless the string 269 * is known to be ASCII. 268 270 */ 269 271 CString cstring() const; … … 279 281 /** 280 282 * Convert the string to UTF-8, assuming it is UTF-16 encoded. 281 * Since this function is tolerant of badly formed UTF-16, it can create UTF-8282 * strings that are invalid because they have characters in the range283 * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to284 * be otherwise valid.285 * /286 CString UTF8String() const;287 CString UTF8String(bool * utf16WasGood) const;283 * In non-strict mode, this function is tolerant of badly formed UTF-16, it 284 * can create UTF-8 strings that are invalid because they have characters in 285 * the range U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is 286 * guaranteed to be otherwise valid. 287 * In strict mode, error is returned as null CString. 288 */ 289 CString UTF8String(bool strict = false) const; 288 290 289 291 /** … … 427 429 428 430 int compare(const UString &, const UString &); 429 430 // Given a first byte, gives the length of the UTF-8 sequence it begins.431 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.432 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).433 int UTF8SequenceLength(char);434 435 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.436 // Only allows Unicode characters (U-00000000 to U-0010FFFF).437 // Returns -1 if the sequence is not valid (including presence of extra bytes).438 int decodeUTF8Sequence(const char *);439 431 440 432 inline UString::UString()
Note:
See TracChangeset
for help on using the changeset viewer.