Context Navigation

← Previous Change
Next Change →

Changeset 27746 in webkit for trunk/JavaScriptCore

Timestamp:

Nov 12, 2007, 11:12:55 PM (18 years ago)

Author:

[email protected]

Message:

Reviewed by Darin.

http://bugs.webkit.org/show_bug.cgi?id=15953
Add UTF-8 encoding/decoding to WTF

kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode.
kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient parameter. Callers are not interested in getting decoding results in strict mode, so this allows for bailing out as soon as an error is seen.

kjs/function.cpp: (KJS::encode): Updated for new UString::UTF8String() signature.

API/JSStringRef.cpp: (JSStringCreateWithCharacters): Disambiguate UChar. (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string!
bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16().

wtf/unicode/UTF8.cpp: Added. (WTF::Unicode::inlineUTF8SequenceLengthNonASCII): (WTF::Unicode::inlineUTF8SequenceLength): (WTF::Unicode::UTF8SequenceLength): (WTF::Unicode::decodeUTF8Sequence): (WTF::Unicode::): (WTF::Unicode::ConvertUTF16ToUTF8): (WTF::Unicode::isLegalUTF8): (WTF::Unicode::ConvertUTF8ToUTF16):
wtf/unicode/UTF8.h: Added. (WTF::Unicode::): Some code moved from ustring.h, some adapted from unicode.org sources.

JavaScriptCore.exp:
JavaScriptCore.pri:
JavaScriptCore.vcproj/WTF/WTF.vcproj:
JavaScriptCore.xcodeproj/project.pbxproj:
JavaScriptCoreSources.bkl: Added UTF8.{h,cpp}

Location:

trunk/JavaScriptCore

Files:

: 2 added
: 11 edited

API/JSStringRef.cpp (modified) (2 diffs)
ChangeLog (modified) (1 diff)
JavaScriptCore.exp (modified) (1 diff)
JavaScriptCore.pri (modified) (1 diff)
JavaScriptCore.vcproj/WTF/WTF.vcproj (modified) (1 diff)
JavaScriptCore.xcodeproj/project.pbxproj (modified) (6 diffs)
JavaScriptCoreSources.bkl (modified) (1 diff)
bindings/c/c_utility.cpp (modified) (3 diffs)
kjs/function.cpp (modified) (2 diffs)
kjs/ustring.cpp (modified) (2 diffs)
kjs/ustring.h (modified) (3 diffs)
wtf/unicode/UTF8.cpp (added)
wtf/unicode/UTF8.h (added)

Legend:

: Unmodified
: Added
: Removed

trunk/JavaScriptCore/API/JSStringRef.cpp

-              r27730
+              r27746
 #include <kjs/ustring.h>
 #include <kjs/value.h>
+#include <wtf/unicode/UTF8.h>
 using namespace KJS;
+using namespace WTF::Unicode;
 JSStringRef JSStringCreateWithCharacters(const JSChar* chars, size_t numChars)
+{
     JSLock lock;
     return toRef(UString(reinterpret_cast<const UChar*>(chars), static_cast<int>(numChars)).rep()->ref());
+    return toRef(UString(reinterpret_cast<const KJS::UChar*>(chars), static_cast<int>(numChars)).rep()->ref());
+}
 …
+{
     JSLock lock;
+    // FIXME: <rdar://problem/4949018>
+    return toRef(UString(string).rep()->ref());
+    size_t length = strlen(string);
+    Vector< ::UChar, 1024> buffer(length);
+    ::UChar* p = buffer.data();
+    ConvertUTF8ToUTF16(&string, string + length, &p, p + length, false);
+    return toRef(UString(reinterpret_cast<KJS::UChar*>(buffer.data()), p - buffer.data()).rep()->ref());
+}

trunk/JavaScriptCore/ChangeLog

-              r27745
+              r27746
+-11-12  Alexey Proskuryakov  <[email protected]>
+        Reviewed by Darin.
+        http://bugs.webkit.org/show_bug.cgi?id=15953
+        Add UTF-8 encoding/decoding to WTF
+        * kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode.
+        * kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient
+        parameter. Callers are not interested in getting decoding results in strict mode, so
+        this allows for bailing out as soon as an error is seen.
+        * kjs/function.cpp:
+        (KJS::encode): Updated for new UString::UTF8String() signature.
+        * API/JSStringRef.cpp:
+        (JSStringCreateWithCharacters): Disambiguate UChar.
+        (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string!
+        * bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16().
+        * wtf/unicode/UTF8.cpp: Added.
+        (WTF::Unicode::inlineUTF8SequenceLengthNonASCII):
+        (WTF::Unicode::inlineUTF8SequenceLength):
+        (WTF::Unicode::UTF8SequenceLength):
+        (WTF::Unicode::decodeUTF8Sequence):
+        (WTF::Unicode::):
+        (WTF::Unicode::ConvertUTF16ToUTF8):
+        (WTF::Unicode::isLegalUTF8):
+        (WTF::Unicode::ConvertUTF8ToUTF16):
+        * wtf/unicode/UTF8.h: Added.
+        (WTF::Unicode::):
+        Some code moved from ustring.h, some adapted from unicode.org sources.
+        * JavaScriptCore.exp:
+        * JavaScriptCore.pri:
+        * JavaScriptCore.vcproj/WTF/WTF.vcproj:
+        * JavaScriptCore.xcodeproj/project.pbxproj:
+        * JavaScriptCoreSources.bkl:
+        Added UTF8.{h,cpp}
 -11-12  Josh Aas  <[email protected]>

trunk/JavaScriptCore/JavaScriptCore.exp

r27711	r27746
260	260	__ZNK3KJS7JSValue7toFloatEPNS_9ExecStateE
261	261	__ZNK3KJS7JSValue9toIntegerEPNS_9ExecStateE
262		__ZNK3KJS7UString10UTF8StringEv
	262	__ZNK3KJS7UString10UTF8StringEb
263	263	__ZNK3KJS7UString14toStrictUInt32EPb
264	264	__ZNK3KJS7UString5asciiEv

trunk/JavaScriptCore/JavaScriptCore.pri

r27686	r27746
34	34	wtf/HashTable.cpp \
35	35	wtf/FastMalloc.cpp \
	36	wtf/unicode/UTF8.cpp \
36	37	bindings/NP_jsobject.cpp \
37	38	bindings/npruntime.cpp \

trunk/JavaScriptCore/JavaScriptCore.vcproj/WTF/WTF.vcproj

-              r26787
+              r27746
+                        >
                 </File>
+                <File
+                        RelativePath="..\..\wtf\unicode\UTF8.h"
+                        >
+                </File>
+                <File
+                        RelativePath="..\..\wtf\unicode\UTF8.cpp"
+                        >
+                </File>
         </Files>
         <Globals>

trunk/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj

-              r27687
+              r27746
                 E195679609E7CF1200B89D13 /* UnicodeIcu.h in Headers */ = {isa = PBXBuildFile; fileRef = E195678F09E7CF1200B89D13 /* UnicodeIcu.h */; settings = {ATTRIBUTES = (Private, ); }; };
                 E195679809E7CF1200B89D13 /* Unicode.h in Headers */ = {isa = PBXBuildFile; fileRef = E195679409E7CF1200B89D13 /* Unicode.h */; settings = {ATTRIBUTES = (Private, ); }; };
+                E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E1EF79A80CE97BA60088D500 /* UTF8.cpp */; };
+                E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */ = {isa = PBXBuildFile; fileRef = E1EF79A90CE97BA60088D500 /* UTF8.h */; };
 /* End PBXBuildFile section */
 …
                 E195678F09E7CF1200B89D13 /* UnicodeIcu.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UnicodeIcu.h; sourceTree = "<group>"; };
                 E195679409E7CF1200B89D13 /* Unicode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Unicode.h; sourceTree = "<group>"; };
+                E1EF79A80CE97BA60088D500 /* UTF8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UTF8.cpp; sourceTree = "<group>"; };
+                E1EF79A90CE97BA60088D500 /* UTF8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UTF8.h; sourceTree = "<group>"; };
                 F5BB2BC5030F772101FCFE1D /* completion.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = completion.h; sourceTree = "<group>"; tabWidth = 8; };
                 F5C290E60284F98E018635CA /* JavaScriptCorePrefix.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; name = JavaScriptCorePrefix.h; path = ../JavaScriptCorePrefix.h; sourceTree = "<group>"; tabWidth = 8; };
 …
                                 E195678E09E7CF1200B89D13 /* icu */,
                                 E195679409E7CF1200B89D13 /* Unicode.h */,
+                                E1EF79A90CE97BA60088D500 /* UTF8.h */,
+                                E1EF79A80CE97BA60088D500 /* UTF8.cpp */,
                         );
                         path = unicode;
 …
 F5B5C0822A1C700736975 /* ustring.h in Headers */,
 ABB36F099C076400E2A24F /* value.h in Headers */,
+                                E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;
 …
                         isa = PBXProject;
                         buildConfigurationList = 149C277108902AFE008A9EFC /* Build configuration list for PBXProject "JavaScriptCore" */;
-                        compatibilityVersion = "Xcode 2.4";
                         hasScannedForEncodings = 1;
                         mainGroup = 0867D691FE84028FC02AAC07 /* JavaScriptCore */;
 …
 F5BBA0822A1C700736975 /* runtime_object.cpp in Sources */,
 F5BC50822A1C700736975 /* runtime_root.cpp in Sources */,
+                                E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;

trunk/JavaScriptCore/JavaScriptCoreSources.bkl

r27686	r27746
114	114	wtf/HashTable.cpp
115	115	wtf/TCSystemAlloc.cpp
	116	wtf/unicode/UTF8.cpp
116	117	</set>
117	118

trunk/JavaScriptCore/bindings/c/c_utility.cpp

-              r27022
+              r27746
 #include "runtime_root.h"
 #include "Platform.h"
-#if USE(ICU_UNICODE)
-#include <unicode/ucnv.h>
-#endif
 #include <wtf/Assertions.h>
+#include <wtf/unicode/UTF8.h>
+using namespace WTF::Unicode;
 namespace KJS { namespace Bindings {
 …
 // Requires free() of returned UTF16Chars.
 void convertUTF8ToUTF16(const NPUTF8 *UTF8Chars, int UTF8Length, NPUTF16 **UTF16Chars, unsigned int *UTF16Length)
+void convertUTF8ToUTF16(const NPUTF8* UTF8Chars, int UTF8Length, NPUTF16** UTF16Chars, unsigned int* UTF16Length)
+{
-#if USE(ICU_UNICODE)
     ASSERT(UTF8Chars || UTF8Length == 0);
     ASSERT(UTF16Chars);
 …
     if (UTF8Length == -1)
         UTF8Length = static_cast<int>(strlen(UTF8Chars));
+    // UTF16Length maximum length is the length of the UTF8 string, plus one to include terminator
+    // Without the plus one, it will convert ok, but a warning is generated from the converter as
+    // there is not enough room for a terminating character.
+    *UTF16Length = UTF8Length + 1;
+    *UTF16Chars = 0;
+    UErrorCode status = U_ZERO_ERROR;
+    UConverter* conv = ucnv_open("utf8", &status);
+    if (U_SUCCESS(status)) {
+        *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length));
+        ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, 0, 0, 0, &status);
+        *UTF16Length = ucnv_toUChars(conv, (::UChar*)*UTF16Chars, *UTF16Length, UTF8Chars, UTF8Length, &status);
+        ucnv_close(conv);
+    }
+    *UTF16Length = UTF8Length;
+    *UTF16Chars = static_cast<NPUTF16*>(malloc(sizeof(NPUTF16) * (*UTF16Length)));
+    const char* sourcestart = UTF8Chars;
+    const char* sourceend = sourcestart + UTF8Length;
+    ::UChar* targetstart = reinterpret_cast< ::UChar*>(*UTF16Chars);
+    ::UChar* targetend = targetstart + UTF8Length;
+    ConversionResult result = ConvertUTF8ToUTF16(&sourcestart, sourceend, &targetstart, targetend, true);
+    *UTF16Length = targetstart - *UTF16Chars;
     // Check to see if the conversion was successful
     // Some plugins return invalid UTF-8 in NPVariantType_String, see <http://bugs.webkit.org/show_bug.cgi?id=5163>
     // There is no "bad data" for latin1. It is unlikely that the plugin was really sending text in this encoding,
     // but it should have used UTF-8, and now we are simply avoiding a crash.
     if (!U_SUCCESS(status)) {
+    if (result != conversionOK) {
         *UTF16Length = UTF8Length;
         if (!*UTF16Chars)   // If the memory wasn't allocated, allocate it.
             *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length));
+            *UTF16Chars = (NPUTF16*)malloc(sizeof(NPUTF16) * (*UTF16Length));
         for (unsigned i = 0; i < *UTF16Length; i++)
             (*UTF16Chars)[i] = UTF8Chars[i] & 0xFF;
+    }
-#else
-    ASSERT(!"Implement me!");
-#endif
+}

trunk/JavaScriptCore/kjs/function.cpp

-              r27448
+              r27746
 #include <wtf/Assertions.h>
 #include <wtf/MathExtras.h>
 #include <wtf/unicode/Unicode.h>
+#include <wtf/unicode/UTF8.h>
 using namespace WTF;
 …
+{
   UString r = "", s, str = args[0]->toString(exec);
+  bool wasGoodUTF16;
+  CString cstr = str.UTF8String(&wasGoodUTF16);
+  if (!wasGoodUTF16)
+  CString cstr = str.UTF8String(true);
+  if (!cstr.c_str())
     return throwError(exec, URIError, "String contained an illegal UTF-16 sequence.");
   const char* p = cstr.c_str();

trunk/JavaScriptCore/kjs/ustring.cpp

-              r27406
+              r27746
+}
+inline int inlineUTF8SequenceLengthNonASCII(char b0)
+{
+  if ((b0 & 0xC0) != 0xC0)
+    return 0;
+  if ((b0 & 0xE0) == 0xC0)
+    return 2;
+  if ((b0 & 0xF0) == 0xE0)
+    return 3;
+  if ((b0 & 0xF8) == 0xF0)
+    return 4;
+  return 0;
+}
+int UTF8SequenceLengthNonASCII(char b0)
+{
+  return inlineUTF8SequenceLengthNonASCII(b0);
+}
+inline int inlineUTF8SequenceLength(char b0)
+{
+  return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
+}
+// Given a first byte, gives the length of the UTF-8 sequence it begins.
+// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
+// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
+int UTF8SequenceLength(char b0)
+{
+  return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+}
+// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
+// Only allows Unicode characters (U-00000000 to U-0010FFFF).
+// Returns -1 if the sequence is not valid (including presence of extra bytes).
+int decodeUTF8Sequence(const char *sequence)
+{
+  // Handle 0-byte sequences (never valid).
+  const unsigned char b0 = sequence[0];
+  const int length = inlineUTF8SequenceLength(b0);
+  if (length == 0)
+    return -1;
+  // Handle 1-byte sequences (plain ASCII).
+  const unsigned char b1 = sequence[1];
+  if (length == 1) {
+    if (b1)
+      return -1;
+    return b0;
+  }
+  // Handle 2-byte sequences.
+  if ((b1 & 0xC0) != 0x80)
+    return -1;
+  const unsigned char b2 = sequence[2];
+  if (length == 2) {
+    if (b2)
+      return -1;
+    const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
+    if (c < 0x80)
+      return -1;
+    return c;
+  }
+  // Handle 3-byte sequences.
+  if ((b2 & 0xC0) != 0x80)
+    return -1;
+  const unsigned char b3 = sequence[3];
+  if (length == 3) {
+    if (b3)
+      return -1;
+    const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
+    if (c < 0x800)
+      return -1;
+    // UTF-16 surrogates should never appear in UTF-8 data.
+    if (c >= 0xD800 && c <= 0xDFFF)
+      return -1;
+    return c;
+  }
+  // Handle 4-byte sequences.
+  if ((b3 & 0xC0) != 0x80)
+    return -1;
+  const unsigned char b4 = sequence[4];
+  if (length == 4) {
+    if (b4)
+      return -1;
+    const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+    if (c < 0x10000 || c > 0x10FFFF)
+      return -1;
+    return c;
+  }
+  return -1;
+}
+CString UString::UTF8String(bool* utf16WasGood) const
+{
+  if (utf16WasGood)
+    *utf16WasGood = true;
+CString UString::UTF8String(bool strict) const
+{
   // Allocate a buffer big enough to hold all the characters.
   const int length = size();
 …
   // Convert to runs of 8-bit characters.
+  char *p = buffer.begin();
+  const UChar *d = data();
+  for (int i = 0; i != length; ++i) {
+    unsigned short c = d[i].unicode();
+    if (c < 0x80) {
+      *p++ = (char)c;
+    } else if (c < 0x800) {
+      *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
+      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+    } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) {
+      unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF));
+      *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
+      *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
+      *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+      *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
+      ++i;
+    } else {
+      if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF)
+        *utf16WasGood = false;
+      *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
+      *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+    }
+  }
+  // Return the result as a C string.
+  CString result(buffer.data(), p - buffer.data());
+  return result;
+}
+CString UString::UTF8String() const
+{
+    return UTF8String(0);
+  char* p = buffer.data();
+  const ::UChar* d = &data()->uc;
+  ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict);
+  if (result != conversionOK)
+    return CString();
+  return CString(buffer.data(), p - buffer.data());
+}

trunk/JavaScriptCore/kjs/ustring.h

-              r27406
+              r27746
     /**
      * @return The string converted to the 8-bit string type CString().
+     * This method is not Unicode safe and shouldn't be used unless the string
+     * is known to be ASCII.
      */
     CString cstring() const;
 …
     /**
      * Convert the string to UTF-8, assuming it is UTF-16 encoded.
      * Since this function is tolerant of badly formed UTF-16, it can create UTF-8
      * strings that are invalid because they have characters in the range
      * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to
      * be otherwise valid.
      */
     CString UTF8String() const;
     CString UTF8String(bool* utf16WasGood) const;
+     * In non-strict mode, this function is tolerant of badly formed UTF-16, it
+     * can create UTF-8 strings that are invalid because they have characters in
+     * the range U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is
+     * guaranteed to be otherwise valid.
+     * In strict mode, error is returned as null CString.
+     */
+    CString UTF8String(bool strict = false) const;
     /**
 …
   int compare(const UString &, const UString &);
-  // Given a first byte, gives the length of the UTF-8 sequence it begins.
-  // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
-  // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
-  int UTF8SequenceLength(char);
-  // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
-  // Only allows Unicode characters (U-00000000 to U-0010FFFF).
-  // Returns -1 if the sequence is not valid (including presence of extra bytes).
-  int decodeUTF8Sequence(const char *);
 inline UString::UString()

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 27746 in webkit for trunk/JavaScriptCore

Legend:

Download in other formats: