Context Navigation

← Previous Change
Next Change →

Changeset 27746 in webkit for trunk/JavaScriptCore/kjs

Timestamp:

Nov 12, 2007, 11:12:55 PM (18 years ago)

Author:

[email protected]

Message:

Reviewed by Darin.

http://bugs.webkit.org/show_bug.cgi?id=15953
Add UTF-8 encoding/decoding to WTF

kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode.
kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient parameter. Callers are not interested in getting decoding results in strict mode, so this allows for bailing out as soon as an error is seen.

kjs/function.cpp: (KJS::encode): Updated for new UString::UTF8String() signature.

API/JSStringRef.cpp: (JSStringCreateWithCharacters): Disambiguate UChar. (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string!
bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16().

wtf/unicode/UTF8.cpp: Added. (WTF::Unicode::inlineUTF8SequenceLengthNonASCII): (WTF::Unicode::inlineUTF8SequenceLength): (WTF::Unicode::UTF8SequenceLength): (WTF::Unicode::decodeUTF8Sequence): (WTF::Unicode::): (WTF::Unicode::ConvertUTF16ToUTF8): (WTF::Unicode::isLegalUTF8): (WTF::Unicode::ConvertUTF8ToUTF16):
wtf/unicode/UTF8.h: Added. (WTF::Unicode::): Some code moved from ustring.h, some adapted from unicode.org sources.

JavaScriptCore.exp:
JavaScriptCore.pri:
JavaScriptCore.vcproj/WTF/WTF.vcproj:
JavaScriptCore.xcodeproj/project.pbxproj:
JavaScriptCoreSources.bkl: Added UTF8.{h,cpp}

Location:

trunk/JavaScriptCore/kjs

Files:

: 3 edited

function.cpp (modified) (2 diffs)
ustring.cpp (modified) (2 diffs)
ustring.h (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/JavaScriptCore/kjs/function.cpp

-              r27448
+              r27746
 #include <wtf/Assertions.h>
 #include <wtf/MathExtras.h>
 #include <wtf/unicode/Unicode.h>
+#include <wtf/unicode/UTF8.h>
 using namespace WTF;
 …
+{
   UString r = "", s, str = args[0]->toString(exec);
+  bool wasGoodUTF16;
+  CString cstr = str.UTF8String(&wasGoodUTF16);
+  if (!wasGoodUTF16)
+  CString cstr = str.UTF8String(true);
+  if (!cstr.c_str())
     return throwError(exec, URIError, "String contained an illegal UTF-16 sequence.");
   const char* p = cstr.c_str();

trunk/JavaScriptCore/kjs/ustring.cpp

-              r27406
+              r27746
+}
+inline int inlineUTF8SequenceLengthNonASCII(char b0)
+{
+  if ((b0 & 0xC0) != 0xC0)
+    return 0;
+  if ((b0 & 0xE0) == 0xC0)
+    return 2;
+  if ((b0 & 0xF0) == 0xE0)
+    return 3;
+  if ((b0 & 0xF8) == 0xF0)
+    return 4;
+  return 0;
+}
+int UTF8SequenceLengthNonASCII(char b0)
+{
+  return inlineUTF8SequenceLengthNonASCII(b0);
+}
+inline int inlineUTF8SequenceLength(char b0)
+{
+  return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
+}
+// Given a first byte, gives the length of the UTF-8 sequence it begins.
+// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
+// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
+int UTF8SequenceLength(char b0)
+{
+  return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+}
+// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
+// Only allows Unicode characters (U-00000000 to U-0010FFFF).
+// Returns -1 if the sequence is not valid (including presence of extra bytes).
+int decodeUTF8Sequence(const char *sequence)
+{
+  // Handle 0-byte sequences (never valid).
+  const unsigned char b0 = sequence[0];
+  const int length = inlineUTF8SequenceLength(b0);
+  if (length == 0)
+    return -1;
+  // Handle 1-byte sequences (plain ASCII).
+  const unsigned char b1 = sequence[1];
+  if (length == 1) {
+    if (b1)
+      return -1;
+    return b0;
+  }
+  // Handle 2-byte sequences.
+  if ((b1 & 0xC0) != 0x80)
+    return -1;
+  const unsigned char b2 = sequence[2];
+  if (length == 2) {
+    if (b2)
+      return -1;
+    const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
+    if (c < 0x80)
+      return -1;
+    return c;
+  }
+  // Handle 3-byte sequences.
+  if ((b2 & 0xC0) != 0x80)
+    return -1;
+  const unsigned char b3 = sequence[3];
+  if (length == 3) {
+    if (b3)
+      return -1;
+    const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
+    if (c < 0x800)
+      return -1;
+    // UTF-16 surrogates should never appear in UTF-8 data.
+    if (c >= 0xD800 && c <= 0xDFFF)
+      return -1;
+    return c;
+  }
+  // Handle 4-byte sequences.
+  if ((b3 & 0xC0) != 0x80)
+    return -1;
+  const unsigned char b4 = sequence[4];
+  if (length == 4) {
+    if (b4)
+      return -1;
+    const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+    if (c < 0x10000 || c > 0x10FFFF)
+      return -1;
+    return c;
+  }
+  return -1;
+}
+CString UString::UTF8String(bool* utf16WasGood) const
+{
+  if (utf16WasGood)
+    *utf16WasGood = true;
+CString UString::UTF8String(bool strict) const
+{
   // Allocate a buffer big enough to hold all the characters.
   const int length = size();
 …
   // Convert to runs of 8-bit characters.
+  char *p = buffer.begin();
+  const UChar *d = data();
+  for (int i = 0; i != length; ++i) {
+    unsigned short c = d[i].unicode();
+    if (c < 0x80) {
+      *p++ = (char)c;
+    } else if (c < 0x800) {
+      *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
+      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+    } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) {
+      unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF));
+      *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
+      *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
+      *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+      *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
+      ++i;
+    } else {
+      if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF)
+        *utf16WasGood = false;
+      *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
+      *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+    }
+  }
+  // Return the result as a C string.
+  CString result(buffer.data(), p - buffer.data());
+  return result;
+}
+CString UString::UTF8String() const
+{
+    return UTF8String(0);
+  char* p = buffer.data();
+  const ::UChar* d = &data()->uc;
+  ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict);
+  if (result != conversionOK)
+    return CString();
+  return CString(buffer.data(), p - buffer.data());
+}

trunk/JavaScriptCore/kjs/ustring.h

-              r27406
+              r27746
     /**
      * @return The string converted to the 8-bit string type CString().
+     * This method is not Unicode safe and shouldn't be used unless the string
+     * is known to be ASCII.
      */
     CString cstring() const;
 …
     /**
      * Convert the string to UTF-8, assuming it is UTF-16 encoded.
      * Since this function is tolerant of badly formed UTF-16, it can create UTF-8
      * strings that are invalid because they have characters in the range
      * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to
      * be otherwise valid.
      */
     CString UTF8String() const;
     CString UTF8String(bool* utf16WasGood) const;
+     * In non-strict mode, this function is tolerant of badly formed UTF-16, it
+     * can create UTF-8 strings that are invalid because they have characters in
+     * the range U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is
+     * guaranteed to be otherwise valid.
+     * In strict mode, error is returned as null CString.
+     */
+    CString UTF8String(bool strict = false) const;
     /**
 …
   int compare(const UString &, const UString &);
-  // Given a first byte, gives the length of the UTF-8 sequence it begins.
-  // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
-  // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
-  int UTF8SequenceLength(char);
-  // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
-  // Only allows Unicode characters (U-00000000 to U-0010FFFF).
-  // Returns -1 if the sequence is not valid (including presence of extra bytes).
-  int decodeUTF8Sequence(const char *);
 inline UString::UString()

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 27746 in webkit for trunk/JavaScriptCore/kjs

Legend:

trunk/JavaScriptCore/kjs/function.cpp

trunk/JavaScriptCore/kjs/ustring.cpp

trunk/JavaScriptCore/kjs/ustring.h

Download in other formats: