Context Navigation

← Previous Change
Next Change →

function.cpp

Timestamp:

Aug 18, 2003, 11:51:25 AM (22 years ago)

Author:

darin

Message:

Reviewed by Maciej.

fixed 3247528 -- encodeURI missing from JavaScriptCore (needed by Crystal Reports)
fixed 3381297 -- escape method does not escape the null character
fixed 3381299 -- escape method produces incorrect escape sequences ala WinIE, rather than correct ala Gecko
fixed 3381303 -- unescape method treats escape sequences as Latin-1 ala WinIE rather than as UTF-8 ala Gecko
fixed 3381304 -- unescape method garbles strings with bad escape sequences in them

kjs/function.h: Added constants for decodeURI, decodeURIComponent, encodeURI, and encodeURIComponent.
kjs/function.cpp: (encode): Added. New helper function for escape, encodeURI, and encodeURIComponent. (decode): Added. New helper function for unescape, decodeURI, and decodeURIComponent. (GlobalFuncImp::call): Added decodeURI, decodeURIComponent, encodeURI, and encodeURIComponent implementations. Changed escape and unescape to use new helper functions, which fixes the four problems above.

kjs/internal.cpp: (InterpreterImp::initGlobalObject): Add decodeURI, decodeURIComponent, encodeURI, and encodeURIComponent to the global object.

kjs/ustring.h: Added a length to the CString class so it can hold strings with null characters in them, not just null-terminated strings. This allows a null character from a UString to survive the process of UTF-16 to UTF-8 decoding. Added overloads to UString::append, UString::UTF8String, UTF8SequenceLength, decodeUTF8Sequence, convertUTF16OffsetsToUTF8Offsets, and convertUTF8OffsetsToUTF16Offsets.

kjs/ustring.cpp: (CString::CString): Set up the length properly in all the constructors. Also add a new constructor that takes a length. (CString::append): Use and set the length properly. (CString::operator=): Use and set the length properly. (operator==): Use and the length and memcmp instead of strcmp. (UString::append): Added new overloads for const char * and for a single string to make it more efficient to build up a UString from pieces. The old way, a UString was created and destroyed each time you appended. (UTF8SequenceLength): New. Helper for decoding UTF-8. (decodeUTF8Sequence): New. Helper for decoding UTF-8. (UString::UTF8String): New. Decodes from UTF-16 to UTF-8. Same as the function that was in regexp.cpp, except has proper handling for UTF-16 surrogates. (compareStringOffsets): Moved from regexp.cpp. (createSortedOffsetsArray): Moved from regexp.cpp. (convertUTF16OffsetsToUTF8Offsets): New. Converts UTF-16 offsets to UTF-8 offsets, given a UTF-8 string. Same as the function that was in regexp.cpp, except has proper handling for UTF-16 surrogates. (convertUTF8OffsetsToUTF16Offsets): New. Converts UTF-8 offsets to UTF-16 offsets, given a UTF-8 string. Same as the function that was in regexp.cpp, except has proper handling for UTF-16 surrogates.

fixed 3381296 -- regular expression matches with UTF-16 surrogates will treat sequences as two characters

kjs/regexp.cpp: (RegExp::RegExp): Use the new UString::UTF8String function instead a function in this file. (RegExp::match): Use the new convertUTF16OffsetsToUTF8Offsets (and the corresponding reverse) instead of convertCharacterOffsetsToUTF8ByteOffsets in this file.

File:

: 1 edited

trunk/JavaScriptCore/kjs/function.cpp (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/JavaScriptCore/kjs/function.cpp

-              r3373
+              r4837
+}
+static Value encode(ExecState *exec, const List &args, const char *do_not_escape)
+{
+  UString r = "", s, str = args[0].toString(exec);
+  CString cstr = str.UTF8String();
+  const char *p = cstr.c_str();
+  for (int k = 0; k < cstr.size(); k++, p++) {
+    char c = *p;
+    if (c && strchr(do_not_escape, c)) {
+      r.append(c);
+    } else {
+      char tmp[4];
+      sprintf(tmp, "%%%02X", (unsigned char)c);
+      r += tmp;
+    }
+  }
+  return String(r);
+}
+static Value decode(ExecState *exec, const List &args, const char *do_not_unescape, bool strict)
+{
+  UString s = "", str = args[0].toString(exec);
+  int k = 0, len = str.size();
+  const UChar *d = str.data();
+  UChar u;
+  while (k < len) {
+    const UChar *p = d + k;
+    UChar c = *p;
+    if (c == '%') {
+      int charLen = 0;
+      if (k <= len - 3 && isxdigit(p[1].uc) && isxdigit(p[2].uc)) {
+        const char b0 = Lexer::convertHex(p[1].uc, p[2].uc);
+        const int sequenceLen = UTF8SequenceLength(b0);
+        if (sequenceLen != 0 && k <= len - sequenceLen * 3) {
+          charLen = sequenceLen * 3;
+          char sequence[5];
+          sequence[0] = b0;
+          for (int i = 1; i < sequenceLen; ++i) {
+            const UChar *q = p + i * 3;
+            if (q[0] == '%' && isxdigit(q[1].uc) && isxdigit(q[2].uc))
+              sequence[i] = Lexer::convertHex(q[1].uc, q[2].uc);
+            else {
+              charLen = 0;
+              break;
+            }
+          }
+          if (charLen != 0) {
+            sequence[sequenceLen] = 0;
+            const int character = decodeUTF8Sequence(sequence);
+            if (character < 0 || character >= 0x110000) {
+              charLen = 0;
+            } else if (character >= 0x10000) {
+              // Convert to surrogate pair.
+              s.append(static_cast<unsigned short>(0xD800 | ((character - 0x10000) >> 10)));
+              u = static_cast<unsigned short>(0xDC00 | ((character - 0x10000) & 0x3FF));
+            } else {
+              u = static_cast<unsigned short>(character);
+            }
+          }
+        }
+      }
+      if (charLen == 0) {
+        if (strict) {
+          Object error = Error::create(exec, URIError);
+          exec->setException(error);
+          return error;
+        }
+        // The only case where we don't use "strict" mode is the "unescape" function.
+        // For that, it's good to support the wonky "%u" syntax for compatibility with WinIE.
+        if (k <= len - 6 && p[1] == 'u'
+            && isxdigit(p[2].uc) && isxdigit(p[3].uc)
+            && isxdigit(p[4].uc) && isxdigit(p[5].uc)) {
+          charLen = 6;
+          u = Lexer::convertUnicode(p[2].uc, p[3].uc, p[4].uc, p[5].uc);
+        }
+      }
+      if (charLen && (u.uc == 0 || u.uc >= 128 || !strchr(do_not_unescape, u.low()))) {
+        c = u;
+        k += charLen - 1;
+      }
+    }
+    k++;
+    s.append(c);
+  }
+  return String(s);
+}
 Value GlobalFuncImp::call(ExecState *exec, Object &/*thisObj*/, const List &args)
+{
   Value res;
+  static const char non_escape[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                   "abcdefghijklmnopqrstuvwxyz"
+                                   "0123456789@*_+-./";
+  static const char do_not_escape[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "abcdefghijklmnopqrstuvwxyz"
+    "0123456789"
+    "*+-./@_";
+  static const char do_not_escape_when_encoding_URI_component[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "abcdefghijklmnopqrstuvwxyz"
+    "0123456789"
+    "!'()*-._~";
+  static const char do_not_escape_when_encoding_URI[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "abcdefghijklmnopqrstuvwxyz"
+    "0123456789"
+    "!#$&'()*+,-./:;=?@_~";
+  static const char do_not_unescape_when_decoding_URI[] =
+    "#$&+,/:;=?@";
   switch (id) {
 …
     break;
+  }
+  case Escape: {
+    UString r = "", s, str = args[0].toString(exec);
+    const UChar *c = str.data();
+    for (int k = 0; k < str.size(); k++, c++) {
+      int u = c->uc;
+      if (u > 255) {
+        char tmp[7];
+        sprintf(tmp, "%%u%04X", u);
+        s = UString(tmp);
+      } else if (strchr(non_escape, (char)u)) {
+        s = UString(c, 1);
+      } else {
+        char tmp[4];
+        sprintf(tmp, "%%%02X", u);
+        s = UString(tmp);
+      }
+      r += s;
+    }
+    res = String(r);
+    break;
+  }
+  case UnEscape: {
+    UString s, str = args[0].toString(exec);
+    int k = 0, len = str.size();
+    UChar u;
+    while (k < len) {
+      const UChar *c = str.data() + k;
+      if (*c == UChar('%') && k <= len - 6 && *(c+1) == UChar('u')) {
+        u = Lexer::convertUnicode((c+2)->uc, (c+3)->uc,
+                                  (c+4)->uc, (c+5)->uc);
+        c = &u;
+        k += 5;
+      } else if (*c == UChar('%') && k <= len - 3) {
+        u = UChar(Lexer::convertHex((c+1)->uc, (c+2)->uc));
+        c = &u;
+        k += 2;
+      }
+      k++;
+      s += UString(c, 1);
+    }
+    res = String(s);
+    break;
+  }
+  case DecodeURI:
+    res = decode(exec, args, do_not_unescape_when_decoding_URI, true);
+    break;
+  case DecodeURIComponent:
+    res = decode(exec, args, "", true);
+    break;
+  case EncodeURI:
+    res = encode(exec, args, do_not_escape_when_encoding_URI);
+    break;
+  case EncodeURIComponent:
+    res = encode(exec, args, do_not_escape_when_encoding_URI_component);
+    break;
+  case Escape:
+    res = encode(exec, args, do_not_escape);
+    break;
+  case UnEscape:
+    res = decode(exec, args, "", false);
+    break;
 #ifndef NDEBUG
+  case KJSPrint: {
+    UString str = args[0].toString(exec);
+    puts(str.ascii());
+  }
+  case KJSPrint:
+    puts(args[0].toString(exec).ascii());
+    break;
 #endif
+  }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 4837 in webkit for trunk/JavaScriptCore/kjs/function.cpp

Legend:

trunk/JavaScriptCore/kjs/function.cpp

Download in other formats: