Ignore:
Timestamp:
Aug 18, 2003, 11:51:25 AM (22 years ago)
Author:
darin
Message:

Reviewed by Maciej.

  • fixed 3247528 -- encodeURI missing from JavaScriptCore (needed by Crystal Reports)
  • fixed 3381297 -- escape method does not escape the null character
  • fixed 3381299 -- escape method produces incorrect escape sequences ala WinIE, rather than correct ala Gecko
  • fixed 3381303 -- unescape method treats escape sequences as Latin-1 ala WinIE rather than as UTF-8 ala Gecko
  • fixed 3381304 -- unescape method garbles strings with bad escape sequences in them
  • kjs/function.h: Added constants for decodeURI, decodeURIComponent, encodeURI, and encodeURIComponent.
  • kjs/function.cpp: (encode): Added. New helper function for escape, encodeURI, and encodeURIComponent. (decode): Added. New helper function for unescape, decodeURI, and decodeURIComponent. (GlobalFuncImp::call): Added decodeURI, decodeURIComponent, encodeURI, and encodeURIComponent implementations. Changed escape and unescape to use new helper functions, which fixes the four problems above.
  • kjs/internal.cpp: (InterpreterImp::initGlobalObject): Add decodeURI, decodeURIComponent, encodeURI, and encodeURIComponent to the global object.
  • kjs/ustring.h: Added a length to the CString class so it can hold strings with null characters in them, not just null-terminated strings. This allows a null character from a UString to survive the process of UTF-16 to UTF-8 decoding. Added overloads to UString::append, UString::UTF8String, UTF8SequenceLength, decodeUTF8Sequence, convertUTF16OffsetsToUTF8Offsets, and convertUTF8OffsetsToUTF16Offsets.
  • kjs/ustring.cpp: (CString::CString): Set up the length properly in all the constructors. Also add a new constructor that takes a length. (CString::append): Use and set the length properly. (CString::operator=): Use and set the length properly. (operator==): Use and the length and memcmp instead of strcmp. (UString::append): Added new overloads for const char * and for a single string to make it more efficient to build up a UString from pieces. The old way, a UString was created and destroyed each time you appended. (UTF8SequenceLength): New. Helper for decoding UTF-8. (decodeUTF8Sequence): New. Helper for decoding UTF-8. (UString::UTF8String): New. Decodes from UTF-16 to UTF-8. Same as the function that was in regexp.cpp, except has proper handling for UTF-16 surrogates. (compareStringOffsets): Moved from regexp.cpp. (createSortedOffsetsArray): Moved from regexp.cpp. (convertUTF16OffsetsToUTF8Offsets): New. Converts UTF-16 offsets to UTF-8 offsets, given a UTF-8 string. Same as the function that was in regexp.cpp, except has proper handling for UTF-16 surrogates. (convertUTF8OffsetsToUTF16Offsets): New. Converts UTF-8 offsets to UTF-16 offsets, given a UTF-8 string. Same as the function that was in regexp.cpp, except has proper handling for UTF-16 surrogates.
  • fixed 3381296 -- regular expression matches with UTF-16 surrogates will treat sequences as two characters
  • kjs/regexp.cpp: (RegExp::RegExp): Use the new UString::UTF8String function instead a function in this file. (RegExp::match): Use the new convertUTF16OffsetsToUTF8Offsets (and the corresponding reverse) instead of convertCharacterOffsetsToUTF8ByteOffsets in this file.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/kjs/function.cpp

    r3373 r4837  
    409409}
    410410
     411static Value encode(ExecState *exec, const List &args, const char *do_not_escape)
     412{
     413  UString r = "", s, str = args[0].toString(exec);
     414  CString cstr = str.UTF8String();
     415  const char *p = cstr.c_str();
     416  for (int k = 0; k < cstr.size(); k++, p++) {
     417    char c = *p;
     418    if (c && strchr(do_not_escape, c)) {
     419      r.append(c);
     420    } else {
     421      char tmp[4];
     422      sprintf(tmp, "%%%02X", (unsigned char)c);
     423      r += tmp;
     424    }
     425  }
     426  return String(r);
     427}
     428
     429static Value decode(ExecState *exec, const List &args, const char *do_not_unescape, bool strict)
     430{
     431  UString s = "", str = args[0].toString(exec);
     432  int k = 0, len = str.size();
     433  const UChar *d = str.data();
     434  UChar u;
     435  while (k < len) {
     436    const UChar *p = d + k;
     437    UChar c = *p;
     438    if (c == '%') {
     439      int charLen = 0;
     440      if (k <= len - 3 && isxdigit(p[1].uc) && isxdigit(p[2].uc)) {
     441        const char b0 = Lexer::convertHex(p[1].uc, p[2].uc);
     442        const int sequenceLen = UTF8SequenceLength(b0);
     443        if (sequenceLen != 0 && k <= len - sequenceLen * 3) {
     444          charLen = sequenceLen * 3;
     445          char sequence[5];
     446          sequence[0] = b0;
     447          for (int i = 1; i < sequenceLen; ++i) {
     448            const UChar *q = p + i * 3;
     449            if (q[0] == '%' && isxdigit(q[1].uc) && isxdigit(q[2].uc))
     450              sequence[i] = Lexer::convertHex(q[1].uc, q[2].uc);
     451            else {
     452              charLen = 0;
     453              break;
     454            }
     455          }
     456          if (charLen != 0) {
     457            sequence[sequenceLen] = 0;
     458            const int character = decodeUTF8Sequence(sequence);
     459            if (character < 0 || character >= 0x110000) {
     460              charLen = 0;
     461            } else if (character >= 0x10000) {
     462              // Convert to surrogate pair.
     463              s.append(static_cast<unsigned short>(0xD800 | ((character - 0x10000) >> 10)));
     464              u = static_cast<unsigned short>(0xDC00 | ((character - 0x10000) & 0x3FF));
     465            } else {
     466              u = static_cast<unsigned short>(character);
     467            }
     468          }
     469        }
     470      }
     471      if (charLen == 0) {
     472        if (strict) {
     473          Object error = Error::create(exec, URIError);
     474          exec->setException(error);
     475          return error;
     476        }
     477        // The only case where we don't use "strict" mode is the "unescape" function.
     478        // For that, it's good to support the wonky "%u" syntax for compatibility with WinIE.
     479        if (k <= len - 6 && p[1] == 'u'
     480            && isxdigit(p[2].uc) && isxdigit(p[3].uc)
     481            && isxdigit(p[4].uc) && isxdigit(p[5].uc)) {
     482          charLen = 6;
     483          u = Lexer::convertUnicode(p[2].uc, p[3].uc, p[4].uc, p[5].uc);
     484        }
     485      }
     486      if (charLen && (u.uc == 0 || u.uc >= 128 || !strchr(do_not_unescape, u.low()))) {
     487        c = u;
     488        k += charLen - 1;
     489      }
     490    }
     491    k++;
     492    s.append(c);
     493  }
     494  return String(s);
     495}
     496
    411497Value GlobalFuncImp::call(ExecState *exec, Object &/*thisObj*/, const List &args)
    412498{
    413499  Value res;
    414500
    415   static const char non_escape[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    416                                    "abcdefghijklmnopqrstuvwxyz"
    417                                    "0123456789@*_+-./";
     501  static const char do_not_escape[] =
     502    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     503    "abcdefghijklmnopqrstuvwxyz"
     504    "0123456789"
     505    "*+-./@_";
     506  static const char do_not_escape_when_encoding_URI_component[] =
     507    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     508    "abcdefghijklmnopqrstuvwxyz"
     509    "0123456789"
     510    "!'()*-._~";
     511  static const char do_not_escape_when_encoding_URI[] =
     512    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     513    "abcdefghijklmnopqrstuvwxyz"
     514    "0123456789"
     515    "!#$&'()*+,-./:;=?@_~";
     516  static const char do_not_unescape_when_decoding_URI[] =
     517    "#$&+,/:;=?@";
    418518
    419519  switch (id) {
     
    503603    break;
    504604  }
    505   case Escape: {
    506     UString r = "", s, str = args[0].toString(exec);
    507     const UChar *c = str.data();
    508     for (int k = 0; k < str.size(); k++, c++) {
    509       int u = c->uc;
    510       if (u > 255) {
    511         char tmp[7];
    512         sprintf(tmp, "%%u%04X", u);
    513         s = UString(tmp);
    514       } else if (strchr(non_escape, (char)u)) {
    515         s = UString(c, 1);
    516       } else {
    517         char tmp[4];
    518         sprintf(tmp, "%%%02X", u);
    519         s = UString(tmp);
    520       }
    521       r += s;
    522     }
    523     res = String(r);
    524     break;
    525   }
    526   case UnEscape: {
    527     UString s, str = args[0].toString(exec);
    528     int k = 0, len = str.size();
    529     UChar u;
    530     while (k < len) {
    531       const UChar *c = str.data() + k;
    532       if (*c == UChar('%') && k <= len - 6 && *(c+1) == UChar('u')) {
    533         u = Lexer::convertUnicode((c+2)->uc, (c+3)->uc,
    534                                   (c+4)->uc, (c+5)->uc);
    535         c = &u;
    536         k += 5;
    537       } else if (*c == UChar('%') && k <= len - 3) {
    538         u = UChar(Lexer::convertHex((c+1)->uc, (c+2)->uc));
    539         c = &u;
    540         k += 2;
    541       }
    542       k++;
    543       s += UString(c, 1);
    544     }
    545     res = String(s);
    546     break;
    547   }
     605  case DecodeURI:
     606    res = decode(exec, args, do_not_unescape_when_decoding_URI, true);
     607    break;
     608  case DecodeURIComponent:
     609    res = decode(exec, args, "", true);
     610    break;
     611  case EncodeURI:
     612    res = encode(exec, args, do_not_escape_when_encoding_URI);
     613    break;
     614  case EncodeURIComponent:
     615    res = encode(exec, args, do_not_escape_when_encoding_URI_component);
     616    break;
     617  case Escape:
     618    res = encode(exec, args, do_not_escape);
     619    break;
     620  case UnEscape:
     621    res = decode(exec, args, "", false);
     622    break;
    548623#ifndef NDEBUG
    549   case KJSPrint: {
    550     UString str = args[0].toString(exec);
    551     puts(str.ascii());
    552   }
     624  case KJSPrint:
     625    puts(args[0].toString(exec).ascii());
     626    break;
    553627#endif
    554628  }
Note: See TracChangeset for help on using the changeset viewer.