Changeset 4837 in webkit for trunk/JavaScriptCore/kjs/ustring.cpp


Ignore:
Timestamp:
Aug 18, 2003, 11:51:25 AM (22 years ago)
Author:
darin
Message:

Reviewed by Maciej.

  • fixed 3247528 -- encodeURI missing from JavaScriptCore (needed by Crystal Reports)
  • fixed 3381297 -- escape method does not escape the null character
  • fixed 3381299 -- escape method produces incorrect escape sequences ala WinIE, rather than correct ala Gecko
  • fixed 3381303 -- unescape method treats escape sequences as Latin-1 ala WinIE rather than as UTF-8 ala Gecko
  • fixed 3381304 -- unescape method garbles strings with bad escape sequences in them
  • kjs/function.h: Added constants for decodeURI, decodeURIComponent, encodeURI, and encodeURIComponent.
  • kjs/function.cpp: (encode): Added. New helper function for escape, encodeURI, and encodeURIComponent. (decode): Added. New helper function for unescape, decodeURI, and decodeURIComponent. (GlobalFuncImp::call): Added decodeURI, decodeURIComponent, encodeURI, and encodeURIComponent implementations. Changed escape and unescape to use new helper functions, which fixes the four problems above.
  • kjs/internal.cpp: (InterpreterImp::initGlobalObject): Add decodeURI, decodeURIComponent, encodeURI, and encodeURIComponent to the global object.
  • kjs/ustring.h: Added a length to the CString class so it can hold strings with null characters in them, not just null-terminated strings. This allows a null character from a UString to survive the process of UTF-16 to UTF-8 decoding. Added overloads to UString::append, UString::UTF8String, UTF8SequenceLength, decodeUTF8Sequence, convertUTF16OffsetsToUTF8Offsets, and convertUTF8OffsetsToUTF16Offsets.
  • kjs/ustring.cpp: (CString::CString): Set up the length properly in all the constructors. Also add a new constructor that takes a length. (CString::append): Use and set the length properly. (CString::operator=): Use and set the length properly. (operator==): Use and the length and memcmp instead of strcmp. (UString::append): Added new overloads for const char * and for a single string to make it more efficient to build up a UString from pieces. The old way, a UString was created and destroyed each time you appended. (UTF8SequenceLength): New. Helper for decoding UTF-8. (decodeUTF8Sequence): New. Helper for decoding UTF-8. (UString::UTF8String): New. Decodes from UTF-16 to UTF-8. Same as the function that was in regexp.cpp, except has proper handling for UTF-16 surrogates. (compareStringOffsets): Moved from regexp.cpp. (createSortedOffsetsArray): Moved from regexp.cpp. (convertUTF16OffsetsToUTF8Offsets): New. Converts UTF-16 offsets to UTF-8 offsets, given a UTF-8 string. Same as the function that was in regexp.cpp, except has proper handling for UTF-16 surrogates. (convertUTF8OffsetsToUTF16Offsets): New. Converts UTF-8 offsets to UTF-16 offsets, given a UTF-8 string. Same as the function that was in regexp.cpp, except has proper handling for UTF-16 surrogates.
  • fixed 3381296 -- regular expression matches with UTF-16 surrogates will treat sequences as two characters
  • kjs/regexp.cpp: (RegExp::RegExp): Use the new UString::UTF8String function instead a function in this file. (RegExp::match): Use the new convertUTF16OffsetsToUTF8Offsets (and the corresponding reverse) instead of convertCharacterOffsetsToUTF8ByteOffsets in this file.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/kjs/ustring.cpp

    r4792 r4837  
    4343
    4444namespace KJS {
    45   extern const double NaN;
    46   extern const double Inf;
    47 };
    48 
    49 using namespace KJS;
     45
     46extern const double NaN;
     47extern const double Inf;
    5048
    5149CString::CString(const char *c)
    5250{
    53   data = new char[strlen(c)+1];
     51  length = strlen(c);
     52  data = new char[length+1];
    5453  strcpy(data, c);
    5554}
    5655
     56CString::CString(const char *c, int len)
     57{
     58  length = len;
     59  data = new char[len+1];
     60  memcpy(data, c, len);
     61  data[len] = 0;
     62}
     63
    5764CString::CString(const CString &b)
    5865{
    59   data = new char[b.size()+1];
    60   strcpy(data, b.c_str());
     66  length = b.length;
     67  data = new char[length+1];
     68  memcpy(data, b.data, length);
    6169}
    6270
     
    6977{
    7078  char *n;
    71   if (data) {
    72     n = new char[strlen(data)+t.size()+1];
    73     strcpy(n, data);
    74   } else {
    75     n = new char[t.size()+1];
    76     n[0] = '\0';
    77   }
    78   strcat(n, t.c_str());
     79  n = new char[length+t.length+1];
     80  if (length)
     81    memcpy(n, data, length);
     82  if (t.length)
     83    memcpy(n+length, t.data, t.length);
     84  length += t.length;
     85  n[length] = 0;
    7986
    8087  delete [] data;
     
    8895  if (data)
    8996    delete [] data;
    90   data = new char[strlen(c)+1];
     97  length = strlen(c);
     98  data = new char[length+1];
    9199  strcpy(data, c);
    92100
     
    101109  if (data)
    102110    delete [] data;
    103   data = new char[str.size()+1];
    104   strcpy(data, str.c_str());
     111  length = str.length;
     112  data = new char[length + 1];
     113  memcpy(data, str.data, length + 1);
    105114
    106115  return *this;
    107116}
    108117
    109 int CString::size() const
    110 {
    111   return strlen(data);
    112 }
    113 
    114118bool KJS::operator==(const KJS::CString& c1, const KJS::CString& c2)
    115119{
    116   return (strcmp(c1.c_str(), c2.c_str()) == 0);
     120  int len = c1.size();
     121  return len == c2.size() && (len == 0 || memcmp(c1.c_str(), c2.c_str(), len) == 0);
    117122}
    118123
     
    464469  memcpy(n, data(), l * sizeof(UChar));
    465470  memcpy(n+l, t.data(), tLen * sizeof(UChar));
     471  release();
     472  rep = Rep::create(n, newLen);
     473  rep->capacity = newCapacity;
     474
     475  return *this;
     476}
     477
     478UString &UString::append(const char *t)
     479{
     480  int l = size();
     481  int tLen = strlen(t);
     482  int newLen = l + tLen;
     483  if (rep->rc == 1 && newLen <= rep->capacity) {
     484    for (int i = 0; i < tLen; ++i)
     485      rep->dat[l+i] = t[i];
     486    rep->len = newLen;
     487    rep->_hash = 0;
     488    return *this;
     489  }
     490 
     491  int newCapacity = (newLen * 3 + 1) / 2;
     492  UChar *n = new UChar[newCapacity];
     493  memcpy(n, data(), l * sizeof(UChar));
     494  for (int i = 0; i < tLen; ++i)
     495    n[l+i] = t[i];
     496  release();
     497  rep = Rep::create(n, newLen);
     498  rep->capacity = newCapacity;
     499
     500  return *this;
     501}
     502
     503UString &UString::append(unsigned short c)
     504{
     505  int l = size();
     506  int newLen = l + 1;
     507  if (rep->rc == 1 && newLen <= rep->capacity) {
     508    rep->dat[l] = c;
     509    rep->len = newLen;
     510    rep->_hash = 0;
     511    return *this;
     512  }
     513 
     514  int newCapacity = (newLen * 3 + 1) / 2;
     515  UChar *n = new UChar[newCapacity];
     516  memcpy(n, data(), l * sizeof(UChar));
     517  n[l] = c;
    466518  release();
    467519  rep = Rep::create(n, newLen);
     
    895947  return (l1 < l2) ? 1 : -1;
    896948}
     949
     950// Given a first byte, gives the length of the UTF-8 sequence it begins.
     951// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
     952// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
     953int UTF8SequenceLength(char b0)
     954{
     955  if ((b0 & 0x80) == 0)
     956    return 1;
     957  if ((b0 & 0xC0) != 0xC0)
     958    return 0;
     959  if ((b0 & 0xE0) == 0xC0)
     960    return 2;
     961  if ((b0 & 0xF0) == 0xE0)
     962    return 3;
     963  if ((b0 & 0xF8) == 0xF0)
     964    return 4;
     965  return 0;
     966}
     967
     968// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
     969// Only allows Unicode characters (U-00000000 to U-0010FFFF).
     970// Returns -1 if the sequence is not valid (including presence of extra bytes).
     971int decodeUTF8Sequence(const char *sequence)
     972{
     973  // Handle 0-byte sequences (never valid).
     974  const unsigned char b0 = sequence[0];
     975  const int length = UTF8SequenceLength(b0);
     976  if (length == 0)
     977    return -1;
     978
     979  // Handle 1-byte sequences (plain ASCII).
     980  const unsigned char b1 = sequence[1];
     981  if (length == 1) {
     982    if (b1)
     983      return -1;
     984    return b0;
     985  }
     986
     987  // Handle 2-byte sequences.
     988  if ((b1 & 0xC0) != 0x80)
     989    return -1;
     990  const unsigned char b2 = sequence[2];
     991  if (length == 2) {
     992    if (b2)
     993      return -1;
     994    const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
     995    if (c < 0x80)
     996      return -1;
     997    return c;
     998  }
     999
     1000  // Handle 3-byte sequences.
     1001  if ((b2 & 0xC0) != 0x80)
     1002    return -1;
     1003  const unsigned char b3 = sequence[3];
     1004  if (length == 3) {
     1005    if (b3)
     1006      return -1;
     1007    const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
     1008    if (c < 0x800)
     1009      return -1;
     1010    // UTF-16 surrogates should never appear in UTF-8 data.
     1011    if (c >= 0xD800 && c <= 0xDFFF)
     1012      return -1;
     1013    // Backwards BOM and U+FFFF should never appear in UTF-8 data.
     1014    if (c == 0xFFFE || c == 0xFFFF)
     1015      return -1;
     1016    return c;
     1017  }
     1018
     1019  // Handle 4-byte sequences.
     1020  if ((b3 & 0xC0) != 0x80)
     1021    return -1;
     1022  const unsigned char b4 = sequence[4];
     1023  if (length == 4) {
     1024    if (b4)
     1025      return -1;
     1026    const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
     1027    if (c < 0x10000 || c > 0x10FFFF)
     1028      return -1;
     1029    return c;
     1030  }
     1031
     1032  return -1;
     1033}
     1034
     1035CString UString::UTF8String() const
     1036{
     1037  // Allocate a buffer big enough to hold all the characters.
     1038  const int length = size();
     1039  const unsigned bufferSize = length * 3;
     1040  char fixedSizeBuffer[1024];
     1041  char *buffer;
     1042  if (bufferSize > sizeof(fixedSizeBuffer)) {
     1043    buffer = new char [bufferSize];
     1044  } else {
     1045    buffer = fixedSizeBuffer;
     1046  }
     1047
     1048  // Convert to runs of 8-bit characters.
     1049  char *p = buffer;
     1050  const UChar *d = data();
     1051  for (int i = 0; i != length; ++i) {
     1052    unsigned short c = d[i].unicode();
     1053    if (c < 0x80) {
     1054      *p++ = (char)c;
     1055    } else if (c < 0x800) {
     1056      *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
     1057      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
     1058    } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+2].uc <= 0xDFFF) {
     1059      unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF));
     1060      *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
     1061      *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
     1062      *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
     1063      *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
     1064      ++i;
     1065    } else {
     1066      *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
     1067      *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
     1068      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
     1069    }
     1070  }
     1071
     1072  // Return the result as a C string.
     1073  CString result(buffer, p - buffer);
     1074  if (buffer != fixedSizeBuffer) {
     1075    delete [] buffer;
     1076  }
     1077  return result;
     1078}
     1079
     1080struct StringOffset {
     1081    int offset;
     1082    int locationInOffsetsArray;
     1083};
     1084
     1085static int compareStringOffsets(const void *a, const void *b)
     1086{
     1087    const StringOffset *oa = static_cast<const StringOffset *>(a);
     1088    const StringOffset *ob = static_cast<const StringOffset *>(b);
     1089   
     1090    if (oa->offset < ob->offset) {
     1091        return -1;
     1092    }
     1093    if (oa->offset > ob->offset) {
     1094        return +1;
     1095    }
     1096    return 0;
     1097}
     1098
     1099const int sortedOffsetsFixedBufferSize = 128;
     1100
     1101static StringOffset *createSortedOffsetsArray(const int offsets[], int numOffsets,
     1102    StringOffset sortedOffsetsFixedBuffer[sortedOffsetsFixedBufferSize])
     1103{
     1104    // Allocate the sorted offsets.
     1105    StringOffset *sortedOffsets;
     1106    if (numOffsets <= sortedOffsetsFixedBufferSize) {
     1107        sortedOffsets = sortedOffsetsFixedBuffer;
     1108    } else {
     1109        sortedOffsets = new StringOffset [numOffsets];
     1110    }
     1111
     1112    // Copy offsets.
     1113    for (int i = 0; i != numOffsets; ++i) {
     1114        sortedOffsets[i].offset = offsets[i];
     1115        sortedOffsets[i].locationInOffsetsArray = i;
     1116    }
     1117
     1118    // Sort them.
     1119    qsort(sortedOffsets, numOffsets, sizeof(StringOffset), compareStringOffsets);
     1120
     1121    return sortedOffsets;
     1122}
     1123
     1124// Note: This function assumes valid UTF-8.
     1125// It can even go into an infinite loop if the passed in string is not valid UTF-8.
     1126void convertUTF16OffsetsToUTF8Offsets(const char *s, int *offsets, int numOffsets)
     1127{
     1128    // Allocate buffer.
     1129    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
     1130    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
     1131
     1132    // Walk through sorted offsets and string, adjusting all the offests.
     1133    // Offsets that are off the ends of the string map to the edges of the string.
     1134    int UTF16Offset = 0;
     1135    const char *p = s;
     1136    for (int oi = 0; oi != numOffsets; ++oi) {
     1137        const int nextOffset = sortedOffsets[oi].offset;
     1138        while (*p && UTF16Offset < nextOffset) {
     1139            // Skip to the next character.
     1140            const int sequenceLength = UTF8SequenceLength(*p);
     1141            assert(sequenceLength >= 1 && sequenceLength <= 4);
     1142            p += sequenceLength;
     1143            // Characters that take a 4 byte sequence in UTF-8 take two bytes in UTF-16.
     1144            UTF16Offset += sequenceLength < 4 ? 1 : 2;
     1145        }
     1146        offsets[sortedOffsets[oi].locationInOffsetsArray] = p - s;
     1147    }
     1148
     1149    // Free buffer.
     1150    if (sortedOffsets != fixedBuffer) {
     1151        delete [] sortedOffsets;
     1152    }
     1153}
     1154
     1155// Note: This function assumes valid UTF-8.
     1156// It can even go into an infinite loop if the passed in string is not valid UTF-8.
     1157void convertUTF8OffsetsToUTF16Offsets(const char *s, int *offsets, int numOffsets)
     1158{
     1159    // Allocate buffer.
     1160    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
     1161    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
     1162
     1163    // Walk through sorted offsets and string, adjusting all the offests.
     1164    // Offsets that are off the end of the string map to the edges of the string.
     1165    int UTF16Offset = 0;
     1166    const char *p = s;
     1167    for (int oi = 0; oi != numOffsets; ++oi) {
     1168        const int nextOffset = sortedOffsets[oi].offset;
     1169        while (*p && (p - s) < nextOffset) {
     1170            // Skip to the next character.
     1171            const int sequenceLength = UTF8SequenceLength(*p);
     1172            assert(sequenceLength >= 1 && sequenceLength <= 4);
     1173            p += sequenceLength;
     1174            // Characters that take a 4 byte sequence in UTF-8 take two bytes in UTF-16.
     1175            UTF16Offset += sequenceLength < 4 ? 1 : 2;
     1176        }
     1177        offsets[sortedOffsets[oi].locationInOffsetsArray] = UTF16Offset;
     1178    }
     1179
     1180    // Free buffer.
     1181    if (sortedOffsets != fixedBuffer) {
     1182        delete [] sortedOffsets;
     1183    }
     1184}
     1185
     1186} // namespace KJS
Note: See TracChangeset for help on using the changeset viewer.