Changeset 4837 in webkit for trunk/JavaScriptCore/kjs/ustring.cpp
- Timestamp:
- Aug 18, 2003, 11:51:25 AM (22 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/kjs/ustring.cpp
r4792 r4837 43 43 44 44 namespace KJS { 45 extern const double NaN; 46 extern const double Inf; 47 }; 48 49 using namespace KJS; 45 46 extern const double NaN; 47 extern const double Inf; 50 48 51 49 CString::CString(const char *c) 52 50 { 53 data = new char[strlen(c)+1]; 51 length = strlen(c); 52 data = new char[length+1]; 54 53 strcpy(data, c); 55 54 } 56 55 56 CString::CString(const char *c, int len) 57 { 58 length = len; 59 data = new char[len+1]; 60 memcpy(data, c, len); 61 data[len] = 0; 62 } 63 57 64 CString::CString(const CString &b) 58 65 { 59 data = new char[b.size()+1]; 60 strcpy(data, b.c_str()); 66 length = b.length; 67 data = new char[length+1]; 68 memcpy(data, b.data, length); 61 69 } 62 70 … … 69 77 { 70 78 char *n; 71 if (data) { 72 n = new char[strlen(data)+t.size()+1]; 73 strcpy(n, data); 74 } else { 75 n = new char[t.size()+1]; 76 n[0] = '\0'; 77 } 78 strcat(n, t.c_str()); 79 n = new char[length+t.length+1]; 80 if (length) 81 memcpy(n, data, length); 82 if (t.length) 83 memcpy(n+length, t.data, t.length); 84 length += t.length; 85 n[length] = 0; 79 86 80 87 delete [] data; … … 88 95 if (data) 89 96 delete [] data; 90 data = new char[strlen(c)+1]; 97 length = strlen(c); 98 data = new char[length+1]; 91 99 strcpy(data, c); 92 100 … … 101 109 if (data) 102 110 delete [] data; 103 data = new char[str.size()+1]; 104 strcpy(data, str.c_str()); 111 length = str.length; 112 data = new char[length + 1]; 113 memcpy(data, str.data, length + 1); 105 114 106 115 return *this; 107 116 } 108 117 109 int CString::size() const110 {111 return strlen(data);112 }113 114 118 bool KJS::operator==(const KJS::CString& c1, const KJS::CString& c2) 115 119 { 116 return (strcmp(c1.c_str(), c2.c_str()) == 0); 120 int len = c1.size(); 121 return len == c2.size() && (len == 0 || memcmp(c1.c_str(), c2.c_str(), len) == 0); 117 122 } 118 123 … … 464 469 memcpy(n, data(), l * sizeof(UChar)); 465 470 memcpy(n+l, t.data(), tLen * sizeof(UChar)); 471 release(); 472 rep = Rep::create(n, newLen); 473 rep->capacity = newCapacity; 474 475 return *this; 476 } 477 478 UString &UString::append(const char *t) 479 { 480 int l = size(); 481 int tLen = strlen(t); 482 int newLen = l + tLen; 483 if (rep->rc == 1 && newLen <= rep->capacity) { 484 for (int i = 0; i < tLen; ++i) 485 rep->dat[l+i] = t[i]; 486 rep->len = newLen; 487 rep->_hash = 0; 488 return *this; 489 } 490 491 int newCapacity = (newLen * 3 + 1) / 2; 492 UChar *n = new UChar[newCapacity]; 493 memcpy(n, data(), l * sizeof(UChar)); 494 for (int i = 0; i < tLen; ++i) 495 n[l+i] = t[i]; 496 release(); 497 rep = Rep::create(n, newLen); 498 rep->capacity = newCapacity; 499 500 return *this; 501 } 502 503 UString &UString::append(unsigned short c) 504 { 505 int l = size(); 506 int newLen = l + 1; 507 if (rep->rc == 1 && newLen <= rep->capacity) { 508 rep->dat[l] = c; 509 rep->len = newLen; 510 rep->_hash = 0; 511 return *this; 512 } 513 514 int newCapacity = (newLen * 3 + 1) / 2; 515 UChar *n = new UChar[newCapacity]; 516 memcpy(n, data(), l * sizeof(UChar)); 517 n[l] = c; 466 518 release(); 467 519 rep = Rep::create(n, newLen); … … 895 947 return (l1 < l2) ? 1 : -1; 896 948 } 949 950 // Given a first byte, gives the length of the UTF-8 sequence it begins. 951 // Returns 0 for bytes that are not legal starts of UTF-8 sequences. 952 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). 953 int UTF8SequenceLength(char b0) 954 { 955 if ((b0 & 0x80) == 0) 956 return 1; 957 if ((b0 & 0xC0) != 0xC0) 958 return 0; 959 if ((b0 & 0xE0) == 0xC0) 960 return 2; 961 if ((b0 & 0xF0) == 0xE0) 962 return 3; 963 if ((b0 & 0xF8) == 0xF0) 964 return 4; 965 return 0; 966 } 967 968 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. 969 // Only allows Unicode characters (U-00000000 to U-0010FFFF). 970 // Returns -1 if the sequence is not valid (including presence of extra bytes). 971 int decodeUTF8Sequence(const char *sequence) 972 { 973 // Handle 0-byte sequences (never valid). 974 const unsigned char b0 = sequence[0]; 975 const int length = UTF8SequenceLength(b0); 976 if (length == 0) 977 return -1; 978 979 // Handle 1-byte sequences (plain ASCII). 980 const unsigned char b1 = sequence[1]; 981 if (length == 1) { 982 if (b1) 983 return -1; 984 return b0; 985 } 986 987 // Handle 2-byte sequences. 988 if ((b1 & 0xC0) != 0x80) 989 return -1; 990 const unsigned char b2 = sequence[2]; 991 if (length == 2) { 992 if (b2) 993 return -1; 994 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); 995 if (c < 0x80) 996 return -1; 997 return c; 998 } 999 1000 // Handle 3-byte sequences. 1001 if ((b2 & 0xC0) != 0x80) 1002 return -1; 1003 const unsigned char b3 = sequence[3]; 1004 if (length == 3) { 1005 if (b3) 1006 return -1; 1007 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); 1008 if (c < 0x800) 1009 return -1; 1010 // UTF-16 surrogates should never appear in UTF-8 data. 1011 if (c >= 0xD800 && c <= 0xDFFF) 1012 return -1; 1013 // Backwards BOM and U+FFFF should never appear in UTF-8 data. 1014 if (c == 0xFFFE || c == 0xFFFF) 1015 return -1; 1016 return c; 1017 } 1018 1019 // Handle 4-byte sequences. 1020 if ((b3 & 0xC0) != 0x80) 1021 return -1; 1022 const unsigned char b4 = sequence[4]; 1023 if (length == 4) { 1024 if (b4) 1025 return -1; 1026 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); 1027 if (c < 0x10000 || c > 0x10FFFF) 1028 return -1; 1029 return c; 1030 } 1031 1032 return -1; 1033 } 1034 1035 CString UString::UTF8String() const 1036 { 1037 // Allocate a buffer big enough to hold all the characters. 1038 const int length = size(); 1039 const unsigned bufferSize = length * 3; 1040 char fixedSizeBuffer[1024]; 1041 char *buffer; 1042 if (bufferSize > sizeof(fixedSizeBuffer)) { 1043 buffer = new char [bufferSize]; 1044 } else { 1045 buffer = fixedSizeBuffer; 1046 } 1047 1048 // Convert to runs of 8-bit characters. 1049 char *p = buffer; 1050 const UChar *d = data(); 1051 for (int i = 0; i != length; ++i) { 1052 unsigned short c = d[i].unicode(); 1053 if (c < 0x80) { 1054 *p++ = (char)c; 1055 } else if (c < 0x800) { 1056 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 1057 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 1058 } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+2].uc <= 0xDFFF) { 1059 unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF)); 1060 *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8 1061 *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set 1062 *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 1063 *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set 1064 ++i; 1065 } else { 1066 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 1067 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 1068 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 1069 } 1070 } 1071 1072 // Return the result as a C string. 1073 CString result(buffer, p - buffer); 1074 if (buffer != fixedSizeBuffer) { 1075 delete [] buffer; 1076 } 1077 return result; 1078 } 1079 1080 struct StringOffset { 1081 int offset; 1082 int locationInOffsetsArray; 1083 }; 1084 1085 static int compareStringOffsets(const void *a, const void *b) 1086 { 1087 const StringOffset *oa = static_cast<const StringOffset *>(a); 1088 const StringOffset *ob = static_cast<const StringOffset *>(b); 1089 1090 if (oa->offset < ob->offset) { 1091 return -1; 1092 } 1093 if (oa->offset > ob->offset) { 1094 return +1; 1095 } 1096 return 0; 1097 } 1098 1099 const int sortedOffsetsFixedBufferSize = 128; 1100 1101 static StringOffset *createSortedOffsetsArray(const int offsets[], int numOffsets, 1102 StringOffset sortedOffsetsFixedBuffer[sortedOffsetsFixedBufferSize]) 1103 { 1104 // Allocate the sorted offsets. 1105 StringOffset *sortedOffsets; 1106 if (numOffsets <= sortedOffsetsFixedBufferSize) { 1107 sortedOffsets = sortedOffsetsFixedBuffer; 1108 } else { 1109 sortedOffsets = new StringOffset [numOffsets]; 1110 } 1111 1112 // Copy offsets. 1113 for (int i = 0; i != numOffsets; ++i) { 1114 sortedOffsets[i].offset = offsets[i]; 1115 sortedOffsets[i].locationInOffsetsArray = i; 1116 } 1117 1118 // Sort them. 1119 qsort(sortedOffsets, numOffsets, sizeof(StringOffset), compareStringOffsets); 1120 1121 return sortedOffsets; 1122 } 1123 1124 // Note: This function assumes valid UTF-8. 1125 // It can even go into an infinite loop if the passed in string is not valid UTF-8. 1126 void convertUTF16OffsetsToUTF8Offsets(const char *s, int *offsets, int numOffsets) 1127 { 1128 // Allocate buffer. 1129 StringOffset fixedBuffer[sortedOffsetsFixedBufferSize]; 1130 StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer); 1131 1132 // Walk through sorted offsets and string, adjusting all the offests. 1133 // Offsets that are off the ends of the string map to the edges of the string. 1134 int UTF16Offset = 0; 1135 const char *p = s; 1136 for (int oi = 0; oi != numOffsets; ++oi) { 1137 const int nextOffset = sortedOffsets[oi].offset; 1138 while (*p && UTF16Offset < nextOffset) { 1139 // Skip to the next character. 1140 const int sequenceLength = UTF8SequenceLength(*p); 1141 assert(sequenceLength >= 1 && sequenceLength <= 4); 1142 p += sequenceLength; 1143 // Characters that take a 4 byte sequence in UTF-8 take two bytes in UTF-16. 1144 UTF16Offset += sequenceLength < 4 ? 1 : 2; 1145 } 1146 offsets[sortedOffsets[oi].locationInOffsetsArray] = p - s; 1147 } 1148 1149 // Free buffer. 1150 if (sortedOffsets != fixedBuffer) { 1151 delete [] sortedOffsets; 1152 } 1153 } 1154 1155 // Note: This function assumes valid UTF-8. 1156 // It can even go into an infinite loop if the passed in string is not valid UTF-8. 1157 void convertUTF8OffsetsToUTF16Offsets(const char *s, int *offsets, int numOffsets) 1158 { 1159 // Allocate buffer. 1160 StringOffset fixedBuffer[sortedOffsetsFixedBufferSize]; 1161 StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer); 1162 1163 // Walk through sorted offsets and string, adjusting all the offests. 1164 // Offsets that are off the end of the string map to the edges of the string. 1165 int UTF16Offset = 0; 1166 const char *p = s; 1167 for (int oi = 0; oi != numOffsets; ++oi) { 1168 const int nextOffset = sortedOffsets[oi].offset; 1169 while (*p && (p - s) < nextOffset) { 1170 // Skip to the next character. 1171 const int sequenceLength = UTF8SequenceLength(*p); 1172 assert(sequenceLength >= 1 && sequenceLength <= 4); 1173 p += sequenceLength; 1174 // Characters that take a 4 byte sequence in UTF-8 take two bytes in UTF-16. 1175 UTF16Offset += sequenceLength < 4 ? 1 : 2; 1176 } 1177 offsets[sortedOffsets[oi].locationInOffsetsArray] = UTF16Offset; 1178 } 1179 1180 // Free buffer. 1181 if (sortedOffsets != fixedBuffer) { 1182 delete [] sortedOffsets; 1183 } 1184 } 1185 1186 } // namespace KJS
Note:
See TracChangeset
for help on using the changeset viewer.