1 | /*
|
---|
2 | * Copyright (C) 1999-2000 Harri Porten ([email protected])
|
---|
3 | * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
|
---|
4 | * Copyright (C) 2007 Cameron Zwarich ([email protected])
|
---|
5 | * Copyright (C) 2009 Google Inc. All rights reserved.
|
---|
6 | *
|
---|
7 | * This library is free software; you can redistribute it and/or
|
---|
8 | * modify it under the terms of the GNU Library General Public
|
---|
9 | * License as published by the Free Software Foundation; either
|
---|
10 | * version 2 of the License, or (at your option) any later version.
|
---|
11 | *
|
---|
12 | * This library is distributed in the hope that it will be useful,
|
---|
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
15 | * Library General Public License for more details.
|
---|
16 | *
|
---|
17 | * You should have received a copy of the GNU Library General Public License
|
---|
18 | * along with this library; see the file COPYING.LIB. If not, write to
|
---|
19 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
---|
20 | * Boston, MA 02110-1301, USA.
|
---|
21 | *
|
---|
22 | */
|
---|
23 |
|
---|
24 | #include "config.h"
|
---|
25 | #include "UString.h"
|
---|
26 |
|
---|
27 | #include "JSGlobalObjectFunctions.h"
|
---|
28 | #include "Collector.h"
|
---|
29 | #include "dtoa.h"
|
---|
30 | #include "Identifier.h"
|
---|
31 | #include "Operations.h"
|
---|
32 | #include <ctype.h>
|
---|
33 | #include <limits.h>
|
---|
34 | #include <limits>
|
---|
35 | #include <stdio.h>
|
---|
36 | #include <stdlib.h>
|
---|
37 | #include <wtf/ASCIICType.h>
|
---|
38 | #include <wtf/Assertions.h>
|
---|
39 | #include <wtf/MathExtras.h>
|
---|
40 | #include <wtf/StringExtras.h>
|
---|
41 | #include <wtf/Vector.h>
|
---|
42 | #include <wtf/unicode/UTF8.h>
|
---|
43 |
|
---|
44 | #if HAVE(STRINGS_H)
|
---|
45 | #include <strings.h>
|
---|
46 | #endif
|
---|
47 |
|
---|
48 | using namespace WTF;
|
---|
49 | using namespace WTF::Unicode;
|
---|
50 | using namespace std;
|
---|
51 |
|
---|
52 | namespace JSC {
|
---|
53 |
|
---|
54 | extern const double NaN;
|
---|
55 | extern const double Inf;
|
---|
56 |
|
---|
57 | COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small);
|
---|
58 |
|
---|
59 | // Construct a string with UTF-16 data.
|
---|
60 | UString::UString(const UChar* characters, unsigned length)
|
---|
61 | : m_impl(characters ? StringImpl::create(characters, length) : 0)
|
---|
62 | {
|
---|
63 | }
|
---|
64 |
|
---|
65 | // Construct a string with UTF-16 data, from a null-terminated source.
|
---|
66 | UString::UString(const UChar* characters)
|
---|
67 | {
|
---|
68 | if (!characters)
|
---|
69 | return;
|
---|
70 |
|
---|
71 | int length = 0;
|
---|
72 | while (characters[length] != UChar(0))
|
---|
73 | ++length;
|
---|
74 |
|
---|
75 | m_impl = StringImpl::create(characters, length);
|
---|
76 | }
|
---|
77 |
|
---|
78 | // Construct a string with latin1 data.
|
---|
79 | UString::UString(const char* characters, unsigned length)
|
---|
80 | : m_impl(characters ? StringImpl::create(characters, length) : 0)
|
---|
81 | {
|
---|
82 | }
|
---|
83 |
|
---|
84 | // Construct a string with latin1 data, from a null-terminated source.
|
---|
85 | UString::UString(const char* characters)
|
---|
86 | : m_impl(characters ? StringImpl::create(characters) : 0)
|
---|
87 | {
|
---|
88 | }
|
---|
89 |
|
---|
90 | UString UString::number(int i)
|
---|
91 | {
|
---|
92 | UChar buf[1 + sizeof(i) * 3];
|
---|
93 | UChar* end = buf + sizeof(buf) / sizeof(UChar);
|
---|
94 | UChar* p = end;
|
---|
95 |
|
---|
96 | if (i == 0)
|
---|
97 | *--p = '0';
|
---|
98 | else if (i == INT_MIN) {
|
---|
99 | char minBuf[1 + sizeof(i) * 3];
|
---|
100 | snprintf(minBuf, sizeof(minBuf), "%d", INT_MIN);
|
---|
101 | return UString(minBuf);
|
---|
102 | } else {
|
---|
103 | bool negative = false;
|
---|
104 | if (i < 0) {
|
---|
105 | negative = true;
|
---|
106 | i = -i;
|
---|
107 | }
|
---|
108 | while (i) {
|
---|
109 | *--p = static_cast<unsigned short>((i % 10) + '0');
|
---|
110 | i /= 10;
|
---|
111 | }
|
---|
112 | if (negative)
|
---|
113 | *--p = '-';
|
---|
114 | }
|
---|
115 |
|
---|
116 | return UString(p, static_cast<unsigned>(end - p));
|
---|
117 | }
|
---|
118 |
|
---|
119 | UString UString::number(long long i)
|
---|
120 | {
|
---|
121 | UChar buf[1 + sizeof(i) * 3];
|
---|
122 | UChar* end = buf + sizeof(buf) / sizeof(UChar);
|
---|
123 | UChar* p = end;
|
---|
124 |
|
---|
125 | if (i == 0)
|
---|
126 | *--p = '0';
|
---|
127 | else if (i == std::numeric_limits<long long>::min()) {
|
---|
128 | char minBuf[1 + sizeof(i) * 3];
|
---|
129 | #if OS(WINDOWS)
|
---|
130 | snprintf(minBuf, sizeof(minBuf), "%I64d", std::numeric_limits<long long>::min());
|
---|
131 | #else
|
---|
132 | snprintf(minBuf, sizeof(minBuf), "%lld", std::numeric_limits<long long>::min());
|
---|
133 | #endif
|
---|
134 | return UString(minBuf);
|
---|
135 | } else {
|
---|
136 | bool negative = false;
|
---|
137 | if (i < 0) {
|
---|
138 | negative = true;
|
---|
139 | i = -i;
|
---|
140 | }
|
---|
141 | while (i) {
|
---|
142 | *--p = static_cast<unsigned short>((i % 10) + '0');
|
---|
143 | i /= 10;
|
---|
144 | }
|
---|
145 | if (negative)
|
---|
146 | *--p = '-';
|
---|
147 | }
|
---|
148 |
|
---|
149 | return UString(p, static_cast<unsigned>(end - p));
|
---|
150 | }
|
---|
151 |
|
---|
152 | UString UString::number(unsigned u)
|
---|
153 | {
|
---|
154 | UChar buf[sizeof(u) * 3];
|
---|
155 | UChar* end = buf + sizeof(buf) / sizeof(UChar);
|
---|
156 | UChar* p = end;
|
---|
157 |
|
---|
158 | if (u == 0)
|
---|
159 | *--p = '0';
|
---|
160 | else {
|
---|
161 | while (u) {
|
---|
162 | *--p = static_cast<unsigned short>((u % 10) + '0');
|
---|
163 | u /= 10;
|
---|
164 | }
|
---|
165 | }
|
---|
166 |
|
---|
167 | return UString(p, static_cast<unsigned>(end - p));
|
---|
168 | }
|
---|
169 |
|
---|
170 | UString UString::number(long l)
|
---|
171 | {
|
---|
172 | UChar buf[1 + sizeof(l) * 3];
|
---|
173 | UChar* end = buf + sizeof(buf) / sizeof(UChar);
|
---|
174 | UChar* p = end;
|
---|
175 |
|
---|
176 | if (l == 0)
|
---|
177 | *--p = '0';
|
---|
178 | else if (l == LONG_MIN) {
|
---|
179 | char minBuf[1 + sizeof(l) * 3];
|
---|
180 | snprintf(minBuf, sizeof(minBuf), "%ld", LONG_MIN);
|
---|
181 | return UString(minBuf);
|
---|
182 | } else {
|
---|
183 | bool negative = false;
|
---|
184 | if (l < 0) {
|
---|
185 | negative = true;
|
---|
186 | l = -l;
|
---|
187 | }
|
---|
188 | while (l) {
|
---|
189 | *--p = static_cast<unsigned short>((l % 10) + '0');
|
---|
190 | l /= 10;
|
---|
191 | }
|
---|
192 | if (negative)
|
---|
193 | *--p = '-';
|
---|
194 | }
|
---|
195 |
|
---|
196 | return UString(p, end - p);
|
---|
197 | }
|
---|
198 |
|
---|
199 | UString UString::number(double d)
|
---|
200 | {
|
---|
201 | DtoaBuffer buffer;
|
---|
202 | unsigned length;
|
---|
203 | doubleToStringInJavaScriptFormat(d, buffer, &length);
|
---|
204 | return UString(buffer, length);
|
---|
205 | }
|
---|
206 |
|
---|
207 | UString UString::substringSharingImpl(unsigned offset, unsigned length) const
|
---|
208 | {
|
---|
209 | // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
|
---|
210 |
|
---|
211 | unsigned stringLength = this->length();
|
---|
212 | offset = min(offset, stringLength);
|
---|
213 | length = min(length, stringLength - offset);
|
---|
214 |
|
---|
215 | if (!offset && length == stringLength)
|
---|
216 | return *this;
|
---|
217 | return UString(StringImpl::create(m_impl, offset, length));
|
---|
218 | }
|
---|
219 |
|
---|
220 | bool operator==(const UString& s1, const char *s2)
|
---|
221 | {
|
---|
222 | if (s2 == 0)
|
---|
223 | return s1.isEmpty();
|
---|
224 |
|
---|
225 | const UChar* u = s1.characters();
|
---|
226 | const UChar* uend = u + s1.length();
|
---|
227 | while (u != uend && *s2) {
|
---|
228 | if (u[0] != (unsigned char)*s2)
|
---|
229 | return false;
|
---|
230 | s2++;
|
---|
231 | u++;
|
---|
232 | }
|
---|
233 |
|
---|
234 | return u == uend && *s2 == 0;
|
---|
235 | }
|
---|
236 |
|
---|
237 | bool operator<(const UString& s1, const UString& s2)
|
---|
238 | {
|
---|
239 | const unsigned l1 = s1.length();
|
---|
240 | const unsigned l2 = s2.length();
|
---|
241 | const unsigned lmin = l1 < l2 ? l1 : l2;
|
---|
242 | const UChar* c1 = s1.characters();
|
---|
243 | const UChar* c2 = s2.characters();
|
---|
244 | unsigned l = 0;
|
---|
245 | while (l < lmin && *c1 == *c2) {
|
---|
246 | c1++;
|
---|
247 | c2++;
|
---|
248 | l++;
|
---|
249 | }
|
---|
250 | if (l < lmin)
|
---|
251 | return (c1[0] < c2[0]);
|
---|
252 |
|
---|
253 | return (l1 < l2);
|
---|
254 | }
|
---|
255 |
|
---|
256 | bool operator>(const UString& s1, const UString& s2)
|
---|
257 | {
|
---|
258 | const unsigned l1 = s1.length();
|
---|
259 | const unsigned l2 = s2.length();
|
---|
260 | const unsigned lmin = l1 < l2 ? l1 : l2;
|
---|
261 | const UChar* c1 = s1.characters();
|
---|
262 | const UChar* c2 = s2.characters();
|
---|
263 | unsigned l = 0;
|
---|
264 | while (l < lmin && *c1 == *c2) {
|
---|
265 | c1++;
|
---|
266 | c2++;
|
---|
267 | l++;
|
---|
268 | }
|
---|
269 | if (l < lmin)
|
---|
270 | return (c1[0] > c2[0]);
|
---|
271 |
|
---|
272 | return (l1 > l2);
|
---|
273 | }
|
---|
274 |
|
---|
275 | CString UString::ascii() const
|
---|
276 | {
|
---|
277 | // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
|
---|
278 | // preserved, characters outside of this range are converted to '?'.
|
---|
279 |
|
---|
280 | unsigned length = this->length();
|
---|
281 | const UChar* characters = this->characters();
|
---|
282 |
|
---|
283 | char* characterBuffer;
|
---|
284 | CString result = CString::newUninitialized(length, characterBuffer);
|
---|
285 |
|
---|
286 | for (unsigned i = 0; i < length; ++i) {
|
---|
287 | UChar ch = characters[i];
|
---|
288 | characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch;
|
---|
289 | }
|
---|
290 |
|
---|
291 | return result;
|
---|
292 | }
|
---|
293 |
|
---|
294 | CString UString::latin1() const
|
---|
295 | {
|
---|
296 | // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
|
---|
297 | // preserved, characters outside of this range are converted to '?'.
|
---|
298 |
|
---|
299 | unsigned length = this->length();
|
---|
300 | const UChar* characters = this->characters();
|
---|
301 |
|
---|
302 | char* characterBuffer;
|
---|
303 | CString result = CString::newUninitialized(length, characterBuffer);
|
---|
304 |
|
---|
305 | for (unsigned i = 0; i < length; ++i) {
|
---|
306 | UChar ch = characters[i];
|
---|
307 | characterBuffer[i] = ch > 0xff ? '?' : ch;
|
---|
308 | }
|
---|
309 |
|
---|
310 | return result;
|
---|
311 | }
|
---|
312 |
|
---|
313 | // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
|
---|
314 | static inline void putUTF8Triple(char*& buffer, UChar ch)
|
---|
315 | {
|
---|
316 | ASSERT(ch >= 0x0800);
|
---|
317 | *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
|
---|
318 | *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
|
---|
319 | *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
|
---|
320 | }
|
---|
321 |
|
---|
322 | CString UString::utf8(bool strict) const
|
---|
323 | {
|
---|
324 | unsigned length = this->length();
|
---|
325 | const UChar* characters = this->characters();
|
---|
326 |
|
---|
327 | // Allocate a buffer big enough to hold all the characters
|
---|
328 | // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
|
---|
329 | // Optimization ideas, if we find this function is hot:
|
---|
330 | // * We could speculatively create a CStringBuffer to contain 'length'
|
---|
331 | // characters, and resize if necessary (i.e. if the buffer contains
|
---|
332 | // non-ascii characters). (Alternatively, scan the buffer first for
|
---|
333 | // ascii characters, so we know this will be sufficient).
|
---|
334 | // * We could allocate a CStringBuffer with an appropriate size to
|
---|
335 | // have a good chance of being able to write the string into the
|
---|
336 | // buffer without reallocing (say, 1.5 x length).
|
---|
337 | if (length > numeric_limits<unsigned>::max() / 3)
|
---|
338 | return CString();
|
---|
339 | Vector<char, 1024> bufferVector(length * 3);
|
---|
340 |
|
---|
341 | char* buffer = bufferVector.data();
|
---|
342 | ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
|
---|
343 | ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
|
---|
344 |
|
---|
345 | // Only produced from strict conversion.
|
---|
346 | if (result == sourceIllegal)
|
---|
347 | return CString();
|
---|
348 |
|
---|
349 | // Check for an unconverted high surrogate.
|
---|
350 | if (result == sourceExhausted) {
|
---|
351 | if (strict)
|
---|
352 | return CString();
|
---|
353 | // This should be one unpaired high surrogate. Treat it the same
|
---|
354 | // was as an unpaired high surrogate would have been handled in
|
---|
355 | // the middle of a string with non-strict conversion - which is
|
---|
356 | // to say, simply encode it to UTF-8.
|
---|
357 | ASSERT((characters + 1) == (this->characters() + length));
|
---|
358 | ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
|
---|
359 | // There should be room left, since one UChar hasn't been converted.
|
---|
360 | ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
|
---|
361 | putUTF8Triple(buffer, *characters);
|
---|
362 | }
|
---|
363 |
|
---|
364 | return CString(bufferVector.data(), buffer - bufferVector.data());
|
---|
365 | }
|
---|
366 |
|
---|
367 | } // namespace JSC
|
---|