source: webkit/trunk/JavaScriptCore/kjs/ustring.cpp@ 13304

Last change on this file since 13304 was 13294, checked in by ggaren, 19 years ago

JavaScriptCore:

  • Fixed <rdar://problem/4478239> string sort puts "closed" before "close"

Reviewed by Eric.

  • kjs/ustring.cpp: (KJS::compare): Inverted a < in order to treat the longer string as > the shorter string.

LayoutTests:

Reviewed by Eric.

Layout test for <rdar://problem/4478239> string sort puts "closed"
before "close"

Also changed the engine to report data types when tests fail, so that
you don't get messages like, "should be A, was A."

Updated results for these files:

  • fast/js/kde/Array-expected.txt:
  • fast/js/kde/RegExp-expected.txt:
  • fast/js/kde/encode_decode_uri-expected.txt:
  • fast/js/resources/js-test-pre.js:

Added these files:

  • fast/js/string-sort-expected.txt: Added.
  • fast/js/string-sort.html: Added.
  • Property allow-tabs set to x
  • Property svn:eol-style set to native
File size: 31.2 KB
Line 
1// -*- c-basic-offset: 2 -*-
2/*
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2000 Harri Porten ([email protected])
5 * Copyright (C) 2004 Apple Computer, Inc.
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
16 *
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
21 *
22 */
23
24#include "config.h"
25#include "ustring.h"
26
27#include <assert.h>
28#include <stdlib.h>
29#include <stdio.h>
30#include <ctype.h>
31#if HAVE(STRING_H)
32#include <string.h>
33#endif
34#if HAVE(STRINGS_H)
35#include <strings.h>
36#endif
37
38#include "operations.h"
39#include "identifier.h"
40#include <math.h>
41#include "dtoa.h"
42
43#include <kxmlcore/Vector.h>
44
45using std::max;
46
47#include <unicode/uchar.h>
48
49namespace KJS {
50
51extern const double NaN;
52extern const double Inf;
53
54CString::CString(const char *c)
55{
56 length = strlen(c);
57 data = new char[length+1];
58 memcpy(data, c, length + 1);
59}
60
61CString::CString(const char *c, int len)
62{
63 length = len;
64 data = new char[len+1];
65 memcpy(data, c, len);
66 data[len] = 0;
67}
68
69CString::CString(const CString &b)
70{
71 length = b.length;
72 if (length > 0 && b.data) {
73 data = new char[length+1];
74 memcpy(data, b.data, length + 1);
75 }
76 else {
77 data = 0;
78 }
79}
80
81CString::~CString()
82{
83 delete [] data;
84}
85
86CString &CString::append(const CString &t)
87{
88 char *n;
89 n = new char[length+t.length+1];
90 if (length)
91 memcpy(n, data, length);
92 if (t.length)
93 memcpy(n+length, t.data, t.length);
94 length += t.length;
95 n[length] = 0;
96
97 delete [] data;
98 data = n;
99
100 return *this;
101}
102
103CString &CString::operator=(const char *c)
104{
105 if (data)
106 delete [] data;
107 length = strlen(c);
108 data = new char[length+1];
109 memcpy(data, c, length + 1);
110
111 return *this;
112}
113
114CString &CString::operator=(const CString &str)
115{
116 if (this == &str)
117 return *this;
118
119 if (data)
120 delete [] data;
121 length = str.length;
122 if (length > 0 && str.data) {
123 data = new char[length + 1];
124 memcpy(data, str.data, length + 1);
125 }
126 else {
127 data = 0;
128 }
129
130 return *this;
131}
132
133bool operator==(const CString& c1, const CString& c2)
134{
135 int len = c1.size();
136 return len == c2.size() && (len == 0 || memcmp(c1.c_str(), c2.c_str(), len) == 0);
137}
138
139// Hack here to avoid a global with a constructor; point to an unsigned short instead of a UChar.
140static unsigned short almostUChar;
141static UChar *const nonNullUCharPointer = reinterpret_cast<UChar *>(&almostUChar);
142UString::Rep UString::Rep::null = { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
143UString::Rep UString::Rep::empty = { 0, 0, 1, 0, 0, 0, nonNullUCharPointer, 0, 0, 0, 0 };
144const int normalStatBufferSize = 4096;
145static char *statBuffer = 0;
146static int statBufferSize = 0;
147
148UChar UChar::toLower() const
149{
150 return static_cast<unsigned short>(u_tolower(uc));
151}
152
153UChar UChar::toUpper() const
154{
155 return static_cast<unsigned short>(u_toupper(uc));
156}
157
158UCharReference& UCharReference::operator=(UChar c)
159{
160 str->copyForWriting();
161 if (offset < str->rep()->len)
162 *(str->rep()->data() + offset) = c;
163 /* TODO: lengthen string ? */
164 return *this;
165}
166
167UChar& UCharReference::ref() const
168{
169 if (offset < str->rep()->len)
170 return *(str->rep()->data() + offset);
171 else {
172 static UChar callerBetterNotModifyThis('\0');
173 return callerBetterNotModifyThis;
174 }
175}
176
177PassRefPtr<UString::Rep> UString::Rep::createCopying(const UChar *d, int l)
178{
179 int sizeInBytes = l * sizeof(UChar);
180 UChar *copyD = static_cast<UChar *>(fastMalloc(sizeInBytes));
181 memcpy(copyD, d, sizeInBytes);
182
183 return create(copyD, l);
184}
185
186PassRefPtr<UString::Rep> UString::Rep::create(UChar *d, int l)
187{
188 Rep *r = new Rep;
189 r->offset = 0;
190 r->len = l;
191 r->rc = 1;
192 r->_hash = 0;
193 r->isIdentifier = 0;
194 r->baseString = 0;
195 r->buf = d;
196 r->usedCapacity = l;
197 r->capacity = l;
198 r->usedPreCapacity = 0;
199 r->preCapacity = 0;
200
201 // steal the single reference this Rep was created with
202 return adoptRef(r);
203}
204
205PassRefPtr<UString::Rep> UString::Rep::create(PassRefPtr<Rep> base, int offset, int length)
206{
207 assert(base);
208
209 int baseOffset = base->offset;
210
211 if (base->baseString) {
212 base = base->baseString;
213 }
214
215 assert(-(offset + baseOffset) <= base->usedPreCapacity);
216 assert(offset + baseOffset + length <= base->usedCapacity);
217
218 Rep *r = new Rep;
219 r->offset = baseOffset + offset;
220 r->len = length;
221 r->rc = 1;
222 r->_hash = 0;
223 r->isIdentifier = 0;
224 r->baseString = base.release();
225 r->buf = 0;
226 r->usedCapacity = 0;
227 r->capacity = 0;
228 r->usedPreCapacity = 0;
229 r->preCapacity = 0;
230
231 // steal the single reference this Rep was created with
232 return adoptRef(r);
233}
234
235void UString::Rep::destroy()
236{
237 if (isIdentifier)
238 Identifier::remove(this);
239 if (baseString) {
240 baseString->deref();
241 } else {
242 fastFree(buf);
243 }
244 delete this;
245}
246
247// Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
248// or anything like that.
249const unsigned PHI = 0x9e3779b9U;
250
251// Paul Hsieh's SuperFastHash
252// http://www.azillionmonkeys.com/qed/hash.html
253unsigned UString::Rep::computeHash(const UChar *s, int len)
254{
255 unsigned l = len;
256 uint32_t hash = PHI;
257 uint32_t tmp;
258
259 int rem = l & 1;
260 l >>= 1;
261
262 // Main loop
263 for (; l > 0; l--) {
264 hash += s[0].uc;
265 tmp = (s[1].uc << 11) ^ hash;
266 hash = (hash << 16) ^ tmp;
267 s += 2;
268 hash += hash >> 11;
269 }
270
271 // Handle end case
272 if (rem) {
273 hash += s[0].uc;
274 hash ^= hash << 11;
275 hash += hash >> 17;
276 }
277
278 // Force "avalanching" of final 127 bits
279 hash ^= hash << 3;
280 hash += hash >> 5;
281 hash ^= hash << 2;
282 hash += hash >> 15;
283 hash ^= hash << 10;
284
285 // this avoids ever returning a hash code of 0, since that is used to
286 // signal "hash not computed yet", using a value that is likely to be
287 // effectively the same as 0 when the low bits are masked
288 if (hash == 0)
289 hash = 0x80000000;
290
291 return hash;
292}
293
294// Paul Hsieh's SuperFastHash
295// http://www.azillionmonkeys.com/qed/hash.html
296unsigned UString::Rep::computeHash(const char *s)
297{
298 // This hash is designed to work on 16-bit chunks at a time. But since the normal case
299 // (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
300 // were 16-bit chunks, which should give matching results
301
302 uint32_t hash = PHI;
303 uint32_t tmp;
304 unsigned l = strlen(s);
305
306 int rem = l & 1;
307 l >>= 1;
308
309 // Main loop
310 for (; l > 0; l--) {
311 hash += (unsigned char)s[0];
312 tmp = ((unsigned char)s[1] << 11) ^ hash;
313 hash = (hash << 16) ^ tmp;
314 s += 2;
315 hash += hash >> 11;
316 }
317
318 // Handle end case
319 if (rem) {
320 hash += (unsigned char)s[0];
321 hash ^= hash << 11;
322 hash += hash >> 17;
323 }
324
325 // Force "avalanching" of final 127 bits
326 hash ^= hash << 3;
327 hash += hash >> 5;
328 hash ^= hash << 2;
329 hash += hash >> 15;
330 hash ^= hash << 10;
331
332 // this avoids ever returning a hash code of 0, since that is used to
333 // signal "hash not computed yet", using a value that is likely to be
334 // effectively the same as 0 when the low bits are masked
335 if (hash == 0)
336 hash = 0x80000000;
337
338 return hash;
339}
340
341// put these early so they can be inlined
342inline int UString::expandedSize(int size, int otherSize) const
343{
344 int s = (size * 11 / 10) + 1 + otherSize;
345 return s;
346}
347
348inline int UString::usedCapacity() const
349{
350 return m_rep->baseString ? m_rep->baseString->usedCapacity : m_rep->usedCapacity;
351}
352
353inline int UString::usedPreCapacity() const
354{
355 return m_rep->baseString ? m_rep->baseString->usedPreCapacity : m_rep->usedPreCapacity;
356}
357
358void UString::expandCapacity(int requiredLength)
359{
360 Rep *r = m_rep->baseString ? m_rep->baseString : rep();
361
362 if (requiredLength > r->capacity) {
363 int newCapacity = expandedSize(requiredLength, r->preCapacity);
364 r->buf = static_cast<UChar *>(fastRealloc(r->buf, newCapacity * sizeof(UChar)));
365 r->capacity = newCapacity - r->preCapacity;
366 }
367 if (requiredLength > r->usedCapacity) {
368 r->usedCapacity = requiredLength;
369 }
370}
371
372void UString::expandPreCapacity(int requiredPreCap)
373{
374 Rep *r = m_rep->baseString ? m_rep->baseString : rep();
375
376 if (requiredPreCap > r->preCapacity) {
377 int newCapacity = expandedSize(requiredPreCap, r->capacity);
378 int delta = newCapacity - r->capacity - r->preCapacity;
379
380 UChar *newBuf = static_cast<UChar *>(fastMalloc(newCapacity * sizeof(UChar)));
381 memcpy(newBuf + delta, r->buf, (r->capacity + r->preCapacity) * sizeof(UChar));
382 fastFree(r->buf);
383 r->buf = newBuf;
384
385 r->preCapacity = newCapacity - r->capacity;
386 }
387 if (requiredPreCap > r->usedPreCapacity) {
388 r->usedPreCapacity = requiredPreCap;
389 }
390}
391
392
393UString::UString(char c)
394{
395 UChar *d = static_cast<UChar *>(fastMalloc(sizeof(UChar)));
396 d[0] = c;
397 m_rep = Rep::create(d, 1);
398}
399
400UString::UString(const char *c)
401{
402 if (!c) {
403 m_rep = &Rep::null;
404 return;
405 }
406 int length = strlen(c);
407 if (length == 0) {
408 m_rep = &Rep::empty;
409 return;
410 }
411 UChar *d = static_cast<UChar *>(fastMalloc(sizeof(UChar) * length));
412 for (int i = 0; i < length; i++)
413 d[i].uc = c[i];
414 m_rep = Rep::create(d, length);
415}
416
417UString::UString(const UChar *c, int length)
418{
419 if (length == 0)
420 m_rep = &Rep::empty;
421 else
422 m_rep = Rep::createCopying(c, length);
423}
424
425UString::UString(UChar *c, int length, bool copy)
426{
427 if (length == 0)
428 m_rep = &Rep::empty;
429 else if (copy)
430 m_rep = Rep::createCopying(c, length);
431 else
432 m_rep = Rep::create(c, length);
433}
434
435UString::UString(const UString &a, const UString &b)
436{
437 int aSize = a.size();
438 int aOffset = a.m_rep->offset;
439 int bSize = b.size();
440 int bOffset = b.m_rep->offset;
441 int length = aSize + bSize;
442
443 // possible cases:
444
445 if (aSize == 0) {
446 // a is empty
447 m_rep = b.m_rep;
448 } else if (bSize == 0) {
449 // b is empty
450 m_rep = a.m_rep;
451 } else if (aOffset + aSize == a.usedCapacity() && 4 * aSize >= bSize &&
452 (-bOffset != b.usedPreCapacity() || aSize >= bSize)) {
453 // - a reaches the end of its buffer so it qualifies for shared append
454 // - also, it's at least a quarter the length of b - appending to a much shorter
455 // string does more harm than good
456 // - however, if b qualifies for prepend and is longer than a, we'd rather prepend
457 UString x(a);
458 x.expandCapacity(aOffset + length);
459 memcpy(const_cast<UChar *>(a.data() + aSize), b.data(), bSize * sizeof(UChar));
460 m_rep = Rep::create(a.m_rep, 0, length);
461 } else if (-bOffset == b.usedPreCapacity() && 4 * bSize >= aSize) {
462 // - b reaches the beginning of its buffer so it qualifies for shared prepend
463 // - also, it's at least a quarter the length of a - prepending to a much shorter
464 // string does more harm than good
465 UString y(b);
466 y.expandPreCapacity(-bOffset + aSize);
467 memcpy(const_cast<UChar *>(b.data() - aSize), a.data(), aSize * sizeof(UChar));
468 m_rep = Rep::create(b.m_rep, -aSize, length);
469 } else {
470 // a does not qualify for append, and b does not qualify for prepend, gotta make a whole new string
471 int newCapacity = expandedSize(length, 0);
472 UChar *d = static_cast<UChar *>(fastMalloc(sizeof(UChar) * newCapacity));
473 memcpy(d, a.data(), aSize * sizeof(UChar));
474 memcpy(d + aSize, b.data(), bSize * sizeof(UChar));
475 m_rep = Rep::create(d, length);
476 m_rep->capacity = newCapacity;
477 }
478}
479
480const UString &UString::null()
481{
482 static UString n;
483 return n;
484}
485
486UString UString::from(int i)
487{
488 UChar buf[1 + sizeof(i) * 3];
489 UChar *end = buf + sizeof(buf) / sizeof(UChar);
490 UChar *p = end;
491
492 if (i == 0) {
493 *--p = '0';
494 } else if (i == INT_MIN) {
495 char minBuf[1 + sizeof(i) * 3];
496 sprintf(minBuf, "%d", INT_MIN);
497 return UString(minBuf);
498 } else {
499 bool negative = false;
500 if (i < 0) {
501 negative = true;
502 i = -i;
503 }
504 while (i) {
505 *--p = (unsigned short)((i % 10) + '0');
506 i /= 10;
507 }
508 if (negative) {
509 *--p = '-';
510 }
511 }
512
513 return UString(p, end - p);
514}
515
516UString UString::from(unsigned int u)
517{
518 UChar buf[sizeof(u) * 3];
519 UChar *end = buf + sizeof(buf) / sizeof(UChar);
520 UChar *p = end;
521
522 if (u == 0) {
523 *--p = '0';
524 } else {
525 while (u) {
526 *--p = (unsigned short)((u % 10) + '0');
527 u /= 10;
528 }
529 }
530
531 return UString(p, end - p);
532}
533
534UString UString::from(long l)
535{
536 UChar buf[1 + sizeof(l) * 3];
537 UChar *end = buf + sizeof(buf) / sizeof(UChar);
538 UChar *p = end;
539
540 if (l == 0) {
541 *--p = '0';
542 } else if (l == LONG_MIN) {
543 char minBuf[1 + sizeof(l) * 3];
544 sprintf(minBuf, "%ld", LONG_MIN);
545 return UString(minBuf);
546 } else {
547 bool negative = false;
548 if (l < 0) {
549 negative = true;
550 l = -l;
551 }
552 while (l) {
553 *--p = (unsigned short)((l % 10) + '0');
554 l /= 10;
555 }
556 if (negative) {
557 *--p = '-';
558 }
559 }
560
561 return UString(p, end - p);
562}
563
564UString UString::from(double d)
565{
566 char buf[80];
567 int decimalPoint;
568 int sign;
569
570 char *result = kjs_dtoa(d, 0, 0, &decimalPoint, &sign, NULL);
571 int length = strlen(result);
572
573 int i = 0;
574 if (sign) {
575 buf[i++] = '-';
576 }
577
578 if (decimalPoint <= 0 && decimalPoint > -6) {
579 buf[i++] = '0';
580 buf[i++] = '.';
581 for (int j = decimalPoint; j < 0; j++) {
582 buf[i++] = '0';
583 }
584 strcpy(buf + i, result);
585 } else if (decimalPoint <= 21 && decimalPoint > 0) {
586 if (length <= decimalPoint) {
587 strcpy(buf + i, result);
588 i += length;
589 for (int j = 0; j < decimalPoint - length; j++) {
590 buf[i++] = '0';
591 }
592 buf[i] = '\0';
593 } else {
594 strncpy(buf + i, result, decimalPoint);
595 i += decimalPoint;
596 buf[i++] = '.';
597 strcpy(buf + i, result + decimalPoint);
598 }
599 } else if (result[0] < '0' || result[0] > '9') {
600 strcpy(buf + i, result);
601 } else {
602 buf[i++] = result[0];
603 if (length > 1) {
604 buf[i++] = '.';
605 strcpy(buf + i, result + 1);
606 i += length - 1;
607 }
608
609 buf[i++] = 'e';
610 buf[i++] = (decimalPoint >= 0) ? '+' : '-';
611 // decimalPoint can't be more than 3 digits decimal given the
612 // nature of float representation
613 int exponential = decimalPoint - 1;
614 if (exponential < 0) {
615 exponential = exponential * -1;
616 }
617 if (exponential >= 100) {
618 buf[i++] = '0' + exponential / 100;
619 }
620 if (exponential >= 10) {
621 buf[i++] = '0' + (exponential % 100) / 10;
622 }
623 buf[i++] = '0' + exponential % 10;
624 buf[i++] = '\0';
625 }
626
627 kjs_freedtoa(result);
628
629 return UString(buf);
630}
631
632UString UString::spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const
633{
634 int totalLength = 0;
635
636 for (int i = 0; i < rangeCount; i++) {
637 totalLength += substringRanges[i].length;
638 }
639 for (int i = 0; i < separatorCount; i++) {
640 totalLength += separators[i].size();
641 }
642
643 UChar *buffer = static_cast<UChar *>(fastMalloc(totalLength * sizeof(UChar)));
644
645 int maxCount = max(rangeCount, separatorCount);
646 int bufferPos = 0;
647 for (int i = 0; i < maxCount; i++) {
648 if (i < rangeCount) {
649 memcpy(buffer + bufferPos, data() + substringRanges[i].position, substringRanges[i].length * sizeof(UChar));
650 bufferPos += substringRanges[i].length;
651 }
652 if (i < separatorCount) {
653 memcpy(buffer + bufferPos, separators[i].data(), separators[i].size() * sizeof(UChar));
654 bufferPos += separators[i].size();
655 }
656 }
657
658 return UString(UString::Rep::create(buffer, totalLength));
659}
660
661
662
663UString &UString::append(const UString &t)
664{
665 int thisSize = size();
666 int thisOffset = m_rep->offset;
667 int tSize = t.size();
668 int length = thisSize + tSize;
669
670 // possible cases:
671 if (thisSize == 0) {
672 // this is empty
673 *this = t;
674 } else if (tSize == 0) {
675 // t is empty
676 } else if (!m_rep->baseString && m_rep->rc == 1) {
677 // this is direct and has refcount of 1 (so we can just alter it directly)
678 expandCapacity(thisOffset + length);
679 memcpy(const_cast<UChar *>(data() + thisSize), t.data(), tSize * sizeof(UChar));
680 m_rep->len = length;
681 m_rep->_hash = 0;
682 } else if (thisOffset + thisSize == usedCapacity()) {
683 // this reaches the end of the buffer - extend it
684 expandCapacity(thisOffset + length);
685 memcpy(const_cast<UChar *>(data() + thisSize), t.data(), tSize * sizeof(UChar));
686 m_rep = Rep::create(m_rep, 0, length);
687 } else {
688 // this is shared with someone using more capacity, gotta make a whole new string
689 int newCapacity = expandedSize(length, 0);
690 UChar *d = static_cast<UChar *>(fastMalloc(sizeof(UChar) * newCapacity));
691 memcpy(d, data(), thisSize * sizeof(UChar));
692 memcpy(const_cast<UChar *>(d + thisSize), t.data(), tSize * sizeof(UChar));
693 m_rep = Rep::create(d, length);
694 m_rep->capacity = newCapacity;
695 }
696
697 return *this;
698}
699
700UString &UString::append(const char *t)
701{
702 int thisSize = size();
703 int thisOffset = m_rep->offset;
704 int tSize = strlen(t);
705 int length = thisSize + tSize;
706
707 // possible cases:
708 if (thisSize == 0) {
709 // this is empty
710 *this = t;
711 } else if (tSize == 0) {
712 // t is empty, we'll just return *this below.
713 } else if (!m_rep->baseString && m_rep->rc == 1) {
714 // this is direct and has refcount of 1 (so we can just alter it directly)
715 expandCapacity(thisOffset + length);
716 UChar *d = const_cast<UChar *>(data());
717 for (int i = 0; i < tSize; ++i)
718 d[thisSize+i] = t[i];
719 m_rep->len = length;
720 m_rep->_hash = 0;
721 } else if (thisOffset + thisSize == usedCapacity()) {
722 // this string reaches the end of the buffer - extend it
723 expandCapacity(thisOffset + length);
724 UChar *d = const_cast<UChar *>(data());
725 for (int i = 0; i < tSize; ++i)
726 d[thisSize+i] = t[i];
727 m_rep = Rep::create(m_rep, 0, length);
728 } else {
729 // this is shared with someone using more capacity, gotta make a whole new string
730 int newCapacity = expandedSize(length, 0);
731 UChar *d = static_cast<UChar *>(fastMalloc(sizeof(UChar) * newCapacity));
732 memcpy(d, data(), thisSize * sizeof(UChar));
733 for (int i = 0; i < tSize; ++i)
734 d[thisSize+i] = t[i];
735 m_rep = Rep::create(d, length);
736 m_rep->capacity = newCapacity;
737 }
738
739 return *this;
740}
741
742UString &UString::append(unsigned short c)
743{
744 int thisOffset = m_rep->offset;
745 int length = size();
746
747 // possible cases:
748 if (length == 0) {
749 // this is empty - must make a new m_rep because we don't want to pollute the shared empty one
750 int newCapacity = expandedSize(1, 0);
751 UChar *d = static_cast<UChar *>(fastMalloc(sizeof(UChar) * newCapacity));
752 d[0] = c;
753 m_rep = Rep::create(d, 1);
754 m_rep->capacity = newCapacity;
755 } else if (!m_rep->baseString && m_rep->rc == 1) {
756 // this is direct and has refcount of 1 (so we can just alter it directly)
757 expandCapacity(thisOffset + length + 1);
758 UChar *d = const_cast<UChar *>(data());
759 d[length] = c;
760 m_rep->len = length + 1;
761 m_rep->_hash = 0;
762 } else if (thisOffset + length == usedCapacity()) {
763 // this reaches the end of the string - extend it and share
764 expandCapacity(thisOffset + length + 1);
765 UChar *d = const_cast<UChar *>(data());
766 d[length] = c;
767 m_rep = Rep::create(m_rep, 0, length + 1);
768 } else {
769 // this is shared with someone using more capacity, gotta make a whole new string
770 int newCapacity = expandedSize((length + 1), 0);
771 UChar *d = static_cast<UChar *>(fastMalloc(sizeof(UChar) * newCapacity));
772 memcpy(d, data(), length * sizeof(UChar));
773 d[length] = c;
774 m_rep = Rep::create(d, length);
775 m_rep->capacity = newCapacity;
776 }
777
778 return *this;
779}
780
781CString UString::cstring() const
782{
783 return ascii();
784}
785
786char *UString::ascii() const
787{
788 // Never make the buffer smaller than normalStatBufferSize.
789 // Thus we almost never need to reallocate.
790 int length = size();
791 int neededSize = length + 1;
792 if (neededSize < normalStatBufferSize) {
793 neededSize = normalStatBufferSize;
794 }
795 if (neededSize != statBufferSize) {
796 delete [] statBuffer;
797 statBuffer = new char [neededSize];
798 statBufferSize = neededSize;
799 }
800
801 const UChar *p = data();
802 char *q = statBuffer;
803 const UChar *limit = p + length;
804 while (p != limit) {
805 *q = p->uc;
806 ++p;
807 ++q;
808 }
809 *q = '\0';
810
811 return statBuffer;
812}
813
814#ifdef KJS_DEBUG_MEM
815void UString::globalClear()
816{
817 delete [] statBuffer;
818 statBuffer = 0;
819 statBufferSize = 0;
820}
821#endif
822
823UString &UString::operator=(const char *c)
824{
825 int l = c ? strlen(c) : 0;
826 UChar *d;
827 if (m_rep->rc == 1 && l <= m_rep->capacity && !m_rep->baseString && m_rep->offset == 0 && m_rep->preCapacity == 0) {
828 d = m_rep->buf;
829 m_rep->_hash = 0;
830 } else {
831 d = static_cast<UChar *>(fastMalloc(sizeof(UChar) * l));
832 m_rep = Rep::create(d, l);
833 }
834 for (int i = 0; i < l; i++)
835 d[i].uc = c[i];
836
837 return *this;
838}
839
840bool UString::is8Bit() const
841{
842 const UChar *u = data();
843 const UChar *limit = u + size();
844 while (u < limit) {
845 if (u->uc > 0xFF)
846 return false;
847 ++u;
848 }
849
850 return true;
851}
852
853UChar UString::operator[](int pos) const
854{
855 if (pos >= size())
856 return '\0';
857 return data()[pos];
858}
859
860UCharReference UString::operator[](int pos)
861{
862 /* TODO: boundary check */
863 return UCharReference(this, pos);
864}
865
866double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const
867{
868 double d;
869
870 // FIXME: If tolerateTrailingJunk is true, then we want to tolerate non-8-bit junk
871 // after the number, so is8Bit is too strict a check.
872 if (!is8Bit())
873 return NaN;
874
875 const char *c = ascii();
876
877 // skip leading white space
878 while (isspace(*c))
879 c++;
880
881 // empty string ?
882 if (*c == '\0')
883 return tolerateEmptyString ? 0.0 : NaN;
884
885 // hex number ?
886 if (*c == '0' && (*(c+1) == 'x' || *(c+1) == 'X')) {
887 c++;
888 d = 0.0;
889 while (*(++c)) {
890 if (*c >= '0' && *c <= '9')
891 d = d * 16.0 + *c - '0';
892 else if ((*c >= 'A' && *c <= 'F') || (*c >= 'a' && *c <= 'f'))
893 d = d * 16.0 + (*c & 0xdf) - 'A' + 10.0;
894 else
895 break;
896 }
897 } else {
898 // regular number ?
899 char *end;
900 d = kjs_strtod(c, &end);
901 if ((d != 0.0 || end != c) && d != HUGE_VAL && d != -HUGE_VAL) {
902 c = end;
903 } else {
904 // infinity ?
905 d = 1.0;
906 if (*c == '+')
907 c++;
908 else if (*c == '-') {
909 d = -1.0;
910 c++;
911 }
912 if (strncmp(c, "Infinity", 8) != 0)
913 return NaN;
914 d = d * Inf;
915 c += 8;
916 }
917 }
918
919 // allow trailing white space
920 while (isspace(*c))
921 c++;
922 // don't allow anything after - unless tolerant=true
923 if (!tolerateTrailingJunk && *c != '\0')
924 d = NaN;
925
926 return d;
927}
928
929double UString::toDouble(bool tolerateTrailingJunk) const
930{
931 return toDouble(tolerateTrailingJunk, true);
932}
933
934double UString::toDouble() const
935{
936 return toDouble(false, true);
937}
938
939uint32_t UString::toUInt32(bool *ok) const
940{
941 double d = toDouble();
942 bool b = true;
943
944 if (d != static_cast<uint32_t>(d)) {
945 b = false;
946 d = 0;
947 }
948
949 if (ok)
950 *ok = b;
951
952 return static_cast<uint32_t>(d);
953}
954
955uint32_t UString::toUInt32(bool *ok, bool tolerateEmptyString) const
956{
957 double d = toDouble(false, tolerateEmptyString);
958 bool b = true;
959
960 if (d != static_cast<uint32_t>(d)) {
961 b = false;
962 d = 0;
963 }
964
965 if (ok)
966 *ok = b;
967
968 return static_cast<uint32_t>(d);
969}
970
971uint32_t UString::toStrictUInt32(bool *ok) const
972{
973 if (ok)
974 *ok = false;
975
976 // Empty string is not OK.
977 int len = m_rep->len;
978 if (len == 0)
979 return 0;
980 const UChar *p = m_rep->data();
981 unsigned short c = p->unicode();
982
983 // If the first digit is 0, only 0 itself is OK.
984 if (c == '0') {
985 if (len == 1 && ok)
986 *ok = true;
987 return 0;
988 }
989
990 // Convert to UInt32, checking for overflow.
991 uint32_t i = 0;
992 while (1) {
993 // Process character, turning it into a digit.
994 if (c < '0' || c > '9')
995 return 0;
996 const unsigned d = c - '0';
997
998 // Multiply by 10, checking for overflow out of 32 bits.
999 if (i > 0xFFFFFFFFU / 10)
1000 return 0;
1001 i *= 10;
1002
1003 // Add in the digit, checking for overflow out of 32 bits.
1004 const unsigned max = 0xFFFFFFFFU - d;
1005 if (i > max)
1006 return 0;
1007 i += d;
1008
1009 // Handle end of string.
1010 if (--len == 0) {
1011 if (ok)
1012 *ok = true;
1013 return i;
1014 }
1015
1016 // Get next character.
1017 c = (++p)->unicode();
1018 }
1019}
1020
1021int UString::find(const UString &f, int pos) const
1022{
1023 int sz = size();
1024 int fsz = f.size();
1025 if (sz < fsz)
1026 return -1;
1027 if (pos < 0)
1028 pos = 0;
1029 if (fsz == 0)
1030 return pos;
1031 const UChar *end = data() + sz - fsz;
1032 int fsizeminusone = (fsz - 1) * sizeof(UChar);
1033 const UChar *fdata = f.data();
1034 unsigned short fchar = fdata->uc;
1035 ++fdata;
1036 for (const UChar *c = data() + pos; c <= end; c++)
1037 if (c->uc == fchar && !memcmp(c + 1, fdata, fsizeminusone))
1038 return (c-data());
1039
1040 return -1;
1041}
1042
1043int UString::find(UChar ch, int pos) const
1044{
1045 if (pos < 0)
1046 pos = 0;
1047 const UChar *end = data() + size();
1048 for (const UChar *c = data() + pos; c < end; c++)
1049 if (*c == ch)
1050 return (c-data());
1051
1052 return -1;
1053}
1054
1055int UString::rfind(const UString &f, int pos) const
1056{
1057 int sz = size();
1058 int fsz = f.size();
1059 if (sz < fsz)
1060 return -1;
1061 if (pos < 0)
1062 pos = 0;
1063 if (pos > sz - fsz)
1064 pos = sz - fsz;
1065 if (fsz == 0)
1066 return pos;
1067 int fsizeminusone = (fsz - 1) * sizeof(UChar);
1068 const UChar *fdata = f.data();
1069 for (const UChar *c = data() + pos; c >= data(); c--) {
1070 if (*c == *fdata && !memcmp(c + 1, fdata + 1, fsizeminusone))
1071 return (c-data());
1072 }
1073
1074 return -1;
1075}
1076
1077int UString::rfind(UChar ch, int pos) const
1078{
1079 if (isEmpty())
1080 return -1;
1081 if (pos + 1 >= size())
1082 pos = size() - 1;
1083 for (const UChar *c = data() + pos; c >= data(); c--) {
1084 if (*c == ch)
1085 return (c-data());
1086 }
1087
1088 return -1;
1089}
1090
1091UString UString::substr(int pos, int len) const
1092{
1093 int s = size();
1094
1095 if (pos < 0)
1096 pos = 0;
1097 else if (pos >= s)
1098 pos = s;
1099 if (len < 0)
1100 len = s;
1101 if (pos + len >= s)
1102 len = s - pos;
1103
1104 if (pos == 0 && len == s)
1105 return *this;
1106
1107 return UString(Rep::create(m_rep, pos, len));
1108}
1109
1110void UString::copyForWriting()
1111{
1112 if (m_rep->rc > 1 || m_rep->baseString) {
1113 int l = size();
1114 UChar *n = static_cast<UChar *>(fastMalloc(sizeof(UChar) * l));
1115 memcpy(n, data(), l * sizeof(UChar));
1116 m_rep = Rep::create(n, l);
1117 }
1118}
1119
1120bool operator==(const UString& s1, const UString& s2)
1121{
1122 if (s1.m_rep->len != s2.m_rep->len)
1123 return false;
1124
1125 return (memcmp(s1.m_rep->data(), s2.m_rep->data(),
1126 s1.m_rep->len * sizeof(UChar)) == 0);
1127}
1128
1129bool operator==(const UString& s1, const char *s2)
1130{
1131 if (s2 == 0) {
1132 return s1.isEmpty();
1133 }
1134
1135 const UChar *u = s1.data();
1136 const UChar *uend = u + s1.size();
1137 while (u != uend && *s2) {
1138 if (u->uc != (unsigned char)*s2)
1139 return false;
1140 s2++;
1141 u++;
1142 }
1143
1144 return u == uend && *s2 == 0;
1145}
1146
1147bool operator<(const UString& s1, const UString& s2)
1148{
1149 const int l1 = s1.size();
1150 const int l2 = s2.size();
1151 const int lmin = l1 < l2 ? l1 : l2;
1152 const UChar *c1 = s1.data();
1153 const UChar *c2 = s2.data();
1154 int l = 0;
1155 while (l < lmin && *c1 == *c2) {
1156 c1++;
1157 c2++;
1158 l++;
1159 }
1160 if (l < lmin)
1161 return (c1->uc < c2->uc);
1162
1163 return (l1 < l2);
1164}
1165
1166int compare(const UString& s1, const UString& s2)
1167{
1168 const int l1 = s1.size();
1169 const int l2 = s2.size();
1170 const int lmin = l1 < l2 ? l1 : l2;
1171 const UChar *c1 = s1.data();
1172 const UChar *c2 = s2.data();
1173 int l = 0;
1174 while (l < lmin && *c1 == *c2) {
1175 c1++;
1176 c2++;
1177 l++;
1178 }
1179
1180 if (l < lmin)
1181 return (c1->uc > c2->uc) ? 1 : -1;
1182
1183 if (l1 == l2)
1184 return 0;
1185
1186 return (l1 > l2) ? 1 : -1;
1187}
1188
1189inline int inlineUTF8SequenceLengthNonASCII(char b0)
1190{
1191 if ((b0 & 0xC0) != 0xC0)
1192 return 0;
1193 if ((b0 & 0xE0) == 0xC0)
1194 return 2;
1195 if ((b0 & 0xF0) == 0xE0)
1196 return 3;
1197 if ((b0 & 0xF8) == 0xF0)
1198 return 4;
1199 return 0;
1200}
1201
1202int UTF8SequenceLengthNonASCII(char b0)
1203{
1204 return inlineUTF8SequenceLengthNonASCII(b0);
1205}
1206
1207inline int inlineUTF8SequenceLength(char b0)
1208{
1209 return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
1210}
1211
1212// Given a first byte, gives the length of the UTF-8 sequence it begins.
1213// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
1214// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
1215int UTF8SequenceLength(char b0)
1216{
1217 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
1218}
1219
1220// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
1221// Only allows Unicode characters (U-00000000 to U-0010FFFF).
1222// Returns -1 if the sequence is not valid (including presence of extra bytes).
1223int decodeUTF8Sequence(const char *sequence)
1224{
1225 // Handle 0-byte sequences (never valid).
1226 const unsigned char b0 = sequence[0];
1227 const int length = inlineUTF8SequenceLength(b0);
1228 if (length == 0)
1229 return -1;
1230
1231 // Handle 1-byte sequences (plain ASCII).
1232 const unsigned char b1 = sequence[1];
1233 if (length == 1) {
1234 if (b1)
1235 return -1;
1236 return b0;
1237 }
1238
1239 // Handle 2-byte sequences.
1240 if ((b1 & 0xC0) != 0x80)
1241 return -1;
1242 const unsigned char b2 = sequence[2];
1243 if (length == 2) {
1244 if (b2)
1245 return -1;
1246 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
1247 if (c < 0x80)
1248 return -1;
1249 return c;
1250 }
1251
1252 // Handle 3-byte sequences.
1253 if ((b2 & 0xC0) != 0x80)
1254 return -1;
1255 const unsigned char b3 = sequence[3];
1256 if (length == 3) {
1257 if (b3)
1258 return -1;
1259 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
1260 if (c < 0x800)
1261 return -1;
1262 // UTF-16 surrogates should never appear in UTF-8 data.
1263 if (c >= 0xD800 && c <= 0xDFFF)
1264 return -1;
1265 // Backwards BOM and U+FFFF should never appear in UTF-8 data.
1266 if (c == 0xFFFE || c == 0xFFFF)
1267 return -1;
1268 return c;
1269 }
1270
1271 // Handle 4-byte sequences.
1272 if ((b3 & 0xC0) != 0x80)
1273 return -1;
1274 const unsigned char b4 = sequence[4];
1275 if (length == 4) {
1276 if (b4)
1277 return -1;
1278 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
1279 if (c < 0x10000 || c > 0x10FFFF)
1280 return -1;
1281 return c;
1282 }
1283
1284 return -1;
1285}
1286
1287CString UString::UTF8String() const
1288{
1289 // Allocate a buffer big enough to hold all the characters.
1290 const int length = size();
1291 Vector<char, 1024> buffer(length * 3);
1292
1293 // Convert to runs of 8-bit characters.
1294 char *p = buffer.begin();
1295 const UChar *d = data();
1296 for (int i = 0; i != length; ++i) {
1297 unsigned short c = d[i].unicode();
1298 if (c < 0x80) {
1299 *p++ = (char)c;
1300 } else if (c < 0x800) {
1301 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
1302 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1303 } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) {
1304 unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF));
1305 *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
1306 *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
1307 *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1308 *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
1309 ++i;
1310 } else {
1311 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
1312 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1313 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1314 }
1315 }
1316
1317 // Return the result as a C string.
1318 CString result(buffer, p - buffer);
1319
1320 return result;
1321}
1322
1323} // namespace KJS
Note: See TracBrowser for help on using the repository browser.