source: webkit/trunk/JavaScriptCore/kjs/ustring.cpp@ 9582

Last change on this file since 9582 was 9501, checked in by mjs, 20 years ago

Reviewed by Darin.

  • replace hash functions with better ones
  • JavaScriptCore.pbproj/project.pbxproj: Add new file to build.
  • kjs/interpreter_map.cpp: (KJS::InterpreterMap::computeHash): Use shared pointer hash.
  • kjs/pointer_hash.h: Added. (KJS::pointerHash): Pointer hash based on 32-bit mix and 64-bit mix hashes.
  • kjs/protected_values.cpp: (KJS::ProtectedValues::computeHash): Use shared pointer hash.
  • kjs/ustring.cpp: (KJS::UString::Rep::computeHash): Use SuperFastHash algorithm.
  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 31.8 KB
Line 
1// -*- c-basic-offset: 2 -*-
2/*
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2000 Harri Porten ([email protected])
5 * Copyright (C) 2004 Apple Computer, Inc.
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
16 *
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 02111-1307, USA.
21 *
22 */
23
24#ifdef HAVE_CONFIG_H
25#include <config.h>
26#endif
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <ctype.h>
31#ifdef HAVE_STRING_H
32#include <string.h>
33#endif
34#ifdef HAVE_STRINGS_H
35#include <strings.h>
36#endif
37
38#include "fast_malloc.h"
39#include "ustring.h"
40#include "operations.h"
41#include "identifier.h"
42#include <math.h>
43#include "dtoa.h"
44
45#if APPLE_CHANGES
46
47#include <unicode/uchar.h>
48
49#endif
50
51namespace KJS {
52
53extern const double NaN;
54extern const double Inf;
55
56CString::CString(const char *c)
57{
58 length = strlen(c);
59 data = new char[length+1];
60 memcpy(data, c, length + 1);
61}
62
63CString::CString(const char *c, int len)
64{
65 length = len;
66 data = new char[len+1];
67 memcpy(data, c, len);
68 data[len] = 0;
69}
70
71CString::CString(const CString &b)
72{
73 length = b.length;
74 if (length > 0 && b.data) {
75 data = new char[length+1];
76 memcpy(data, b.data, length + 1);
77 }
78 else {
79 data = 0;
80 }
81}
82
83CString::~CString()
84{
85 delete [] data;
86}
87
88CString &CString::append(const CString &t)
89{
90 char *n;
91 n = new char[length+t.length+1];
92 if (length)
93 memcpy(n, data, length);
94 if (t.length)
95 memcpy(n+length, t.data, t.length);
96 length += t.length;
97 n[length] = 0;
98
99 delete [] data;
100 data = n;
101
102 return *this;
103}
104
105CString &CString::operator=(const char *c)
106{
107 if (data)
108 delete [] data;
109 length = strlen(c);
110 data = new char[length+1];
111 memcpy(data, c, length + 1);
112
113 return *this;
114}
115
116CString &CString::operator=(const CString &str)
117{
118 if (this == &str)
119 return *this;
120
121 if (data)
122 delete [] data;
123 length = str.length;
124 if (length > 0 && str.data) {
125 data = new char[length + 1];
126 memcpy(data, str.data, length + 1);
127 }
128 else {
129 data = 0;
130 }
131
132 return *this;
133}
134
135bool operator==(const CString& c1, const CString& c2)
136{
137 int len = c1.size();
138 return len == c2.size() && (len == 0 || memcmp(c1.c_str(), c2.c_str(), len) == 0);
139}
140
141// Hack here to avoid a global with a constructor; point to an unsigned short instead of a UChar.
142static unsigned short almostUChar;
143static UChar *const nonNullUCharPointer = reinterpret_cast<UChar *>(&almostUChar);
144UString::Rep UString::Rep::null = { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
145UString::Rep UString::Rep::empty = { 0, 0, 1, 0, 0, 0, nonNullUCharPointer, 0, 0, 0, 0 };
146const int normalStatBufferSize = 4096;
147static char *statBuffer = 0;
148static int statBufferSize = 0;
149
150UChar UChar::toLower() const
151{
152#if APPLE_CHANGES
153 return static_cast<unsigned short>(u_tolower(uc));
154#else
155 // ### properly support unicode tolower
156 if (uc >= 256 || islower(uc))
157 return *this;
158
159 return (unsigned char)tolower(uc);
160#endif
161}
162
163UChar UChar::toUpper() const
164{
165#if APPLE_CHANGES
166 return static_cast<unsigned short>(u_toupper(uc));
167#else
168 if (uc >= 256 || isupper(uc))
169 return *this;
170
171 return (unsigned char)toupper(uc);
172#endif
173}
174
175UCharReference& UCharReference::operator=(UChar c)
176{
177 str->detach();
178 if (offset < str->rep->len)
179 *(str->rep->data() + offset) = c;
180 /* TODO: lengthen string ? */
181 return *this;
182}
183
184UChar& UCharReference::ref() const
185{
186 if (offset < str->rep->len)
187 return *(str->rep->data() + offset);
188 else {
189 static UChar callerBetterNotModifyThis('\0');
190 return callerBetterNotModifyThis;
191 }
192}
193
194UString::Rep *UString::Rep::createCopying(const UChar *d, int l)
195{
196 int sizeInBytes = l * sizeof(UChar);
197 UChar *copyD = static_cast<UChar *>(kjs_fast_malloc(sizeInBytes));
198 memcpy(copyD, d, sizeInBytes);
199
200 return create(copyD, l);
201}
202
203UString::Rep *UString::Rep::create(UChar *d, int l)
204{
205 Rep *r = new Rep;
206 r->offset = 0;
207 r->len = l;
208 r->rc = 1;
209 r->_hash = 0;
210 r->isIdentifier = 0;
211 r->baseString = 0;
212 r->buf = d;
213 r->usedCapacity = l;
214 r->capacity = l;
215 r->usedPreCapacity = 0;
216 r->preCapacity = 0;
217 return r;
218}
219
220UString::Rep *UString::Rep::create(Rep *base, int offset, int length)
221{
222 assert(base);
223
224 int baseOffset = base->offset;
225
226 if (base->baseString) {
227 base = base->baseString;
228 }
229
230 assert(-(offset + baseOffset) <= base->usedPreCapacity);
231 assert(offset + baseOffset + length <= base->usedCapacity);
232
233 Rep *r = new Rep;
234 r->offset = baseOffset + offset;
235 r->len = length;
236 r->rc = 1;
237 r->_hash = 0;
238 r->isIdentifier = 0;
239 r->baseString = base;
240 base->ref();
241 r->buf = 0;
242 r->usedCapacity = 0;
243 r->capacity = 0;
244 r->usedPreCapacity = 0;
245 r->preCapacity = 0;
246 return r;
247}
248
249void UString::Rep::destroy()
250{
251 if (isIdentifier)
252 Identifier::remove(this);
253 if (baseString) {
254 baseString->deref();
255 } else {
256 kjs_fast_free(buf);
257 }
258 delete this;
259}
260
261// Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
262// or anything like that.
263const unsigned PHI = 0x9e3779b9U;
264
265// Paul Hsieh's SuperFastHash
266// http://www.azillionmonkeys.com/qed/hash.html
267unsigned UString::Rep::computeHash(const UChar *s, int len)
268{
269 unsigned l = len;
270 uint32_t hash = PHI;
271 uint32_t tmp;
272
273 int rem = l & 1;
274 l >>= 1;
275
276 // Main loop
277 for (; l > 0; l--) {
278 hash += s[0].uc;
279 tmp = (s[1].uc << 11) ^ hash;
280 hash = (hash << 16) ^ tmp;
281 s += 2;
282 hash += hash >> 11;
283 }
284
285 // Handle end case
286 if (rem) {
287 hash += s[0].uc;
288 hash ^= hash << 11;
289 hash += hash >> 17;
290 }
291
292 // Force "avalanching" of final 127 bits
293 hash ^= hash << 3;
294 hash += hash >> 5;
295 hash ^= hash << 2;
296 hash += hash >> 15;
297 hash ^= hash << 10;
298
299 // this avoids ever returning a hash code of 0, since that is used to
300 // signal "hash not computed yet", using a value that is likely to be
301 // effectively the same as 0 when the low bits are masked
302 if (hash == 0)
303 hash = 0x80000000;
304
305 return hash;
306}
307
308// Paul Hsieh's SuperFastHash
309// http://www.azillionmonkeys.com/qed/hash.html
310unsigned UString::Rep::computeHash(const char *s)
311{
312 // This hash is designed to work on 16-bit chunks at a time. But since the normal case
313 // (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
314 // were 16-bit chunks, which should give matching results
315
316 uint32_t hash = PHI;
317 uint32_t tmp;
318 unsigned l = strlen(s);
319
320 int rem = l & 1;
321 l >>= 1;
322
323 // Main loop
324 for (; l > 0; l--) {
325 hash += (unsigned char)s[0];
326 tmp = ((unsigned char)s[1] << 11) ^ hash;
327 hash = (hash << 16) ^ tmp;
328 s += 2;
329 hash += hash >> 11;
330 }
331
332 // Handle end case
333 if (rem) {
334 hash += (unsigned char)s[0];
335 hash ^= hash << 11;
336 hash += hash >> 17;
337 }
338
339 // Force "avalanching" of final 127 bits
340 hash ^= hash << 3;
341 hash += hash >> 5;
342 hash ^= hash << 2;
343 hash += hash >> 15;
344 hash ^= hash << 10;
345
346 // this avoids ever returning a hash code of 0, since that is used to
347 // signal "hash not computed yet", using a value that is likely to be
348 // effectively the same as 0 when the low bits are masked
349 if (hash == 0)
350 hash = 0x80000000;
351
352 return hash;
353}
354
355// put these early so they can be inlined
356inline int UString::expandedSize(int size, int otherSize) const
357{
358 int s = (size * 11 / 10) + 1 + otherSize;
359 return s;
360}
361
362inline int UString::usedCapacity() const
363{
364 return rep->baseString ? rep->baseString->usedCapacity : rep->usedCapacity;
365}
366
367inline int UString::usedPreCapacity() const
368{
369 return rep->baseString ? rep->baseString->usedPreCapacity : rep->usedPreCapacity;
370}
371
372void UString::expandCapacity(int requiredLength)
373{
374 Rep *r = rep->baseString ? rep->baseString : rep;
375
376 if (requiredLength > r->capacity) {
377 int newCapacity = expandedSize(requiredLength, r->preCapacity);
378 r->buf = static_cast<UChar *>(kjs_fast_realloc(r->buf, newCapacity * sizeof(UChar)));
379 r->capacity = newCapacity - r->preCapacity;
380 }
381 if (requiredLength > r->usedCapacity) {
382 r->usedCapacity = requiredLength;
383 }
384}
385
386void UString::expandPreCapacity(int requiredPreCap)
387{
388 Rep *r = rep->baseString ? rep->baseString : rep;
389
390 if (requiredPreCap > r->preCapacity) {
391 int newCapacity = expandedSize(requiredPreCap, r->capacity);
392 int delta = newCapacity - r->capacity - r->preCapacity;
393
394 UChar *newBuf = static_cast<UChar *>(kjs_fast_malloc(newCapacity * sizeof(UChar)));
395 memcpy(newBuf + delta, r->buf, (r->capacity + r->preCapacity) * sizeof(UChar));
396 kjs_fast_free(r->buf);
397 r->buf = newBuf;
398
399 r->preCapacity = newCapacity - r->capacity;
400 }
401 if (requiredPreCap > r->usedPreCapacity) {
402 r->usedPreCapacity = requiredPreCap;
403 }
404}
405
406
407UString::UString()
408{
409 attach(&Rep::null);
410}
411
412UString::UString(char c)
413{
414 UChar *d = static_cast<UChar *>(kjs_fast_malloc(sizeof(UChar)));
415 d[0] = c;
416 rep = Rep::create(d, 1);
417}
418
419UString::UString(const char *c)
420{
421 if (!c) {
422 attach(&Rep::null);
423 return;
424 }
425 int length = strlen(c);
426 if (length == 0) {
427 attach(&Rep::empty);
428 return;
429 }
430 UChar *d = static_cast<UChar *>(kjs_fast_malloc(sizeof(UChar) * length));
431 for (int i = 0; i < length; i++)
432 d[i].uc = c[i];
433 rep = Rep::create(d, length);
434}
435
436UString::UString(const UChar *c, int length)
437{
438 if (length == 0) {
439 attach(&Rep::empty);
440 return;
441 }
442 rep = Rep::createCopying(c, length);
443}
444
445UString::UString(UChar *c, int length, bool copy)
446{
447 if (length == 0) {
448 attach(&Rep::empty);
449 return;
450 }
451 if (copy) {
452 rep = Rep::createCopying(c, length);
453 } else {
454 rep = Rep::create(c, length);
455 }
456}
457
458UString::UString(const UString &a, const UString &b)
459{
460 int aSize = a.size();
461 int aOffset = a.rep->offset;
462 int bSize = b.size();
463 int bOffset = b.rep->offset;
464 int length = aSize + bSize;
465
466 // possible cases:
467
468 if (aSize == 0) {
469 // a is empty
470 attach(b.rep);
471 } else if (bSize == 0) {
472 // b is empty
473 attach(a.rep);
474 } else if (aOffset + aSize == a.usedCapacity() && 4 * aSize >= bSize &&
475 (-bOffset != b.usedPreCapacity() || aSize >= bSize)) {
476 // - a reaches the end of its buffer so it qualifies for shared append
477 // - also, it's at least a quarter the length of b - appending to a much shorter
478 // string does more harm than good
479 // - however, if b qualifies for prepend and is longer than a, we'd rather prepend
480 UString x(a);
481 x.expandCapacity(aOffset + length);
482 memcpy(const_cast<UChar *>(a.data() + aSize), b.data(), bSize * sizeof(UChar));
483 rep = Rep::create(a.rep, 0, length);
484 } else if (-bOffset == b.usedPreCapacity() && 4 * bSize >= aSize) {
485 // - b reaches the beginning of its buffer so it qualifies for shared prepend
486 // - also, it's at least a quarter the length of a - prepending to a much shorter
487 // string does more harm than good
488 UString y(b);
489 y.expandPreCapacity(-bOffset + aSize);
490 memcpy(const_cast<UChar *>(b.data() - aSize), a.data(), aSize * sizeof(UChar));
491 rep = Rep::create(b.rep, -aSize, length);
492 } else {
493 // a does not qualify for append, and b does not qualify for prepend, gotta make a whole new string
494 int newCapacity = expandedSize(length, 0);
495 UChar *d = static_cast<UChar *>(kjs_fast_malloc(sizeof(UChar) * newCapacity));
496 memcpy(d, a.data(), aSize * sizeof(UChar));
497 memcpy(d + aSize, b.data(), bSize * sizeof(UChar));
498 rep = Rep::create(d, length);
499 rep->capacity = newCapacity;
500 }
501}
502
503const UString &UString::null()
504{
505 static UString n;
506 return n;
507}
508
509UString UString::from(int i)
510{
511 return from((long)i);
512}
513
514UString UString::from(unsigned int u)
515{
516 UChar buf[20];
517 UChar *end = buf + 20;
518 UChar *p = end;
519
520 if (u == 0) {
521 *--p = '0';
522 } else {
523 while (u) {
524 *--p = (unsigned short)((u % 10) + '0');
525 u /= 10;
526 }
527 }
528
529 return UString(p, end - p);
530}
531
532UString UString::from(long l)
533{
534 UChar buf[20];
535 UChar *end = buf + 20;
536 UChar *p = end;
537
538 if (l == 0) {
539 *--p = '0';
540 } else if (l == LONG_MIN) {
541 char minBuf[20];
542 sprintf(minBuf, "%ld", LONG_MIN);
543 return UString(minBuf);
544 } else {
545 bool negative = false;
546 if (l < 0) {
547 negative = true;
548 l = -l;
549 }
550 while (l) {
551 *--p = (unsigned short)((l % 10) + '0');
552 l /= 10;
553 }
554 if (negative) {
555 *--p = '-';
556 }
557 }
558
559 return UString(p, end - p);
560}
561
562UString UString::from(double d)
563{
564 char buf[80];
565 int decimalPoint;
566 int sign;
567
568 char *result = kjs_dtoa(d, 0, 0, &decimalPoint, &sign, NULL);
569 int length = strlen(result);
570
571 int i = 0;
572 if (sign) {
573 buf[i++] = '-';
574 }
575
576 if (decimalPoint <= 0 && decimalPoint > -6) {
577 buf[i++] = '0';
578 buf[i++] = '.';
579 for (int j = decimalPoint; j < 0; j++) {
580 buf[i++] = '0';
581 }
582 strcpy(buf + i, result);
583 } else if (decimalPoint <= 21 && decimalPoint > 0) {
584 if (length <= decimalPoint) {
585 strcpy(buf + i, result);
586 i += length;
587 for (int j = 0; j < decimalPoint - length; j++) {
588 buf[i++] = '0';
589 }
590 buf[i] = '\0';
591 } else {
592 strncpy(buf + i, result, decimalPoint);
593 i += decimalPoint;
594 buf[i++] = '.';
595 strcpy(buf + i, result + decimalPoint);
596 }
597 } else if (result[0] < '0' || result[0] > '9') {
598 strcpy(buf + i, result);
599 } else {
600 buf[i++] = result[0];
601 if (length > 1) {
602 buf[i++] = '.';
603 strcpy(buf + i, result + 1);
604 i += length - 1;
605 }
606
607 buf[i++] = 'e';
608 buf[i++] = (decimalPoint >= 0) ? '+' : '-';
609 // decimalPoint can't be more than 3 digits decimal given the
610 // nature of float representation
611 int exponential = decimalPoint - 1;
612 if (exponential < 0) {
613 exponential = exponential * -1;
614 }
615 if (exponential >= 100) {
616 buf[i++] = '0' + exponential / 100;
617 }
618 if (exponential >= 10) {
619 buf[i++] = '0' + (exponential % 100) / 10;
620 }
621 buf[i++] = '0' + exponential % 10;
622 buf[i++] = '\0';
623 }
624
625 kjs_freedtoa(result);
626
627 return UString(buf);
628}
629
630UString UString::spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const
631{
632 int totalLength = 0;
633
634 for (int i = 0; i < rangeCount; i++) {
635 totalLength += substringRanges[i].length;
636 }
637 for (int i = 0; i < separatorCount; i++) {
638 totalLength += separators[i].size();
639 }
640
641 UChar *buffer = static_cast<UChar *>(kjs_fast_malloc(totalLength * sizeof(UChar)));
642
643 int maxCount = MAX(rangeCount, separatorCount);
644 int bufferPos = 0;
645 for (int i = 0; i < maxCount; i++) {
646 if (i < rangeCount) {
647 memcpy(buffer + bufferPos, data() + substringRanges[i].position, substringRanges[i].length * sizeof(UChar));
648 bufferPos += substringRanges[i].length;
649 }
650 if (i < separatorCount) {
651 memcpy(buffer + bufferPos, separators[i].data(), separators[i].size() * sizeof(UChar));
652 bufferPos += separators[i].size();
653 }
654 }
655
656 UString::Rep *rep = UString::Rep::create(buffer, totalLength);
657 UString result = UString(rep);
658 rep->deref();
659
660 return result;
661}
662
663
664
665UString &UString::append(const UString &t)
666{
667 int thisSize = size();
668 int thisOffset = rep->offset;
669 int tSize = t.size();
670 int length = thisSize + tSize;
671
672 // possible cases:
673 if (thisSize == 0) {
674 // this is empty
675 *this = t;
676 } else if (tSize == 0) {
677 // t is empty
678 } else if (!rep->baseString && rep->rc == 1) {
679 // this is direct and has refcount of 1 (so we can just alter it directly)
680 expandCapacity(thisOffset + length);
681 memcpy(const_cast<UChar *>(data() + thisSize), t.data(), tSize * sizeof(UChar));
682 rep->len = length;
683 rep->_hash = 0;
684 } else if (thisOffset + thisSize == usedCapacity()) {
685 // this reaches the end of the buffer - extend it
686 expandCapacity(thisOffset + length);
687 memcpy(const_cast<UChar *>(data() + thisSize), t.data(), tSize * sizeof(UChar));
688 Rep *newRep = Rep::create(rep, 0, length);
689 release();
690 rep = newRep;
691 } else {
692 // this is shared with someone using more capacity, gotta make a whole new string
693 int newCapacity = expandedSize(length, 0);
694 UChar *d = static_cast<UChar *>(kjs_fast_malloc(sizeof(UChar) * newCapacity));
695 memcpy(d, data(), thisSize * sizeof(UChar));
696 memcpy(const_cast<UChar *>(d + thisSize), t.data(), tSize * sizeof(UChar));
697 release();
698 rep = Rep::create(d, length);
699 rep->capacity = newCapacity;
700 }
701
702 return *this;
703}
704
705UString &UString::append(const char *t)
706{
707 int thisSize = size();
708 int thisOffset = rep->offset;
709 int tSize = strlen(t);
710 int length = thisSize + tSize;
711
712 // possible cases:
713 if (thisSize == 0) {
714 // this is empty
715 *this = t;
716 } else if (tSize == 0) {
717 // t is empty, we'll just return *this below.
718 } else if (!rep->baseString && rep->rc == 1) {
719 // this is direct and has refcount of 1 (so we can just alter it directly)
720 expandCapacity(thisOffset + length);
721 UChar *d = const_cast<UChar *>(data());
722 for (int i = 0; i < tSize; ++i)
723 d[thisSize+i] = t[i];
724 rep->len = length;
725 rep->_hash = 0;
726 } else if (thisOffset + thisSize == usedCapacity()) {
727 // this string reaches the end of the buffer - extend it
728 expandCapacity(thisOffset + length);
729 UChar *d = const_cast<UChar *>(data());
730 for (int i = 0; i < tSize; ++i)
731 d[thisSize+i] = t[i];
732 Rep *newRep = Rep::create(rep, 0, length);
733 release();
734 rep = newRep;
735 } else {
736 // this is shared with someone using more capacity, gotta make a whole new string
737 int newCapacity = expandedSize(length, 0);
738 UChar *d = static_cast<UChar *>(kjs_fast_malloc(sizeof(UChar) * newCapacity));
739 memcpy(d, data(), thisSize * sizeof(UChar));
740 for (int i = 0; i < tSize; ++i)
741 d[thisSize+i] = t[i];
742 release();
743 rep = Rep::create(d, length);
744 rep->capacity = newCapacity;
745 }
746
747 return *this;
748}
749
750UString &UString::append(unsigned short c)
751{
752 int thisOffset = rep->offset;
753 int length = size();
754
755 // possible cases:
756 if (length == 0) {
757 // this is empty - must make a new rep because we don't want to pollute the shared empty one
758 int newCapacity = expandedSize(1, 0);
759 UChar *d = static_cast<UChar *>(kjs_fast_malloc(sizeof(UChar) * newCapacity));
760 d[0] = c;
761 release();
762 rep = Rep::create(d, 1);
763 rep->capacity = newCapacity;
764 } else if (!rep->baseString && rep->rc == 1) {
765 // this is direct and has refcount of 1 (so we can just alter it directly)
766 expandCapacity(thisOffset + length + 1);
767 UChar *d = const_cast<UChar *>(data());
768 d[length] = c;
769 rep->len = length + 1;
770 rep->_hash = 0;
771 } else if (thisOffset + length == usedCapacity()) {
772 // this reaches the end of the string - extend it and share
773 expandCapacity(thisOffset + length + 1);
774 UChar *d = const_cast<UChar *>(data());
775 d[length] = c;
776 Rep *newRep = Rep::create(rep, 0, length + 1);
777 release();
778 rep = newRep;
779 } else {
780 // this is shared with someone using more capacity, gotta make a whole new string
781 int newCapacity = expandedSize((length + 1), 0);
782 UChar *d = static_cast<UChar *>(kjs_fast_malloc(sizeof(UChar) * newCapacity));
783 memcpy(d, data(), length * sizeof(UChar));
784 d[length] = c;
785 release();
786 rep = Rep::create(d, length);
787 rep->capacity = newCapacity;
788 }
789
790 return *this;
791}
792
793CString UString::cstring() const
794{
795 return ascii();
796}
797
798char *UString::ascii() const
799{
800 // Never make the buffer smaller than normalStatBufferSize.
801 // Thus we almost never need to reallocate.
802 int length = size();
803 int neededSize = length + 1;
804 if (neededSize < normalStatBufferSize) {
805 neededSize = normalStatBufferSize;
806 }
807 if (neededSize != statBufferSize) {
808 delete [] statBuffer;
809 statBuffer = new char [neededSize];
810 statBufferSize = neededSize;
811 }
812
813 const UChar *p = data();
814 char *q = statBuffer;
815 const UChar *limit = p + length;
816 while (p != limit) {
817 *q = p->uc;
818 ++p;
819 ++q;
820 }
821 *q = '\0';
822
823 return statBuffer;
824}
825
826#ifdef KJS_DEBUG_MEM
827void UString::globalClear()
828{
829 delete [] statBuffer;
830 statBuffer = 0;
831 statBufferSize = 0;
832}
833#endif
834
835UString &UString::operator=(const char *c)
836{
837 int l = c ? strlen(c) : 0;
838 UChar *d;
839 if (rep->rc == 1 && l <= rep->capacity && !rep->baseString && rep->offset == 0 && rep->preCapacity == 0) {
840 d = rep->buf;
841 rep->_hash = 0;
842 } else {
843 release();
844 d = static_cast<UChar *>(kjs_fast_malloc(sizeof(UChar) * l));
845 rep = Rep::create(d, l);
846 }
847 for (int i = 0; i < l; i++)
848 d[i].uc = c[i];
849
850 return *this;
851}
852
853UString &UString::operator=(const UString &str)
854{
855 str.rep->ref();
856 release();
857 rep = str.rep;
858
859 return *this;
860}
861
862bool UString::is8Bit() const
863{
864 const UChar *u = data();
865 const UChar *limit = u + size();
866 while (u < limit) {
867 if (u->uc > 0xFF)
868 return false;
869 ++u;
870 }
871
872 return true;
873}
874
875UChar UString::operator[](int pos) const
876{
877 if (pos >= size())
878 return '\0';
879 return data()[pos];
880}
881
882UCharReference UString::operator[](int pos)
883{
884 /* TODO: boundary check */
885 return UCharReference(this, pos);
886}
887
888double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const
889{
890 double d;
891
892 // FIXME: If tolerateTrailingJunk is true, then we want to tolerate non-8-bit junk
893 // after the number, so is8Bit is too strict a check.
894 if (!is8Bit())
895 return NaN;
896
897 const char *c = ascii();
898
899 // skip leading white space
900 while (isspace(*c))
901 c++;
902
903 // empty string ?
904 if (*c == '\0')
905 return tolerateEmptyString ? 0.0 : NaN;
906
907 // hex number ?
908 if (*c == '0' && (*(c+1) == 'x' || *(c+1) == 'X')) {
909 c++;
910 d = 0.0;
911 while (*(++c)) {
912 if (*c >= '0' && *c <= '9')
913 d = d * 16.0 + *c - '0';
914 else if ((*c >= 'A' && *c <= 'F') || (*c >= 'a' && *c <= 'f'))
915 d = d * 16.0 + (*c & 0xdf) - 'A' + 10.0;
916 else
917 break;
918 }
919 } else {
920 // regular number ?
921 char *end;
922 d = kjs_strtod(c, &end);
923 if ((d != 0.0 || end != c) && d != HUGE_VAL && d != -HUGE_VAL) {
924 c = end;
925 } else {
926 // infinity ?
927 d = 1.0;
928 if (*c == '+')
929 c++;
930 else if (*c == '-') {
931 d = -1.0;
932 c++;
933 }
934 if (strncmp(c, "Infinity", 8) != 0)
935 return NaN;
936 d = d * Inf;
937 c += 8;
938 }
939 }
940
941 // allow trailing white space
942 while (isspace(*c))
943 c++;
944 // don't allow anything after - unless tolerant=true
945 if (!tolerateTrailingJunk && *c != '\0')
946 d = NaN;
947
948 return d;
949}
950
951double UString::toDouble(bool tolerateTrailingJunk) const
952{
953 return toDouble(tolerateTrailingJunk, true);
954}
955
956double UString::toDouble() const
957{
958 return toDouble(false, true);
959}
960
961unsigned long UString::toULong(bool *ok, bool tolerateEmptyString) const
962{
963 double d = toDouble(false, tolerateEmptyString);
964 bool b = true;
965
966 if (isNaN(d) || d != static_cast<unsigned long>(d)) {
967 b = false;
968 d = 0;
969 }
970
971 if (ok)
972 *ok = b;
973
974 return static_cast<unsigned long>(d);
975}
976
977unsigned long UString::toULong(bool *ok) const
978{
979 return toULong(ok, true);
980}
981
982uint32_t UString::toUInt32(bool *ok) const
983{
984 double d = toDouble();
985 bool b = true;
986
987 if (isNaN(d) || d != static_cast<uint32_t>(d)) {
988 b = false;
989 d = 0;
990 }
991
992 if (ok)
993 *ok = b;
994
995 return static_cast<uint32_t>(d);
996}
997
998uint32_t UString::toStrictUInt32(bool *ok) const
999{
1000 if (ok)
1001 *ok = false;
1002
1003 // Empty string is not OK.
1004 int len = rep->len;
1005 if (len == 0)
1006 return 0;
1007 const UChar *p = rep->data();
1008 unsigned short c = p->unicode();
1009
1010 // If the first digit is 0, only 0 itself is OK.
1011 if (c == '0') {
1012 if (len == 1 && ok)
1013 *ok = true;
1014 return 0;
1015 }
1016
1017 // Convert to UInt32, checking for overflow.
1018 uint32_t i = 0;
1019 while (1) {
1020 // Process character, turning it into a digit.
1021 if (c < '0' || c > '9')
1022 return 0;
1023 const unsigned d = c - '0';
1024
1025 // Multiply by 10, checking for overflow out of 32 bits.
1026 if (i > 0xFFFFFFFFU / 10)
1027 return 0;
1028 i *= 10;
1029
1030 // Add in the digit, checking for overflow out of 32 bits.
1031 const unsigned max = 0xFFFFFFFFU - d;
1032 if (i > max)
1033 return 0;
1034 i += d;
1035
1036 // Handle end of string.
1037 if (--len == 0) {
1038 if (ok)
1039 *ok = true;
1040 return i;
1041 }
1042
1043 // Get next character.
1044 c = (++p)->unicode();
1045 }
1046}
1047
1048// Rule from ECMA 15.2 about what an array index is.
1049// Must exactly match string form of an unsigned integer, and be less than 2^32 - 1.
1050unsigned UString::toArrayIndex(bool *ok) const
1051{
1052 unsigned i = toStrictUInt32(ok);
1053 if (i >= 0xFFFFFFFFU && ok)
1054 *ok = false;
1055 return i;
1056}
1057
1058int UString::find(const UString &f, int pos) const
1059{
1060 int sz = size();
1061 int fsz = f.size();
1062 if (sz < fsz)
1063 return -1;
1064 if (pos < 0)
1065 pos = 0;
1066 if (fsz == 0)
1067 return pos;
1068 const UChar *end = data() + sz - fsz;
1069 long fsizeminusone = (fsz - 1) * sizeof(UChar);
1070 const UChar *fdata = f.data();
1071 unsigned short fchar = fdata->uc;
1072 ++fdata;
1073 for (const UChar *c = data() + pos; c <= end; c++)
1074 if (c->uc == fchar && !memcmp(c + 1, fdata, fsizeminusone))
1075 return (c-data());
1076
1077 return -1;
1078}
1079
1080int UString::find(UChar ch, int pos) const
1081{
1082 if (pos < 0)
1083 pos = 0;
1084 const UChar *end = data() + size();
1085 for (const UChar *c = data() + pos; c < end; c++)
1086 if (*c == ch)
1087 return (c-data());
1088
1089 return -1;
1090}
1091
1092int UString::rfind(const UString &f, int pos) const
1093{
1094 int sz = size();
1095 int fsz = f.size();
1096 if (sz < fsz)
1097 return -1;
1098 if (pos < 0)
1099 pos = 0;
1100 if (pos > sz - fsz)
1101 pos = sz - fsz;
1102 if (fsz == 0)
1103 return pos;
1104 long fsizeminusone = (fsz - 1) * sizeof(UChar);
1105 const UChar *fdata = f.data();
1106 for (const UChar *c = data() + pos; c >= data(); c--) {
1107 if (*c == *fdata && !memcmp(c + 1, fdata + 1, fsizeminusone))
1108 return (c-data());
1109 }
1110
1111 return -1;
1112}
1113
1114int UString::rfind(UChar ch, int pos) const
1115{
1116 if (isEmpty())
1117 return -1;
1118 if (pos + 1 >= size())
1119 pos = size() - 1;
1120 for (const UChar *c = data() + pos; c >= data(); c--) {
1121 if (*c == ch)
1122 return (c-data());
1123 }
1124
1125 return -1;
1126}
1127
1128UString UString::substr(int pos, int len) const
1129{
1130 int s = size();
1131
1132 if (pos < 0)
1133 pos = 0;
1134 else if (pos >= s)
1135 pos = s;
1136 if (len < 0)
1137 len = s;
1138 if (pos + len >= s)
1139 len = s - pos;
1140
1141 if (pos == 0 && len == s)
1142 return *this;
1143
1144 UString::Rep *newRep = Rep::create(rep, pos, len);
1145 UString result(newRep);
1146 newRep->deref();
1147
1148 return result;
1149}
1150
1151void UString::detach()
1152{
1153 if (rep->rc > 1 || rep->baseString) {
1154 int l = size();
1155 UChar *n = static_cast<UChar *>(kjs_fast_malloc(sizeof(UChar) * l));
1156 memcpy(n, data(), l * sizeof(UChar));
1157 release();
1158 rep = Rep::create(n, l);
1159 }
1160}
1161
1162bool operator==(const UString& s1, const UString& s2)
1163{
1164 if (s1.rep->len != s2.rep->len)
1165 return false;
1166
1167 return (memcmp(s1.rep->data(), s2.rep->data(),
1168 s1.rep->len * sizeof(UChar)) == 0);
1169}
1170
1171bool operator==(const UString& s1, const char *s2)
1172{
1173 if (s2 == 0) {
1174 return s1.isEmpty();
1175 }
1176
1177 const UChar *u = s1.data();
1178 const UChar *uend = u + s1.size();
1179 while (u != uend && *s2) {
1180 if (u->uc != (unsigned char)*s2)
1181 return false;
1182 s2++;
1183 u++;
1184 }
1185
1186 return u == uend && *s2 == 0;
1187}
1188
1189bool operator<(const UString& s1, const UString& s2)
1190{
1191 const int l1 = s1.size();
1192 const int l2 = s2.size();
1193 const int lmin = l1 < l2 ? l1 : l2;
1194 const UChar *c1 = s1.data();
1195 const UChar *c2 = s2.data();
1196 int l = 0;
1197 while (l < lmin && *c1 == *c2) {
1198 c1++;
1199 c2++;
1200 l++;
1201 }
1202 if (l < lmin)
1203 return (c1->uc < c2->uc);
1204
1205 return (l1 < l2);
1206}
1207
1208int compare(const UString& s1, const UString& s2)
1209{
1210 const int l1 = s1.size();
1211 const int l2 = s2.size();
1212 const int lmin = l1 < l2 ? l1 : l2;
1213 const UChar *c1 = s1.data();
1214 const UChar *c2 = s2.data();
1215 int l = 0;
1216 while (l < lmin && *c1 == *c2) {
1217 c1++;
1218 c2++;
1219 l++;
1220 }
1221 if (l < lmin)
1222 return (c1->uc > c2->uc) ? 1 : -1;
1223
1224 if (l1 == l2) {
1225 return 0;
1226 }
1227 return (l1 < l2) ? 1 : -1;
1228}
1229
1230inline int inlineUTF8SequenceLengthNonASCII(char b0)
1231{
1232 if ((b0 & 0xC0) != 0xC0)
1233 return 0;
1234 if ((b0 & 0xE0) == 0xC0)
1235 return 2;
1236 if ((b0 & 0xF0) == 0xE0)
1237 return 3;
1238 if ((b0 & 0xF8) == 0xF0)
1239 return 4;
1240 return 0;
1241}
1242
1243int UTF8SequenceLengthNonASCII(char b0)
1244{
1245 return inlineUTF8SequenceLengthNonASCII(b0);
1246}
1247
1248inline int inlineUTF8SequenceLength(char b0)
1249{
1250 return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
1251}
1252
1253// Given a first byte, gives the length of the UTF-8 sequence it begins.
1254// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
1255// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
1256int UTF8SequenceLength(char b0)
1257{
1258 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
1259}
1260
1261// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
1262// Only allows Unicode characters (U-00000000 to U-0010FFFF).
1263// Returns -1 if the sequence is not valid (including presence of extra bytes).
1264int decodeUTF8Sequence(const char *sequence)
1265{
1266 // Handle 0-byte sequences (never valid).
1267 const unsigned char b0 = sequence[0];
1268 const int length = inlineUTF8SequenceLength(b0);
1269 if (length == 0)
1270 return -1;
1271
1272 // Handle 1-byte sequences (plain ASCII).
1273 const unsigned char b1 = sequence[1];
1274 if (length == 1) {
1275 if (b1)
1276 return -1;
1277 return b0;
1278 }
1279
1280 // Handle 2-byte sequences.
1281 if ((b1 & 0xC0) != 0x80)
1282 return -1;
1283 const unsigned char b2 = sequence[2];
1284 if (length == 2) {
1285 if (b2)
1286 return -1;
1287 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
1288 if (c < 0x80)
1289 return -1;
1290 return c;
1291 }
1292
1293 // Handle 3-byte sequences.
1294 if ((b2 & 0xC0) != 0x80)
1295 return -1;
1296 const unsigned char b3 = sequence[3];
1297 if (length == 3) {
1298 if (b3)
1299 return -1;
1300 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
1301 if (c < 0x800)
1302 return -1;
1303 // UTF-16 surrogates should never appear in UTF-8 data.
1304 if (c >= 0xD800 && c <= 0xDFFF)
1305 return -1;
1306 // Backwards BOM and U+FFFF should never appear in UTF-8 data.
1307 if (c == 0xFFFE || c == 0xFFFF)
1308 return -1;
1309 return c;
1310 }
1311
1312 // Handle 4-byte sequences.
1313 if ((b3 & 0xC0) != 0x80)
1314 return -1;
1315 const unsigned char b4 = sequence[4];
1316 if (length == 4) {
1317 if (b4)
1318 return -1;
1319 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
1320 if (c < 0x10000 || c > 0x10FFFF)
1321 return -1;
1322 return c;
1323 }
1324
1325 return -1;
1326}
1327
1328CString UString::UTF8String() const
1329{
1330 // Allocate a buffer big enough to hold all the characters.
1331 const int length = size();
1332 const unsigned bufferSize = length * 3;
1333 char fixedSizeBuffer[1024];
1334 char *buffer;
1335 if (bufferSize > sizeof(fixedSizeBuffer)) {
1336 buffer = new char [bufferSize];
1337 } else {
1338 buffer = fixedSizeBuffer;
1339 }
1340
1341 // Convert to runs of 8-bit characters.
1342 char *p = buffer;
1343 const UChar *d = data();
1344 for (int i = 0; i != length; ++i) {
1345 unsigned short c = d[i].unicode();
1346 if (c < 0x80) {
1347 *p++ = (char)c;
1348 } else if (c < 0x800) {
1349 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
1350 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1351 } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) {
1352 unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF));
1353 *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
1354 *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
1355 *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1356 *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
1357 ++i;
1358 } else {
1359 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
1360 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1361 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1362 }
1363 }
1364
1365 // Return the result as a C string.
1366 CString result(buffer, p - buffer);
1367 if (buffer != fixedSizeBuffer) {
1368 delete [] buffer;
1369 }
1370 return result;
1371}
1372
1373} // namespace KJS
Note: See TracBrowser for help on using the repository browser.