Context Navigation

ustring.cpp@ 13304

Visit:

Last change on this file since 13304 was 13294, checked in by ggaren, 19 years ago

JavaScriptCore:

Fixed <rdar://problem/4478239> string sort puts "closed" before "close"

Reviewed by Eric.

kjs/ustring.cpp: (KJS::compare): Inverted a < in order to treat the longer string as > the shorter string.

LayoutTests:

Reviewed by Eric.

Layout test for <rdar://problem/4478239> string sort puts "closed"
before "close"

Also changed the engine to report data types when tests fail, so that
you don't get messages like, "should be A, was A."

Updated results for these files:

fast/js/kde/Array-expected.txt:
fast/js/kde/RegExp-expected.txt:
fast/js/kde/encode_decode_uri-expected.txt:
fast/js/resources/js-test-pre.js:

Added these files:

fast/js/string-sort-expected.txt: Added.
fast/js/string-sort.html: Added.

Property allow-tabs set to x
Property svn:eol-style set to native

File size: 31.2 KB

Line
1	// -- c-basic-offset: 2 --
2	/*
3	* This file is part of the KDE libraries
4	* Copyright (C) 1999-2000 Harri Porten ([email protected])
5	* Copyright (C) 2004 Apple Computer, Inc.
6	*
7	* This library is free software; you can redistribute it and/or
8	* modify it under the terms of the GNU Library General Public
9	* License as published by the Free Software Foundation; either
10	* version 2 of the License, or (at your option) any later version.
11	*
12	* This library is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15	* Library General Public License for more details.
16	*
17	* You should have received a copy of the GNU Library General Public License
18	* along with this library; see the file COPYING.LIB. If not, write to
19	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20	* Boston, MA 02110-1301, USA.
21	*
22	*/
23
24	#include "config.h"
25	#include "ustring.h"
26
27	#include <assert.h>
28	#include <stdlib.h>
29	#include <stdio.h>
30	#include <ctype.h>
31	#if HAVE(STRING_H)
32	#include <string.h>
33	#endif
34	#if HAVE(STRINGS_H)
35	#include <strings.h>
36	#endif
37
38	#include "operations.h"
39	#include "identifier.h"
40	#include <math.h>
41	#include "dtoa.h"
42
43	#include <kxmlcore/Vector.h>
44
45	using std::max;
46
47	#include <unicode/uchar.h>
48
49	namespace KJS {
50
51	extern const double NaN;
52	extern const double Inf;
53
54	CString::CString(const char *c)
55	{
56	length = strlen(c);
57	data = new char[length+1];
58	memcpy(data, c, length + 1);
59	}
60
61	CString::CString(const char *c, int len)
62	{
63	length = len;
64	data = new char[len+1];
65	memcpy(data, c, len);
66	data[len] = 0;
67	}
68
69	CString::CString(const CString &b)
70	{
71	length = b.length;
72	if (length > 0 && b.data) {
73	data = new char[length+1];
74	memcpy(data, b.data, length + 1);
75	}
76	else {
77	data = 0;
78	}
79	}
80
81	CString::~CString()
82	{
83	delete [] data;
84	}
85
86	CString &CString::append(const CString &t)
87	{
88	char *n;
89	n = new char[length+t.length+1];
90	if (length)
91	memcpy(n, data, length);
92	if (t.length)
93	memcpy(n+length, t.data, t.length);
94	length += t.length;
95	n[length] = 0;
96
97	delete [] data;
98	data = n;
99
100	return *this;
101	}
102
103	CString &CString::operator=(const char *c)
104	{
105	if (data)
106	delete [] data;
107	length = strlen(c);
108	data = new char[length+1];
109	memcpy(data, c, length + 1);
110
111	return *this;
112	}
113
114	CString &CString::operator=(const CString &str)
115	{
116	if (this == &str)
117	return *this;
118
119	if (data)
120	delete [] data;
121	length = str.length;
122	if (length > 0 && str.data) {
123	data = new char[length + 1];
124	memcpy(data, str.data, length + 1);
125	}
126	else {
127	data = 0;
128	}
129
130	return *this;
131	}
132
133	bool operator==(const CString& c1, const CString& c2)
134	{
135	int len = c1.size();
136	return len == c2.size() && (len == 0 \|\| memcmp(c1.c_str(), c2.c_str(), len) == 0);
137	}
138
139	// Hack here to avoid a global with a constructor; point to an unsigned short instead of a UChar.
140	static unsigned short almostUChar;
141	static UChar const nonNullUCharPointer = reinterpret_cast<UChar >(&almostUChar);
142	UString::Rep UString::Rep::null = { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
143	UString::Rep UString::Rep::empty = { 0, 0, 1, 0, 0, 0, nonNullUCharPointer, 0, 0, 0, 0 };
144	const int normalStatBufferSize = 4096;
145	static char *statBuffer = 0;
146	static int statBufferSize = 0;
147
148	UChar UChar::toLower() const
149	{
150	return static_cast<unsigned short>(u_tolower(uc));
151	}
152
153	UChar UChar::toUpper() const
154	{
155	return static_cast<unsigned short>(u_toupper(uc));
156	}
157
158	UCharReference& UCharReference::operator=(UChar c)
159	{
160	str->copyForWriting();
161	if (offset < str->rep()->len)
162	*(str->rep()->data() + offset) = c;
163	/* TODO: lengthen string ? */
164	return *this;
165	}
166
167	UChar& UCharReference::ref() const
168	{
169	if (offset < str->rep()->len)
170	return *(str->rep()->data() + offset);
171	else {
172	static UChar callerBetterNotModifyThis('\0');
173	return callerBetterNotModifyThis;
174	}
175	}
176
177	PassRefPtr<UString::Rep> UString::Rep::createCopying(const UChar *d, int l)
178	{
179	int sizeInBytes = l * sizeof(UChar);
180	UChar copyD = static_cast<UChar >(fastMalloc(sizeInBytes));
181	memcpy(copyD, d, sizeInBytes);
182
183	return create(copyD, l);
184	}
185
186	PassRefPtr<UString::Rep> UString::Rep::create(UChar *d, int l)
187	{
188	Rep *r = new Rep;
189	r->offset = 0;
190	r->len = l;
191	r->rc = 1;
192	r->_hash = 0;
193	r->isIdentifier = 0;
194	r->baseString = 0;
195	r->buf = d;
196	r->usedCapacity = l;
197	r->capacity = l;
198	r->usedPreCapacity = 0;
199	r->preCapacity = 0;
200
201	// steal the single reference this Rep was created with
202	return adoptRef(r);
203	}
204
205	PassRefPtr<UString::Rep> UString::Rep::create(PassRefPtr<Rep> base, int offset, int length)
206	{
207	assert(base);
208
209	int baseOffset = base->offset;
210
211	if (base->baseString) {
212	base = base->baseString;
213	}
214
215	assert(-(offset + baseOffset) <= base->usedPreCapacity);
216	assert(offset + baseOffset + length <= base->usedCapacity);
217
218	Rep *r = new Rep;
219	r->offset = baseOffset + offset;
220	r->len = length;
221	r->rc = 1;
222	r->_hash = 0;
223	r->isIdentifier = 0;
224	r->baseString = base.release();
225	r->buf = 0;
226	r->usedCapacity = 0;
227	r->capacity = 0;
228	r->usedPreCapacity = 0;
229	r->preCapacity = 0;
230
231	// steal the single reference this Rep was created with
232	return adoptRef(r);
233	}
234
235	void UString::Rep::destroy()
236	{
237	if (isIdentifier)
238	Identifier::remove(this);
239	if (baseString) {
240	baseString->deref();
241	} else {
242	fastFree(buf);
243	}
244	delete this;
245	}
246
247	// Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
248	// or anything like that.
249	const unsigned PHI = 0x9e3779b9U;
250
251	// Paul Hsieh's SuperFastHash
252	// http://www.azillionmonkeys.com/qed/hash.html
253	unsigned UString::Rep::computeHash(const UChar *s, int len)
254	{
255	unsigned l = len;
256	uint32_t hash = PHI;
257	uint32_t tmp;
258
259	int rem = l & 1;
260	l >>= 1;
261
262	// Main loop
263	for (; l > 0; l--) {
264	hash += s[0].uc;
265	tmp = (s[1].uc << 11) ^ hash;
266	hash = (hash << 16) ^ tmp;
267	s += 2;
268	hash += hash >> 11;
269	}
270
271	// Handle end case
272	if (rem) {
273	hash += s[0].uc;
274	hash ^= hash << 11;
275	hash += hash >> 17;
276	}
277
278	// Force "avalanching" of final 127 bits
279	hash ^= hash << 3;
280	hash += hash >> 5;
281	hash ^= hash << 2;
282	hash += hash >> 15;
283	hash ^= hash << 10;
284
285	// this avoids ever returning a hash code of 0, since that is used to
286	// signal "hash not computed yet", using a value that is likely to be
287	// effectively the same as 0 when the low bits are masked
288	if (hash == 0)
289	hash = 0x80000000;
290
291	return hash;
292	}
293
294	// Paul Hsieh's SuperFastHash
295	// http://www.azillionmonkeys.com/qed/hash.html
296	unsigned UString::Rep::computeHash(const char *s)
297	{
298	// This hash is designed to work on 16-bit chunks at a time. But since the normal case
299	// (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
300	// were 16-bit chunks, which should give matching results
301
302	uint32_t hash = PHI;
303	uint32_t tmp;
304	unsigned l = strlen(s);
305
306	int rem = l & 1;
307	l >>= 1;
308
309	// Main loop
310	for (; l > 0; l--) {
311	hash += (unsigned char)s[0];
312	tmp = ((unsigned char)s[1] << 11) ^ hash;
313	hash = (hash << 16) ^ tmp;
314	s += 2;
315	hash += hash >> 11;
316	}
317
318	// Handle end case
319	if (rem) {
320	hash += (unsigned char)s[0];
321	hash ^= hash << 11;
322	hash += hash >> 17;
323	}
324
325	// Force "avalanching" of final 127 bits
326	hash ^= hash << 3;
327	hash += hash >> 5;
328	hash ^= hash << 2;
329	hash += hash >> 15;
330	hash ^= hash << 10;
331
332	// this avoids ever returning a hash code of 0, since that is used to
333	// signal "hash not computed yet", using a value that is likely to be
334	// effectively the same as 0 when the low bits are masked
335	if (hash == 0)
336	hash = 0x80000000;
337
338	return hash;
339	}
340
341	// put these early so they can be inlined
342	inline int UString::expandedSize(int size, int otherSize) const
343	{
344	int s = (size * 11 / 10) + 1 + otherSize;
345	return s;
346	}
347
348	inline int UString::usedCapacity() const
349	{
350	return m_rep->baseString ? m_rep->baseString->usedCapacity : m_rep->usedCapacity;
351	}
352
353	inline int UString::usedPreCapacity() const
354	{
355	return m_rep->baseString ? m_rep->baseString->usedPreCapacity : m_rep->usedPreCapacity;
356	}
357
358	void UString::expandCapacity(int requiredLength)
359	{
360	Rep *r = m_rep->baseString ? m_rep->baseString : rep();
361
362	if (requiredLength > r->capacity) {
363	int newCapacity = expandedSize(requiredLength, r->preCapacity);
364	r->buf = static_cast<UChar >(fastRealloc(r->buf, newCapacity sizeof(UChar)));
365	r->capacity = newCapacity - r->preCapacity;
366	}
367	if (requiredLength > r->usedCapacity) {
368	r->usedCapacity = requiredLength;
369	}
370	}
371
372	void UString::expandPreCapacity(int requiredPreCap)
373	{
374	Rep *r = m_rep->baseString ? m_rep->baseString : rep();
375
376	if (requiredPreCap > r->preCapacity) {
377	int newCapacity = expandedSize(requiredPreCap, r->capacity);
378	int delta = newCapacity - r->capacity - r->preCapacity;
379
380	UChar newBuf = static_cast<UChar >(fastMalloc(newCapacity * sizeof(UChar)));
381	memcpy(newBuf + delta, r->buf, (r->capacity + r->preCapacity) * sizeof(UChar));
382	fastFree(r->buf);
383	r->buf = newBuf;
384
385	r->preCapacity = newCapacity - r->capacity;
386	}
387	if (requiredPreCap > r->usedPreCapacity) {
388	r->usedPreCapacity = requiredPreCap;
389	}
390	}
391
392
393	UString::UString(char c)
394	{
395	UChar d = static_cast<UChar >(fastMalloc(sizeof(UChar)));
396	d[0] = c;
397	m_rep = Rep::create(d, 1);
398	}
399
400	UString::UString(const char *c)
401	{
402	if (!c) {
403	m_rep = &Rep::null;
404	return;
405	}
406	int length = strlen(c);
407	if (length == 0) {
408	m_rep = &Rep::empty;
409	return;
410	}
411	UChar d = static_cast<UChar >(fastMalloc(sizeof(UChar) * length));
412	for (int i = 0; i < length; i++)
413	d[i].uc = c[i];
414	m_rep = Rep::create(d, length);
415	}
416
417	UString::UString(const UChar *c, int length)
418	{
419	if (length == 0)
420	m_rep = &Rep::empty;
421	else
422	m_rep = Rep::createCopying(c, length);
423	}
424
425	UString::UString(UChar *c, int length, bool copy)
426	{
427	if (length == 0)
428	m_rep = &Rep::empty;
429	else if (copy)
430	m_rep = Rep::createCopying(c, length);
431	else
432	m_rep = Rep::create(c, length);
433	}
434
435	UString::UString(const UString &a, const UString &b)
436	{
437	int aSize = a.size();
438	int aOffset = a.m_rep->offset;
439	int bSize = b.size();
440	int bOffset = b.m_rep->offset;
441	int length = aSize + bSize;
442
443	// possible cases:
444
445	if (aSize == 0) {
446	// a is empty
447	m_rep = b.m_rep;
448	} else if (bSize == 0) {
449	// b is empty
450	m_rep = a.m_rep;
451	} else if (aOffset + aSize == a.usedCapacity() && 4 * aSize >= bSize &&
452	(-bOffset != b.usedPreCapacity() \|\| aSize >= bSize)) {
453	// - a reaches the end of its buffer so it qualifies for shared append
454	// - also, it's at least a quarter the length of b - appending to a much shorter
455	// string does more harm than good
456	// - however, if b qualifies for prepend and is longer than a, we'd rather prepend
457	UString x(a);
458	x.expandCapacity(aOffset + length);
459	memcpy(const_cast<UChar >(a.data() + aSize), b.data(), bSize sizeof(UChar));
460	m_rep = Rep::create(a.m_rep, 0, length);
461	} else if (-bOffset == b.usedPreCapacity() && 4 * bSize >= aSize) {
462	// - b reaches the beginning of its buffer so it qualifies for shared prepend
463	// - also, it's at least a quarter the length of a - prepending to a much shorter
464	// string does more harm than good
465	UString y(b);
466	y.expandPreCapacity(-bOffset + aSize);
467	memcpy(const_cast<UChar >(b.data() - aSize), a.data(), aSize sizeof(UChar));
468	m_rep = Rep::create(b.m_rep, -aSize, length);
469	} else {
470	// a does not qualify for append, and b does not qualify for prepend, gotta make a whole new string
471	int newCapacity = expandedSize(length, 0);
472	UChar d = static_cast<UChar >(fastMalloc(sizeof(UChar) * newCapacity));
473	memcpy(d, a.data(), aSize * sizeof(UChar));
474	memcpy(d + aSize, b.data(), bSize * sizeof(UChar));
475	m_rep = Rep::create(d, length);
476	m_rep->capacity = newCapacity;
477	}
478	}
479
480	const UString &UString::null()
481	{
482	static UString n;
483	return n;
484	}
485
486	UString UString::from(int i)
487	{
488	UChar buf[1 + sizeof(i) * 3];
489	UChar *end = buf + sizeof(buf) / sizeof(UChar);
490	UChar *p = end;
491
492	if (i == 0) {
493	*--p = '0';
494	} else if (i == INT_MIN) {
495	char minBuf[1 + sizeof(i) * 3];
496	sprintf(minBuf, "%d", INT_MIN);
497	return UString(minBuf);
498	} else {
499	bool negative = false;
500	if (i < 0) {
501	negative = true;
502	i = -i;
503	}
504	while (i) {
505	*--p = (unsigned short)((i % 10) + '0');
506	i /= 10;
507	}
508	if (negative) {
509	*--p = '-';
510	}
511	}
512
513	return UString(p, end - p);
514	}
515
516	UString UString::from(unsigned int u)
517	{
518	UChar buf[sizeof(u) * 3];
519	UChar *end = buf + sizeof(buf) / sizeof(UChar);
520	UChar *p = end;
521
522	if (u == 0) {
523	*--p = '0';
524	} else {
525	while (u) {
526	*--p = (unsigned short)((u % 10) + '0');
527	u /= 10;
528	}
529	}
530
531	return UString(p, end - p);
532	}
533
534	UString UString::from(long l)
535	{
536	UChar buf[1 + sizeof(l) * 3];
537	UChar *end = buf + sizeof(buf) / sizeof(UChar);
538	UChar *p = end;
539
540	if (l == 0) {
541	*--p = '0';
542	} else if (l == LONG_MIN) {
543	char minBuf[1 + sizeof(l) * 3];
544	sprintf(minBuf, "%ld", LONG_MIN);
545	return UString(minBuf);
546	} else {
547	bool negative = false;
548	if (l < 0) {
549	negative = true;
550	l = -l;
551	}
552	while (l) {
553	*--p = (unsigned short)((l % 10) + '0');
554	l /= 10;
555	}
556	if (negative) {
557	*--p = '-';
558	}
559	}
560
561	return UString(p, end - p);
562	}
563
564	UString UString::from(double d)
565	{
566	char buf[80];
567	int decimalPoint;
568	int sign;
569
570	char *result = kjs_dtoa(d, 0, 0, &decimalPoint, &sign, NULL);
571	int length = strlen(result);
572
573	int i = 0;
574	if (sign) {
575	buf[i++] = '-';
576	}
577
578	if (decimalPoint <= 0 && decimalPoint > -6) {
579	buf[i++] = '0';
580	buf[i++] = '.';
581	for (int j = decimalPoint; j < 0; j++) {
582	buf[i++] = '0';
583	}
584	strcpy(buf + i, result);
585	} else if (decimalPoint <= 21 && decimalPoint > 0) {
586	if (length <= decimalPoint) {
587	strcpy(buf + i, result);
588	i += length;
589	for (int j = 0; j < decimalPoint - length; j++) {
590	buf[i++] = '0';
591	}
592	buf[i] = '\0';
593	} else {
594	strncpy(buf + i, result, decimalPoint);
595	i += decimalPoint;
596	buf[i++] = '.';
597	strcpy(buf + i, result + decimalPoint);
598	}
599	} else if (result[0] < '0' \|\| result[0] > '9') {
600	strcpy(buf + i, result);
601	} else {
602	buf[i++] = result[0];
603	if (length > 1) {
604	buf[i++] = '.';
605	strcpy(buf + i, result + 1);
606	i += length - 1;
607	}
608
609	buf[i++] = 'e';
610	buf[i++] = (decimalPoint >= 0) ? '+' : '-';
611	// decimalPoint can't be more than 3 digits decimal given the
612	// nature of float representation
613	int exponential = decimalPoint - 1;
614	if (exponential < 0) {
615	exponential = exponential * -1;
616	}
617	if (exponential >= 100) {
618	buf[i++] = '0' + exponential / 100;
619	}
620	if (exponential >= 10) {
621	buf[i++] = '0' + (exponential % 100) / 10;
622	}
623	buf[i++] = '0' + exponential % 10;
624	buf[i++] = '\0';
625	}
626
627	kjs_freedtoa(result);
628
629	return UString(buf);
630	}
631
632	UString UString::spliceSubstringsWithSeparators(const Range substringRanges, int rangeCount, const UString separators, int separatorCount) const
633	{
634	int totalLength = 0;
635
636	for (int i = 0; i < rangeCount; i++) {
637	totalLength += substringRanges[i].length;
638	}
639	for (int i = 0; i < separatorCount; i++) {
640	totalLength += separators[i].size();
641	}
642
643	UChar buffer = static_cast<UChar >(fastMalloc(totalLength * sizeof(UChar)));
644
645	int maxCount = max(rangeCount, separatorCount);
646	int bufferPos = 0;
647	for (int i = 0; i < maxCount; i++) {
648	if (i < rangeCount) {
649	memcpy(buffer + bufferPos, data() + substringRanges[i].position, substringRanges[i].length * sizeof(UChar));
650	bufferPos += substringRanges[i].length;
651	}
652	if (i < separatorCount) {
653	memcpy(buffer + bufferPos, separators[i].data(), separators[i].size() * sizeof(UChar));
654	bufferPos += separators[i].size();
655	}
656	}
657
658	return UString(UString::Rep::create(buffer, totalLength));
659	}
660
661
662
663	UString &UString::append(const UString &t)
664	{
665	int thisSize = size();
666	int thisOffset = m_rep->offset;
667	int tSize = t.size();
668	int length = thisSize + tSize;
669
670	// possible cases:
671	if (thisSize == 0) {
672	// this is empty
673	*this = t;
674	} else if (tSize == 0) {
675	// t is empty
676	} else if (!m_rep->baseString && m_rep->rc == 1) {
677	// this is direct and has refcount of 1 (so we can just alter it directly)
678	expandCapacity(thisOffset + length);
679	memcpy(const_cast<UChar >(data() + thisSize), t.data(), tSize sizeof(UChar));
680	m_rep->len = length;
681	m_rep->_hash = 0;
682	} else if (thisOffset + thisSize == usedCapacity()) {
683	// this reaches the end of the buffer - extend it
684	expandCapacity(thisOffset + length);
685	memcpy(const_cast<UChar >(data() + thisSize), t.data(), tSize sizeof(UChar));
686	m_rep = Rep::create(m_rep, 0, length);
687	} else {
688	// this is shared with someone using more capacity, gotta make a whole new string
689	int newCapacity = expandedSize(length, 0);
690	UChar d = static_cast<UChar >(fastMalloc(sizeof(UChar) * newCapacity));
691	memcpy(d, data(), thisSize * sizeof(UChar));
692	memcpy(const_cast<UChar >(d + thisSize), t.data(), tSize sizeof(UChar));
693	m_rep = Rep::create(d, length);
694	m_rep->capacity = newCapacity;
695	}
696
697	return *this;
698	}
699
700	UString &UString::append(const char *t)
701	{
702	int thisSize = size();
703	int thisOffset = m_rep->offset;
704	int tSize = strlen(t);
705	int length = thisSize + tSize;
706
707	// possible cases:
708	if (thisSize == 0) {
709	// this is empty
710	*this = t;
711	} else if (tSize == 0) {
712	// t is empty, we'll just return *this below.
713	} else if (!m_rep->baseString && m_rep->rc == 1) {
714	// this is direct and has refcount of 1 (so we can just alter it directly)
715	expandCapacity(thisOffset + length);
716	UChar d = const_cast<UChar >(data());
717	for (int i = 0; i < tSize; ++i)
718	d[thisSize+i] = t[i];
719	m_rep->len = length;
720	m_rep->_hash = 0;
721	} else if (thisOffset + thisSize == usedCapacity()) {
722	// this string reaches the end of the buffer - extend it
723	expandCapacity(thisOffset + length);
724	UChar d = const_cast<UChar >(data());
725	for (int i = 0; i < tSize; ++i)
726	d[thisSize+i] = t[i];
727	m_rep = Rep::create(m_rep, 0, length);
728	} else {
729	// this is shared with someone using more capacity, gotta make a whole new string
730	int newCapacity = expandedSize(length, 0);
731	UChar d = static_cast<UChar >(fastMalloc(sizeof(UChar) * newCapacity));
732	memcpy(d, data(), thisSize * sizeof(UChar));
733	for (int i = 0; i < tSize; ++i)
734	d[thisSize+i] = t[i];
735	m_rep = Rep::create(d, length);
736	m_rep->capacity = newCapacity;
737	}
738
739	return *this;
740	}
741
742	UString &UString::append(unsigned short c)
743	{
744	int thisOffset = m_rep->offset;
745	int length = size();
746
747	// possible cases:
748	if (length == 0) {
749	// this is empty - must make a new m_rep because we don't want to pollute the shared empty one
750	int newCapacity = expandedSize(1, 0);
751	UChar d = static_cast<UChar >(fastMalloc(sizeof(UChar) * newCapacity));
752	d[0] = c;
753	m_rep = Rep::create(d, 1);
754	m_rep->capacity = newCapacity;
755	} else if (!m_rep->baseString && m_rep->rc == 1) {
756	// this is direct and has refcount of 1 (so we can just alter it directly)
757	expandCapacity(thisOffset + length + 1);
758	UChar d = const_cast<UChar >(data());
759	d[length] = c;
760	m_rep->len = length + 1;
761	m_rep->_hash = 0;
762	} else if (thisOffset + length == usedCapacity()) {
763	// this reaches the end of the string - extend it and share
764	expandCapacity(thisOffset + length + 1);
765	UChar d = const_cast<UChar >(data());
766	d[length] = c;
767	m_rep = Rep::create(m_rep, 0, length + 1);
768	} else {
769	// this is shared with someone using more capacity, gotta make a whole new string
770	int newCapacity = expandedSize((length + 1), 0);
771	UChar d = static_cast<UChar >(fastMalloc(sizeof(UChar) * newCapacity));
772	memcpy(d, data(), length * sizeof(UChar));
773	d[length] = c;
774	m_rep = Rep::create(d, length);
775	m_rep->capacity = newCapacity;
776	}
777
778	return *this;
779	}
780
781	CString UString::cstring() const
782	{
783	return ascii();
784	}
785
786	char *UString::ascii() const
787	{
788	// Never make the buffer smaller than normalStatBufferSize.
789	// Thus we almost never need to reallocate.
790	int length = size();
791	int neededSize = length + 1;
792	if (neededSize < normalStatBufferSize) {
793	neededSize = normalStatBufferSize;
794	}
795	if (neededSize != statBufferSize) {
796	delete [] statBuffer;
797	statBuffer = new char [neededSize];
798	statBufferSize = neededSize;
799	}
800
801	const UChar *p = data();
802	char *q = statBuffer;
803	const UChar *limit = p + length;
804	while (p != limit) {
805	*q = p->uc;
806	++p;
807	++q;
808	}
809	*q = '\0';
810
811	return statBuffer;
812	}
813
814	#ifdef KJS_DEBUG_MEM
815	void UString::globalClear()
816	{
817	delete [] statBuffer;
818	statBuffer = 0;
819	statBufferSize = 0;
820	}
821	#endif
822
823	UString &UString::operator=(const char *c)
824	{
825	int l = c ? strlen(c) : 0;
826	UChar *d;
827	if (m_rep->rc == 1 && l <= m_rep->capacity && !m_rep->baseString && m_rep->offset == 0 && m_rep->preCapacity == 0) {
828	d = m_rep->buf;
829	m_rep->_hash = 0;
830	} else {
831	d = static_cast<UChar >(fastMalloc(sizeof(UChar) l));
832	m_rep = Rep::create(d, l);
833	}
834	for (int i = 0; i < l; i++)
835	d[i].uc = c[i];
836
837	return *this;
838	}
839
840	bool UString::is8Bit() const
841	{
842	const UChar *u = data();
843	const UChar *limit = u + size();
844	while (u < limit) {
845	if (u->uc > 0xFF)
846	return false;
847	++u;
848	}
849
850	return true;
851	}
852
853	UChar UString::operator[](int pos) const
854	{
855	if (pos >= size())
856	return '\0';
857	return data()[pos];
858	}
859
860	UCharReference UString::operator[](int pos)
861	{
862	/* TODO: boundary check */
863	return UCharReference(this, pos);
864	}
865
866	double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const
867	{
868	double d;
869
870	// FIXME: If tolerateTrailingJunk is true, then we want to tolerate non-8-bit junk
871	// after the number, so is8Bit is too strict a check.
872	if (!is8Bit())
873	return NaN;
874
875	const char *c = ascii();
876
877	// skip leading white space
878	while (isspace(*c))
879	c++;
880
881	// empty string ?
882	if (*c == '\0')
883	return tolerateEmptyString ? 0.0 : NaN;
884
885	// hex number ?
886	if (c == '0' && ((c+1) == 'x' \|\| *(c+1) == 'X')) {
887	c++;
888	d = 0.0;
889	while (*(++c)) {
890	if (c >= '0' && c <= '9')
891	d = d * 16.0 + *c - '0';
892	else if ((c >= 'A' && c <= 'F') \|\| (c >= 'a' && c <= 'f'))
893	d = d * 16.0 + (*c & 0xdf) - 'A' + 10.0;
894	else
895	break;
896	}
897	} else {
898	// regular number ?
899	char *end;
900	d = kjs_strtod(c, &end);
901	if ((d != 0.0 \|\| end != c) && d != HUGE_VAL && d != -HUGE_VAL) {
902	c = end;
903	} else {
904	// infinity ?
905	d = 1.0;
906	if (*c == '+')
907	c++;
908	else if (*c == '-') {
909	d = -1.0;
910	c++;
911	}
912	if (strncmp(c, "Infinity", 8) != 0)
913	return NaN;
914	d = d * Inf;
915	c += 8;
916	}
917	}
918
919	// allow trailing white space
920	while (isspace(*c))
921	c++;
922	// don't allow anything after - unless tolerant=true
923	if (!tolerateTrailingJunk && *c != '\0')
924	d = NaN;
925
926	return d;
927	}
928
929	double UString::toDouble(bool tolerateTrailingJunk) const
930	{
931	return toDouble(tolerateTrailingJunk, true);
932	}
933
934	double UString::toDouble() const
935	{
936	return toDouble(false, true);
937	}
938
939	uint32_t UString::toUInt32(bool *ok) const
940	{
941	double d = toDouble();
942	bool b = true;
943
944	if (d != static_cast<uint32_t>(d)) {
945	b = false;
946	d = 0;
947	}
948
949	if (ok)
950	*ok = b;
951
952	return static_cast<uint32_t>(d);
953	}
954
955	uint32_t UString::toUInt32(bool *ok, bool tolerateEmptyString) const
956	{
957	double d = toDouble(false, tolerateEmptyString);
958	bool b = true;
959
960	if (d != static_cast<uint32_t>(d)) {
961	b = false;
962	d = 0;
963	}
964
965	if (ok)
966	*ok = b;
967
968	return static_cast<uint32_t>(d);
969	}
970
971	uint32_t UString::toStrictUInt32(bool *ok) const
972	{
973	if (ok)
974	*ok = false;
975
976	// Empty string is not OK.
977	int len = m_rep->len;
978	if (len == 0)
979	return 0;
980	const UChar *p = m_rep->data();
981	unsigned short c = p->unicode();
982
983	// If the first digit is 0, only 0 itself is OK.
984	if (c == '0') {
985	if (len == 1 && ok)
986	*ok = true;
987	return 0;
988	}
989
990	// Convert to UInt32, checking for overflow.
991	uint32_t i = 0;
992	while (1) {
993	// Process character, turning it into a digit.
994	if (c < '0' \|\| c > '9')
995	return 0;
996	const unsigned d = c - '0';
997
998	// Multiply by 10, checking for overflow out of 32 bits.
999	if (i > 0xFFFFFFFFU / 10)
1000	return 0;
1001	i *= 10;
1002
1003	// Add in the digit, checking for overflow out of 32 bits.
1004	const unsigned max = 0xFFFFFFFFU - d;
1005	if (i > max)
1006	return 0;
1007	i += d;
1008
1009	// Handle end of string.
1010	if (--len == 0) {
1011	if (ok)
1012	*ok = true;
1013	return i;
1014	}
1015
1016	// Get next character.
1017	c = (++p)->unicode();
1018	}
1019	}
1020
1021	int UString::find(const UString &f, int pos) const
1022	{
1023	int sz = size();
1024	int fsz = f.size();
1025	if (sz < fsz)
1026	return -1;
1027	if (pos < 0)
1028	pos = 0;
1029	if (fsz == 0)
1030	return pos;
1031	const UChar *end = data() + sz - fsz;
1032	int fsizeminusone = (fsz - 1) * sizeof(UChar);
1033	const UChar *fdata = f.data();
1034	unsigned short fchar = fdata->uc;
1035	++fdata;
1036	for (const UChar *c = data() + pos; c <= end; c++)
1037	if (c->uc == fchar && !memcmp(c + 1, fdata, fsizeminusone))
1038	return (c-data());
1039
1040	return -1;
1041	}
1042
1043	int UString::find(UChar ch, int pos) const
1044	{
1045	if (pos < 0)
1046	pos = 0;
1047	const UChar *end = data() + size();
1048	for (const UChar *c = data() + pos; c < end; c++)
1049	if (*c == ch)
1050	return (c-data());
1051
1052	return -1;
1053	}
1054
1055	int UString::rfind(const UString &f, int pos) const
1056	{
1057	int sz = size();
1058	int fsz = f.size();
1059	if (sz < fsz)
1060	return -1;
1061	if (pos < 0)
1062	pos = 0;
1063	if (pos > sz - fsz)
1064	pos = sz - fsz;
1065	if (fsz == 0)
1066	return pos;
1067	int fsizeminusone = (fsz - 1) * sizeof(UChar);
1068	const UChar *fdata = f.data();
1069	for (const UChar *c = data() + pos; c >= data(); c--) {
1070	if (c == fdata && !memcmp(c + 1, fdata + 1, fsizeminusone))
1071	return (c-data());
1072	}
1073
1074	return -1;
1075	}
1076
1077	int UString::rfind(UChar ch, int pos) const
1078	{
1079	if (isEmpty())
1080	return -1;
1081	if (pos + 1 >= size())
1082	pos = size() - 1;
1083	for (const UChar *c = data() + pos; c >= data(); c--) {
1084	if (*c == ch)
1085	return (c-data());
1086	}
1087
1088	return -1;
1089	}
1090
1091	UString UString::substr(int pos, int len) const
1092	{
1093	int s = size();
1094
1095	if (pos < 0)
1096	pos = 0;
1097	else if (pos >= s)
1098	pos = s;
1099	if (len < 0)
1100	len = s;
1101	if (pos + len >= s)
1102	len = s - pos;
1103
1104	if (pos == 0 && len == s)
1105	return *this;
1106
1107	return UString(Rep::create(m_rep, pos, len));
1108	}
1109
1110	void UString::copyForWriting()
1111	{
1112	if (m_rep->rc > 1 \|\| m_rep->baseString) {
1113	int l = size();
1114	UChar n = static_cast<UChar >(fastMalloc(sizeof(UChar) * l));
1115	memcpy(n, data(), l * sizeof(UChar));
1116	m_rep = Rep::create(n, l);
1117	}
1118	}
1119
1120	bool operator==(const UString& s1, const UString& s2)
1121	{
1122	if (s1.m_rep->len != s2.m_rep->len)
1123	return false;
1124
1125	return (memcmp(s1.m_rep->data(), s2.m_rep->data(),
1126	s1.m_rep->len * sizeof(UChar)) == 0);
1127	}
1128
1129	bool operator==(const UString& s1, const char *s2)
1130	{
1131	if (s2 == 0) {
1132	return s1.isEmpty();
1133	}
1134
1135	const UChar *u = s1.data();
1136	const UChar *uend = u + s1.size();
1137	while (u != uend && *s2) {
1138	if (u->uc != (unsigned char)*s2)
1139	return false;
1140	s2++;
1141	u++;
1142	}
1143
1144	return u == uend && *s2 == 0;
1145	}
1146
1147	bool operator<(const UString& s1, const UString& s2)
1148	{
1149	const int l1 = s1.size();
1150	const int l2 = s2.size();
1151	const int lmin = l1 < l2 ? l1 : l2;
1152	const UChar *c1 = s1.data();
1153	const UChar *c2 = s2.data();
1154	int l = 0;
1155	while (l < lmin && c1 == c2) {
1156	c1++;
1157	c2++;
1158	l++;
1159	}
1160	if (l < lmin)
1161	return (c1->uc < c2->uc);
1162
1163	return (l1 < l2);
1164	}
1165
1166	int compare(const UString& s1, const UString& s2)
1167	{
1168	const int l1 = s1.size();
1169	const int l2 = s2.size();
1170	const int lmin = l1 < l2 ? l1 : l2;
1171	const UChar *c1 = s1.data();
1172	const UChar *c2 = s2.data();
1173	int l = 0;
1174	while (l < lmin && c1 == c2) {
1175	c1++;
1176	c2++;
1177	l++;
1178	}
1179
1180	if (l < lmin)
1181	return (c1->uc > c2->uc) ? 1 : -1;
1182
1183	if (l1 == l2)
1184	return 0;
1185
1186	return (l1 > l2) ? 1 : -1;
1187	}
1188
1189	inline int inlineUTF8SequenceLengthNonASCII(char b0)
1190	{
1191	if ((b0 & 0xC0) != 0xC0)
1192	return 0;
1193	if ((b0 & 0xE0) == 0xC0)
1194	return 2;
1195	if ((b0 & 0xF0) == 0xE0)
1196	return 3;
1197	if ((b0 & 0xF8) == 0xF0)
1198	return 4;
1199	return 0;
1200	}
1201
1202	int UTF8SequenceLengthNonASCII(char b0)
1203	{
1204	return inlineUTF8SequenceLengthNonASCII(b0);
1205	}
1206
1207	inline int inlineUTF8SequenceLength(char b0)
1208	{
1209	return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
1210	}
1211
1212	// Given a first byte, gives the length of the UTF-8 sequence it begins.
1213	// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
1214	// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
1215	int UTF8SequenceLength(char b0)
1216	{
1217	return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
1218	}
1219
1220	// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
1221	// Only allows Unicode characters (U-00000000 to U-0010FFFF).
1222	// Returns -1 if the sequence is not valid (including presence of extra bytes).
1223	int decodeUTF8Sequence(const char *sequence)
1224	{
1225	// Handle 0-byte sequences (never valid).
1226	const unsigned char b0 = sequence[0];
1227	const int length = inlineUTF8SequenceLength(b0);
1228	if (length == 0)
1229	return -1;
1230
1231	// Handle 1-byte sequences (plain ASCII).
1232	const unsigned char b1 = sequence[1];
1233	if (length == 1) {
1234	if (b1)
1235	return -1;
1236	return b0;
1237	}
1238
1239	// Handle 2-byte sequences.
1240	if ((b1 & 0xC0) != 0x80)
1241	return -1;
1242	const unsigned char b2 = sequence[2];
1243	if (length == 2) {
1244	if (b2)
1245	return -1;
1246	const int c = ((b0 & 0x1F) << 6) \| (b1 & 0x3F);
1247	if (c < 0x80)
1248	return -1;
1249	return c;
1250	}
1251
1252	// Handle 3-byte sequences.
1253	if ((b2 & 0xC0) != 0x80)
1254	return -1;
1255	const unsigned char b3 = sequence[3];
1256	if (length == 3) {
1257	if (b3)
1258	return -1;
1259	const int c = ((b0 & 0xF) << 12) \| ((b1 & 0x3F) << 6) \| (b2 & 0x3F);
1260	if (c < 0x800)
1261	return -1;
1262	// UTF-16 surrogates should never appear in UTF-8 data.
1263	if (c >= 0xD800 && c <= 0xDFFF)
1264	return -1;
1265	// Backwards BOM and U+FFFF should never appear in UTF-8 data.
1266	if (c == 0xFFFE \|\| c == 0xFFFF)
1267	return -1;
1268	return c;
1269	}
1270
1271	// Handle 4-byte sequences.
1272	if ((b3 & 0xC0) != 0x80)
1273	return -1;
1274	const unsigned char b4 = sequence[4];
1275	if (length == 4) {
1276	if (b4)
1277	return -1;
1278	const int c = ((b0 & 0x7) << 18) \| ((b1 & 0x3F) << 12) \| ((b2 & 0x3F) << 6) \| (b3 & 0x3F);
1279	if (c < 0x10000 \|\| c > 0x10FFFF)
1280	return -1;
1281	return c;
1282	}
1283
1284	return -1;
1285	}
1286
1287	CString UString::UTF8String() const
1288	{
1289	// Allocate a buffer big enough to hold all the characters.
1290	const int length = size();
1291	Vector<char, 1024> buffer(length * 3);
1292
1293	// Convert to runs of 8-bit characters.
1294	char *p = buffer.begin();
1295	const UChar *d = data();
1296	for (int i = 0; i != length; ++i) {
1297	unsigned short c = d[i].unicode();
1298	if (c < 0x80) {
1299	*p++ = (char)c;
1300	} else if (c < 0x800) {
1301	*p++ = (char)((c >> 6) \| 0xC0); // C0 is the 2-byte flag for UTF-8
1302	*p++ = (char)((c \| 0x80) & 0xBF); // next 6 bits, with high bit set
1303	} else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) {
1304	unsigned sc = 0x10000 + (((c & 0x3FF) << 10) \| (d[i+1].uc & 0x3FF));
1305	*p++ = (char)((sc >> 18) \| 0xF0); // F0 is the 4-byte flag for UTF-8
1306	*p++ = (char)(((sc >> 12) \| 0x80) & 0xBF); // next 6 bits, with high bit set
1307	*p++ = (char)(((sc >> 6) \| 0x80) & 0xBF); // next 6 bits, with high bit set
1308	*p++ = (char)((sc \| 0x80) & 0xBF); // next 6 bits, with high bit set
1309	++i;
1310	} else {
1311	*p++ = (char)((c >> 12) \| 0xE0); // E0 is the 3-byte flag for UTF-8
1312	*p++ = (char)(((c >> 6) \| 0x80) & 0xBF); // next 6 bits, with high bit set
1313	*p++ = (char)((c \| 0x80) & 0xBF); // next 6 bits, with high bit set
1314	}
1315	}
1316
1317	// Return the result as a C string.
1318	CString result(buffer, p - buffer);
1319
1320	return result;
1321	}
1322
1323	} // namespace KJS

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: webkit/trunk/JavaScriptCore/kjs/ustring.cpp@ 13304

Download in other formats: