Context Navigation

ustring.h@ 27406

Visit:

Last change on this file since 27406 was 27406, checked in by Darin Adler, 18 years ago

JavaScriptCore:

Reviewed by Maciej.

fix http://bugs.webkit.org/show_bug.cgi?id=15814 <rdar://problem/5536644> fast/js/kde/encode_decode_uri.html fails

These changes cause us to match the JavaScript specification and pass the
fast/js/kde/encode_decode_uri.html test.

kjs/function.cpp: (KJS::encode): Call the UTF-8 string conversion in its new strict mode, throwing an exception if there are malformed UTF-16 surrogate pairs in the text.

kjs/ustring.h: Added a strict version of the UTF-8 string conversion.
kjs/ustring.cpp: (KJS::decodeUTF8Sequence): Removed code to disallow U+FFFE and U+FFFF; while those might be illegal in some sense, they aren't supposed to get any special handling in the place where this function is currently used. (KJS::UString::UTF8String): Added the strictness.

LayoutTests:

Reviewed by Maciej.

updates for http://bugs.webkit.org/show_bug.cgi?id=15814 <rdar://problem/5536644> fast/js/kde/encode_decode_uri.html fails

fast/js/kde/resources/encode_decode_uri.js: Rewrote the test to cover edges better, and use the should functions in a way that makes failures easier to understand.
fast/js/kde/encode_decode_uri-expected.txt: Updated.

Property svn:eol-style set to native

File size: 14.6 KB

Line
1	// -- c-basic-offset: 2 --
2	/*
3	* Copyright (C) 1999-2000 Harri Porten ([email protected])
4	* Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Library General Public
8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Library General Public License for more details.
15	*
16	* You should have received a copy of the GNU Library General Public License
17	* along with this library; see the file COPYING.LIB. If not, write to
18	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19	* Boston, MA 02110-1301, USA.
20	*
21	*/
22
23	#ifndef _KJS_USTRING_H_
24	#define _KJS_USTRING_H_
25
26	#include "JSLock.h"
27	#include "collector.h"
28	#include <stdint.h>
29	#include <wtf/Assertions.h>
30	#include <wtf/FastMalloc.h>
31	#include <wtf/PassRefPtr.h>
32	#include <wtf/RefPtr.h>
33
34	/* On some ARM platforms GCC won't pack structures by default so sizeof(UChar)
35	will end up being != 2 which causes crashes since the code depends on that. */
36	#if COMPILER(GCC) && PLATFORM(FORCE_PACK)
37	#define PACK_STRUCT __attribute__((packed))
38	#else
39	#define PACK_STRUCT
40	#endif
41
42	/**
43	* @internal
44	*/
45	namespace DOM {
46	class DOMString;
47	class AtomicString;
48	}
49	class KJScript;
50
51	namespace KJS {
52
53	using WTF::PlacementNewAdoptType;
54	using WTF::PlacementNewAdopt;
55
56	class UString;
57
58	/**
59	* @short Unicode character.
60	*
61	* UChar represents a 16 bit Unicode character. It's internal data
62	* representation is compatible to XChar2b and QChar. It's therefore
63	* possible to exchange data with X and Qt with shallow copies.
64	*/
65	struct UChar {
66	/**
67	* Construct a character with uninitialized value.
68	*/
69	UChar();
70	/**
71	* Construct a character with the value denoted by the arguments.
72	* @param h higher byte
73	* @param l lower byte
74	*/
75	UChar(unsigned char h , unsigned char l);
76	/**
77	* Construct a character with the given value.
78	* @param u 16 bit Unicode value
79	*/
80	UChar(char u);
81	UChar(unsigned char u);
82	UChar(unsigned short u);
83	/**
84	* @return The higher byte of the character.
85	*/
86	unsigned char high() const { return static_cast<unsigned char>(uc >> 8); }
87	/**
88	* @return The lower byte of the character.
89	*/
90	unsigned char low() const { return static_cast<unsigned char>(uc); }
91	/**
92	* @return the 16 bit Unicode value of the character
93	*/
94	unsigned short unicode() const { return uc; }
95
96	unsigned short uc;
97	} PACK_STRUCT;
98
99	inline UChar::UChar() { }
100	inline UChar::UChar(unsigned char h , unsigned char l) : uc(h << 8 \| l) { }
101	inline UChar::UChar(char u) : uc((unsigned char)u) { }
102	inline UChar::UChar(unsigned char u) : uc(u) { }
103	inline UChar::UChar(unsigned short u) : uc(u) { }
104
105	/**
106	* @short 8 bit char based string class
107	*/
108	class CString {
109	public:
110	CString() : data(0), length(0) { }
111	CString(const char *c);
112	CString(const char *c, size_t len);
113	CString(const CString &);
114
115	~CString();
116
117	CString &append(const CString &);
118	CString &operator=(const char *c);
119	CString &operator=(const CString &);
120	CString &operator+=(const CString &c) { return append(c); }
121
122	size_t size() const { return length; }
123	const char *c_str() const { return data; }
124	private:
125	char *data;
126	size_t length;
127	};
128
129	/**
130	* @short Unicode string class
131	*/
132	class UString {
133	friend bool operator==(const UString&, const UString&);
134
135	public:
136	/**
137	* @internal
138	*/
139	struct Rep {
140
141	static PassRefPtr<Rep> create(UChar *d, int l);
142	static PassRefPtr<Rep> createCopying(const UChar *d, int l);
143	static PassRefPtr<Rep> create(PassRefPtr<Rep> base, int offset, int length);
144
145	void destroy();
146
147	bool baseIsSelf() const { return baseString == this; }
148	UChar* data() const { return baseString->buf + baseString->preCapacity + offset; }
149	int size() const { return len; }
150
151	unsigned hash() const { if (_hash == 0) _hash = computeHash(data(), len); return _hash; }
152	unsigned computedHash() const { ASSERT(_hash); return _hash; } // fast path for Identifiers
153
154	static unsigned computeHash(const UChar *, int length);
155	static unsigned computeHash(const char *);
156
157	Rep* ref() { ASSERT(JSLock::lockCount() > 0); ++rc; return this; }
158	ALWAYS_INLINE void deref() { ASSERT(JSLock::lockCount() > 0); if (--rc == 0) destroy(); }
159
160	// unshared data
161	int offset;
162	int len;
163	int rc;
164	mutable unsigned _hash;
165	bool isIdentifier;
166	UString::Rep* baseString;
167
168	// potentially shared data
169	UChar *buf;
170	int usedCapacity;
171	int capacity;
172	int usedPreCapacity;
173	int preCapacity;
174
175	static Rep null;
176	static Rep empty;
177	};
178
179	public:
180
181	/**
182	* Constructs a null string.
183	*/
184	UString();
185	/**
186	* Constructs a string from a classical zero-terminated char string.
187	*/
188	UString(const char *c);
189	/**
190	* Constructs a string from an array of Unicode characters of the specified
191	* length.
192	*/
193	UString(const UChar *c, int length);
194	/**
195	* If copy is false the string data will be adopted.
196	* That means that the data will NOT be copied and the pointer will
197	* be deleted when the UString object is modified or destroyed.
198	* Behaviour defaults to a deep copy if copy is true.
199	*/
200	UString(UChar *c, int length, bool copy);
201	/**
202	* Copy constructor. Makes a shallow copy only.
203	*/
204	UString(const UString &s) : m_rep(s.m_rep) {}
205	/**
206	* Convenience declaration only ! You'll be on your own to write the
207	* implementation for a construction from DOM::DOMString.
208	*
209	* Note: feel free to contact me if you want to see a dummy header for
210	* your favorite FooString class here !
211	*/
212	UString(const DOM::DOMString&);
213	/**
214	* Convenience declaration only ! See UString(const DOM::DOMString&).
215	*/
216	UString(const DOM::AtomicString&);
217
218	/**
219	* Concatenation constructor. Makes operator+ more efficient.
220	*/
221	UString(const UString &, const UString &);
222	/**
223	* Destructor.
224	*/
225	~UString() {}
226
227	// Special constructor for cases where we overwrite an object in place.
228	UString(PlacementNewAdoptType) : m_rep(PlacementNewAdopt) { }
229
230	/**
231	* Constructs a string from an int.
232	*/
233	static UString from(int i);
234	/**
235	* Constructs a string from an unsigned int.
236	*/
237	static UString from(unsigned int u);
238	/**
239	* Constructs a string from a long int.
240	*/
241	static UString from(long u);
242	/**
243	* Constructs a string from a double.
244	*/
245	static UString from(double d);
246
247	struct Range {
248	public:
249	Range(int pos, int len) : position(pos), length(len) {}
250	Range() {}
251	int position;
252	int length;
253	};
254
255	UString spliceSubstringsWithSeparators(const Range substringRanges, int rangeCount, const UString separators, int separatorCount) const;
256
257	/**
258	* Append another string.
259	*/
260	UString &append(const UString &);
261	UString &append(const char *);
262	UString &append(unsigned short);
263	UString &append(char c) { return append(static_cast<unsigned short>(static_cast<unsigned char>(c))); }
264	UString &append(UChar c) { return append(c.uc); }
265
266	/**
267	* @return The string converted to the 8-bit string type CString().
268	*/
269	CString cstring() const;
270	/**
271	* Convert the Unicode string to plain ASCII chars chopping of any higher
272	* bytes. This method should only be used for debugging purposes as it
273	* is neither Unicode safe nor free from side effects. In order not to
274	* waste any memory the char buffer is static and shared by all UString
275	* instances.
276	*/
277	char *ascii() const;
278
279	/**
280	* Convert the string to UTF-8, assuming it is UTF-16 encoded.
281	* Since this function is tolerant of badly formed UTF-16, it can create UTF-8
282	* strings that are invalid because they have characters in the range
283	* U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to
284	* be otherwise valid.
285	*/
286	CString UTF8String() const;
287	CString UTF8String(bool* utf16WasGood) const;
288
289	/**
290	* @see UString(const DOM::DOMString&).
291	*/
292	DOM::DOMString domString() const;
293
294	/**
295	* Assignment operator.
296	*/
297	UString &operator=(const char *c);
298	/**
299	* Appends the specified string.
300	*/
301	UString &operator+=(const UString &s) { return append(s); }
302	UString &operator+=(const char *s) { return append(s); }
303
304	/**
305	* @return A pointer to the internal Unicode data.
306	*/
307	const UChar* data() const { return m_rep->data(); }
308	/**
309	* @return True if null.
310	*/
311	bool isNull() const { return (m_rep == &Rep::null); }
312	/**
313	* @return True if null or zero length.
314	*/
315	bool isEmpty() const { return (!m_rep->len); }
316	/**
317	* Use this if you want to make sure that this string is a plain ASCII
318	* string. For example, if you don't want to lose any information when
319	* using cstring() or ascii().
320	*
321	* @return True if the string doesn't contain any non-ASCII characters.
322	*/
323	bool is8Bit() const;
324	/**
325	* @return The length of the string.
326	*/
327	int size() const { return m_rep->size(); }
328	/**
329	* Const character at specified position.
330	*/
331	const UChar operator[](int pos) const;
332
333	/**
334	* Attempts an conversion to a number. Apart from floating point numbers,
335	* the algorithm will recognize hexadecimal representations (as
336	* indicated by a 0x or 0X prefix) and +/- Infinity.
337	* Returns NaN if the conversion failed.
338	* @param tolerateTrailingJunk if true, toDouble can tolerate garbage after the number.
339	* @param tolerateEmptyString if false, toDouble will turn an empty string into NaN rather than 0.
340	*/
341	double toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const;
342	double toDouble(bool tolerateTrailingJunk) const;
343	double toDouble() const;
344
345	/**
346	* Attempts an conversion to a 32-bit integer. ok will be set
347	* according to the success.
348	* @param tolerateEmptyString if false, toUInt32 will return false for *ok for an empty string.
349	*/
350	uint32_t toUInt32(bool *ok = 0) const;
351	uint32_t toUInt32(bool *ok, bool tolerateEmptyString) const;
352	uint32_t toStrictUInt32(bool *ok = 0) const;
353
354	/**
355	* Attempts an conversion to an array index. The "ok" boolean will be set
356	* to true if it is a valid array index according to the rule from
357	* ECMA 15.2 about what an array index is. It must exactly match the string
358	* form of an unsigned integer, and be less than 2^32 - 1.
359	*/
360	unsigned toArrayIndex(bool *ok = 0) const;
361
362	/**
363	* @return Position of first occurrence of f starting at position pos.
364	* -1 if the search was not successful.
365	*/
366	int find(const UString &f, int pos = 0) const;
367	int find(UChar, int pos = 0) const;
368	/**
369	* @return Position of first occurrence of f searching backwards from
370	* position pos.
371	* -1 if the search was not successful.
372	*/
373	int rfind(const UString &f, int pos) const;
374	int rfind(UChar, int pos) const;
375	/**
376	* @return The sub string starting at position pos and length len.
377	*/
378	UString substr(int pos = 0, int len = -1) const;
379	/**
380	* Static instance of a null string.
381	*/
382	static const UString &null();
383	#ifdef KJS_DEBUG_MEM
384	/**
385	* Clear statically allocated resources.
386	*/
387	static void globalClear();
388	#endif
389
390	Rep* rep() const { return m_rep.get(); }
391	UString(PassRefPtr<Rep> r) : m_rep(r) { ASSERT(m_rep); }
392
393	size_t cost() const;
394
395	private:
396	size_t expandedSize(size_t size, size_t otherSize) const;
397	int usedCapacity() const;
398	int usedPreCapacity() const;
399	void expandCapacity(int requiredLength);
400	void expandPreCapacity(int requiredPreCap);
401
402	RefPtr<Rep> m_rep;
403	};
404
405	inline bool operator==(const UChar &c1, const UChar &c2) {
406	return (c1.uc == c2.uc);
407	}
408	bool operator==(const UString& s1, const UString& s2);
409	inline bool operator!=(const UString& s1, const UString& s2) {
410	return !KJS::operator==(s1, s2);
411	}
412	bool operator<(const UString& s1, const UString& s2);
413	bool operator==(const UString& s1, const char *s2);
414	inline bool operator!=(const UString& s1, const char *s2) {
415	return !KJS::operator==(s1, s2);
416	}
417	inline bool operator==(const char *s1, const UString& s2) {
418	return operator==(s2, s1);
419	}
420	inline bool operator!=(const char *s1, const UString& s2) {
421	return !KJS::operator==(s1, s2);
422	}
423	bool operator==(const CString& s1, const CString& s2);
424	inline UString operator+(const UString& s1, const UString& s2) {
425	return UString(s1, s2);
426	}
427
428	int compare(const UString &, const UString &);
429
430	// Given a first byte, gives the length of the UTF-8 sequence it begins.
431	// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
432	// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
433	int UTF8SequenceLength(char);
434
435	// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
436	// Only allows Unicode characters (U-00000000 to U-0010FFFF).
437	// Returns -1 if the sequence is not valid (including presence of extra bytes).
438	int decodeUTF8Sequence(const char *);
439
440	inline UString::UString()
441	: m_rep(&Rep::null)
442	{
443	}
444
445	// Rule from ECMA 15.2 about what an array index is.
446	// Must exactly match string form of an unsigned integer, and be less than 2^32 - 1.
447	inline unsigned UString::toArrayIndex(bool *ok) const
448	{
449	unsigned i = toStrictUInt32(ok);
450	if (ok && i >= 0xFFFFFFFFU)
451	*ok = false;
452	return i;
453	}
454
455	inline size_t UString::cost() const
456	{
457	// If this string is sharing with a base, then don't count any cost. We will never share
458	// with a base that wasn't already big enough to register extra cost, so a string holding that
459	// buffer has already paid extra cost at some point; and if we just
460	// enlarged it by a huge amount, it must have been by appending a string
461	// that itself paid extra cost, or a huge number of small strings. Either way, GC will come
462	// relatively soon.
463
464	// If we didn't do this, the shared substring optimization would result
465	// in constantly garbage collecting when sharing with one big string.
466
467	if (!m_rep->baseIsSelf())
468	return 0;
469
470	return (m_rep->capacity + m_rep->preCapacity) * sizeof(UChar);
471	}
472
473	} // namespace
474
475	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: webkit/trunk/JavaScriptCore/kjs/ustring.h@ 27406

Download in other formats: