Context Navigation

regexp.cpp@ 24453

Visit:

Last change on this file since 24453 was 24453, checked in by darin, 18 years ago

Reviewed by Geoff.

fix <rdar://problem/5345440> PCRE computes wrong length for expressions with quantifiers on named recursion or subexpressions

It's challenging to implement proper preflighting for compiling these advanced features.
But we don't want them in the JavaScript engine anyway.

Turned off the following features of PCRE (some of these are simply parsed and not implemented):

\C \E \G \L \N \P \Q \U \X \Z
\e \l \p \u \z
[::] .. [==]
(?#) (?<=) (?<!) (?>)
(?C) (?P) (?R)
(?0) (and 1-9)
(?imsxUX)

Added the following:

\u \v

Because of \v, the js1_2/regexp/special_characters.js test now passes.

To be conservative, I left some features that JavaScript doesn't want, such as
\012 and \x{2013}, in place. We can revisit these later; they're not directly-enough
related to avoiding the incorrect preflighting.

I also didn't try to remove unused opcodes and remove code from the execution engine.
That could save code size and speed things up a bit, but it would require more changes.

kjs/regexp.h:
kjs/regexp.cpp: (KJS::RegExp::RegExp): Remove the sanitizePattern workaround for lack of \u support, since the PCRE code now has \u support.

pcre/pcre-config.h: Set JAVASCRIPT to 1.
pcre/pcre_internal.h: Added ESC_v.

pcre/pcre_compile.c: Added a different escape table for when JAVASCRIPT is set that omits all the escapes we don't want interpreted and includes '\v'. (check_escape): Put !JAVASCRIPT around the code for '\l', '\L', '\N', '\u', and '\U', and added code to handle '\u2013' inside JAVASCRIPT. (compile_branch): Put !JAVASCRIPT if around all the code implementing the features we don't want. (pcre_compile2): Ditto.

tests/mozilla/expected.html: Updated since js1_2/regexp/special_characters.js now passes.

Property svn:eol-style set to native

File size: 5.6 KB

Line
1	// -- c-basic-offset: 2 --
2	/*
3	* This file is part of the KDE libraries
4	* Copyright (C) 1999-2001,2004 Harri Porten ([email protected])
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with this library; if not, write to the Free Software
18	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19	*
20	*/
21
22	#include "config.h"
23	#include "regexp.h"
24
25	#include "lexer.h"
26
27	#include <assert.h>
28	#include <stdio.h>
29	#include <stdlib.h>
30	#include <string.h>
31
32	namespace KJS {
33
34	RegExp::RegExp(const UString &p, int flags)
35	: m_flags(flags), m_constructionError(0), m_numSubPatterns(0)
36	{
37	#if HAVE(PCREPOSIX)
38
39	int options = PCRE_UTF8;
40	// Note: the Global flag is already handled by RegExpProtoFunc::execute.
41	// FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
42	if (flags & IgnoreCase)
43	options \|= PCRE_CASELESS;
44	if (flags & Multiline)
45	options \|= PCRE_MULTILINE;
46
47	const char *errorMessage;
48	int errorOffset;
49
50	m_regex = pcre_compile(reinterpret_cast<const uint16_t*>(p.data()), p.size(),
51	options, &errorMessage, &errorOffset, NULL);
52	if (!m_regex) {
53	m_constructionError = strdup(errorMessage);
54	return;
55	}
56
57	#ifdef PCRE_INFO_CAPTURECOUNT
58	// Get number of subpatterns that will be returned.
59	pcre_fullinfo(m_regex, NULL, PCRE_INFO_CAPTURECOUNT, &m_numSubPatterns);
60	#endif
61
62	#else /* HAVE(PCREPOSIX) */
63
64	int regflags = 0;
65	#ifdef REG_EXTENDED
66	regflags \|= REG_EXTENDED;
67	#endif
68	#ifdef REG_ICASE
69	if ( f & IgnoreCase )
70	regflags \|= REG_ICASE;
71	#endif
72
73	//NOTE: Multiline is not feasible with POSIX regex.
74	//if ( f & Multiline )
75	// ;
76	// Note: the Global flag is already handled by RegExpProtoFunc::execute
77
78	// FIXME: support \u Unicode escapes.
79
80	int errorCode = regcomp(&m_regex, intern.ascii(), regflags);
81	if (errorCode != 0) {
82	char errorMessage[80];
83	regerror(errorCode, &m_regex, errorMessage, sizeof errorMessage);
84	m_constructionError = strdup(errorMessage);
85	}
86
87	#endif
88	}
89
90	RegExp::~RegExp()
91	{
92	#if HAVE(PCREPOSIX)
93	pcre_free(m_regex);
94	#else
95	/* TODO: is this really okay after an error ? */
96	regfree(&m_regex);
97	#endif
98	free(m_constructionError);
99	}
100
101	UString RegExp::match(const UString &s, int i, int pos, int *ovector)
102	{
103	if (i < 0)
104	i = 0;
105	int dummyPos;
106	if (!pos)
107	pos = &dummyPos;
108	*pos = -1;
109	if (ovector)
110	*ovector = 0;
111
112	if (i > s.size() \|\| s.isNull())
113	return UString::null();
114
115	#if HAVE(PCREPOSIX)
116
117	if (!m_regex)
118	return UString::null();
119
120	// Set up the offset vector for the result.
121	// First 2/3 used for result, the last third used by PCRE.
122	int *offsetVector;
123	int offsetVectorSize;
124	int fixedSizeOffsetVector[3];
125	if (!ovector) {
126	offsetVectorSize = 3;
127	offsetVector = fixedSizeOffsetVector;
128	} else {
129	offsetVectorSize = (m_numSubPatterns + 1) * 3;
130	offsetVector = new int [offsetVectorSize];
131	}
132
133	const int numMatches = pcre_exec(m_regex, NULL, reinterpret_cast<const uint16_t *>(s.data()), s.size(), i, 0, offsetVector, offsetVectorSize);
134
135	if (numMatches < 0) {
136	#ifndef NDEBUG
137	if (numMatches != PCRE_ERROR_NOMATCH)
138	fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
139	#endif
140	if (offsetVector != fixedSizeOffsetVector)
141	delete [] offsetVector;
142	return UString::null();
143	}
144
145	*pos = offsetVector[0];
146	if (ovector)
147	*ovector = offsetVector;
148	return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
149
150	#else
151
152	const unsigned maxMatch = 10;
153	regmatch_t rmatch[maxMatch];
154
155	char *str = strdup(s.ascii()); // TODO: why ???
156	if (regexec(&m_regex, str + i, maxMatch, rmatch, 0)) {
157	free(str);
158	return UString::null();
159	}
160	free(str);
161
162	if (!ovector) {
163	*pos = rmatch[0].rm_so + i;
164	return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
165	}
166
167	// map rmatch array to ovector used in PCRE case
168	m_numSubPatterns = 0;
169	for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
170	m_numSubPatterns++;
171	int ovecsize = (m_numSubPatterns+1)*3; // see above
172	*ovector = new int[ovecsize];
173	for (unsigned j = 0; j < m_numSubPatterns + 1; j++) {
174	if (j>maxMatch)
175	break;
176	(ovector)[2j] = rmatch[j].rm_so + i;
177	(ovector)[2j+1] = rmatch[j].rm_eo + i;
178	}
179
180	pos = (ovector)[0];
181	return s.substr((ovector)[0], (ovector)[1] - (*ovector)[0]);
182
183	#endif
184	}
185
186	bool RegExp::isHexDigit(UChar uc)
187	{
188	int c = uc.unicode();
189	return (c >= '0' && c <= '9' \|\|
190	c >= 'a' && c <= 'f' \|\|
191	c >= 'A' && c <= 'F');
192	}
193
194	unsigned char RegExp::convertHex(int c)
195	{
196	if (c >= '0' && c <= '9')
197	return static_cast<unsigned char>(c - '0');
198	if (c >= 'a' && c <= 'f')
199	return static_cast<unsigned char>(c - 'a' + 10);
200	return static_cast<unsigned char>(c - 'A' + 10);
201	}
202
203	unsigned char RegExp::convertHex(int c1, int c2)
204	{
205	return ((convertHex(c1) << 4) + convertHex(c2));
206	}
207
208	UChar RegExp::convertUnicode(UChar uc1, UChar uc2, UChar uc3, UChar uc4)
209	{
210	int c1 = uc1.unicode();
211	int c2 = uc2.unicode();
212	int c3 = uc3.unicode();
213	int c4 = uc4.unicode();
214	return UChar((convertHex(c1) << 4) + convertHex(c2),
215	(convertHex(c3) << 4) + convertHex(c4));
216	}
217
218	} // namespace KJS

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: webkit/trunk/JavaScriptCore/kjs/regexp.cpp@ 24453

Download in other formats: