source: webkit/trunk/JavaScriptCore/kjs/regexp.cpp@ 24453

Last change on this file since 24453 was 24453, checked in by darin, 18 years ago

Reviewed by Geoff.

  • fix <rdar://problem/5345440> PCRE computes wrong length for expressions with quantifiers on named recursion or subexpressions

It's challenging to implement proper preflighting for compiling these advanced features.
But we don't want them in the JavaScript engine anyway.

Turned off the following features of PCRE (some of these are simply parsed and not implemented):

\C \E \G \L \N \P \Q \U \X \Z
\e \l \p \u \z
[::] .. [==]
(?#) (?<=) (?<!) (?>)
(?C) (?P) (?R)
(?0) (and 1-9)
(?imsxUX)

Added the following:

\u \v

Because of \v, the js1_2/regexp/special_characters.js test now passes.

To be conservative, I left some features that JavaScript doesn't want, such as
\012 and \x{2013}, in place. We can revisit these later; they're not directly-enough
related to avoiding the incorrect preflighting.

I also didn't try to remove unused opcodes and remove code from the execution engine.
That could save code size and speed things up a bit, but it would require more changes.

  • kjs/regexp.h:
  • kjs/regexp.cpp: (KJS::RegExp::RegExp): Remove the sanitizePattern workaround for lack of \u support, since the PCRE code now has \u support.
  • pcre/pcre-config.h: Set JAVASCRIPT to 1.
  • pcre/pcre_internal.h: Added ESC_v.
  • pcre/pcre_compile.c: Added a different escape table for when JAVASCRIPT is set that omits all the escapes we don't want interpreted and includes '\v'. (check_escape): Put !JAVASCRIPT around the code for '\l', '\L', '\N', '\u', and '\U', and added code to handle '\u2013' inside JAVASCRIPT. (compile_branch): Put !JAVASCRIPT if around all the code implementing the features we don't want. (pcre_compile2): Ditto.
  • tests/mozilla/expected.html: Updated since js1_2/regexp/special_characters.js now passes.
  • Property svn:eol-style set to native
File size: 5.6 KB
Line 
1// -*- c-basic-offset: 2 -*-
2/*
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2001,2004 Harri Porten ([email protected])
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 */
21
22#include "config.h"
23#include "regexp.h"
24
25#include "lexer.h"
26
27#include <assert.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31
32namespace KJS {
33
34RegExp::RegExp(const UString &p, int flags)
35 : m_flags(flags), m_constructionError(0), m_numSubPatterns(0)
36{
37#if HAVE(PCREPOSIX)
38
39 int options = PCRE_UTF8;
40 // Note: the Global flag is already handled by RegExpProtoFunc::execute.
41 // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
42 if (flags & IgnoreCase)
43 options |= PCRE_CASELESS;
44 if (flags & Multiline)
45 options |= PCRE_MULTILINE;
46
47 const char *errorMessage;
48 int errorOffset;
49
50 m_regex = pcre_compile(reinterpret_cast<const uint16_t*>(p.data()), p.size(),
51 options, &errorMessage, &errorOffset, NULL);
52 if (!m_regex) {
53 m_constructionError = strdup(errorMessage);
54 return;
55 }
56
57#ifdef PCRE_INFO_CAPTURECOUNT
58 // Get number of subpatterns that will be returned.
59 pcre_fullinfo(m_regex, NULL, PCRE_INFO_CAPTURECOUNT, &m_numSubPatterns);
60#endif
61
62#else /* HAVE(PCREPOSIX) */
63
64 int regflags = 0;
65#ifdef REG_EXTENDED
66 regflags |= REG_EXTENDED;
67#endif
68#ifdef REG_ICASE
69 if ( f & IgnoreCase )
70 regflags |= REG_ICASE;
71#endif
72
73 //NOTE: Multiline is not feasible with POSIX regex.
74 //if ( f & Multiline )
75 // ;
76 // Note: the Global flag is already handled by RegExpProtoFunc::execute
77
78 // FIXME: support \u Unicode escapes.
79
80 int errorCode = regcomp(&m_regex, intern.ascii(), regflags);
81 if (errorCode != 0) {
82 char errorMessage[80];
83 regerror(errorCode, &m_regex, errorMessage, sizeof errorMessage);
84 m_constructionError = strdup(errorMessage);
85 }
86
87#endif
88}
89
90RegExp::~RegExp()
91{
92#if HAVE(PCREPOSIX)
93 pcre_free(m_regex);
94#else
95 /* TODO: is this really okay after an error ? */
96 regfree(&m_regex);
97#endif
98 free(m_constructionError);
99}
100
101UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
102{
103 if (i < 0)
104 i = 0;
105 int dummyPos;
106 if (!pos)
107 pos = &dummyPos;
108 *pos = -1;
109 if (ovector)
110 *ovector = 0;
111
112 if (i > s.size() || s.isNull())
113 return UString::null();
114
115#if HAVE(PCREPOSIX)
116
117 if (!m_regex)
118 return UString::null();
119
120 // Set up the offset vector for the result.
121 // First 2/3 used for result, the last third used by PCRE.
122 int *offsetVector;
123 int offsetVectorSize;
124 int fixedSizeOffsetVector[3];
125 if (!ovector) {
126 offsetVectorSize = 3;
127 offsetVector = fixedSizeOffsetVector;
128 } else {
129 offsetVectorSize = (m_numSubPatterns + 1) * 3;
130 offsetVector = new int [offsetVectorSize];
131 }
132
133 const int numMatches = pcre_exec(m_regex, NULL, reinterpret_cast<const uint16_t *>(s.data()), s.size(), i, 0, offsetVector, offsetVectorSize);
134
135 if (numMatches < 0) {
136#ifndef NDEBUG
137 if (numMatches != PCRE_ERROR_NOMATCH)
138 fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
139#endif
140 if (offsetVector != fixedSizeOffsetVector)
141 delete [] offsetVector;
142 return UString::null();
143 }
144
145 *pos = offsetVector[0];
146 if (ovector)
147 *ovector = offsetVector;
148 return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
149
150#else
151
152 const unsigned maxMatch = 10;
153 regmatch_t rmatch[maxMatch];
154
155 char *str = strdup(s.ascii()); // TODO: why ???
156 if (regexec(&m_regex, str + i, maxMatch, rmatch, 0)) {
157 free(str);
158 return UString::null();
159 }
160 free(str);
161
162 if (!ovector) {
163 *pos = rmatch[0].rm_so + i;
164 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
165 }
166
167 // map rmatch array to ovector used in PCRE case
168 m_numSubPatterns = 0;
169 for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
170 m_numSubPatterns++;
171 int ovecsize = (m_numSubPatterns+1)*3; // see above
172 *ovector = new int[ovecsize];
173 for (unsigned j = 0; j < m_numSubPatterns + 1; j++) {
174 if (j>maxMatch)
175 break;
176 (*ovector)[2*j] = rmatch[j].rm_so + i;
177 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
178 }
179
180 *pos = (*ovector)[0];
181 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
182
183#endif
184}
185
186bool RegExp::isHexDigit(UChar uc)
187{
188 int c = uc.unicode();
189 return (c >= '0' && c <= '9' ||
190 c >= 'a' && c <= 'f' ||
191 c >= 'A' && c <= 'F');
192}
193
194unsigned char RegExp::convertHex(int c)
195{
196 if (c >= '0' && c <= '9')
197 return static_cast<unsigned char>(c - '0');
198 if (c >= 'a' && c <= 'f')
199 return static_cast<unsigned char>(c - 'a' + 10);
200 return static_cast<unsigned char>(c - 'A' + 10);
201}
202
203unsigned char RegExp::convertHex(int c1, int c2)
204{
205 return ((convertHex(c1) << 4) + convertHex(c2));
206}
207
208UChar RegExp::convertUnicode(UChar uc1, UChar uc2, UChar uc3, UChar uc4)
209{
210 int c1 = uc1.unicode();
211 int c2 = uc2.unicode();
212 int c3 = uc3.unicode();
213 int c4 = uc4.unicode();
214 return UChar((convertHex(c1) << 4) + convertHex(c2),
215 (convertHex(c3) << 4) + convertHex(c4));
216}
217
218} // namespace KJS
Note: See TracBrowser for help on using the repository browser.