1 | /*
|
---|
2 | * Copyright (C) 2009 Apple Inc. All rights reserved.
|
---|
3 | *
|
---|
4 | * Redistribution and use in source and binary forms, with or without
|
---|
5 | * modification, are permitted provided that the following conditions
|
---|
6 | * are met:
|
---|
7 | * 1. Redistributions of source code must retain the above copyright
|
---|
8 | * notice, this list of conditions and the following disclaimer.
|
---|
9 | * 2. Redistributions in binary form must reproduce the above copyright
|
---|
10 | * notice, this list of conditions and the following disclaimer in the
|
---|
11 | * documentation and/or other materials provided with the distribution.
|
---|
12 | *
|
---|
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
---|
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
---|
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
---|
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
---|
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
---|
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
---|
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
---|
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
---|
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
---|
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
---|
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
24 | */
|
---|
25 |
|
---|
26 | #include "config.h"
|
---|
27 | #include "RegexCompiler.h"
|
---|
28 |
|
---|
29 | #include "RegexInterpreter.h"
|
---|
30 | #include "RegexPattern.h"
|
---|
31 | #include <wtf/Vector.h>
|
---|
32 |
|
---|
33 | #if ENABLE(YARR)
|
---|
34 |
|
---|
35 | using namespace WTF;
|
---|
36 |
|
---|
37 | namespace JSC { namespace Yarr {
|
---|
38 |
|
---|
39 | class CharacterClassConstructor {
|
---|
40 | public:
|
---|
41 | CharacterClassConstructor(bool isCaseInsensitive = false)
|
---|
42 | : m_isCaseInsensitive(isCaseInsensitive)
|
---|
43 | {
|
---|
44 | }
|
---|
45 |
|
---|
46 | void reset()
|
---|
47 | {
|
---|
48 | m_matches.clear();
|
---|
49 | m_ranges.clear();
|
---|
50 | m_matchesUnicode.clear();
|
---|
51 | m_rangesUnicode.clear();
|
---|
52 | }
|
---|
53 |
|
---|
54 | void append(const CharacterClass* other)
|
---|
55 | {
|
---|
56 | for (size_t i = 0; i < other->m_matches.size(); ++i)
|
---|
57 | addSorted(m_matches, other->m_matches[i]);
|
---|
58 | for (size_t i = 0; i < other->m_ranges.size(); ++i)
|
---|
59 | addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
|
---|
60 | for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
|
---|
61 | addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
|
---|
62 | for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
|
---|
63 | addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
|
---|
64 | }
|
---|
65 |
|
---|
66 | void putChar(UChar ch)
|
---|
67 | {
|
---|
68 | if (ch <= 0x7f) {
|
---|
69 | if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
|
---|
70 | addSorted(m_matches, toASCIIUpper(ch));
|
---|
71 | addSorted(m_matches, toASCIILower(ch));
|
---|
72 | } else
|
---|
73 | addSorted(m_matches, ch);
|
---|
74 | } else {
|
---|
75 | UChar upper, lower;
|
---|
76 | if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
|
---|
77 | addSorted(m_matchesUnicode, upper);
|
---|
78 | addSorted(m_matchesUnicode, lower);
|
---|
79 | } else
|
---|
80 | addSorted(m_matchesUnicode, ch);
|
---|
81 | }
|
---|
82 | }
|
---|
83 |
|
---|
84 | // returns true if this character has another case, and 'ch' is the upper case form.
|
---|
85 | static inline bool isUnicodeUpper(UChar ch)
|
---|
86 | {
|
---|
87 | return ch != Unicode::toLower(ch);
|
---|
88 | }
|
---|
89 |
|
---|
90 | // returns true if this character has another case, and 'ch' is the lower case form.
|
---|
91 | static inline bool isUnicodeLower(UChar ch)
|
---|
92 | {
|
---|
93 | return ch != Unicode::toUpper(ch);
|
---|
94 | }
|
---|
95 |
|
---|
96 | void putRange(UChar lo, UChar hi)
|
---|
97 | {
|
---|
98 | if (lo <= 0x7f) {
|
---|
99 | char asciiLo = lo;
|
---|
100 | char asciiHi = std::min(hi, (UChar)0x7f);
|
---|
101 | addSortedRange(m_ranges, lo, asciiHi);
|
---|
102 |
|
---|
103 | if (m_isCaseInsensitive) {
|
---|
104 | if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
|
---|
105 | addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
|
---|
106 | if ((asciiLo <= 'z') && (asciiHi >= 'a'))
|
---|
107 | addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
|
---|
108 | }
|
---|
109 | }
|
---|
110 | if (hi >= 0x80) {
|
---|
111 | UChar unicodeCurr = std::max(lo, (UChar)0x80);
|
---|
112 | addSortedRange(m_rangesUnicode, unicodeCurr, hi);
|
---|
113 |
|
---|
114 | if (m_isCaseInsensitive) {
|
---|
115 | while (unicodeCurr <= hi) {
|
---|
116 | if (isUnicodeUpper(unicodeCurr)) {
|
---|
117 | UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
|
---|
118 | UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
|
---|
119 | while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
|
---|
120 | lowerCaseRangeEnd++;
|
---|
121 | addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
|
---|
122 | } else if (isUnicodeLower(unicodeCurr)) {
|
---|
123 | UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
|
---|
124 | UChar upperCaseRangeEnd = upperCaseRangeBegin;
|
---|
125 | while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
|
---|
126 | upperCaseRangeEnd++;
|
---|
127 | addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
|
---|
128 | } else
|
---|
129 | ++unicodeCurr;
|
---|
130 | }
|
---|
131 | }
|
---|
132 | }
|
---|
133 | }
|
---|
134 |
|
---|
135 | CharacterClass* charClass()
|
---|
136 | {
|
---|
137 | CharacterClass* characterClass = new CharacterClass();
|
---|
138 |
|
---|
139 | characterClass->m_matches.append(m_matches);
|
---|
140 | characterClass->m_ranges.append(m_ranges);
|
---|
141 | characterClass->m_matchesUnicode.append(m_matchesUnicode);
|
---|
142 | characterClass->m_rangesUnicode.append(m_rangesUnicode);
|
---|
143 |
|
---|
144 | reset();
|
---|
145 |
|
---|
146 | return characterClass;
|
---|
147 | }
|
---|
148 |
|
---|
149 | private:
|
---|
150 | void addSorted(Vector<UChar>& matches, UChar ch)
|
---|
151 | {
|
---|
152 | unsigned pos = 0;
|
---|
153 | unsigned range = matches.size();
|
---|
154 |
|
---|
155 | // binary chop, find position to insert char.
|
---|
156 | while (range) {
|
---|
157 | unsigned index = range >> 1;
|
---|
158 |
|
---|
159 | int val = matches[pos+index] - ch;
|
---|
160 | if (!val)
|
---|
161 | return;
|
---|
162 | else if (val > 0)
|
---|
163 | range = index;
|
---|
164 | else {
|
---|
165 | pos += (index+1);
|
---|
166 | range -= (index+1);
|
---|
167 | }
|
---|
168 | }
|
---|
169 |
|
---|
170 | if (pos == matches.size())
|
---|
171 | matches.append(ch);
|
---|
172 | else
|
---|
173 | matches.insert(pos, ch);
|
---|
174 | }
|
---|
175 |
|
---|
176 | void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
|
---|
177 | {
|
---|
178 | unsigned end = ranges.size();
|
---|
179 |
|
---|
180 | // Simple linear scan - I doubt there are that many ranges anyway...
|
---|
181 | // feel free to fix this with something faster (eg binary chop).
|
---|
182 | for (unsigned i = 0; i < end; ++i) {
|
---|
183 | // does the new range fall before the current position in the array
|
---|
184 | if (hi < ranges[i].begin) {
|
---|
185 | // optional optimization: concatenate appending ranges? - may not be worthwhile.
|
---|
186 | if (hi == (ranges[i].begin - 1)) {
|
---|
187 | ranges[i].begin = lo;
|
---|
188 | return;
|
---|
189 | }
|
---|
190 | ranges.insert(i, CharacterRange(lo, hi));
|
---|
191 | return;
|
---|
192 | }
|
---|
193 | // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
|
---|
194 | // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
|
---|
195 | // end of the last range they concatenate, which is just as good.
|
---|
196 | if (lo <= (ranges[i].end + 1)) {
|
---|
197 | // found an intersect! we'll replace this entry in the array.
|
---|
198 | ranges[i].begin = std::min(ranges[i].begin, lo);
|
---|
199 | ranges[i].end = std::max(ranges[i].end, hi);
|
---|
200 |
|
---|
201 | // now check if the new range can subsume any subsequent ranges.
|
---|
202 | unsigned next = i+1;
|
---|
203 | // each iteration of the loop we will either remove something from the list, or break the loop.
|
---|
204 | while (next < ranges.size()) {
|
---|
205 | if (ranges[next].begin <= (ranges[i].end + 1)) {
|
---|
206 | // the next entry now overlaps / concatenates this one.
|
---|
207 | ranges[i].end = std::max(ranges[i].end, ranges[next].end);
|
---|
208 | ranges.remove(next);
|
---|
209 | } else
|
---|
210 | break;
|
---|
211 | }
|
---|
212 |
|
---|
213 | return;
|
---|
214 | }
|
---|
215 | }
|
---|
216 |
|
---|
217 | // CharacterRange comes after all existing ranges.
|
---|
218 | ranges.append(CharacterRange(lo, hi));
|
---|
219 | }
|
---|
220 |
|
---|
221 | bool m_isCaseInsensitive;
|
---|
222 |
|
---|
223 | Vector<UChar> m_matches;
|
---|
224 | Vector<CharacterRange> m_ranges;
|
---|
225 | Vector<UChar> m_matchesUnicode;
|
---|
226 | Vector<CharacterRange> m_rangesUnicode;
|
---|
227 | };
|
---|
228 |
|
---|
229 |
|
---|
230 | CharacterClass* newlineCreate()
|
---|
231 | {
|
---|
232 | CharacterClass* characterClass = new CharacterClass();
|
---|
233 |
|
---|
234 | characterClass->m_matches.append('\n');
|
---|
235 | characterClass->m_matches.append('\r');
|
---|
236 | characterClass->m_matchesUnicode.append(0x2028);
|
---|
237 | characterClass->m_matchesUnicode.append(0x2029);
|
---|
238 |
|
---|
239 | return characterClass;
|
---|
240 | }
|
---|
241 |
|
---|
242 | CharacterClass* digitsCreate()
|
---|
243 | {
|
---|
244 | CharacterClass* characterClass = new CharacterClass();
|
---|
245 |
|
---|
246 | characterClass->m_ranges.append(CharacterRange('0', '9'));
|
---|
247 |
|
---|
248 | return characterClass;
|
---|
249 | }
|
---|
250 |
|
---|
251 | CharacterClass* spacesCreate()
|
---|
252 | {
|
---|
253 | CharacterClass* characterClass = new CharacterClass();
|
---|
254 |
|
---|
255 | characterClass->m_matches.append(' ');
|
---|
256 | characterClass->m_ranges.append(CharacterRange('\t', '\r'));
|
---|
257 | characterClass->m_matchesUnicode.append(0x00a0);
|
---|
258 | characterClass->m_matchesUnicode.append(0x1680);
|
---|
259 | characterClass->m_matchesUnicode.append(0x180e);
|
---|
260 | characterClass->m_matchesUnicode.append(0x2028);
|
---|
261 | characterClass->m_matchesUnicode.append(0x2029);
|
---|
262 | characterClass->m_matchesUnicode.append(0x202f);
|
---|
263 | characterClass->m_matchesUnicode.append(0x205f);
|
---|
264 | characterClass->m_matchesUnicode.append(0x3000);
|
---|
265 | characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a));
|
---|
266 |
|
---|
267 | return characterClass;
|
---|
268 | }
|
---|
269 |
|
---|
270 | CharacterClass* wordcharCreate()
|
---|
271 | {
|
---|
272 | CharacterClass* characterClass = new CharacterClass();
|
---|
273 |
|
---|
274 | characterClass->m_matches.append('_');
|
---|
275 | characterClass->m_ranges.append(CharacterRange('0', '9'));
|
---|
276 | characterClass->m_ranges.append(CharacterRange('A', 'Z'));
|
---|
277 | characterClass->m_ranges.append(CharacterRange('a', 'z'));
|
---|
278 |
|
---|
279 | return characterClass;
|
---|
280 | }
|
---|
281 |
|
---|
282 | CharacterClass* nondigitsCreate()
|
---|
283 | {
|
---|
284 | CharacterClass* characterClass = new CharacterClass();
|
---|
285 |
|
---|
286 | characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
|
---|
287 | characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f));
|
---|
288 | characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
|
---|
289 |
|
---|
290 | return characterClass;
|
---|
291 | }
|
---|
292 |
|
---|
293 | CharacterClass* nonspacesCreate()
|
---|
294 | {
|
---|
295 | CharacterClass* characterClass = new CharacterClass();
|
---|
296 |
|
---|
297 | characterClass->m_ranges.append(CharacterRange(0, '\t' - 1));
|
---|
298 | characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1));
|
---|
299 | characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f));
|
---|
300 | characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f));
|
---|
301 | characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f));
|
---|
302 | characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d));
|
---|
303 | characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff));
|
---|
304 | characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027));
|
---|
305 | characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e));
|
---|
306 | characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e));
|
---|
307 | characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff));
|
---|
308 | characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff));
|
---|
309 |
|
---|
310 | return characterClass;
|
---|
311 | }
|
---|
312 |
|
---|
313 | CharacterClass* nonwordcharCreate()
|
---|
314 | {
|
---|
315 | CharacterClass* characterClass = new CharacterClass();
|
---|
316 |
|
---|
317 | characterClass->m_matches.append('`');
|
---|
318 | characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
|
---|
319 | characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1));
|
---|
320 | characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1));
|
---|
321 | characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f));
|
---|
322 | characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
|
---|
323 |
|
---|
324 | return characterClass;
|
---|
325 | }
|
---|
326 |
|
---|
327 |
|
---|
328 | class RegexPatternConstructor {
|
---|
329 | public:
|
---|
330 | RegexPatternConstructor(RegexPattern& pattern)
|
---|
331 | : m_pattern(pattern)
|
---|
332 | , m_characterClassConstructor(pattern.m_ignoreCase)
|
---|
333 | {
|
---|
334 | }
|
---|
335 |
|
---|
336 | ~RegexPatternConstructor()
|
---|
337 | {
|
---|
338 | }
|
---|
339 |
|
---|
340 | void reset()
|
---|
341 | {
|
---|
342 | m_pattern.reset();
|
---|
343 | m_characterClassConstructor.reset();
|
---|
344 | }
|
---|
345 |
|
---|
346 | void assertionBOL()
|
---|
347 | {
|
---|
348 | m_alternative->m_terms.append(PatternTerm::BOL());
|
---|
349 | }
|
---|
350 | void assertionEOL()
|
---|
351 | {
|
---|
352 | m_alternative->m_terms.append(PatternTerm::EOL());
|
---|
353 | }
|
---|
354 | void assertionWordBoundary(bool invert)
|
---|
355 | {
|
---|
356 | m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
|
---|
357 | }
|
---|
358 |
|
---|
359 | void atomPatternCharacter(UChar ch)
|
---|
360 | {
|
---|
361 | // We handle case-insensitive checking of unicode characters which do have both
|
---|
362 | // cases by handling them as if they were defined using a CharacterClass.
|
---|
363 | if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
|
---|
364 | atomCharacterClassBegin();
|
---|
365 | atomCharacterClassAtom(ch);
|
---|
366 | atomCharacterClassEnd();
|
---|
367 | } else
|
---|
368 | m_alternative->m_terms.append(PatternTerm(ch));
|
---|
369 | }
|
---|
370 |
|
---|
371 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
|
---|
372 | {
|
---|
373 | switch (classID) {
|
---|
374 | case DigitClassID:
|
---|
375 | m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
|
---|
376 | break;
|
---|
377 | case SpaceClassID:
|
---|
378 | m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
|
---|
379 | break;
|
---|
380 | case WordClassID:
|
---|
381 | m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
|
---|
382 | break;
|
---|
383 | case NewlineClassID:
|
---|
384 | m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
|
---|
385 | break;
|
---|
386 | }
|
---|
387 | }
|
---|
388 |
|
---|
389 | void atomCharacterClassBegin(bool invert = false)
|
---|
390 | {
|
---|
391 | m_invertCharacterClass = invert;
|
---|
392 | }
|
---|
393 |
|
---|
394 | void atomCharacterClassAtom(UChar ch)
|
---|
395 | {
|
---|
396 | m_characterClassConstructor.putChar(ch);
|
---|
397 | }
|
---|
398 |
|
---|
399 | void atomCharacterClassRange(UChar begin, UChar end)
|
---|
400 | {
|
---|
401 | m_characterClassConstructor.putRange(begin, end);
|
---|
402 | }
|
---|
403 |
|
---|
404 | void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
|
---|
405 | {
|
---|
406 | ASSERT(classID != NewlineClassID);
|
---|
407 |
|
---|
408 | switch (classID) {
|
---|
409 | case DigitClassID:
|
---|
410 | m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
|
---|
411 | break;
|
---|
412 |
|
---|
413 | case SpaceClassID:
|
---|
414 | m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
|
---|
415 | break;
|
---|
416 |
|
---|
417 | case WordClassID:
|
---|
418 | m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
|
---|
419 | break;
|
---|
420 |
|
---|
421 | default:
|
---|
422 | ASSERT_NOT_REACHED();
|
---|
423 | }
|
---|
424 | }
|
---|
425 |
|
---|
426 | void atomCharacterClassEnd()
|
---|
427 | {
|
---|
428 | CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
|
---|
429 | m_pattern.m_userCharacterClasses.append(newCharacterClass);
|
---|
430 | m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
|
---|
431 | }
|
---|
432 |
|
---|
433 | void atomParenthesesSubpatternBegin(bool capture = true)
|
---|
434 | {
|
---|
435 | unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
|
---|
436 | if (capture)
|
---|
437 | m_pattern.m_numSubpatterns++;
|
---|
438 |
|
---|
439 | PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
|
---|
440 | m_pattern.m_disjunctions.append(parenthesesDisjunction);
|
---|
441 | m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture));
|
---|
442 | m_alternative = parenthesesDisjunction->addNewAlternative();
|
---|
443 | }
|
---|
444 |
|
---|
445 | void atomParentheticalAssertionBegin(bool invert = false)
|
---|
446 | {
|
---|
447 | PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
|
---|
448 | m_pattern.m_disjunctions.append(parenthesesDisjunction);
|
---|
449 | m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert));
|
---|
450 | m_alternative = parenthesesDisjunction->addNewAlternative();
|
---|
451 | }
|
---|
452 |
|
---|
453 | void atomParenthesesEnd()
|
---|
454 | {
|
---|
455 | ASSERT(m_alternative->m_parent);
|
---|
456 | ASSERT(m_alternative->m_parent->m_parent);
|
---|
457 | m_alternative = m_alternative->m_parent->m_parent;
|
---|
458 |
|
---|
459 | m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
|
---|
460 | }
|
---|
461 |
|
---|
462 | void atomBackReference(unsigned subpatternId)
|
---|
463 | {
|
---|
464 | ASSERT(subpatternId);
|
---|
465 | m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
|
---|
466 |
|
---|
467 | if (subpatternId > m_pattern.m_numSubpatterns)
|
---|
468 | return;
|
---|
469 |
|
---|
470 | PatternAlternative* currentAlternative = m_alternative;
|
---|
471 | ASSERT(currentAlternative);
|
---|
472 |
|
---|
473 | // Note to self: if we waited until the AST was baked, we could also remove forwards refs
|
---|
474 | while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
|
---|
475 | PatternTerm& term = currentAlternative->lastTerm();
|
---|
476 | ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
|
---|
477 |
|
---|
478 | if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId))
|
---|
479 | return;
|
---|
480 | }
|
---|
481 |
|
---|
482 | m_alternative->m_terms.append(PatternTerm(subpatternId));
|
---|
483 | }
|
---|
484 |
|
---|
485 | PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction)
|
---|
486 | {
|
---|
487 | PatternDisjunction* newDisjunction = new PatternDisjunction();
|
---|
488 |
|
---|
489 | newDisjunction->m_parent = disjunction->m_parent;
|
---|
490 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
|
---|
491 | PatternAlternative* alternative = disjunction->m_alternatives[alt];
|
---|
492 | PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
|
---|
493 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
|
---|
494 | newAlternative->m_terms.append(copyTerm(alternative->m_terms[i]));
|
---|
495 | }
|
---|
496 |
|
---|
497 | return newDisjunction;
|
---|
498 | }
|
---|
499 |
|
---|
500 | PatternTerm copyTerm(PatternTerm& term)
|
---|
501 | {
|
---|
502 | if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
|
---|
503 | return PatternTerm(term);
|
---|
504 |
|
---|
505 | PatternTerm termCopy = term;
|
---|
506 | termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction);
|
---|
507 | return termCopy;
|
---|
508 | }
|
---|
509 |
|
---|
510 | void quantifyAtom(unsigned min, unsigned max, bool greedy)
|
---|
511 | {
|
---|
512 | ASSERT(min <= max);
|
---|
513 | ASSERT(m_alternative->m_terms.size());
|
---|
514 |
|
---|
515 | if (!max) {
|
---|
516 | m_alternative->removeLastTerm();
|
---|
517 | return;
|
---|
518 | }
|
---|
519 |
|
---|
520 | PatternTerm& term = m_alternative->lastTerm();
|
---|
521 | ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
|
---|
522 | ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
|
---|
523 |
|
---|
524 | // For any assertion with a zero minimum, not matching is valid and has no effect,
|
---|
525 | // remove it. Otherwise, we need to match as least once, but there is no point
|
---|
526 | // matching more than once, so remove the quantifier. It is not entirely clear
|
---|
527 | // from the spec whether or not this behavior is correct, but I believe this
|
---|
528 | // matches Firefox. :-/
|
---|
529 | if (term.type == PatternTerm::TypeParentheticalAssertion) {
|
---|
530 | if (!min)
|
---|
531 | m_alternative->removeLastTerm();
|
---|
532 | return;
|
---|
533 | }
|
---|
534 |
|
---|
535 | if (min == 0)
|
---|
536 | term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
|
---|
537 | else if (min == max)
|
---|
538 | term.quantify(min, QuantifierFixedCount);
|
---|
539 | else {
|
---|
540 | term.quantify(min, QuantifierFixedCount);
|
---|
541 | m_alternative->m_terms.append(copyTerm(term));
|
---|
542 | // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
|
---|
543 | m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
|
---|
544 | if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
|
---|
545 | m_alternative->lastTerm().parentheses.isCopy = true;
|
---|
546 | }
|
---|
547 | }
|
---|
548 |
|
---|
549 | void disjunction()
|
---|
550 | {
|
---|
551 | m_alternative = m_alternative->m_parent->addNewAlternative();
|
---|
552 | }
|
---|
553 |
|
---|
554 | void regexBegin()
|
---|
555 | {
|
---|
556 | m_pattern.m_body = new PatternDisjunction();
|
---|
557 | m_alternative = m_pattern.m_body->addNewAlternative();
|
---|
558 | m_pattern.m_disjunctions.append(m_pattern.m_body);
|
---|
559 | }
|
---|
560 | void regexEnd()
|
---|
561 | {
|
---|
562 | }
|
---|
563 | void regexError()
|
---|
564 | {
|
---|
565 | }
|
---|
566 |
|
---|
567 | unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
|
---|
568 | {
|
---|
569 | alternative->m_hasFixedSize = true;
|
---|
570 | unsigned currentInputPosition = initialInputPosition;
|
---|
571 |
|
---|
572 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
|
---|
573 | PatternTerm& term = alternative->m_terms[i];
|
---|
574 |
|
---|
575 | switch (term.type) {
|
---|
576 | case PatternTerm::TypeAssertionBOL:
|
---|
577 | case PatternTerm::TypeAssertionEOL:
|
---|
578 | case PatternTerm::TypeAssertionWordBoundary:
|
---|
579 | term.inputPosition = currentInputPosition;
|
---|
580 | break;
|
---|
581 |
|
---|
582 | case PatternTerm::TypeBackReference:
|
---|
583 | term.inputPosition = currentInputPosition;
|
---|
584 | term.frameLocation = currentCallFrameSize;
|
---|
585 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference;
|
---|
586 | alternative->m_hasFixedSize = false;
|
---|
587 | break;
|
---|
588 |
|
---|
589 | case PatternTerm::TypePatternCharacter:
|
---|
590 | term.inputPosition = currentInputPosition;
|
---|
591 | if (term.quantityType != QuantifierFixedCount) {
|
---|
592 | term.frameLocation = currentCallFrameSize;
|
---|
593 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter;
|
---|
594 | alternative->m_hasFixedSize = false;
|
---|
595 | } else
|
---|
596 | currentInputPosition += term.quantityCount;
|
---|
597 | break;
|
---|
598 |
|
---|
599 | case PatternTerm::TypeCharacterClass:
|
---|
600 | term.inputPosition = currentInputPosition;
|
---|
601 | if (term.quantityType != QuantifierFixedCount) {
|
---|
602 | term.frameLocation = currentCallFrameSize;
|
---|
603 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass;
|
---|
604 | alternative->m_hasFixedSize = false;
|
---|
605 | } else
|
---|
606 | currentInputPosition += term.quantityCount;
|
---|
607 | break;
|
---|
608 |
|
---|
609 | case PatternTerm::TypeParenthesesSubpattern:
|
---|
610 | // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
|
---|
611 | term.frameLocation = currentCallFrameSize;
|
---|
612 | if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
|
---|
613 | if (term.quantityType == QuantifierFixedCount) {
|
---|
614 | currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
|
---|
615 | currentInputPosition += term.parentheses.disjunction->m_minimumSize;
|
---|
616 | } else {
|
---|
617 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce;
|
---|
618 | currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
|
---|
619 | }
|
---|
620 | term.inputPosition = currentInputPosition;
|
---|
621 | } else {
|
---|
622 | term.inputPosition = currentInputPosition;
|
---|
623 | setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition);
|
---|
624 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses;
|
---|
625 | }
|
---|
626 | // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
|
---|
627 | alternative->m_hasFixedSize = false;
|
---|
628 | break;
|
---|
629 |
|
---|
630 | case PatternTerm::TypeParentheticalAssertion:
|
---|
631 | term.inputPosition = currentInputPosition;
|
---|
632 | term.frameLocation = currentCallFrameSize;
|
---|
633 | currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition);
|
---|
634 | break;
|
---|
635 | }
|
---|
636 | }
|
---|
637 |
|
---|
638 | alternative->m_minimumSize = currentInputPosition - initialInputPosition;
|
---|
639 | return currentCallFrameSize;
|
---|
640 | }
|
---|
641 |
|
---|
642 | unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
|
---|
643 | {
|
---|
644 | if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
|
---|
645 | initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative;
|
---|
646 |
|
---|
647 | unsigned minimumInputSize = UINT_MAX;
|
---|
648 | unsigned maximumCallFrameSize = 0;
|
---|
649 | bool hasFixedSize = true;
|
---|
650 |
|
---|
651 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
|
---|
652 | PatternAlternative* alternative = disjunction->m_alternatives[alt];
|
---|
653 | unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
|
---|
654 | minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
|
---|
655 | maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
|
---|
656 | hasFixedSize &= alternative->m_hasFixedSize;
|
---|
657 | }
|
---|
658 |
|
---|
659 | ASSERT(minimumInputSize != UINT_MAX);
|
---|
660 | ASSERT(maximumCallFrameSize >= initialCallFrameSize);
|
---|
661 |
|
---|
662 | disjunction->m_hasFixedSize = hasFixedSize;
|
---|
663 | disjunction->m_minimumSize = minimumInputSize;
|
---|
664 | disjunction->m_callFrameSize = maximumCallFrameSize;
|
---|
665 | return maximumCallFrameSize;
|
---|
666 | }
|
---|
667 |
|
---|
668 | void setupOffsets()
|
---|
669 | {
|
---|
670 | setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
|
---|
671 | }
|
---|
672 |
|
---|
673 | private:
|
---|
674 | RegexPattern& m_pattern;
|
---|
675 | PatternAlternative* m_alternative;
|
---|
676 | CharacterClassConstructor m_characterClassConstructor;
|
---|
677 | bool m_invertCharacterClass;
|
---|
678 | };
|
---|
679 |
|
---|
680 |
|
---|
681 | const char* compileRegex(const UString& patternString, RegexPattern& pattern)
|
---|
682 | {
|
---|
683 | RegexPatternConstructor constructor(pattern);
|
---|
684 |
|
---|
685 | if (const char* error = parse(constructor, patternString))
|
---|
686 | return error;
|
---|
687 |
|
---|
688 | // If the pattern contains illegal backreferences reset & reparse.
|
---|
689 | // Quoting Netscape's "What's new in JavaScript 1.2",
|
---|
690 | // "Note: if the number of left parentheses is less than the number specified
|
---|
691 | // in \#, the \# is taken as an octal escape as described in the next row."
|
---|
692 | if (pattern.containsIllegalBackReference()) {
|
---|
693 | unsigned numSubpatterns = pattern.m_numSubpatterns;
|
---|
694 |
|
---|
695 | constructor.reset();
|
---|
696 | #ifndef NDEBUG
|
---|
697 | const char* error =
|
---|
698 | #endif
|
---|
699 | parse(constructor, patternString, numSubpatterns);
|
---|
700 |
|
---|
701 | ASSERT(!error);
|
---|
702 | ASSERT(numSubpatterns == pattern.m_numSubpatterns);
|
---|
703 | }
|
---|
704 |
|
---|
705 | constructor.setupOffsets();
|
---|
706 |
|
---|
707 | return false;
|
---|
708 | };
|
---|
709 |
|
---|
710 |
|
---|
711 | } }
|
---|
712 |
|
---|
713 | #endif
|
---|