source: webkit/trunk/JavaScriptCore/yarr/RegexCompiler.cpp@ 42853

Last change on this file since 42853 was 42853, checked in by [email protected], 16 years ago

2009-04-24 Gavin Barraclough <[email protected]>

Reviewed by Sam "Wesley" Weinig.

Improve performance to YARR interpreter.
(From about 3x slower than PCRE on regex-dna to about 30% slower).

  • yarr/RegexCompiler.cpp: (JSC::Yarr::RegexPatternConstructor::setupAlternativeOffsets):
  • yarr/RegexInterpreter.cpp: (JSC::Yarr::Interpreter::checkCharacter): (JSC::Yarr::Interpreter::checkCasedCharacter): (JSC::Yarr::Interpreter::backtrackPatternCharacter): (JSC::Yarr::Interpreter::backtrackPatternCasedCharacter): (JSC::Yarr::Interpreter::matchParentheticalAssertionBegin): (JSC::Yarr::Interpreter::matchParentheticalAssertionEnd): (JSC::Yarr::Interpreter::backtrackParentheticalAssertionBegin): (JSC::Yarr::Interpreter::backtrackParentheticalAssertionEnd): (JSC::Yarr::Interpreter::matchDisjunction): (JSC::Yarr::Interpreter::interpret): (JSC::Yarr::ByteCompiler::atomPatternCharacter): (JSC::Yarr::ByteCompiler::atomParenthesesSubpatternBegin): (JSC::Yarr::ByteCompiler::atomParentheticalAssertionBegin): (JSC::Yarr::ByteCompiler::closeAlternative): (JSC::Yarr::ByteCompiler::closeBodyAlternative): (JSC::Yarr::ByteCompiler::atomParenthesesEnd): (JSC::Yarr::ByteCompiler::regexBegin): (JSC::Yarr::ByteCompiler::regexEnd): (JSC::Yarr::ByteCompiler::alterantiveBodyDisjunction): (JSC::Yarr::ByteCompiler::alterantiveDisjunction): (JSC::Yarr::ByteCompiler::emitDisjunction):
  • yarr/RegexInterpreter.h: (JSC::Yarr::ByteTerm::): (JSC::Yarr::ByteTerm::ByteTerm): (JSC::Yarr::ByteTerm::BodyAlternativeBegin): (JSC::Yarr::ByteTerm::BodyAlternativeDisjunction): (JSC::Yarr::ByteTerm::BodyAlternativeEnd): (JSC::Yarr::ByteTerm::AlternativeBegin): (JSC::Yarr::ByteTerm::AlternativeDisjunction): (JSC::Yarr::ByteTerm::AlternativeEnd): (JSC::Yarr::ByteTerm::SubpatternBegin): (JSC::Yarr::ByteTerm::SubpatternEnd):
  • yarr/RegexJIT.cpp: (JSC::Yarr::RegexGenerator::generateParentheticalAssertion):
  • yarr/RegexPattern.h:
File size: 27.0 KB
Line 
1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "RegexCompiler.h"
28
29#include "RegexInterpreter.h"
30#include "RegexPattern.h"
31#include <wtf/Vector.h>
32
33#if ENABLE(YARR)
34
35using namespace WTF;
36
37namespace JSC { namespace Yarr {
38
39class CharacterClassConstructor {
40public:
41 CharacterClassConstructor(bool isCaseInsensitive = false)
42 : m_isCaseInsensitive(isCaseInsensitive)
43 {
44 }
45
46 void reset()
47 {
48 m_matches.clear();
49 m_ranges.clear();
50 m_matchesUnicode.clear();
51 m_rangesUnicode.clear();
52 }
53
54 void append(const CharacterClass* other)
55 {
56 for (size_t i = 0; i < other->m_matches.size(); ++i)
57 addSorted(m_matches, other->m_matches[i]);
58 for (size_t i = 0; i < other->m_ranges.size(); ++i)
59 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
60 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
61 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
62 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
63 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
64 }
65
66 void putChar(UChar ch)
67 {
68 if (ch <= 0x7f) {
69 if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
70 addSorted(m_matches, toASCIIUpper(ch));
71 addSorted(m_matches, toASCIILower(ch));
72 } else
73 addSorted(m_matches, ch);
74 } else {
75 UChar upper, lower;
76 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
77 addSorted(m_matchesUnicode, upper);
78 addSorted(m_matchesUnicode, lower);
79 } else
80 addSorted(m_matchesUnicode, ch);
81 }
82 }
83
84 // returns true if this character has another case, and 'ch' is the upper case form.
85 static inline bool isUnicodeUpper(UChar ch)
86 {
87 return ch != Unicode::toLower(ch);
88 }
89
90 // returns true if this character has another case, and 'ch' is the lower case form.
91 static inline bool isUnicodeLower(UChar ch)
92 {
93 return ch != Unicode::toUpper(ch);
94 }
95
96 void putRange(UChar lo, UChar hi)
97 {
98 if (lo <= 0x7f) {
99 char asciiLo = lo;
100 char asciiHi = std::min(hi, (UChar)0x7f);
101 addSortedRange(m_ranges, lo, asciiHi);
102
103 if (m_isCaseInsensitive) {
104 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
105 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
106 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
107 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
108 }
109 }
110 if (hi >= 0x80) {
111 UChar unicodeCurr = std::max(lo, (UChar)0x80);
112 addSortedRange(m_rangesUnicode, unicodeCurr, hi);
113
114 if (m_isCaseInsensitive) {
115 while (unicodeCurr <= hi) {
116 if (isUnicodeUpper(unicodeCurr)) {
117 UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
118 UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
119 while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
120 lowerCaseRangeEnd++;
121 addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
122 } else if (isUnicodeLower(unicodeCurr)) {
123 UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
124 UChar upperCaseRangeEnd = upperCaseRangeBegin;
125 while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
126 upperCaseRangeEnd++;
127 addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
128 } else
129 ++unicodeCurr;
130 }
131 }
132 }
133 }
134
135 CharacterClass* charClass()
136 {
137 CharacterClass* characterClass = new CharacterClass();
138
139 characterClass->m_matches.append(m_matches);
140 characterClass->m_ranges.append(m_ranges);
141 characterClass->m_matchesUnicode.append(m_matchesUnicode);
142 characterClass->m_rangesUnicode.append(m_rangesUnicode);
143
144 reset();
145
146 return characterClass;
147 }
148
149private:
150 void addSorted(Vector<UChar>& matches, UChar ch)
151 {
152 unsigned pos = 0;
153 unsigned range = matches.size();
154
155 // binary chop, find position to insert char.
156 while (range) {
157 unsigned index = range >> 1;
158
159 int val = matches[pos+index] - ch;
160 if (!val)
161 return;
162 else if (val > 0)
163 range = index;
164 else {
165 pos += (index+1);
166 range -= (index+1);
167 }
168 }
169
170 if (pos == matches.size())
171 matches.append(ch);
172 else
173 matches.insert(pos, ch);
174 }
175
176 void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
177 {
178 unsigned end = ranges.size();
179
180 // Simple linear scan - I doubt there are that many ranges anyway...
181 // feel free to fix this with something faster (eg binary chop).
182 for (unsigned i = 0; i < end; ++i) {
183 // does the new range fall before the current position in the array
184 if (hi < ranges[i].begin) {
185 // optional optimization: concatenate appending ranges? - may not be worthwhile.
186 if (hi == (ranges[i].begin - 1)) {
187 ranges[i].begin = lo;
188 return;
189 }
190 ranges.insert(i, CharacterRange(lo, hi));
191 return;
192 }
193 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
194 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
195 // end of the last range they concatenate, which is just as good.
196 if (lo <= (ranges[i].end + 1)) {
197 // found an intersect! we'll replace this entry in the array.
198 ranges[i].begin = std::min(ranges[i].begin, lo);
199 ranges[i].end = std::max(ranges[i].end, hi);
200
201 // now check if the new range can subsume any subsequent ranges.
202 unsigned next = i+1;
203 // each iteration of the loop we will either remove something from the list, or break the loop.
204 while (next < ranges.size()) {
205 if (ranges[next].begin <= (ranges[i].end + 1)) {
206 // the next entry now overlaps / concatenates this one.
207 ranges[i].end = std::max(ranges[i].end, ranges[next].end);
208 ranges.remove(next);
209 } else
210 break;
211 }
212
213 return;
214 }
215 }
216
217 // CharacterRange comes after all existing ranges.
218 ranges.append(CharacterRange(lo, hi));
219 }
220
221 bool m_isCaseInsensitive;
222
223 Vector<UChar> m_matches;
224 Vector<CharacterRange> m_ranges;
225 Vector<UChar> m_matchesUnicode;
226 Vector<CharacterRange> m_rangesUnicode;
227};
228
229
230CharacterClass* newlineCreate()
231{
232 CharacterClass* characterClass = new CharacterClass();
233
234 characterClass->m_matches.append('\n');
235 characterClass->m_matches.append('\r');
236 characterClass->m_matchesUnicode.append(0x2028);
237 characterClass->m_matchesUnicode.append(0x2029);
238
239 return characterClass;
240}
241
242CharacterClass* digitsCreate()
243{
244 CharacterClass* characterClass = new CharacterClass();
245
246 characterClass->m_ranges.append(CharacterRange('0', '9'));
247
248 return characterClass;
249}
250
251CharacterClass* spacesCreate()
252{
253 CharacterClass* characterClass = new CharacterClass();
254
255 characterClass->m_matches.append(' ');
256 characterClass->m_ranges.append(CharacterRange('\t', '\r'));
257 characterClass->m_matchesUnicode.append(0x00a0);
258 characterClass->m_matchesUnicode.append(0x1680);
259 characterClass->m_matchesUnicode.append(0x180e);
260 characterClass->m_matchesUnicode.append(0x2028);
261 characterClass->m_matchesUnicode.append(0x2029);
262 characterClass->m_matchesUnicode.append(0x202f);
263 characterClass->m_matchesUnicode.append(0x205f);
264 characterClass->m_matchesUnicode.append(0x3000);
265 characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a));
266
267 return characterClass;
268}
269
270CharacterClass* wordcharCreate()
271{
272 CharacterClass* characterClass = new CharacterClass();
273
274 characterClass->m_matches.append('_');
275 characterClass->m_ranges.append(CharacterRange('0', '9'));
276 characterClass->m_ranges.append(CharacterRange('A', 'Z'));
277 characterClass->m_ranges.append(CharacterRange('a', 'z'));
278
279 return characterClass;
280}
281
282CharacterClass* nondigitsCreate()
283{
284 CharacterClass* characterClass = new CharacterClass();
285
286 characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
287 characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f));
288 characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
289
290 return characterClass;
291}
292
293CharacterClass* nonspacesCreate()
294{
295 CharacterClass* characterClass = new CharacterClass();
296
297 characterClass->m_ranges.append(CharacterRange(0, '\t' - 1));
298 characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1));
299 characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f));
300 characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f));
301 characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f));
302 characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d));
303 characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff));
304 characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027));
305 characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e));
306 characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e));
307 characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff));
308 characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff));
309
310 return characterClass;
311}
312
313CharacterClass* nonwordcharCreate()
314{
315 CharacterClass* characterClass = new CharacterClass();
316
317 characterClass->m_matches.append('`');
318 characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
319 characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1));
320 characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1));
321 characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f));
322 characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
323
324 return characterClass;
325}
326
327
328class RegexPatternConstructor {
329public:
330 RegexPatternConstructor(RegexPattern& pattern)
331 : m_pattern(pattern)
332 , m_characterClassConstructor(pattern.m_ignoreCase)
333 {
334 }
335
336 ~RegexPatternConstructor()
337 {
338 }
339
340 void reset()
341 {
342 m_pattern.reset();
343 m_characterClassConstructor.reset();
344 }
345
346 void assertionBOL()
347 {
348 m_alternative->m_terms.append(PatternTerm::BOL());
349 }
350 void assertionEOL()
351 {
352 m_alternative->m_terms.append(PatternTerm::EOL());
353 }
354 void assertionWordBoundary(bool invert)
355 {
356 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
357 }
358
359 void atomPatternCharacter(UChar ch)
360 {
361 // We handle case-insensitive checking of unicode characters which do have both
362 // cases by handling them as if they were defined using a CharacterClass.
363 if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
364 atomCharacterClassBegin();
365 atomCharacterClassAtom(ch);
366 atomCharacterClassEnd();
367 } else
368 m_alternative->m_terms.append(PatternTerm(ch));
369 }
370
371 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
372 {
373 switch (classID) {
374 case DigitClassID:
375 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
376 break;
377 case SpaceClassID:
378 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
379 break;
380 case WordClassID:
381 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
382 break;
383 case NewlineClassID:
384 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
385 break;
386 }
387 }
388
389 void atomCharacterClassBegin(bool invert = false)
390 {
391 m_invertCharacterClass = invert;
392 }
393
394 void atomCharacterClassAtom(UChar ch)
395 {
396 m_characterClassConstructor.putChar(ch);
397 }
398
399 void atomCharacterClassRange(UChar begin, UChar end)
400 {
401 m_characterClassConstructor.putRange(begin, end);
402 }
403
404 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
405 {
406 ASSERT(classID != NewlineClassID);
407
408 switch (classID) {
409 case DigitClassID:
410 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
411 break;
412
413 case SpaceClassID:
414 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
415 break;
416
417 case WordClassID:
418 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
419 break;
420
421 default:
422 ASSERT_NOT_REACHED();
423 }
424 }
425
426 void atomCharacterClassEnd()
427 {
428 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
429 m_pattern.m_userCharacterClasses.append(newCharacterClass);
430 m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
431 }
432
433 void atomParenthesesSubpatternBegin(bool capture = true)
434 {
435 unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
436 if (capture)
437 m_pattern.m_numSubpatterns++;
438
439 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
440 m_pattern.m_disjunctions.append(parenthesesDisjunction);
441 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture));
442 m_alternative = parenthesesDisjunction->addNewAlternative();
443 }
444
445 void atomParentheticalAssertionBegin(bool invert = false)
446 {
447 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
448 m_pattern.m_disjunctions.append(parenthesesDisjunction);
449 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert));
450 m_alternative = parenthesesDisjunction->addNewAlternative();
451 }
452
453 void atomParenthesesEnd()
454 {
455 ASSERT(m_alternative->m_parent);
456 ASSERT(m_alternative->m_parent->m_parent);
457 m_alternative = m_alternative->m_parent->m_parent;
458
459 m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
460 }
461
462 void atomBackReference(unsigned subpatternId)
463 {
464 ASSERT(subpatternId);
465 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
466
467 if (subpatternId > m_pattern.m_numSubpatterns)
468 return;
469
470 PatternAlternative* currentAlternative = m_alternative;
471 ASSERT(currentAlternative);
472
473 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
474 while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
475 PatternTerm& term = currentAlternative->lastTerm();
476 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
477
478 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId))
479 return;
480 }
481
482 m_alternative->m_terms.append(PatternTerm(subpatternId));
483 }
484
485 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction)
486 {
487 PatternDisjunction* newDisjunction = new PatternDisjunction();
488
489 newDisjunction->m_parent = disjunction->m_parent;
490 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
491 PatternAlternative* alternative = disjunction->m_alternatives[alt];
492 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
493 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
494 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i]));
495 }
496
497 return newDisjunction;
498 }
499
500 PatternTerm copyTerm(PatternTerm& term)
501 {
502 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
503 return PatternTerm(term);
504
505 PatternTerm termCopy = term;
506 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction);
507 return termCopy;
508 }
509
510 void quantifyAtom(unsigned min, unsigned max, bool greedy)
511 {
512 ASSERT(min <= max);
513 ASSERT(m_alternative->m_terms.size());
514
515 if (!max) {
516 m_alternative->removeLastTerm();
517 return;
518 }
519
520 PatternTerm& term = m_alternative->lastTerm();
521 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
522 ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
523
524 // For any assertion with a zero minimum, not matching is valid and has no effect,
525 // remove it. Otherwise, we need to match as least once, but there is no point
526 // matching more than once, so remove the quantifier. It is not entirely clear
527 // from the spec whether or not this behavior is correct, but I believe this
528 // matches Firefox. :-/
529 if (term.type == PatternTerm::TypeParentheticalAssertion) {
530 if (!min)
531 m_alternative->removeLastTerm();
532 return;
533 }
534
535 if (min == 0)
536 term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
537 else if (min == max)
538 term.quantify(min, QuantifierFixedCount);
539 else {
540 term.quantify(min, QuantifierFixedCount);
541 m_alternative->m_terms.append(copyTerm(term));
542 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
543 m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
544 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
545 m_alternative->lastTerm().parentheses.isCopy = true;
546 }
547 }
548
549 void disjunction()
550 {
551 m_alternative = m_alternative->m_parent->addNewAlternative();
552 }
553
554 void regexBegin()
555 {
556 m_pattern.m_body = new PatternDisjunction();
557 m_alternative = m_pattern.m_body->addNewAlternative();
558 m_pattern.m_disjunctions.append(m_pattern.m_body);
559 }
560 void regexEnd()
561 {
562 }
563 void regexError()
564 {
565 }
566
567 unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
568 {
569 alternative->m_hasFixedSize = true;
570 unsigned currentInputPosition = initialInputPosition;
571
572 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
573 PatternTerm& term = alternative->m_terms[i];
574
575 switch (term.type) {
576 case PatternTerm::TypeAssertionBOL:
577 case PatternTerm::TypeAssertionEOL:
578 case PatternTerm::TypeAssertionWordBoundary:
579 term.inputPosition = currentInputPosition;
580 break;
581
582 case PatternTerm::TypeBackReference:
583 term.inputPosition = currentInputPosition;
584 term.frameLocation = currentCallFrameSize;
585 currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference;
586 alternative->m_hasFixedSize = false;
587 break;
588
589 case PatternTerm::TypePatternCharacter:
590 term.inputPosition = currentInputPosition;
591 if (term.quantityType != QuantifierFixedCount) {
592 term.frameLocation = currentCallFrameSize;
593 currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter;
594 alternative->m_hasFixedSize = false;
595 } else
596 currentInputPosition += term.quantityCount;
597 break;
598
599 case PatternTerm::TypeCharacterClass:
600 term.inputPosition = currentInputPosition;
601 if (term.quantityType != QuantifierFixedCount) {
602 term.frameLocation = currentCallFrameSize;
603 currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass;
604 alternative->m_hasFixedSize = false;
605 } else
606 currentInputPosition += term.quantityCount;
607 break;
608
609 case PatternTerm::TypeParenthesesSubpattern:
610 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
611 term.frameLocation = currentCallFrameSize;
612 if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
613 if (term.quantityType == QuantifierFixedCount) {
614 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
615 currentInputPosition += term.parentheses.disjunction->m_minimumSize;
616 } else {
617 currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce;
618 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
619 }
620 term.inputPosition = currentInputPosition;
621 } else {
622 term.inputPosition = currentInputPosition;
623 setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition);
624 currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses;
625 }
626 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
627 alternative->m_hasFixedSize = false;
628 break;
629
630 case PatternTerm::TypeParentheticalAssertion:
631 term.inputPosition = currentInputPosition;
632 term.frameLocation = currentCallFrameSize;
633 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition);
634 break;
635 }
636 }
637
638 alternative->m_minimumSize = currentInputPosition - initialInputPosition;
639 return currentCallFrameSize;
640 }
641
642 unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
643 {
644 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
645 initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative;
646
647 unsigned minimumInputSize = UINT_MAX;
648 unsigned maximumCallFrameSize = 0;
649 bool hasFixedSize = true;
650
651 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
652 PatternAlternative* alternative = disjunction->m_alternatives[alt];
653 unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
654 minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
655 maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
656 hasFixedSize &= alternative->m_hasFixedSize;
657 }
658
659 ASSERT(minimumInputSize != UINT_MAX);
660 ASSERT(maximumCallFrameSize >= initialCallFrameSize);
661
662 disjunction->m_hasFixedSize = hasFixedSize;
663 disjunction->m_minimumSize = minimumInputSize;
664 disjunction->m_callFrameSize = maximumCallFrameSize;
665 return maximumCallFrameSize;
666 }
667
668 void setupOffsets()
669 {
670 setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
671 }
672
673private:
674 RegexPattern& m_pattern;
675 PatternAlternative* m_alternative;
676 CharacterClassConstructor m_characterClassConstructor;
677 bool m_invertCharacterClass;
678};
679
680
681const char* compileRegex(const UString& patternString, RegexPattern& pattern)
682{
683 RegexPatternConstructor constructor(pattern);
684
685 if (const char* error = parse(constructor, patternString))
686 return error;
687
688 // If the pattern contains illegal backreferences reset & reparse.
689 // Quoting Netscape's "What's new in JavaScript 1.2",
690 // "Note: if the number of left parentheses is less than the number specified
691 // in \#, the \# is taken as an octal escape as described in the next row."
692 if (pattern.containsIllegalBackReference()) {
693 unsigned numSubpatterns = pattern.m_numSubpatterns;
694
695 constructor.reset();
696#ifndef NDEBUG
697 const char* error =
698#endif
699 parse(constructor, patternString, numSubpatterns);
700
701 ASSERT(!error);
702 ASSERT(numSubpatterns == pattern.m_numSubpatterns);
703 }
704
705 constructor.setupOffsets();
706
707 return false;
708};
709
710
711} }
712
713#endif
Note: See TracBrowser for help on using the repository browser.