source: webkit/trunk/JavaScriptCore/yarr/RegexCompiler.cpp@ 43100

Last change on this file since 43100 was 43100, checked in by [email protected], 16 years ago

2009-04-30 Gavin Barraclough <[email protected]>

Reviewed by Oliver "Abandon Ship!" Hunt.

Fix a leak in Yarr.

All Disjunctions should be recorded in RegexPattern::m_disjunctions,
so that they can be freed at the end of compilation - copyDisjunction
is failing to do so.

  • yarr/RegexCompiler.cpp: (JSC::Yarr::RegexPatternConstructor::copyDisjunction):
File size: 27.1 KB
Line 
1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "RegexCompiler.h"
28
29#include "RegexInterpreter.h"
30#include "RegexPattern.h"
31#include <wtf/Vector.h>
32
33#if ENABLE(YARR)
34
35using namespace WTF;
36
37namespace JSC { namespace Yarr {
38
39class CharacterClassConstructor {
40public:
41 CharacterClassConstructor(bool isCaseInsensitive = false)
42 : m_isCaseInsensitive(isCaseInsensitive)
43 {
44 }
45
46 void reset()
47 {
48 m_matches.clear();
49 m_ranges.clear();
50 m_matchesUnicode.clear();
51 m_rangesUnicode.clear();
52 }
53
54 void append(const CharacterClass* other)
55 {
56 for (size_t i = 0; i < other->m_matches.size(); ++i)
57 addSorted(m_matches, other->m_matches[i]);
58 for (size_t i = 0; i < other->m_ranges.size(); ++i)
59 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
60 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
61 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
62 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
63 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
64 }
65
66 void putChar(UChar ch)
67 {
68 if (ch <= 0x7f) {
69 if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
70 addSorted(m_matches, toASCIIUpper(ch));
71 addSorted(m_matches, toASCIILower(ch));
72 } else
73 addSorted(m_matches, ch);
74 } else {
75 UChar upper, lower;
76 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
77 addSorted(m_matchesUnicode, upper);
78 addSorted(m_matchesUnicode, lower);
79 } else
80 addSorted(m_matchesUnicode, ch);
81 }
82 }
83
84 // returns true if this character has another case, and 'ch' is the upper case form.
85 static inline bool isUnicodeUpper(UChar ch)
86 {
87 return ch != Unicode::toLower(ch);
88 }
89
90 // returns true if this character has another case, and 'ch' is the lower case form.
91 static inline bool isUnicodeLower(UChar ch)
92 {
93 return ch != Unicode::toUpper(ch);
94 }
95
96 void putRange(UChar lo, UChar hi)
97 {
98 if (lo <= 0x7f) {
99 char asciiLo = lo;
100 char asciiHi = std::min(hi, (UChar)0x7f);
101 addSortedRange(m_ranges, lo, asciiHi);
102
103 if (m_isCaseInsensitive) {
104 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
105 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
106 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
107 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
108 }
109 }
110 if (hi >= 0x80) {
111 UChar unicodeCurr = std::max(lo, (UChar)0x80);
112 addSortedRange(m_rangesUnicode, unicodeCurr, hi);
113
114 if (m_isCaseInsensitive) {
115 while (unicodeCurr <= hi) {
116 if (isUnicodeUpper(unicodeCurr)) {
117 UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
118 UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
119 while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
120 lowerCaseRangeEnd++;
121 addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
122 } else if (isUnicodeLower(unicodeCurr)) {
123 UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
124 UChar upperCaseRangeEnd = upperCaseRangeBegin;
125 while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
126 upperCaseRangeEnd++;
127 addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
128 } else
129 ++unicodeCurr;
130 }
131 }
132 }
133 }
134
135 CharacterClass* charClass()
136 {
137 CharacterClass* characterClass = new CharacterClass();
138
139 characterClass->m_matches.append(m_matches);
140 characterClass->m_ranges.append(m_ranges);
141 characterClass->m_matchesUnicode.append(m_matchesUnicode);
142 characterClass->m_rangesUnicode.append(m_rangesUnicode);
143
144 reset();
145
146 return characterClass;
147 }
148
149private:
150 void addSorted(Vector<UChar>& matches, UChar ch)
151 {
152 unsigned pos = 0;
153 unsigned range = matches.size();
154
155 // binary chop, find position to insert char.
156 while (range) {
157 unsigned index = range >> 1;
158
159 int val = matches[pos+index] - ch;
160 if (!val)
161 return;
162 else if (val > 0)
163 range = index;
164 else {
165 pos += (index+1);
166 range -= (index+1);
167 }
168 }
169
170 if (pos == matches.size())
171 matches.append(ch);
172 else
173 matches.insert(pos, ch);
174 }
175
176 void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
177 {
178 unsigned end = ranges.size();
179
180 // Simple linear scan - I doubt there are that many ranges anyway...
181 // feel free to fix this with something faster (eg binary chop).
182 for (unsigned i = 0; i < end; ++i) {
183 // does the new range fall before the current position in the array
184 if (hi < ranges[i].begin) {
185 // optional optimization: concatenate appending ranges? - may not be worthwhile.
186 if (hi == (ranges[i].begin - 1)) {
187 ranges[i].begin = lo;
188 return;
189 }
190 ranges.insert(i, CharacterRange(lo, hi));
191 return;
192 }
193 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
194 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
195 // end of the last range they concatenate, which is just as good.
196 if (lo <= (ranges[i].end + 1)) {
197 // found an intersect! we'll replace this entry in the array.
198 ranges[i].begin = std::min(ranges[i].begin, lo);
199 ranges[i].end = std::max(ranges[i].end, hi);
200
201 // now check if the new range can subsume any subsequent ranges.
202 unsigned next = i+1;
203 // each iteration of the loop we will either remove something from the list, or break the loop.
204 while (next < ranges.size()) {
205 if (ranges[next].begin <= (ranges[i].end + 1)) {
206 // the next entry now overlaps / concatenates this one.
207 ranges[i].end = std::max(ranges[i].end, ranges[next].end);
208 ranges.remove(next);
209 } else
210 break;
211 }
212
213 return;
214 }
215 }
216
217 // CharacterRange comes after all existing ranges.
218 ranges.append(CharacterRange(lo, hi));
219 }
220
221 bool m_isCaseInsensitive;
222
223 Vector<UChar> m_matches;
224 Vector<CharacterRange> m_ranges;
225 Vector<UChar> m_matchesUnicode;
226 Vector<CharacterRange> m_rangesUnicode;
227};
228
229
230CharacterClass* newlineCreate()
231{
232 CharacterClass* characterClass = new CharacterClass();
233
234 characterClass->m_matches.append('\n');
235 characterClass->m_matches.append('\r');
236 characterClass->m_matchesUnicode.append(0x2028);
237 characterClass->m_matchesUnicode.append(0x2029);
238
239 return characterClass;
240}
241
242CharacterClass* digitsCreate()
243{
244 CharacterClass* characterClass = new CharacterClass();
245
246 characterClass->m_ranges.append(CharacterRange('0', '9'));
247
248 return characterClass;
249}
250
251CharacterClass* spacesCreate()
252{
253 CharacterClass* characterClass = new CharacterClass();
254
255 characterClass->m_matches.append(' ');
256 characterClass->m_ranges.append(CharacterRange('\t', '\r'));
257 characterClass->m_matchesUnicode.append(0x00a0);
258 characterClass->m_matchesUnicode.append(0x1680);
259 characterClass->m_matchesUnicode.append(0x180e);
260 characterClass->m_matchesUnicode.append(0x2028);
261 characterClass->m_matchesUnicode.append(0x2029);
262 characterClass->m_matchesUnicode.append(0x202f);
263 characterClass->m_matchesUnicode.append(0x205f);
264 characterClass->m_matchesUnicode.append(0x3000);
265 characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a));
266
267 return characterClass;
268}
269
270CharacterClass* wordcharCreate()
271{
272 CharacterClass* characterClass = new CharacterClass();
273
274 characterClass->m_matches.append('_');
275 characterClass->m_ranges.append(CharacterRange('0', '9'));
276 characterClass->m_ranges.append(CharacterRange('A', 'Z'));
277 characterClass->m_ranges.append(CharacterRange('a', 'z'));
278
279 return characterClass;
280}
281
282CharacterClass* nondigitsCreate()
283{
284 CharacterClass* characterClass = new CharacterClass();
285
286 characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
287 characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f));
288 characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
289
290 return characterClass;
291}
292
293CharacterClass* nonspacesCreate()
294{
295 CharacterClass* characterClass = new CharacterClass();
296
297 characterClass->m_ranges.append(CharacterRange(0, '\t' - 1));
298 characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1));
299 characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f));
300 characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f));
301 characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f));
302 characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d));
303 characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff));
304 characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027));
305 characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e));
306 characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e));
307 characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff));
308 characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff));
309
310 return characterClass;
311}
312
313CharacterClass* nonwordcharCreate()
314{
315 CharacterClass* characterClass = new CharacterClass();
316
317 characterClass->m_matches.append('`');
318 characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
319 characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1));
320 characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1));
321 characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f));
322 characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
323
324 return characterClass;
325}
326
327
328class RegexPatternConstructor {
329public:
330 RegexPatternConstructor(RegexPattern& pattern)
331 : m_pattern(pattern)
332 , m_characterClassConstructor(pattern.m_ignoreCase)
333 {
334 }
335
336 ~RegexPatternConstructor()
337 {
338 }
339
340 void reset()
341 {
342 m_pattern.reset();
343 m_characterClassConstructor.reset();
344 }
345
346 void assertionBOL()
347 {
348 m_alternative->m_terms.append(PatternTerm::BOL());
349 }
350 void assertionEOL()
351 {
352 m_alternative->m_terms.append(PatternTerm::EOL());
353 }
354 void assertionWordBoundary(bool invert)
355 {
356 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
357 }
358
359 void atomPatternCharacter(UChar ch)
360 {
361 // We handle case-insensitive checking of unicode characters which do have both
362 // cases by handling them as if they were defined using a CharacterClass.
363 if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
364 atomCharacterClassBegin();
365 atomCharacterClassAtom(ch);
366 atomCharacterClassEnd();
367 } else
368 m_alternative->m_terms.append(PatternTerm(ch));
369 }
370
371 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
372 {
373 switch (classID) {
374 case DigitClassID:
375 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
376 break;
377 case SpaceClassID:
378 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
379 break;
380 case WordClassID:
381 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
382 break;
383 case NewlineClassID:
384 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
385 break;
386 }
387 }
388
389 void atomCharacterClassBegin(bool invert = false)
390 {
391 m_invertCharacterClass = invert;
392 }
393
394 void atomCharacterClassAtom(UChar ch)
395 {
396 m_characterClassConstructor.putChar(ch);
397 }
398
399 void atomCharacterClassRange(UChar begin, UChar end)
400 {
401 m_characterClassConstructor.putRange(begin, end);
402 }
403
404 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
405 {
406 ASSERT(classID != NewlineClassID);
407
408 switch (classID) {
409 case DigitClassID:
410 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
411 break;
412
413 case SpaceClassID:
414 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
415 break;
416
417 case WordClassID:
418 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
419 break;
420
421 default:
422 ASSERT_NOT_REACHED();
423 }
424 }
425
426 void atomCharacterClassEnd()
427 {
428 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
429 m_pattern.m_userCharacterClasses.append(newCharacterClass);
430 m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
431 }
432
433 void atomParenthesesSubpatternBegin(bool capture = true)
434 {
435 unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
436 if (capture)
437 m_pattern.m_numSubpatterns++;
438
439 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
440 m_pattern.m_disjunctions.append(parenthesesDisjunction);
441 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture));
442 m_alternative = parenthesesDisjunction->addNewAlternative();
443 }
444
445 void atomParentheticalAssertionBegin(bool invert = false)
446 {
447 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
448 m_pattern.m_disjunctions.append(parenthesesDisjunction);
449 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert));
450 m_alternative = parenthesesDisjunction->addNewAlternative();
451 }
452
453 void atomParenthesesEnd()
454 {
455 ASSERT(m_alternative->m_parent);
456 ASSERT(m_alternative->m_parent->m_parent);
457 m_alternative = m_alternative->m_parent->m_parent;
458
459 m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
460 }
461
462 void atomBackReference(unsigned subpatternId)
463 {
464 ASSERT(subpatternId);
465 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
466
467 if (subpatternId > m_pattern.m_numSubpatterns)
468 return;
469
470 PatternAlternative* currentAlternative = m_alternative;
471 ASSERT(currentAlternative);
472
473 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
474 while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
475 PatternTerm& term = currentAlternative->lastTerm();
476 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
477
478 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId))
479 return;
480 }
481
482 m_alternative->m_terms.append(PatternTerm(subpatternId));
483 }
484
485 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction)
486 {
487 PatternDisjunction* newDisjunction = new PatternDisjunction();
488
489 newDisjunction->m_parent = disjunction->m_parent;
490 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
491 PatternAlternative* alternative = disjunction->m_alternatives[alt];
492 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
493 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
494 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i]));
495 }
496
497 m_pattern.m_disjunctions.append(newDisjunction);
498 return newDisjunction;
499 }
500
501 PatternTerm copyTerm(PatternTerm& term)
502 {
503 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
504 return PatternTerm(term);
505
506 PatternTerm termCopy = term;
507 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction);
508 return termCopy;
509 }
510
511 void quantifyAtom(unsigned min, unsigned max, bool greedy)
512 {
513 ASSERT(min <= max);
514 ASSERT(m_alternative->m_terms.size());
515
516 if (!max) {
517 m_alternative->removeLastTerm();
518 return;
519 }
520
521 PatternTerm& term = m_alternative->lastTerm();
522 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
523 ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
524
525 // For any assertion with a zero minimum, not matching is valid and has no effect,
526 // remove it. Otherwise, we need to match as least once, but there is no point
527 // matching more than once, so remove the quantifier. It is not entirely clear
528 // from the spec whether or not this behavior is correct, but I believe this
529 // matches Firefox. :-/
530 if (term.type == PatternTerm::TypeParentheticalAssertion) {
531 if (!min)
532 m_alternative->removeLastTerm();
533 return;
534 }
535
536 if (min == 0)
537 term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
538 else if (min == max)
539 term.quantify(min, QuantifierFixedCount);
540 else {
541 term.quantify(min, QuantifierFixedCount);
542 m_alternative->m_terms.append(copyTerm(term));
543 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
544 m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
545 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
546 m_alternative->lastTerm().parentheses.isCopy = true;
547 }
548 }
549
550 void disjunction()
551 {
552 m_alternative = m_alternative->m_parent->addNewAlternative();
553 }
554
555 void regexBegin()
556 {
557 m_pattern.m_body = new PatternDisjunction();
558 m_alternative = m_pattern.m_body->addNewAlternative();
559 m_pattern.m_disjunctions.append(m_pattern.m_body);
560 }
561 void regexEnd()
562 {
563 }
564 void regexError()
565 {
566 }
567
568 unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
569 {
570 alternative->m_hasFixedSize = true;
571 unsigned currentInputPosition = initialInputPosition;
572
573 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
574 PatternTerm& term = alternative->m_terms[i];
575
576 switch (term.type) {
577 case PatternTerm::TypeAssertionBOL:
578 case PatternTerm::TypeAssertionEOL:
579 case PatternTerm::TypeAssertionWordBoundary:
580 term.inputPosition = currentInputPosition;
581 break;
582
583 case PatternTerm::TypeBackReference:
584 term.inputPosition = currentInputPosition;
585 term.frameLocation = currentCallFrameSize;
586 currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference;
587 alternative->m_hasFixedSize = false;
588 break;
589
590 case PatternTerm::TypePatternCharacter:
591 term.inputPosition = currentInputPosition;
592 if (term.quantityType != QuantifierFixedCount) {
593 term.frameLocation = currentCallFrameSize;
594 currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter;
595 alternative->m_hasFixedSize = false;
596 } else
597 currentInputPosition += term.quantityCount;
598 break;
599
600 case PatternTerm::TypeCharacterClass:
601 term.inputPosition = currentInputPosition;
602 if (term.quantityType != QuantifierFixedCount) {
603 term.frameLocation = currentCallFrameSize;
604 currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass;
605 alternative->m_hasFixedSize = false;
606 } else
607 currentInputPosition += term.quantityCount;
608 break;
609
610 case PatternTerm::TypeParenthesesSubpattern:
611 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
612 term.frameLocation = currentCallFrameSize;
613 if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
614 if (term.quantityType == QuantifierFixedCount) {
615 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
616 currentInputPosition += term.parentheses.disjunction->m_minimumSize;
617 } else {
618 currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce;
619 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
620 }
621 term.inputPosition = currentInputPosition;
622 } else {
623 term.inputPosition = currentInputPosition;
624 setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition);
625 currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses;
626 }
627 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
628 alternative->m_hasFixedSize = false;
629 break;
630
631 case PatternTerm::TypeParentheticalAssertion:
632 term.inputPosition = currentInputPosition;
633 term.frameLocation = currentCallFrameSize;
634 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition);
635 break;
636 }
637 }
638
639 alternative->m_minimumSize = currentInputPosition - initialInputPosition;
640 return currentCallFrameSize;
641 }
642
643 unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
644 {
645 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
646 initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative;
647
648 unsigned minimumInputSize = UINT_MAX;
649 unsigned maximumCallFrameSize = 0;
650 bool hasFixedSize = true;
651
652 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
653 PatternAlternative* alternative = disjunction->m_alternatives[alt];
654 unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
655 minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
656 maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
657 hasFixedSize &= alternative->m_hasFixedSize;
658 }
659
660 ASSERT(minimumInputSize != UINT_MAX);
661 ASSERT(maximumCallFrameSize >= initialCallFrameSize);
662
663 disjunction->m_hasFixedSize = hasFixedSize;
664 disjunction->m_minimumSize = minimumInputSize;
665 disjunction->m_callFrameSize = maximumCallFrameSize;
666 return maximumCallFrameSize;
667 }
668
669 void setupOffsets()
670 {
671 setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
672 }
673
674private:
675 RegexPattern& m_pattern;
676 PatternAlternative* m_alternative;
677 CharacterClassConstructor m_characterClassConstructor;
678 bool m_invertCharacterClass;
679};
680
681
682const char* compileRegex(const UString& patternString, RegexPattern& pattern)
683{
684 RegexPatternConstructor constructor(pattern);
685
686 if (const char* error = parse(constructor, patternString))
687 return error;
688
689 // If the pattern contains illegal backreferences reset & reparse.
690 // Quoting Netscape's "What's new in JavaScript 1.2",
691 // "Note: if the number of left parentheses is less than the number specified
692 // in \#, the \# is taken as an octal escape as described in the next row."
693 if (pattern.containsIllegalBackReference()) {
694 unsigned numSubpatterns = pattern.m_numSubpatterns;
695
696 constructor.reset();
697#ifndef NDEBUG
698 const char* error =
699#endif
700 parse(constructor, patternString, numSubpatterns);
701
702 ASSERT(!error);
703 ASSERT(numSubpatterns == pattern.m_numSubpatterns);
704 }
705
706 constructor.setupOffsets();
707
708 return false;
709};
710
711
712} }
713
714#endif
Note: See TracBrowser for help on using the repository browser.