Context Navigation

RegexCompiler.cpp@ 42853

Visit:

Last change on this file since 42853 was 42853, checked in by [email protected], 16 years ago

2009-04-24 Gavin Barraclough <[email protected]>

Reviewed by Sam "Wesley" Weinig.

Improve performance to YARR interpreter.
(From about 3x slower than PCRE on regex-dna to about 30% slower).

yarr/RegexCompiler.cpp: (JSC::Yarr::RegexPatternConstructor::setupAlternativeOffsets):
yarr/RegexInterpreter.cpp: (JSC::Yarr::Interpreter::checkCharacter): (JSC::Yarr::Interpreter::checkCasedCharacter): (JSC::Yarr::Interpreter::backtrackPatternCharacter): (JSC::Yarr::Interpreter::backtrackPatternCasedCharacter): (JSC::Yarr::Interpreter::matchParentheticalAssertionBegin): (JSC::Yarr::Interpreter::matchParentheticalAssertionEnd): (JSC::Yarr::Interpreter::backtrackParentheticalAssertionBegin): (JSC::Yarr::Interpreter::backtrackParentheticalAssertionEnd): (JSC::Yarr::Interpreter::matchDisjunction): (JSC::Yarr::Interpreter::interpret): (JSC::Yarr::ByteCompiler::atomPatternCharacter): (JSC::Yarr::ByteCompiler::atomParenthesesSubpatternBegin): (JSC::Yarr::ByteCompiler::atomParentheticalAssertionBegin): (JSC::Yarr::ByteCompiler::closeAlternative): (JSC::Yarr::ByteCompiler::closeBodyAlternative): (JSC::Yarr::ByteCompiler::atomParenthesesEnd): (JSC::Yarr::ByteCompiler::regexBegin): (JSC::Yarr::ByteCompiler::regexEnd): (JSC::Yarr::ByteCompiler::alterantiveBodyDisjunction): (JSC::Yarr::ByteCompiler::alterantiveDisjunction): (JSC::Yarr::ByteCompiler::emitDisjunction):
yarr/RegexInterpreter.h: (JSC::Yarr::ByteTerm::): (JSC::Yarr::ByteTerm::ByteTerm): (JSC::Yarr::ByteTerm::BodyAlternativeBegin): (JSC::Yarr::ByteTerm::BodyAlternativeDisjunction): (JSC::Yarr::ByteTerm::BodyAlternativeEnd): (JSC::Yarr::ByteTerm::AlternativeBegin): (JSC::Yarr::ByteTerm::AlternativeDisjunction): (JSC::Yarr::ByteTerm::AlternativeEnd): (JSC::Yarr::ByteTerm::SubpatternBegin): (JSC::Yarr::ByteTerm::SubpatternEnd):
yarr/RegexJIT.cpp: (JSC::Yarr::RegexGenerator::generateParentheticalAssertion):
yarr/RegexPattern.h:

File size: 27.0 KB

Line
1	/*
2	* Copyright (C) 2009 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#include "config.h"
27	#include "RegexCompiler.h"
28
29	#include "RegexInterpreter.h"
30	#include "RegexPattern.h"
31	#include <wtf/Vector.h>
32
33	#if ENABLE(YARR)
34
35	using namespace WTF;
36
37	namespace JSC { namespace Yarr {
38
39	class CharacterClassConstructor {
40	public:
41	CharacterClassConstructor(bool isCaseInsensitive = false)
42	: m_isCaseInsensitive(isCaseInsensitive)
43	{
44	}
45
46	void reset()
47	{
48	m_matches.clear();
49	m_ranges.clear();
50	m_matchesUnicode.clear();
51	m_rangesUnicode.clear();
52	}
53
54	void append(const CharacterClass* other)
55	{
56	for (size_t i = 0; i < other->m_matches.size(); ++i)
57	addSorted(m_matches, other->m_matches[i]);
58	for (size_t i = 0; i < other->m_ranges.size(); ++i)
59	addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
60	for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
61	addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
62	for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
63	addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
64	}
65
66	void putChar(UChar ch)
67	{
68	if (ch <= 0x7f) {
69	if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
70	addSorted(m_matches, toASCIIUpper(ch));
71	addSorted(m_matches, toASCIILower(ch));
72	} else
73	addSorted(m_matches, ch);
74	} else {
75	UChar upper, lower;
76	if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
77	addSorted(m_matchesUnicode, upper);
78	addSorted(m_matchesUnicode, lower);
79	} else
80	addSorted(m_matchesUnicode, ch);
81	}
82	}
83
84	// returns true if this character has another case, and 'ch' is the upper case form.
85	static inline bool isUnicodeUpper(UChar ch)
86	{
87	return ch != Unicode::toLower(ch);
88	}
89
90	// returns true if this character has another case, and 'ch' is the lower case form.
91	static inline bool isUnicodeLower(UChar ch)
92	{
93	return ch != Unicode::toUpper(ch);
94	}
95
96	void putRange(UChar lo, UChar hi)
97	{
98	if (lo <= 0x7f) {
99	char asciiLo = lo;
100	char asciiHi = std::min(hi, (UChar)0x7f);
101	addSortedRange(m_ranges, lo, asciiHi);
102
103	if (m_isCaseInsensitive) {
104	if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
105	addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
106	if ((asciiLo <= 'z') && (asciiHi >= 'a'))
107	addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
108	}
109	}
110	if (hi >= 0x80) {
111	UChar unicodeCurr = std::max(lo, (UChar)0x80);
112	addSortedRange(m_rangesUnicode, unicodeCurr, hi);
113
114	if (m_isCaseInsensitive) {
115	while (unicodeCurr <= hi) {
116	if (isUnicodeUpper(unicodeCurr)) {
117	UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
118	UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
119	while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
120	lowerCaseRangeEnd++;
121	addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
122	} else if (isUnicodeLower(unicodeCurr)) {
123	UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
124	UChar upperCaseRangeEnd = upperCaseRangeBegin;
125	while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
126	upperCaseRangeEnd++;
127	addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
128	} else
129	++unicodeCurr;
130	}
131	}
132	}
133	}
134
135	CharacterClass* charClass()
136	{
137	CharacterClass* characterClass = new CharacterClass();
138
139	characterClass->m_matches.append(m_matches);
140	characterClass->m_ranges.append(m_ranges);
141	characterClass->m_matchesUnicode.append(m_matchesUnicode);
142	characterClass->m_rangesUnicode.append(m_rangesUnicode);
143
144	reset();
145
146	return characterClass;
147	}
148
149	private:
150	void addSorted(Vector<UChar>& matches, UChar ch)
151	{
152	unsigned pos = 0;
153	unsigned range = matches.size();
154
155	// binary chop, find position to insert char.
156	while (range) {
157	unsigned index = range >> 1;
158
159	int val = matches[pos+index] - ch;
160	if (!val)
161	return;
162	else if (val > 0)
163	range = index;
164	else {
165	pos += (index+1);
166	range -= (index+1);
167	}
168	}
169
170	if (pos == matches.size())
171	matches.append(ch);
172	else
173	matches.insert(pos, ch);
174	}
175
176	void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
177	{
178	unsigned end = ranges.size();
179
180	// Simple linear scan - I doubt there are that many ranges anyway...
181	// feel free to fix this with something faster (eg binary chop).
182	for (unsigned i = 0; i < end; ++i) {
183	// does the new range fall before the current position in the array
184	if (hi < ranges[i].begin) {
185	// optional optimization: concatenate appending ranges? - may not be worthwhile.
186	if (hi == (ranges[i].begin - 1)) {
187	ranges[i].begin = lo;
188	return;
189	}
190	ranges.insert(i, CharacterRange(lo, hi));
191	return;
192	}
193	// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
194	// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
195	// end of the last range they concatenate, which is just as good.
196	if (lo <= (ranges[i].end + 1)) {
197	// found an intersect! we'll replace this entry in the array.
198	ranges[i].begin = std::min(ranges[i].begin, lo);
199	ranges[i].end = std::max(ranges[i].end, hi);
200
201	// now check if the new range can subsume any subsequent ranges.
202	unsigned next = i+1;
203	// each iteration of the loop we will either remove something from the list, or break the loop.
204	while (next < ranges.size()) {
205	if (ranges[next].begin <= (ranges[i].end + 1)) {
206	// the next entry now overlaps / concatenates this one.
207	ranges[i].end = std::max(ranges[i].end, ranges[next].end);
208	ranges.remove(next);
209	} else
210	break;
211	}
212
213	return;
214	}
215	}
216
217	// CharacterRange comes after all existing ranges.
218	ranges.append(CharacterRange(lo, hi));
219	}
220
221	bool m_isCaseInsensitive;
222
223	Vector<UChar> m_matches;
224	Vector<CharacterRange> m_ranges;
225	Vector<UChar> m_matchesUnicode;
226	Vector<CharacterRange> m_rangesUnicode;
227	};
228
229
230	CharacterClass* newlineCreate()
231	{
232	CharacterClass* characterClass = new CharacterClass();
233
234	characterClass->m_matches.append('\n');
235	characterClass->m_matches.append('\r');
236	characterClass->m_matchesUnicode.append(0x2028);
237	characterClass->m_matchesUnicode.append(0x2029);
238
239	return characterClass;
240	}
241
242	CharacterClass* digitsCreate()
243	{
244	CharacterClass* characterClass = new CharacterClass();
245
246	characterClass->m_ranges.append(CharacterRange('0', '9'));
247
248	return characterClass;
249	}
250
251	CharacterClass* spacesCreate()
252	{
253	CharacterClass* characterClass = new CharacterClass();
254
255	characterClass->m_matches.append(' ');
256	characterClass->m_ranges.append(CharacterRange('\t', '\r'));
257	characterClass->m_matchesUnicode.append(0x00a0);
258	characterClass->m_matchesUnicode.append(0x1680);
259	characterClass->m_matchesUnicode.append(0x180e);
260	characterClass->m_matchesUnicode.append(0x2028);
261	characterClass->m_matchesUnicode.append(0x2029);
262	characterClass->m_matchesUnicode.append(0x202f);
263	characterClass->m_matchesUnicode.append(0x205f);
264	characterClass->m_matchesUnicode.append(0x3000);
265	characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a));
266
267	return characterClass;
268	}
269
270	CharacterClass* wordcharCreate()
271	{
272	CharacterClass* characterClass = new CharacterClass();
273
274	characterClass->m_matches.append('_');
275	characterClass->m_ranges.append(CharacterRange('0', '9'));
276	characterClass->m_ranges.append(CharacterRange('A', 'Z'));
277	characterClass->m_ranges.append(CharacterRange('a', 'z'));
278
279	return characterClass;
280	}
281
282	CharacterClass* nondigitsCreate()
283	{
284	CharacterClass* characterClass = new CharacterClass();
285
286	characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
287	characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f));
288	characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
289
290	return characterClass;
291	}
292
293	CharacterClass* nonspacesCreate()
294	{
295	CharacterClass* characterClass = new CharacterClass();
296
297	characterClass->m_ranges.append(CharacterRange(0, '\t' - 1));
298	characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1));
299	characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f));
300	characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f));
301	characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f));
302	characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d));
303	characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff));
304	characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027));
305	characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e));
306	characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e));
307	characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff));
308	characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff));
309
310	return characterClass;
311	}
312
313	CharacterClass* nonwordcharCreate()
314	{
315	CharacterClass* characterClass = new CharacterClass();
316
317	characterClass->m_matches.append('`');
318	characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
319	characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1));
320	characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1));
321	characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f));
322	characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
323
324	return characterClass;
325	}
326
327
328	class RegexPatternConstructor {
329	public:
330	RegexPatternConstructor(RegexPattern& pattern)
331	: m_pattern(pattern)
332	, m_characterClassConstructor(pattern.m_ignoreCase)
333	{
334	}
335
336	~RegexPatternConstructor()
337	{
338	}
339
340	void reset()
341	{
342	m_pattern.reset();
343	m_characterClassConstructor.reset();
344	}
345
346	void assertionBOL()
347	{
348	m_alternative->m_terms.append(PatternTerm::BOL());
349	}
350	void assertionEOL()
351	{
352	m_alternative->m_terms.append(PatternTerm::EOL());
353	}
354	void assertionWordBoundary(bool invert)
355	{
356	m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
357	}
358
359	void atomPatternCharacter(UChar ch)
360	{
361	// We handle case-insensitive checking of unicode characters which do have both
362	// cases by handling them as if they were defined using a CharacterClass.
363	if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
364	atomCharacterClassBegin();
365	atomCharacterClassAtom(ch);
366	atomCharacterClassEnd();
367	} else
368	m_alternative->m_terms.append(PatternTerm(ch));
369	}
370
371	void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
372	{
373	switch (classID) {
374	case DigitClassID:
375	m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
376	break;
377	case SpaceClassID:
378	m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
379	break;
380	case WordClassID:
381	m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
382	break;
383	case NewlineClassID:
384	m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
385	break;
386	}
387	}
388
389	void atomCharacterClassBegin(bool invert = false)
390	{
391	m_invertCharacterClass = invert;
392	}
393
394	void atomCharacterClassAtom(UChar ch)
395	{
396	m_characterClassConstructor.putChar(ch);
397	}
398
399	void atomCharacterClassRange(UChar begin, UChar end)
400	{
401	m_characterClassConstructor.putRange(begin, end);
402	}
403
404	void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
405	{
406	ASSERT(classID != NewlineClassID);
407
408	switch (classID) {
409	case DigitClassID:
410	m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
411	break;
412
413	case SpaceClassID:
414	m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
415	break;
416
417	case WordClassID:
418	m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
419	break;
420
421	default:
422	ASSERT_NOT_REACHED();
423	}
424	}
425
426	void atomCharacterClassEnd()
427	{
428	CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
429	m_pattern.m_userCharacterClasses.append(newCharacterClass);
430	m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
431	}
432
433	void atomParenthesesSubpatternBegin(bool capture = true)
434	{
435	unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
436	if (capture)
437	m_pattern.m_numSubpatterns++;
438
439	PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
440	m_pattern.m_disjunctions.append(parenthesesDisjunction);
441	m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture));
442	m_alternative = parenthesesDisjunction->addNewAlternative();
443	}
444
445	void atomParentheticalAssertionBegin(bool invert = false)
446	{
447	PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
448	m_pattern.m_disjunctions.append(parenthesesDisjunction);
449	m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert));
450	m_alternative = parenthesesDisjunction->addNewAlternative();
451	}
452
453	void atomParenthesesEnd()
454	{
455	ASSERT(m_alternative->m_parent);
456	ASSERT(m_alternative->m_parent->m_parent);
457	m_alternative = m_alternative->m_parent->m_parent;
458
459	m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
460	}
461
462	void atomBackReference(unsigned subpatternId)
463	{
464	ASSERT(subpatternId);
465	m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
466
467	if (subpatternId > m_pattern.m_numSubpatterns)
468	return;
469
470	PatternAlternative* currentAlternative = m_alternative;
471	ASSERT(currentAlternative);
472
473	// Note to self: if we waited until the AST was baked, we could also remove forwards refs
474	while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
475	PatternTerm& term = currentAlternative->lastTerm();
476	ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) \|\| (term.type == PatternTerm::TypeParentheticalAssertion));
477
478	if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId))
479	return;
480	}
481
482	m_alternative->m_terms.append(PatternTerm(subpatternId));
483	}
484
485	PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction)
486	{
487	PatternDisjunction* newDisjunction = new PatternDisjunction();
488
489	newDisjunction->m_parent = disjunction->m_parent;
490	for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
491	PatternAlternative* alternative = disjunction->m_alternatives[alt];
492	PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
493	for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
494	newAlternative->m_terms.append(copyTerm(alternative->m_terms[i]));
495	}
496
497	return newDisjunction;
498	}
499
500	PatternTerm copyTerm(PatternTerm& term)
501	{
502	if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
503	return PatternTerm(term);
504
505	PatternTerm termCopy = term;
506	termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction);
507	return termCopy;
508	}
509
510	void quantifyAtom(unsigned min, unsigned max, bool greedy)
511	{
512	ASSERT(min <= max);
513	ASSERT(m_alternative->m_terms.size());
514
515	if (!max) {
516	m_alternative->removeLastTerm();
517	return;
518	}
519
520	PatternTerm& term = m_alternative->lastTerm();
521	ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
522	ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
523
524	// For any assertion with a zero minimum, not matching is valid and has no effect,
525	// remove it. Otherwise, we need to match as least once, but there is no point
526	// matching more than once, so remove the quantifier. It is not entirely clear
527	// from the spec whether or not this behavior is correct, but I believe this
528	// matches Firefox. :-/
529	if (term.type == PatternTerm::TypeParentheticalAssertion) {
530	if (!min)
531	m_alternative->removeLastTerm();
532	return;
533	}
534
535	if (min == 0)
536	term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
537	else if (min == max)
538	term.quantify(min, QuantifierFixedCount);
539	else {
540	term.quantify(min, QuantifierFixedCount);
541	m_alternative->m_terms.append(copyTerm(term));
542	// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
543	m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
544	if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
545	m_alternative->lastTerm().parentheses.isCopy = true;
546	}
547	}
548
549	void disjunction()
550	{
551	m_alternative = m_alternative->m_parent->addNewAlternative();
552	}
553
554	void regexBegin()
555	{
556	m_pattern.m_body = new PatternDisjunction();
557	m_alternative = m_pattern.m_body->addNewAlternative();
558	m_pattern.m_disjunctions.append(m_pattern.m_body);
559	}
560	void regexEnd()
561	{
562	}
563	void regexError()
564	{
565	}
566
567	unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
568	{
569	alternative->m_hasFixedSize = true;
570	unsigned currentInputPosition = initialInputPosition;
571
572	for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
573	PatternTerm& term = alternative->m_terms[i];
574
575	switch (term.type) {
576	case PatternTerm::TypeAssertionBOL:
577	case PatternTerm::TypeAssertionEOL:
578	case PatternTerm::TypeAssertionWordBoundary:
579	term.inputPosition = currentInputPosition;
580	break;
581
582	case PatternTerm::TypeBackReference:
583	term.inputPosition = currentInputPosition;
584	term.frameLocation = currentCallFrameSize;
585	currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference;
586	alternative->m_hasFixedSize = false;
587	break;
588
589	case PatternTerm::TypePatternCharacter:
590	term.inputPosition = currentInputPosition;
591	if (term.quantityType != QuantifierFixedCount) {
592	term.frameLocation = currentCallFrameSize;
593	currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter;
594	alternative->m_hasFixedSize = false;
595	} else
596	currentInputPosition += term.quantityCount;
597	break;
598
599	case PatternTerm::TypeCharacterClass:
600	term.inputPosition = currentInputPosition;
601	if (term.quantityType != QuantifierFixedCount) {
602	term.frameLocation = currentCallFrameSize;
603	currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass;
604	alternative->m_hasFixedSize = false;
605	} else
606	currentInputPosition += term.quantityCount;
607	break;
608
609	case PatternTerm::TypeParenthesesSubpattern:
610	// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
611	term.frameLocation = currentCallFrameSize;
612	if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
613	if (term.quantityType == QuantifierFixedCount) {
614	currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
615	currentInputPosition += term.parentheses.disjunction->m_minimumSize;
616	} else {
617	currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce;
618	currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
619	}
620	term.inputPosition = currentInputPosition;
621	} else {
622	term.inputPosition = currentInputPosition;
623	setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition);
624	currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses;
625	}
626	// Fixed count of 1 could be accepted, if they have a fixed size AND if all alternatives are of the same length.
627	alternative->m_hasFixedSize = false;
628	break;
629
630	case PatternTerm::TypeParentheticalAssertion:
631	term.inputPosition = currentInputPosition;
632	term.frameLocation = currentCallFrameSize;
633	currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition);
634	break;
635	}
636	}
637
638	alternative->m_minimumSize = currentInputPosition - initialInputPosition;
639	return currentCallFrameSize;
640	}
641
642	unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
643	{
644	if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
645	initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative;
646
647	unsigned minimumInputSize = UINT_MAX;
648	unsigned maximumCallFrameSize = 0;
649	bool hasFixedSize = true;
650
651	for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
652	PatternAlternative* alternative = disjunction->m_alternatives[alt];
653	unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
654	minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
655	maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
656	hasFixedSize &= alternative->m_hasFixedSize;
657	}
658
659	ASSERT(minimumInputSize != UINT_MAX);
660	ASSERT(maximumCallFrameSize >= initialCallFrameSize);
661
662	disjunction->m_hasFixedSize = hasFixedSize;
663	disjunction->m_minimumSize = minimumInputSize;
664	disjunction->m_callFrameSize = maximumCallFrameSize;
665	return maximumCallFrameSize;
666	}
667
668	void setupOffsets()
669	{
670	setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
671	}
672
673	private:
674	RegexPattern& m_pattern;
675	PatternAlternative* m_alternative;
676	CharacterClassConstructor m_characterClassConstructor;
677	bool m_invertCharacterClass;
678	};
679
680
681	const char* compileRegex(const UString& patternString, RegexPattern& pattern)
682	{
683	RegexPatternConstructor constructor(pattern);
684
685	if (const char* error = parse(constructor, patternString))
686	return error;
687
688	// If the pattern contains illegal backreferences reset & reparse.
689	// Quoting Netscape's "What's new in JavaScript 1.2",
690	// "Note: if the number of left parentheses is less than the number specified
691	// in \#, the \# is taken as an octal escape as described in the next row."
692	if (pattern.containsIllegalBackReference()) {
693	unsigned numSubpatterns = pattern.m_numSubpatterns;
694
695	constructor.reset();
696	#ifndef NDEBUG
697	const char* error =
698	#endif
699	parse(constructor, patternString, numSubpatterns);
700
701	ASSERT(!error);
702	ASSERT(numSubpatterns == pattern.m_numSubpatterns);
703	}
704
705	constructor.setupOffsets();
706
707	return false;
708	};
709
710
711	} }
712
713	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: webkit/trunk/JavaScriptCore/yarr/RegexCompiler.cpp@ 42853

Download in other formats: