1 | /*
|
---|
2 | * Copyright (C) 2009 Apple Inc. All rights reserved.
|
---|
3 | *
|
---|
4 | * Redistribution and use in source and binary forms, with or without
|
---|
5 | * modification, are permitted provided that the following conditions
|
---|
6 | * are met:
|
---|
7 | * 1. Redistributions of source code must retain the above copyright
|
---|
8 | * notice, this list of conditions and the following disclaimer.
|
---|
9 | * 2. Redistributions in binary form must reproduce the above copyright
|
---|
10 | * notice, this list of conditions and the following disclaimer in the
|
---|
11 | * documentation and/or other materials provided with the distribution.
|
---|
12 | *
|
---|
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
---|
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
---|
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
---|
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
---|
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
---|
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
---|
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
---|
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
---|
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
---|
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
---|
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
24 | */
|
---|
25 |
|
---|
26 | #ifndef RegexParser_h
|
---|
27 | #define RegexParser_h
|
---|
28 |
|
---|
29 | #include <wtf/Platform.h>
|
---|
30 |
|
---|
31 | #if ENABLE(YARR)
|
---|
32 |
|
---|
33 | #include <UString.h>
|
---|
34 | #include <wtf/ASCIICType.h>
|
---|
35 | #include <wtf/unicode/Unicode.h>
|
---|
36 | #include <limits.h>
|
---|
37 |
|
---|
38 | namespace JSC { namespace Yarr {
|
---|
39 |
|
---|
40 | enum BuiltInCharacterClassID {
|
---|
41 | DigitClassID,
|
---|
42 | SpaceClassID,
|
---|
43 | WordClassID,
|
---|
44 | NewlineClassID,
|
---|
45 | };
|
---|
46 |
|
---|
47 | // The Parser class should not be used directly - only via the Yarr::parse() method.
|
---|
48 | template<class Delegate>
|
---|
49 | class Parser {
|
---|
50 | private:
|
---|
51 | template<class FriendDelegate>
|
---|
52 | friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
|
---|
53 |
|
---|
54 | enum ErrorCode {
|
---|
55 | NoError,
|
---|
56 | PatternTooLarge,
|
---|
57 | QuantifierOutOfOrder,
|
---|
58 | QuantifierWithoutAtom,
|
---|
59 | MissingParentheses,
|
---|
60 | ParenthesesUnmatched,
|
---|
61 | ParenthesesTypeInvalid,
|
---|
62 | CharacterClassUnmatched,
|
---|
63 | CharacterClassOutOfOrder,
|
---|
64 | EscapeUnterminated,
|
---|
65 | NumberOfErrorCodes
|
---|
66 | };
|
---|
67 |
|
---|
68 | /*
|
---|
69 | * CharacterClassParserDelegate:
|
---|
70 | *
|
---|
71 | * The class CharacterClassParserDelegate is used in the parsing of character
|
---|
72 | * classes. This class handles detection of character ranges. This class
|
---|
73 | * implements enough of the delegate interface such that it can be passed to
|
---|
74 | * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
|
---|
75 | * to perform the parsing of escape characters in character sets.
|
---|
76 | */
|
---|
77 | class CharacterClassParserDelegate {
|
---|
78 | public:
|
---|
79 | CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
|
---|
80 | : m_delegate(delegate)
|
---|
81 | , m_err(err)
|
---|
82 | , m_state(empty)
|
---|
83 | {
|
---|
84 | }
|
---|
85 |
|
---|
86 | /*
|
---|
87 | * begin():
|
---|
88 | *
|
---|
89 | * Called at beginning of construction.
|
---|
90 | */
|
---|
91 | void begin(bool invert)
|
---|
92 | {
|
---|
93 | m_delegate.atomCharacterClassBegin(invert);
|
---|
94 | }
|
---|
95 |
|
---|
96 | /*
|
---|
97 | * atomPatternCharacterUnescaped():
|
---|
98 | *
|
---|
99 | * This method is called directly from parseCharacterClass(), to report a new
|
---|
100 | * pattern character token. This method differs from atomPatternCharacter(),
|
---|
101 | * which will be called from parseEscape(), since a hypen provided via this
|
---|
102 | * method may be indicating a character range, but a hyphen parsed by
|
---|
103 | * parseEscape() cannot be interpreted as doing so.
|
---|
104 | */
|
---|
105 | void atomPatternCharacterUnescaped(UChar ch)
|
---|
106 | {
|
---|
107 | switch (m_state) {
|
---|
108 | case empty:
|
---|
109 | m_character = ch;
|
---|
110 | m_state = cachedCharacter;
|
---|
111 | break;
|
---|
112 |
|
---|
113 | case cachedCharacter:
|
---|
114 | if (ch == '-')
|
---|
115 | m_state = cachedCharacterHyphen;
|
---|
116 | else {
|
---|
117 | m_delegate.atomCharacterClassAtom(m_character);
|
---|
118 | m_character = ch;
|
---|
119 | }
|
---|
120 | break;
|
---|
121 |
|
---|
122 | case cachedCharacterHyphen:
|
---|
123 | if (ch >= m_character)
|
---|
124 | m_delegate.atomCharacterClassRange(m_character, ch);
|
---|
125 | else
|
---|
126 | m_err = CharacterClassOutOfOrder;
|
---|
127 | m_state = empty;
|
---|
128 | }
|
---|
129 | }
|
---|
130 |
|
---|
131 | /*
|
---|
132 | * atomPatternCharacter():
|
---|
133 | *
|
---|
134 | * Adds a pattern character, called by parseEscape(), as such will not
|
---|
135 | * interpret a hyphen as indicating a character range.
|
---|
136 | */
|
---|
137 | void atomPatternCharacter(UChar ch)
|
---|
138 | {
|
---|
139 | // Flush if a character is already pending to prevent the
|
---|
140 | // hyphen from begin interpreted as indicating a range.
|
---|
141 | if((ch == '-') && (m_state == cachedCharacter))
|
---|
142 | flush();
|
---|
143 |
|
---|
144 | atomPatternCharacterUnescaped(ch);
|
---|
145 | }
|
---|
146 |
|
---|
147 | /*
|
---|
148 | * atomBuiltInCharacterClass():
|
---|
149 | *
|
---|
150 | * Adds a built-in character class, called by parseEscape().
|
---|
151 | */
|
---|
152 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
|
---|
153 | {
|
---|
154 | flush();
|
---|
155 | m_delegate.atomCharacterClassBuiltIn(classID, invert);
|
---|
156 | }
|
---|
157 |
|
---|
158 | /*
|
---|
159 | * end():
|
---|
160 | *
|
---|
161 | * Called at end of construction.
|
---|
162 | */
|
---|
163 | void end()
|
---|
164 | {
|
---|
165 | flush();
|
---|
166 | m_delegate.atomCharacterClassEnd();
|
---|
167 | }
|
---|
168 |
|
---|
169 | // parseEscape() should never call these delegate methods when
|
---|
170 | // invoked with inCharacterClass set.
|
---|
171 | void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
|
---|
172 | void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
|
---|
173 |
|
---|
174 | private:
|
---|
175 | void flush()
|
---|
176 | {
|
---|
177 | if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
|
---|
178 | m_delegate.atomCharacterClassAtom(m_character);
|
---|
179 | if (m_state == cachedCharacterHyphen)
|
---|
180 | m_delegate.atomCharacterClassAtom('-');
|
---|
181 | m_state = empty;
|
---|
182 | }
|
---|
183 |
|
---|
184 | Delegate& m_delegate;
|
---|
185 | ErrorCode& m_err;
|
---|
186 | enum CharacterClassConstructionState {
|
---|
187 | empty,
|
---|
188 | cachedCharacter,
|
---|
189 | cachedCharacterHyphen,
|
---|
190 | } m_state;
|
---|
191 | UChar m_character;
|
---|
192 | };
|
---|
193 |
|
---|
194 | Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
|
---|
195 | : m_delegate(delegate)
|
---|
196 | , m_backReferenceLimit(backReferenceLimit)
|
---|
197 | , m_err(NoError)
|
---|
198 | , m_data(pattern.data())
|
---|
199 | , m_size(pattern.size())
|
---|
200 | , m_index(0)
|
---|
201 | , m_parenthesesNestingDepth(0)
|
---|
202 | {
|
---|
203 | }
|
---|
204 |
|
---|
205 | /*
|
---|
206 | * parseEscape():
|
---|
207 | *
|
---|
208 | * Helper for parseTokens() AND parseCharacterClass().
|
---|
209 | * Unlike the other parser methods, this function does not report tokens
|
---|
210 | * directly to the member delegate (m_delegate), instead tokens are
|
---|
211 | * emitted to the delegate provided as an argument. In the case of atom
|
---|
212 | * escapes, parseTokens() will call parseEscape() passing m_delegate as
|
---|
213 | * an argument, and as such the escape will be reported to the delegate.
|
---|
214 | *
|
---|
215 | * However this method may also be used by parseCharacterClass(), in which
|
---|
216 | * case a CharacterClassParserDelegate will be passed as the delegate that
|
---|
217 | * tokens should be added to. A boolean flag is also provided to indicate
|
---|
218 | * whether that an escape in a CharacterClass is being parsed (some parsing
|
---|
219 | * rules change in this context).
|
---|
220 | *
|
---|
221 | * The boolean value returned by this method indicates whether the token
|
---|
222 | * parsed was an atom (outside of a characted class \b and \B will be
|
---|
223 | * interpreted as assertions).
|
---|
224 | */
|
---|
225 | template<bool inCharacterClass, class EscapeDelegate>
|
---|
226 | bool parseEscape(EscapeDelegate& delegate)
|
---|
227 | {
|
---|
228 | ASSERT(!m_err);
|
---|
229 | ASSERT(peek() == '\\');
|
---|
230 | consume();
|
---|
231 |
|
---|
232 | if (atEndOfPattern()) {
|
---|
233 | m_err = EscapeUnterminated;
|
---|
234 | return false;
|
---|
235 | }
|
---|
236 |
|
---|
237 | switch (peek()) {
|
---|
238 | // Assertions
|
---|
239 | case 'b':
|
---|
240 | consume();
|
---|
241 | if (inCharacterClass)
|
---|
242 | delegate.atomPatternCharacter('\b');
|
---|
243 | else {
|
---|
244 | delegate.assertionWordBoundary(false);
|
---|
245 | return false;
|
---|
246 | }
|
---|
247 | break;
|
---|
248 | case 'B':
|
---|
249 | consume();
|
---|
250 | if (inCharacterClass)
|
---|
251 | delegate.atomPatternCharacter('B');
|
---|
252 | else {
|
---|
253 | delegate.assertionWordBoundary(true);
|
---|
254 | return false;
|
---|
255 | }
|
---|
256 | break;
|
---|
257 |
|
---|
258 | // CharacterClassEscape
|
---|
259 | case 'd':
|
---|
260 | consume();
|
---|
261 | delegate.atomBuiltInCharacterClass(DigitClassID, false);
|
---|
262 | break;
|
---|
263 | case 's':
|
---|
264 | consume();
|
---|
265 | delegate.atomBuiltInCharacterClass(SpaceClassID, false);
|
---|
266 | break;
|
---|
267 | case 'w':
|
---|
268 | consume();
|
---|
269 | delegate.atomBuiltInCharacterClass(WordClassID, false);
|
---|
270 | break;
|
---|
271 | case 'D':
|
---|
272 | consume();
|
---|
273 | delegate.atomBuiltInCharacterClass(DigitClassID, true);
|
---|
274 | break;
|
---|
275 | case 'S':
|
---|
276 | consume();
|
---|
277 | delegate.atomBuiltInCharacterClass(SpaceClassID, true);
|
---|
278 | break;
|
---|
279 | case 'W':
|
---|
280 | consume();
|
---|
281 | delegate.atomBuiltInCharacterClass(WordClassID, true);
|
---|
282 | break;
|
---|
283 |
|
---|
284 | // DecimalEscape
|
---|
285 | case '1':
|
---|
286 | case '2':
|
---|
287 | case '3':
|
---|
288 | case '4':
|
---|
289 | case '5':
|
---|
290 | case '6':
|
---|
291 | case '7':
|
---|
292 | case '8':
|
---|
293 | case '9': {
|
---|
294 | // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
|
---|
295 | // First, try to parse this as backreference.
|
---|
296 | if (!inCharacterClass) {
|
---|
297 | ParseState state = saveState();
|
---|
298 |
|
---|
299 | unsigned backReference = consumeNumber();
|
---|
300 | if (backReference <= m_backReferenceLimit) {
|
---|
301 | delegate.atomBackReference(backReference);
|
---|
302 | break;
|
---|
303 | }
|
---|
304 |
|
---|
305 | restoreState(state);
|
---|
306 | }
|
---|
307 |
|
---|
308 | // Not a backreference, and not octal.
|
---|
309 | if (peek() >= '8') {
|
---|
310 | delegate.atomPatternCharacter('\\');
|
---|
311 | break;
|
---|
312 | }
|
---|
313 |
|
---|
314 | // Fall-through to handle this as an octal escape.
|
---|
315 | }
|
---|
316 |
|
---|
317 | // Octal escape
|
---|
318 | case '0':
|
---|
319 | delegate.atomPatternCharacter(consumeOctal());
|
---|
320 | break;
|
---|
321 |
|
---|
322 | // ControlEscape
|
---|
323 | case 'f':
|
---|
324 | consume();
|
---|
325 | delegate.atomPatternCharacter('\f');
|
---|
326 | break;
|
---|
327 | case 'n':
|
---|
328 | consume();
|
---|
329 | delegate.atomPatternCharacter('\n');
|
---|
330 | break;
|
---|
331 | case 'r':
|
---|
332 | consume();
|
---|
333 | delegate.atomPatternCharacter('\r');
|
---|
334 | break;
|
---|
335 | case 't':
|
---|
336 | consume();
|
---|
337 | delegate.atomPatternCharacter('\t');
|
---|
338 | break;
|
---|
339 | case 'v':
|
---|
340 | consume();
|
---|
341 | delegate.atomPatternCharacter('\v');
|
---|
342 | break;
|
---|
343 |
|
---|
344 | // ControlLetter
|
---|
345 | case 'c': {
|
---|
346 | ParseState state = saveState();
|
---|
347 | consume();
|
---|
348 | if (!atEndOfPattern()) {
|
---|
349 | int control = consume();
|
---|
350 |
|
---|
351 | // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
|
---|
352 | if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
|
---|
353 | delegate.atomPatternCharacter(control & 0x1f);
|
---|
354 | break;
|
---|
355 | }
|
---|
356 | }
|
---|
357 | restoreState(state);
|
---|
358 | delegate.atomPatternCharacter('\\');
|
---|
359 | break;
|
---|
360 | }
|
---|
361 |
|
---|
362 | // HexEscape
|
---|
363 | case 'x': {
|
---|
364 | consume();
|
---|
365 | int x = tryConsumeHex(2);
|
---|
366 | if (x == -1)
|
---|
367 | delegate.atomPatternCharacter('x');
|
---|
368 | else
|
---|
369 | delegate.atomPatternCharacter(x);
|
---|
370 | break;
|
---|
371 | }
|
---|
372 |
|
---|
373 | // UnicodeEscape
|
---|
374 | case 'u': {
|
---|
375 | consume();
|
---|
376 | int u = tryConsumeHex(4);
|
---|
377 | if (u == -1)
|
---|
378 | delegate.atomPatternCharacter('u');
|
---|
379 | else
|
---|
380 | delegate.atomPatternCharacter(u);
|
---|
381 | break;
|
---|
382 | }
|
---|
383 |
|
---|
384 | // IdentityEscape
|
---|
385 | default:
|
---|
386 | delegate.atomPatternCharacter(consume());
|
---|
387 | }
|
---|
388 |
|
---|
389 | return true;
|
---|
390 | }
|
---|
391 |
|
---|
392 | /*
|
---|
393 | * parseAtomEscape(), parseCharacterClassEscape():
|
---|
394 | *
|
---|
395 | * These methods alias to parseEscape().
|
---|
396 | */
|
---|
397 | bool parseAtomEscape()
|
---|
398 | {
|
---|
399 | return parseEscape<false>(m_delegate);
|
---|
400 | }
|
---|
401 | void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
|
---|
402 | {
|
---|
403 | parseEscape<true>(delegate);
|
---|
404 | }
|
---|
405 |
|
---|
406 | /*
|
---|
407 | * parseCharacterClass():
|
---|
408 | *
|
---|
409 | * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
|
---|
410 | * to an instance of CharacterClassParserDelegate, to describe the character class to the
|
---|
411 | * delegate.
|
---|
412 | */
|
---|
413 | void parseCharacterClass()
|
---|
414 | {
|
---|
415 | ASSERT(!m_err);
|
---|
416 | ASSERT(peek() == '[');
|
---|
417 | consume();
|
---|
418 |
|
---|
419 | CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
|
---|
420 |
|
---|
421 | characterClassConstructor.begin(tryConsume('^'));
|
---|
422 |
|
---|
423 | while (!atEndOfPattern()) {
|
---|
424 | switch (peek()) {
|
---|
425 | case ']':
|
---|
426 | consume();
|
---|
427 | characterClassConstructor.end();
|
---|
428 | return;
|
---|
429 |
|
---|
430 | case '\\':
|
---|
431 | parseCharacterClassEscape(characterClassConstructor);
|
---|
432 | break;
|
---|
433 |
|
---|
434 | default:
|
---|
435 | characterClassConstructor.atomPatternCharacterUnescaped(consume());
|
---|
436 | }
|
---|
437 |
|
---|
438 | if (m_err)
|
---|
439 | return;
|
---|
440 | }
|
---|
441 |
|
---|
442 | m_err = CharacterClassUnmatched;
|
---|
443 | }
|
---|
444 |
|
---|
445 | /*
|
---|
446 | * parseParenthesesBegin():
|
---|
447 | *
|
---|
448 | * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
|
---|
449 | */
|
---|
450 | void parseParenthesesBegin()
|
---|
451 | {
|
---|
452 | ASSERT(!m_err);
|
---|
453 | ASSERT(peek() == '(');
|
---|
454 | consume();
|
---|
455 |
|
---|
456 | if (tryConsume('?')) {
|
---|
457 | if (atEndOfPattern()) {
|
---|
458 | m_err = ParenthesesTypeInvalid;
|
---|
459 | return;
|
---|
460 | }
|
---|
461 |
|
---|
462 | switch (consume()) {
|
---|
463 | case ':':
|
---|
464 | m_delegate.atomParenthesesSubpatternBegin(false);
|
---|
465 | break;
|
---|
466 |
|
---|
467 | case '=':
|
---|
468 | m_delegate.atomParentheticalAssertionBegin();
|
---|
469 | break;
|
---|
470 |
|
---|
471 | case '!':
|
---|
472 | m_delegate.atomParentheticalAssertionBegin(true);
|
---|
473 | break;
|
---|
474 |
|
---|
475 | default:
|
---|
476 | m_err = ParenthesesTypeInvalid;
|
---|
477 | }
|
---|
478 | } else
|
---|
479 | m_delegate.atomParenthesesSubpatternBegin();
|
---|
480 |
|
---|
481 | ++m_parenthesesNestingDepth;
|
---|
482 | }
|
---|
483 |
|
---|
484 | /*
|
---|
485 | * parseParenthesesEnd():
|
---|
486 | *
|
---|
487 | * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
|
---|
488 | */
|
---|
489 | void parseParenthesesEnd()
|
---|
490 | {
|
---|
491 | ASSERT(!m_err);
|
---|
492 | ASSERT(peek() == ')');
|
---|
493 | consume();
|
---|
494 |
|
---|
495 | if (m_parenthesesNestingDepth > 0)
|
---|
496 | m_delegate.atomParenthesesEnd();
|
---|
497 | else
|
---|
498 | m_err = ParenthesesUnmatched;
|
---|
499 |
|
---|
500 | --m_parenthesesNestingDepth;
|
---|
501 | }
|
---|
502 |
|
---|
503 | /*
|
---|
504 | * parseQuantifier():
|
---|
505 | *
|
---|
506 | * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
|
---|
507 | */
|
---|
508 | void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
|
---|
509 | {
|
---|
510 | ASSERT(!m_err);
|
---|
511 | ASSERT(min <= max);
|
---|
512 |
|
---|
513 | if (lastTokenWasAnAtom)
|
---|
514 | m_delegate.quantifyAtom(min, max, !tryConsume('?'));
|
---|
515 | else
|
---|
516 | m_err = QuantifierWithoutAtom;
|
---|
517 | }
|
---|
518 |
|
---|
519 | /*
|
---|
520 | * parseTokens():
|
---|
521 | *
|
---|
522 | * This method loops over the input pattern reporting tokens to the delegate.
|
---|
523 | * The method returns when a parse error is detected, or the end of the pattern
|
---|
524 | * is reached. One piece of state is tracked around the loop, which is whether
|
---|
525 | * the last token passed to the delegate was an atom (this is necessary to detect
|
---|
526 | * a parse error when a quantifier provided without an atom to quantify).
|
---|
527 | */
|
---|
528 | void parseTokens()
|
---|
529 | {
|
---|
530 | bool lastTokenWasAnAtom = false;
|
---|
531 |
|
---|
532 | while (!atEndOfPattern()) {
|
---|
533 | switch (peek()) {
|
---|
534 | case '|':
|
---|
535 | consume();
|
---|
536 | m_delegate.disjunction();
|
---|
537 | lastTokenWasAnAtom = false;
|
---|
538 | break;
|
---|
539 |
|
---|
540 | case '(':
|
---|
541 | parseParenthesesBegin();
|
---|
542 | lastTokenWasAnAtom = false;
|
---|
543 | break;
|
---|
544 |
|
---|
545 | case ')':
|
---|
546 | parseParenthesesEnd();
|
---|
547 | lastTokenWasAnAtom = true;
|
---|
548 | break;
|
---|
549 |
|
---|
550 | case '^':
|
---|
551 | consume();
|
---|
552 | m_delegate.assertionBOL();
|
---|
553 | lastTokenWasAnAtom = false;
|
---|
554 | break;
|
---|
555 |
|
---|
556 | case '$':
|
---|
557 | consume();
|
---|
558 | m_delegate.assertionEOL();
|
---|
559 | lastTokenWasAnAtom = false;
|
---|
560 | break;
|
---|
561 |
|
---|
562 | case '.':
|
---|
563 | consume();
|
---|
564 | m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
|
---|
565 | lastTokenWasAnAtom = true;
|
---|
566 | break;
|
---|
567 |
|
---|
568 | case '[':
|
---|
569 | parseCharacterClass();
|
---|
570 | lastTokenWasAnAtom = true;
|
---|
571 | break;
|
---|
572 |
|
---|
573 | case '\\':
|
---|
574 | lastTokenWasAnAtom = parseAtomEscape();
|
---|
575 | break;
|
---|
576 |
|
---|
577 | case '*':
|
---|
578 | consume();
|
---|
579 | parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
|
---|
580 | lastTokenWasAnAtom = false;
|
---|
581 | break;
|
---|
582 |
|
---|
583 | case '+':
|
---|
584 | consume();
|
---|
585 | parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
|
---|
586 | lastTokenWasAnAtom = false;
|
---|
587 | break;
|
---|
588 |
|
---|
589 | case '?':
|
---|
590 | consume();
|
---|
591 | parseQuantifier(lastTokenWasAnAtom, 0, 1);
|
---|
592 | lastTokenWasAnAtom = false;
|
---|
593 | break;
|
---|
594 |
|
---|
595 | case '{': {
|
---|
596 | ParseState state = saveState();
|
---|
597 |
|
---|
598 | consume();
|
---|
599 | if (peekIsDigit()) {
|
---|
600 | unsigned min = consumeNumber();
|
---|
601 | unsigned max = min;
|
---|
602 |
|
---|
603 | if (tryConsume(','))
|
---|
604 | max = peekIsDigit() ? consumeNumber() : UINT_MAX;
|
---|
605 |
|
---|
606 | if (tryConsume('}')) {
|
---|
607 | if (min <= max)
|
---|
608 | parseQuantifier(lastTokenWasAnAtom, min, max);
|
---|
609 | else
|
---|
610 | m_err = QuantifierOutOfOrder;
|
---|
611 | lastTokenWasAnAtom = false;
|
---|
612 | break;
|
---|
613 | }
|
---|
614 | }
|
---|
615 |
|
---|
616 | restoreState(state);
|
---|
617 | } // if we did not find a complete quantifer, fall through to the default case.
|
---|
618 |
|
---|
619 | default:
|
---|
620 | m_delegate.atomPatternCharacter(consume());
|
---|
621 | lastTokenWasAnAtom = true;
|
---|
622 | }
|
---|
623 |
|
---|
624 | if (m_err)
|
---|
625 | return;
|
---|
626 | }
|
---|
627 |
|
---|
628 | if (m_parenthesesNestingDepth > 0)
|
---|
629 | m_err = MissingParentheses;
|
---|
630 | }
|
---|
631 |
|
---|
632 | /*
|
---|
633 | * parse():
|
---|
634 | *
|
---|
635 | * This method calls regexBegin(), calls parseTokens() to parse over the input
|
---|
636 | * patterns, calls regexEnd() or regexError() as appropriate, and converts any
|
---|
637 | * error code to a const char* for a result.
|
---|
638 | */
|
---|
639 | const char* parse()
|
---|
640 | {
|
---|
641 | m_delegate.regexBegin();
|
---|
642 |
|
---|
643 | if (m_size > MAX_PATTERN_SIZE)
|
---|
644 | m_err = PatternTooLarge;
|
---|
645 | else
|
---|
646 | parseTokens();
|
---|
647 | ASSERT(atEndOfPattern() || m_err);
|
---|
648 |
|
---|
649 | if (m_err)
|
---|
650 | m_delegate.regexError();
|
---|
651 | else
|
---|
652 | m_delegate.regexEnd();
|
---|
653 |
|
---|
654 | // The order of this array must match the ErrorCode enum.
|
---|
655 | static const char* errorMessages[NumberOfErrorCodes] = {
|
---|
656 | 0, // NoError
|
---|
657 | "regular expression too large",
|
---|
658 | "numbers out of order in {} quantifier",
|
---|
659 | "nothing to repeat",
|
---|
660 | "missing )",
|
---|
661 | "unmatched parentheses",
|
---|
662 | "unrecognized character after (?",
|
---|
663 | "missing terminating ] for character class",
|
---|
664 | "range out of order in character class",
|
---|
665 | "\\ at end of pattern"
|
---|
666 | };
|
---|
667 |
|
---|
668 | return errorMessages[m_err];
|
---|
669 | }
|
---|
670 |
|
---|
671 |
|
---|
672 | // Misc helper functions:
|
---|
673 |
|
---|
674 | typedef unsigned ParseState;
|
---|
675 |
|
---|
676 | ParseState saveState()
|
---|
677 | {
|
---|
678 | return m_index;
|
---|
679 | }
|
---|
680 |
|
---|
681 | void restoreState(ParseState state)
|
---|
682 | {
|
---|
683 | m_index = state;
|
---|
684 | }
|
---|
685 |
|
---|
686 | bool atEndOfPattern()
|
---|
687 | {
|
---|
688 | ASSERT(m_index <= m_size);
|
---|
689 | return m_index == m_size;
|
---|
690 | }
|
---|
691 |
|
---|
692 | int peek()
|
---|
693 | {
|
---|
694 | ASSERT(m_index < m_size);
|
---|
695 | return m_data[m_index];
|
---|
696 | }
|
---|
697 |
|
---|
698 | bool peekIsDigit()
|
---|
699 | {
|
---|
700 | return !atEndOfPattern() && WTF::isASCIIDigit(peek());
|
---|
701 | }
|
---|
702 |
|
---|
703 | unsigned peekDigit()
|
---|
704 | {
|
---|
705 | ASSERT(peekIsDigit());
|
---|
706 | return peek() - '0';
|
---|
707 | }
|
---|
708 |
|
---|
709 | int consume()
|
---|
710 | {
|
---|
711 | ASSERT(m_index < m_size);
|
---|
712 | return m_data[m_index++];
|
---|
713 | }
|
---|
714 |
|
---|
715 | unsigned consumeDigit()
|
---|
716 | {
|
---|
717 | ASSERT(peekIsDigit());
|
---|
718 | return consume() - '0';
|
---|
719 | }
|
---|
720 |
|
---|
721 | unsigned consumeNumber()
|
---|
722 | {
|
---|
723 | unsigned n = consumeDigit();
|
---|
724 | // check for overflow.
|
---|
725 | for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
|
---|
726 | n = newValue;
|
---|
727 | consume();
|
---|
728 | }
|
---|
729 | return n;
|
---|
730 | }
|
---|
731 |
|
---|
732 | unsigned consumeOctal()
|
---|
733 | {
|
---|
734 | ASSERT(WTF::isASCIIOctalDigit(peek()));
|
---|
735 |
|
---|
736 | unsigned n = consumeDigit();
|
---|
737 | while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
|
---|
738 | n = n * 8 + consumeDigit();
|
---|
739 | return n;
|
---|
740 | }
|
---|
741 |
|
---|
742 | bool tryConsume(UChar ch)
|
---|
743 | {
|
---|
744 | if (atEndOfPattern() || (m_data[m_index] != ch))
|
---|
745 | return false;
|
---|
746 | ++m_index;
|
---|
747 | return true;
|
---|
748 | }
|
---|
749 |
|
---|
750 | int tryConsumeHex(int count)
|
---|
751 | {
|
---|
752 | ParseState state = saveState();
|
---|
753 |
|
---|
754 | int n = 0;
|
---|
755 | while (count--) {
|
---|
756 | if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
|
---|
757 | restoreState(state);
|
---|
758 | return -1;
|
---|
759 | }
|
---|
760 | n = (n << 4) | WTF::toASCIIHexValue(consume());
|
---|
761 | }
|
---|
762 | return n;
|
---|
763 | }
|
---|
764 |
|
---|
765 | Delegate& m_delegate;
|
---|
766 | unsigned m_backReferenceLimit;
|
---|
767 | ErrorCode m_err;
|
---|
768 | const UChar* m_data;
|
---|
769 | unsigned m_size;
|
---|
770 | unsigned m_index;
|
---|
771 | unsigned m_parenthesesNestingDepth;
|
---|
772 |
|
---|
773 | // Derived by empirical testing of compile time in PCRE and WREC.
|
---|
774 | static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
|
---|
775 | };
|
---|
776 |
|
---|
777 | /*
|
---|
778 | * Yarr::parse():
|
---|
779 | *
|
---|
780 | * The parse method is passed a pattern to be parsed and a delegate upon which
|
---|
781 | * callbacks will be made to record the parsed tokens forming the regex.
|
---|
782 | * Yarr::parse() returns null on success, or a const C string providing an error
|
---|
783 | * message where a parse error occurs.
|
---|
784 | *
|
---|
785 | * The Delegate must implement the following interface:
|
---|
786 | *
|
---|
787 | * void assertionBOL();
|
---|
788 | * void assertionEOL();
|
---|
789 | * void assertionWordBoundary(bool invert);
|
---|
790 | *
|
---|
791 | * void atomPatternCharacter(UChar ch);
|
---|
792 | * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
|
---|
793 | * void atomCharacterClassBegin(bool invert)
|
---|
794 | * void atomCharacterClassAtom(UChar ch)
|
---|
795 | * void atomCharacterClassRange(UChar begin, UChar end)
|
---|
796 | * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
|
---|
797 | * void atomCharacterClassEnd()
|
---|
798 | * void atomParenthesesSubpatternBegin(bool capture = true);
|
---|
799 | * void atomParentheticalAssertionBegin(bool invert = false);
|
---|
800 | * void atomParenthesesEnd();
|
---|
801 | * void atomBackReference(unsigned subpatternId);
|
---|
802 | *
|
---|
803 | * void quantifyAtom(unsigned min, unsigned max, bool greedy);
|
---|
804 | *
|
---|
805 | * void disjunction();
|
---|
806 | *
|
---|
807 | * void regexBegin();
|
---|
808 | * void regexEnd();
|
---|
809 | * void regexError();
|
---|
810 | *
|
---|
811 | * Before any call recording tokens are made, regexBegin() will be called on the
|
---|
812 | * delegate once. Once parsing is complete either regexEnd() or regexError() will
|
---|
813 | * be called, as appropriate.
|
---|
814 | *
|
---|
815 | * The regular expression is described by a sequence of assertion*() and atom*()
|
---|
816 | * callbacks to the delegate, describing the terms in the regular expression.
|
---|
817 | * Following an atom a quantifyAtom() call may occur to indicate that the previous
|
---|
818 | * atom should be quantified. In the case of atoms described across multiple
|
---|
819 | * calls (parentheses and character classes) the call to quantifyAtom() will come
|
---|
820 | * after the call to the atom*End() method, never after atom*Begin().
|
---|
821 | *
|
---|
822 | * Character classes may either be described by a single call to
|
---|
823 | * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
|
---|
824 | * In the latter case, ...Begin() will be called, followed by a sequence of
|
---|
825 | * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
|
---|
826 | *
|
---|
827 | * Sequences of atoms and assertions are broken into alternatives via calls to
|
---|
828 | * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
|
---|
829 | * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
|
---|
830 | * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
|
---|
831 | * capturing subpattern, this will be the subpatternId associated with these
|
---|
832 | * parentheses, and will also by definition be the lowest subpatternId of these
|
---|
833 | * parentheses and of any nested paretheses. The atomParenthesesEnd() method
|
---|
834 | * is passed the subpatternId of the last capturing subexpression nested within
|
---|
835 | * these paretheses. In the case of a capturing subpattern with no nested
|
---|
836 | * capturing subpatterns, the same subpatternId will be passed to the begin and
|
---|
837 | * end functions. In the case of non-capturing subpatterns the subpatternId
|
---|
838 | * passed to the begin method is also the first possible subpatternId that might
|
---|
839 | * be nested within these paretheses. If a set of non-capturing parentheses does
|
---|
840 | * not contain any capturing subpatterns, then the subpatternId passed to begin
|
---|
841 | * will be greater than the subpatternId passed to end.
|
---|
842 | */
|
---|
843 |
|
---|
844 | template<class Delegate>
|
---|
845 | const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
|
---|
846 | {
|
---|
847 | return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
|
---|
848 | }
|
---|
849 |
|
---|
850 | } } // namespace JSC::Yarr
|
---|
851 |
|
---|
852 | #endif
|
---|
853 |
|
---|
854 | #endif // RegexParser_h
|
---|