source: webkit/trunk/JavaScriptCore/kjs/lexer.cpp@ 2772

Last change on this file since 2772 was 2772, checked in by darin, 23 years ago
  • a few more globals for often-used property names
  • conversion to Identifier from UString must now be explicit
  • kjs/error_object.cpp:
  • kjs/function.cpp:
  • kjs/function_object.cpp:
  • kjs/identifier.cpp:
  • kjs/identifier.h:
  • kjs/lexer.cpp:
  • kjs/nodes.cpp:
  • kjs/number_object.cpp:
  • kjs/object.cpp:
  • kjs/object.h:
  • kjs/string_object.cpp:
  • kjs/testkjs.cpp:
  • kjs/ustring.cpp:
  • kjs/ustring.h:
  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
Line 
1// -*- c-basic-offset: 2 -*-
2/*
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2000 Harri Porten ([email protected])
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 *
21 */
22
23#ifdef HAVE_CONFIG_H
24#include <config.h>
25#endif
26
27#include <ctype.h>
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31#include <assert.h>
32
33#include "value.h"
34#include "object.h"
35#include "types.h"
36#include "interpreter.h"
37#include "nodes.h"
38#include "lexer.h"
39#include "identifier.h"
40#include "lookup.h"
41#include "internal.h"
42
43// we can't specify the namespace in yacc's C output, so do it here
44using namespace KJS;
45
46static Lexer *currLexer = 0;
47
48#ifndef KDE_USE_FINAL
49#include "grammar.h"
50#endif
51
52#include "lexer.lut.h"
53
54extern YYLTYPE yylloc; // global bison variable holding token info
55
56// a bridge for yacc from the C world to C++
57int kjsyylex()
58{
59 return Lexer::curr()->lex();
60}
61
62Lexer::Lexer()
63 : yylineno(1),
64 size8(128), size16(128), restrKeyword(false),
65 eatNextIdentifier(false), stackToken(-1), lastToken(-1), pos(0),
66 code(0), length(0),
67#ifndef KJS_PURE_ECMA
68 bol(true),
69#endif
70 current(0), next1(0), next2(0), next3(0)
71{
72 // allocate space for read buffers
73 buffer8 = new char[size8];
74 buffer16 = new UChar[size16];
75 currLexer = this;
76
77}
78
79Lexer::~Lexer()
80{
81 delete [] buffer8;
82 delete [] buffer16;
83}
84
85Lexer *Lexer::curr()
86{
87 if (!currLexer) {
88 // create singleton instance
89 currLexer = new Lexer();
90 }
91 return currLexer;
92}
93
94#ifdef KJS_DEBUG_MEM
95void Lexer::globalClear()
96{
97 delete currLexer;
98 currLexer = 0L;
99}
100#endif
101
102void Lexer::setCode(const UChar *c, unsigned int len)
103{
104 yylineno = 1;
105 restrKeyword = false;
106 delimited = false;
107 eatNextIdentifier = false;
108 stackToken = -1;
109 lastToken = -1;
110 pos = 0;
111 code = c;
112 length = len;
113 skipLF = false;
114 skipCR = false;
115#ifndef KJS_PURE_ECMA
116 bol = true;
117#endif
118
119 // read first characters
120 current = (length > 0) ? code[0].unicode() : 0;
121 next1 = (length > 1) ? code[1].unicode() : 0;
122 next2 = (length > 2) ? code[2].unicode() : 0;
123 next3 = (length > 3) ? code[3].unicode() : 0;
124}
125
126void Lexer::shift(unsigned int p)
127{
128 while (p--) {
129 pos++;
130 current = next1;
131 next1 = next2;
132 next2 = next3;
133 next3 = (pos + 3 < length) ? code[pos+3].unicode() : 0;
134 }
135}
136
137// called on each new line
138void Lexer::nextLine()
139{
140 yylineno++;
141#ifndef KJS_PURE_ECMA
142 bol = true;
143#endif
144}
145
146void Lexer::setDone(State s)
147{
148 state = s;
149 done = true;
150}
151
152int Lexer::lex()
153{
154 int token = 0;
155 state = Start;
156 unsigned short stringType = 0; // either single or double quotes
157 pos8 = pos16 = 0;
158 done = false;
159 terminator = false;
160 skipLF = false;
161 skipCR = false;
162
163 // did we push a token on the stack previously ?
164 // (after an automatic semicolon insertion)
165 if (stackToken >= 0) {
166 setDone(Other);
167 token = stackToken;
168 stackToken = 0;
169 }
170
171 while (!done) {
172 if (skipLF && current != '\n') // found \r but not \n afterwards
173 skipLF = false;
174 if (skipCR && current != '\r') // found \n but not \r afterwards
175 skipCR = false;
176 if (skipLF || skipCR) // found \r\n or \n\r -> eat the second one
177 {
178 skipLF = false;
179 skipCR = false;
180 shift(1);
181 }
182 switch (state) {
183 case Start:
184 if (isWhiteSpace()) {
185 // do nothing
186 } else if (current == '/' && next1 == '/') {
187 shift(1);
188 state = InSingleLineComment;
189 } else if (current == '/' && next1 == '*') {
190 shift(1);
191 state = InMultiLineComment;
192 } else if (current == 0) {
193 if (!terminator && !delimited) {
194 // automatic semicolon insertion if program incomplete
195 token = ';';
196 stackToken = 0;
197 setDone(Other);
198 } else
199 setDone(Eof);
200 } else if (isLineTerminator()) {
201 nextLine();
202 terminator = true;
203 if (restrKeyword) {
204 token = ';';
205 setDone(Other);
206 }
207 } else if (current == '"' || current == '\'') {
208 state = InString;
209 stringType = current;
210 } else if (isIdentLetter(current)) {
211 record16(current);
212 state = InIdentifier;
213 } else if (current == '0') {
214 record8(current);
215 state = InNum0;
216 } else if (isDecimalDigit(current)) {
217 record8(current);
218 state = InNum;
219 } else if (current == '.' && isDecimalDigit(next1)) {
220 record8(current);
221 state = InDecimal;
222#ifndef KJS_PURE_ECMA
223 // <!-- marks the beginning of a line comment (for www usage)
224 } else if (current == '<' && next1 == '!' &&
225 next2 == '-' && next3 == '-') {
226 shift(3);
227 state = InSingleLineComment;
228 // same for -->
229 } else if (bol && current == '-' && next1 == '-' && next2 == '>') {
230 shift(2);
231 state = InSingleLineComment;
232#endif
233 } else {
234 token = matchPunctuator(current, next1, next2, next3);
235 if (token != -1) {
236 setDone(Other);
237 } else {
238 // cerr << "encountered unknown character" << endl;
239 setDone(Bad);
240 }
241 }
242 break;
243 case InString:
244 if (current == stringType) {
245 shift(1);
246 setDone(String);
247 } else if (current == 0 || isLineTerminator()) {
248 setDone(Bad);
249 } else if (current == '\\') {
250 state = InEscapeSequence;
251 } else {
252 record16(current);
253 }
254 break;
255 // Escape Sequences inside of strings
256 case InEscapeSequence:
257 if (isOctalDigit(current)) {
258 if (current >= '0' && current <= '3' &&
259 isOctalDigit(next1) && isOctalDigit(next2)) {
260 record16(convertOctal(current, next1, next2));
261 shift(2);
262 state = InString;
263 } else if (isOctalDigit(current) && isOctalDigit(next1)) {
264 record16(convertOctal('0', current, next1));
265 shift(1);
266 state = InString;
267 } else if (isOctalDigit(current)) {
268 record16(convertOctal('0', '0', current));
269 state = InString;
270 } else {
271 setDone(Bad);
272 }
273 } else if (current == 'x')
274 state = InHexEscape;
275 else if (current == 'u')
276 state = InUnicodeEscape;
277 else {
278 record16(singleEscape(current));
279 state = InString;
280 }
281 break;
282 case InHexEscape:
283 if (isHexDigit(current) && isHexDigit(next1)) {
284 state = InString;
285 record16(convertHex(current, next1));
286 shift(1);
287 } else if (current == stringType) {
288 record16('x');
289 shift(1);
290 setDone(String);
291 } else {
292 record16('x');
293 record16(current);
294 state = InString;
295 }
296 break;
297 case InUnicodeEscape:
298 if (isHexDigit(current) && isHexDigit(next1) &&
299 isHexDigit(next2) && isHexDigit(next3)) {
300 record16(convertUnicode(current, next1, next2, next3));
301 shift(3);
302 state = InString;
303 } else if (current == stringType) {
304 record16('u');
305 shift(1);
306 setDone(String);
307 } else {
308 setDone(Bad);
309 }
310 break;
311 case InSingleLineComment:
312 if (isLineTerminator()) {
313 nextLine();
314 terminator = true;
315 if (restrKeyword) {
316 token = ';';
317 setDone(Other);
318 } else
319 state = Start;
320 } else if (current == 0) {
321 setDone(Eof);
322 }
323 break;
324 case InMultiLineComment:
325 if (current == 0) {
326 setDone(Bad);
327 } else if (isLineTerminator()) {
328 nextLine();
329 } else if (current == '*' && next1 == '/') {
330 state = Start;
331 shift(1);
332 }
333 break;
334 case InIdentifier:
335 if (isIdentLetter(current) || isDecimalDigit(current)) {
336 record16(current);
337 break;
338 }
339 setDone(Identifier);
340 break;
341 case InNum0:
342 if (current == 'x' || current == 'X') {
343 record8(current);
344 state = InHex;
345 } else if (current == '.') {
346 record8(current);
347 state = InDecimal;
348 } else if (current == 'e' || current == 'E') {
349 record8(current);
350 state = InExponentIndicator;
351 } else if (isOctalDigit(current)) {
352 record8(current);
353 state = InOctal;
354 } else if (isDecimalDigit(current)) {
355 record8(current);
356 state = InDecimal;
357 } else {
358 setDone(Number);
359 }
360 break;
361 case InHex:
362 if (isHexDigit(current)) {
363 record8(current);
364 } else {
365 setDone(Hex);
366 }
367 break;
368 case InOctal:
369 if (isOctalDigit(current)) {
370 record8(current);
371 }
372 else if (isDecimalDigit(current)) {
373 record8(current);
374 state = InDecimal;
375 } else
376 setDone(Octal);
377 break;
378 case InNum:
379 if (isDecimalDigit(current)) {
380 record8(current);
381 } else if (current == '.') {
382 record8(current);
383 state = InDecimal;
384 } else if (current == 'e' || current == 'E') {
385 record8(current);
386 state = InExponentIndicator;
387 } else
388 setDone(Number);
389 break;
390 case InDecimal:
391 if (isDecimalDigit(current)) {
392 record8(current);
393 } else if (current == 'e' || current == 'E') {
394 record8(current);
395 state = InExponentIndicator;
396 } else
397 setDone(Number);
398 break;
399 case InExponentIndicator:
400 if (current == '+' || current == '-') {
401 record8(current);
402 } else if (isDecimalDigit(current)) {
403 record8(current);
404 state = InExponent;
405 } else
406 setDone(Bad);
407 break;
408 case InExponent:
409 if (isDecimalDigit(current)) {
410 record8(current);
411 } else
412 setDone(Number);
413 break;
414 default:
415 assert(!"Unhandled state in switch statement");
416 }
417
418 // move on to the next character
419 if (!done)
420 shift(1);
421#ifndef KJS_PURE_ECMA
422 if (state != Start && state != InSingleLineComment)
423 bol = false;
424#endif
425 }
426
427 // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
428 if ((state == Number || state == Octal || state == Hex)
429 && isIdentLetter(current))
430 state = Bad;
431
432 // terminate string
433 buffer8[pos8] = '\0';
434
435#ifdef KJS_DEBUG_LEX
436 fprintf(stderr, "line: %d ", lineNo());
437 fprintf(stderr, "yytext (%x): ", buffer8[0]);
438 fprintf(stderr, "%s ", buffer8);
439#endif
440
441 double dval = 0;
442 if (state == Number) {
443 dval = strtod(buffer8, 0L);
444 } else if (state == Hex) { // scan hex numbers
445 // TODO: support long unsigned int
446 unsigned int i;
447 sscanf(buffer8, "%x", &i);
448 dval = i;
449 state = Number;
450 } else if (state == Octal) { // scan octal number
451 unsigned int ui;
452 sscanf(buffer8, "%o", &ui);
453 dval = ui;
454 state = Number;
455 }
456
457#ifdef KJS_DEBUG_LEX
458 switch (state) {
459 case Eof:
460 printf("(EOF)\n");
461 break;
462 case Other:
463 printf("(Other)\n");
464 break;
465 case Identifier:
466 printf("(Identifier)/(Keyword)\n");
467 break;
468 case String:
469 printf("(String)\n");
470 break;
471 case Number:
472 printf("(Number)\n");
473 break;
474 default:
475 printf("(unknown)");
476 }
477#endif
478
479 if (state != Identifier && eatNextIdentifier)
480 eatNextIdentifier = false;
481
482 restrKeyword = false;
483 delimited = false;
484 yylloc.first_line = yylineno; // ???
485 yylloc.last_line = yylineno;
486
487 switch (state) {
488 case Eof:
489 token = 0;
490 break;
491 case Other:
492 if(token == '}' || token == ';') {
493 delimited = true;
494 }
495 break;
496 case Identifier:
497 if ((token = Lookup::find(&mainTable, buffer16, pos16)) < 0) {
498 // Lookup for keyword failed, means this is an identifier
499 // Apply anonymous-function hack below (eat the identifier)
500 if (eatNextIdentifier) {
501 eatNextIdentifier = false;
502 UString debugstr(buffer16, pos16); fprintf(stderr,"Anonymous function hack: eating identifier %s\n",debugstr.ascii());
503 token = lex();
504 break;
505 }
506 /* TODO: close leak on parse error. same holds true for String */
507 kjsyylval.ident = new KJS::Identifier(buffer16, pos16);
508 token = IDENT;
509 break;
510 }
511
512 eatNextIdentifier = false;
513 // Hack for "f = function somename() { ... }", too hard to get into the grammar
514 if (token == FUNCTION && lastToken == '=' )
515 eatNextIdentifier = true;
516
517 if (token == CONTINUE || token == BREAK ||
518 token == RETURN || token == THROW)
519 restrKeyword = true;
520 break;
521 case String:
522 kjsyylval.ustr = new UString(buffer16, pos16);
523 token = STRING;
524 break;
525 case Number:
526 kjsyylval.dval = dval;
527 token = NUMBER;
528 break;
529 case Bad:
530 fprintf(stderr, "yylex: ERROR.\n");
531 return -1;
532 default:
533 assert(!"unhandled numeration value in switch");
534 return -1;
535 }
536 lastToken = token;
537 return token;
538}
539
540bool Lexer::isWhiteSpace() const
541{
542 return (current == ' ' || current == '\t' ||
543 current == 0x0b || current == 0x0c);
544}
545
546bool Lexer::isLineTerminator()
547{
548 bool cr = (current == '\r');
549 bool lf = (current == '\n');
550 if (cr)
551 skipLF = true;
552 else if (lf)
553 skipCR = true;
554 return cr || lf;
555}
556
557bool Lexer::isIdentLetter(unsigned short c)
558{
559 /* TODO: allow other legitimate unicode chars */
560 return (c >= 'a' && c <= 'z' ||
561 c >= 'A' && c <= 'Z' ||
562 c == '$' || c == '_');
563}
564
565bool Lexer::isDecimalDigit(unsigned short c)
566{
567 return (c >= '0' && c <= '9');
568}
569
570bool Lexer::isHexDigit(unsigned short c) const
571{
572 return (c >= '0' && c <= '9' ||
573 c >= 'a' && c <= 'f' ||
574 c >= 'A' && c <= 'F');
575}
576
577bool Lexer::isOctalDigit(unsigned short c) const
578{
579 return (c >= '0' && c <= '7');
580}
581
582int Lexer::matchPunctuator(unsigned short c1, unsigned short c2,
583 unsigned short c3, unsigned short c4)
584{
585 if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
586 shift(4);
587 return URSHIFTEQUAL;
588 } else if (c1 == '=' && c2 == '=' && c3 == '=') {
589 shift(3);
590 return STREQ;
591 } else if (c1 == '!' && c2 == '=' && c3 == '=') {
592 shift(3);
593 return STRNEQ;
594 } else if (c1 == '>' && c2 == '>' && c3 == '>') {
595 shift(3);
596 return URSHIFT;
597 } else if (c1 == '<' && c2 == '<' && c3 == '=') {
598 shift(3);
599 return LSHIFTEQUAL;
600 } else if (c1 == '>' && c2 == '>' && c3 == '=') {
601 shift(3);
602 return RSHIFTEQUAL;
603 } else if (c1 == '<' && c2 == '=') {
604 shift(2);
605 return LE;
606 } else if (c1 == '>' && c2 == '=') {
607 shift(2);
608 return GE;
609 } else if (c1 == '!' && c2 == '=') {
610 shift(2);
611 return NE;
612 } else if (c1 == '+' && c2 == '+') {
613 shift(2);
614 if (terminator)
615 return AUTOPLUSPLUS;
616 else
617 return PLUSPLUS;
618 } else if (c1 == '-' && c2 == '-') {
619 shift(2);
620 if (terminator)
621 return AUTOMINUSMINUS;
622 else
623 return MINUSMINUS;
624 } else if (c1 == '=' && c2 == '=') {
625 shift(2);
626 return EQEQ;
627 } else if (c1 == '+' && c2 == '=') {
628 shift(2);
629 return PLUSEQUAL;
630 } else if (c1 == '-' && c2 == '=') {
631 shift(2);
632 return MINUSEQUAL;
633 } else if (c1 == '*' && c2 == '=') {
634 shift(2);
635 return MULTEQUAL;
636 } else if (c1 == '/' && c2 == '=') {
637 shift(2);
638 return DIVEQUAL;
639 } else if (c1 == '&' && c2 == '=') {
640 shift(2);
641 return ANDEQUAL;
642 } else if (c1 == '^' && c2 == '=') {
643 shift(2);
644 return XOREQUAL;
645 } else if (c1 == '%' && c2 == '=') {
646 shift(2);
647 return MODEQUAL;
648 } else if (c1 == '|' && c2 == '=') {
649 shift(2);
650 return OREQUAL;
651 } else if (c1 == '<' && c2 == '<') {
652 shift(2);
653 return LSHIFT;
654 } else if (c1 == '>' && c2 == '>') {
655 shift(2);
656 return RSHIFT;
657 } else if (c1 == '&' && c2 == '&') {
658 shift(2);
659 return AND;
660 } else if (c1 == '|' && c2 == '|') {
661 shift(2);
662 return OR;
663 }
664
665 switch(c1) {
666 case '=':
667 case '>':
668 case '<':
669 case ',':
670 case '!':
671 case '~':
672 case '?':
673 case ':':
674 case '.':
675 case '+':
676 case '-':
677 case '*':
678 case '/':
679 case '&':
680 case '|':
681 case '^':
682 case '%':
683 case '(':
684 case ')':
685 case '{':
686 case '}':
687 case '[':
688 case ']':
689 case ';':
690 shift(1);
691 return static_cast<int>(c1);
692 default:
693 return -1;
694 }
695}
696
697unsigned short Lexer::singleEscape(unsigned short c) const
698{
699 switch(c) {
700 case 'b':
701 return 0x08;
702 case 't':
703 return 0x09;
704 case 'n':
705 return 0x0A;
706 case 'v':
707 return 0x0B;
708 case 'f':
709 return 0x0C;
710 case 'r':
711 return 0x0D;
712 case '"':
713 return 0x22;
714 case '\'':
715 return 0x27;
716 case '\\':
717 return 0x5C;
718 default:
719 return c;
720 }
721}
722
723unsigned short Lexer::convertOctal(unsigned short c1, unsigned short c2,
724 unsigned short c3) const
725{
726 return ((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
727}
728
729unsigned char Lexer::convertHex(unsigned short c)
730{
731 if (c >= '0' && c <= '9')
732 return (c - '0');
733 else if (c >= 'a' && c <= 'f')
734 return (c - 'a' + 10);
735 else
736 return (c - 'A' + 10);
737}
738
739unsigned char Lexer::convertHex(unsigned short c1, unsigned short c2)
740{
741 return ((convertHex(c1) << 4) + convertHex(c2));
742}
743
744UChar Lexer::convertUnicode(unsigned short c1, unsigned short c2,
745 unsigned short c3, unsigned short c4)
746{
747 return UChar((convertHex(c1) << 4) + convertHex(c2),
748 (convertHex(c3) << 4) + convertHex(c4));
749}
750
751void Lexer::record8(unsigned short c)
752{
753 assert(c <= 0xff);
754
755 // enlarge buffer if full
756 if (pos8 >= size8 - 1) {
757 char *tmp = new char[2 * size8];
758 memcpy(tmp, buffer8, size8 * sizeof(char));
759 delete [] buffer8;
760 buffer8 = tmp;
761 size8 *= 2;
762 }
763
764 buffer8[pos8++] = (char) c;
765}
766
767void Lexer::record16(UChar c)
768{
769 // enlarge buffer if full
770 if (pos16 >= size16 - 1) {
771 UChar *tmp = new UChar[2 * size16];
772 memcpy(tmp, buffer16, size16 * sizeof(UChar));
773 delete [] buffer16;
774 buffer16 = tmp;
775 size16 *= 2;
776 }
777
778 buffer16[pos16++] = c;
779}
780
781bool Lexer::scanRegExp()
782{
783 pos16 = 0;
784 bool lastWasEscape = false;
785 bool inBrackets = false;
786
787 while (1) {
788 if (isLineTerminator() || current == 0)
789 return false;
790 else if (current != '/' || lastWasEscape == true || inBrackets == true)
791 {
792 // keep track of '[' and ']'
793 if ( !lastWasEscape ) {
794 if ( current == '[' && !inBrackets )
795 inBrackets = true;
796 if ( current == ']' && inBrackets )
797 inBrackets = false;
798 }
799 record16(current);
800 lastWasEscape =
801 !lastWasEscape && (current == '\\');
802 }
803 else { // end of regexp
804 pattern = UString(buffer16, pos16);
805 pos16 = 0;
806 shift(1);
807 break;
808 }
809 shift(1);
810 }
811
812 while (isIdentLetter(current)) {
813 record16(current);
814 shift(1);
815 }
816 flags = UString(buffer16, pos16);
817
818 return true;
819}
Note: See TracBrowser for help on using the repository browser.