Context Navigation

source: webkit/trunk/JavaScriptCore/kjs/lexer.cpp@ 2772

Visit:

Last change on this file since 2772 was 2772, checked in by darin, 23 years ago

a few more globals for often-used property names
conversion to Identifier from UString must now be explicit

kjs/error_object.cpp:
kjs/function.cpp:
kjs/function_object.cpp:
kjs/identifier.cpp:
kjs/identifier.h:
kjs/lexer.cpp:
kjs/nodes.cpp:
kjs/number_object.cpp:
kjs/object.cpp:
kjs/object.h:
kjs/string_object.cpp:
kjs/testkjs.cpp:
kjs/ustring.cpp:
kjs/ustring.h:

Property svn:eol-style set to native
Property svn:keywords set to Author Date Id Revision

File size: 19.0 KB

Line
1	// -- c-basic-offset: 2 --
2	/*
3	* This file is part of the KDE libraries
4	* Copyright (C) 1999-2000 Harri Porten ([email protected])
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Library General Public
8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Library General Public License for more details.
15	*
16	* You should have received a copy of the GNU Library General Public License
17	* along with this library; see the file COPYING.LIB. If not, write to
18	* the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19	* Boston, MA 02111-1307, USA.
20	*
21	*/
22
23	#ifdef HAVE_CONFIG_H
24	#include <config.h>
25	#endif
26
27	#include <ctype.h>
28	#include <stdlib.h>
29	#include <stdio.h>
30	#include <string.h>
31	#include <assert.h>
32
33	#include "value.h"
34	#include "object.h"
35	#include "types.h"
36	#include "interpreter.h"
37	#include "nodes.h"
38	#include "lexer.h"
39	#include "identifier.h"
40	#include "lookup.h"
41	#include "internal.h"
42
43	// we can't specify the namespace in yacc's C output, so do it here
44	using namespace KJS;
45
46	static Lexer *currLexer = 0;
47
48	#ifndef KDE_USE_FINAL
49	#include "grammar.h"
50	#endif
51
52	#include "lexer.lut.h"
53
54	extern YYLTYPE yylloc; // global bison variable holding token info
55
56	// a bridge for yacc from the C world to C++
57	int kjsyylex()
58	{
59	return Lexer::curr()->lex();
60	}
61
62	Lexer::Lexer()
63	: yylineno(1),
64	size8(128), size16(128), restrKeyword(false),
65	eatNextIdentifier(false), stackToken(-1), lastToken(-1), pos(0),
66	code(0), length(0),
67	#ifndef KJS_PURE_ECMA
68	bol(true),
69	#endif
70	current(0), next1(0), next2(0), next3(0)
71	{
72	// allocate space for read buffers
73	buffer8 = new char[size8];
74	buffer16 = new UChar[size16];
75	currLexer = this;
76
77	}
78
79	Lexer::~Lexer()
80	{
81	delete [] buffer8;
82	delete [] buffer16;
83	}
84
85	Lexer *Lexer::curr()
86	{
87	if (!currLexer) {
88	// create singleton instance
89	currLexer = new Lexer();
90	}
91	return currLexer;
92	}
93
94	#ifdef KJS_DEBUG_MEM
95	void Lexer::globalClear()
96	{
97	delete currLexer;
98	currLexer = 0L;
99	}
100	#endif
101
102	void Lexer::setCode(const UChar *c, unsigned int len)
103	{
104	yylineno = 1;
105	restrKeyword = false;
106	delimited = false;
107	eatNextIdentifier = false;
108	stackToken = -1;
109	lastToken = -1;
110	pos = 0;
111	code = c;
112	length = len;
113	skipLF = false;
114	skipCR = false;
115	#ifndef KJS_PURE_ECMA
116	bol = true;
117	#endif
118
119	// read first characters
120	current = (length > 0) ? code[0].unicode() : 0;
121	next1 = (length > 1) ? code[1].unicode() : 0;
122	next2 = (length > 2) ? code[2].unicode() : 0;
123	next3 = (length > 3) ? code[3].unicode() : 0;
124	}
125
126	void Lexer::shift(unsigned int p)
127	{
128	while (p--) {
129	pos++;
130	current = next1;
131	next1 = next2;
132	next2 = next3;
133	next3 = (pos + 3 < length) ? code[pos+3].unicode() : 0;
134	}
135	}
136
137	// called on each new line
138	void Lexer::nextLine()
139	{
140	yylineno++;
141	#ifndef KJS_PURE_ECMA
142	bol = true;
143	#endif
144	}
145
146	void Lexer::setDone(State s)
147	{
148	state = s;
149	done = true;
150	}
151
152	int Lexer::lex()
153	{
154	int token = 0;
155	state = Start;
156	unsigned short stringType = 0; // either single or double quotes
157	pos8 = pos16 = 0;
158	done = false;
159	terminator = false;
160	skipLF = false;
161	skipCR = false;
162
163	// did we push a token on the stack previously ?
164	// (after an automatic semicolon insertion)
165	if (stackToken >= 0) {
166	setDone(Other);
167	token = stackToken;
168	stackToken = 0;
169	}
170
171	while (!done) {
172	if (skipLF && current != '\n') // found \r but not \n afterwards
173	skipLF = false;
174	if (skipCR && current != '\r') // found \n but not \r afterwards
175	skipCR = false;
176	if (skipLF \|\| skipCR) // found \r\n or \n\r -> eat the second one
177	{
178	skipLF = false;
179	skipCR = false;
180	shift(1);
181	}
182	switch (state) {
183	case Start:
184	if (isWhiteSpace()) {
185	// do nothing
186	} else if (current == '/' && next1 == '/') {
187	shift(1);
188	state = InSingleLineComment;
189	} else if (current == '/' && next1 == '*') {
190	shift(1);
191	state = InMultiLineComment;
192	} else if (current == 0) {
193	if (!terminator && !delimited) {
194	// automatic semicolon insertion if program incomplete
195	token = ';';
196	stackToken = 0;
197	setDone(Other);
198	} else
199	setDone(Eof);
200	} else if (isLineTerminator()) {
201	nextLine();
202	terminator = true;
203	if (restrKeyword) {
204	token = ';';
205	setDone(Other);
206	}
207	} else if (current == '"' \|\| current == '\'') {
208	state = InString;
209	stringType = current;
210	} else if (isIdentLetter(current)) {
211	record16(current);
212	state = InIdentifier;
213	} else if (current == '0') {
214	record8(current);
215	state = InNum0;
216	} else if (isDecimalDigit(current)) {
217	record8(current);
218	state = InNum;
219	} else if (current == '.' && isDecimalDigit(next1)) {
220	record8(current);
221	state = InDecimal;
222	#ifndef KJS_PURE_ECMA
223	// <!-- marks the beginning of a line comment (for www usage)
224	} else if (current == '<' && next1 == '!' &&
225	next2 == '-' && next3 == '-') {
226	shift(3);
227	state = InSingleLineComment;
228	// same for -->
229	} else if (bol && current == '-' && next1 == '-' && next2 == '>') {
230	shift(2);
231	state = InSingleLineComment;
232	#endif
233	} else {
234	token = matchPunctuator(current, next1, next2, next3);
235	if (token != -1) {
236	setDone(Other);
237	} else {
238	// cerr << "encountered unknown character" << endl;
239	setDone(Bad);
240	}
241	}
242	break;
243	case InString:
244	if (current == stringType) {
245	shift(1);
246	setDone(String);
247	} else if (current == 0 \|\| isLineTerminator()) {
248	setDone(Bad);
249	} else if (current == '\\') {
250	state = InEscapeSequence;
251	} else {
252	record16(current);
253	}
254	break;
255	// Escape Sequences inside of strings
256	case InEscapeSequence:
257	if (isOctalDigit(current)) {
258	if (current >= '0' && current <= '3' &&
259	isOctalDigit(next1) && isOctalDigit(next2)) {
260	record16(convertOctal(current, next1, next2));
261	shift(2);
262	state = InString;
263	} else if (isOctalDigit(current) && isOctalDigit(next1)) {
264	record16(convertOctal('0', current, next1));
265	shift(1);
266	state = InString;
267	} else if (isOctalDigit(current)) {
268	record16(convertOctal('0', '0', current));
269	state = InString;
270	} else {
271	setDone(Bad);
272	}
273	} else if (current == 'x')
274	state = InHexEscape;
275	else if (current == 'u')
276	state = InUnicodeEscape;
277	else {
278	record16(singleEscape(current));
279	state = InString;
280	}
281	break;
282	case InHexEscape:
283	if (isHexDigit(current) && isHexDigit(next1)) {
284	state = InString;
285	record16(convertHex(current, next1));
286	shift(1);
287	} else if (current == stringType) {
288	record16('x');
289	shift(1);
290	setDone(String);
291	} else {
292	record16('x');
293	record16(current);
294	state = InString;
295	}
296	break;
297	case InUnicodeEscape:
298	if (isHexDigit(current) && isHexDigit(next1) &&
299	isHexDigit(next2) && isHexDigit(next3)) {
300	record16(convertUnicode(current, next1, next2, next3));
301	shift(3);
302	state = InString;
303	} else if (current == stringType) {
304	record16('u');
305	shift(1);
306	setDone(String);
307	} else {
308	setDone(Bad);
309	}
310	break;
311	case InSingleLineComment:
312	if (isLineTerminator()) {
313	nextLine();
314	terminator = true;
315	if (restrKeyword) {
316	token = ';';
317	setDone(Other);
318	} else
319	state = Start;
320	} else if (current == 0) {
321	setDone(Eof);
322	}
323	break;
324	case InMultiLineComment:
325	if (current == 0) {
326	setDone(Bad);
327	} else if (isLineTerminator()) {
328	nextLine();
329	} else if (current == '*' && next1 == '/') {
330	state = Start;
331	shift(1);
332	}
333	break;
334	case InIdentifier:
335	if (isIdentLetter(current) \|\| isDecimalDigit(current)) {
336	record16(current);
337	break;
338	}
339	setDone(Identifier);
340	break;
341	case InNum0:
342	if (current == 'x' \|\| current == 'X') {
343	record8(current);
344	state = InHex;
345	} else if (current == '.') {
346	record8(current);
347	state = InDecimal;
348	} else if (current == 'e' \|\| current == 'E') {
349	record8(current);
350	state = InExponentIndicator;
351	} else if (isOctalDigit(current)) {
352	record8(current);
353	state = InOctal;
354	} else if (isDecimalDigit(current)) {
355	record8(current);
356	state = InDecimal;
357	} else {
358	setDone(Number);
359	}
360	break;
361	case InHex:
362	if (isHexDigit(current)) {
363	record8(current);
364	} else {
365	setDone(Hex);
366	}
367	break;
368	case InOctal:
369	if (isOctalDigit(current)) {
370	record8(current);
371	}
372	else if (isDecimalDigit(current)) {
373	record8(current);
374	state = InDecimal;
375	} else
376	setDone(Octal);
377	break;
378	case InNum:
379	if (isDecimalDigit(current)) {
380	record8(current);
381	} else if (current == '.') {
382	record8(current);
383	state = InDecimal;
384	} else if (current == 'e' \|\| current == 'E') {
385	record8(current);
386	state = InExponentIndicator;
387	} else
388	setDone(Number);
389	break;
390	case InDecimal:
391	if (isDecimalDigit(current)) {
392	record8(current);
393	} else if (current == 'e' \|\| current == 'E') {
394	record8(current);
395	state = InExponentIndicator;
396	} else
397	setDone(Number);
398	break;
399	case InExponentIndicator:
400	if (current == '+' \|\| current == '-') {
401	record8(current);
402	} else if (isDecimalDigit(current)) {
403	record8(current);
404	state = InExponent;
405	} else
406	setDone(Bad);
407	break;
408	case InExponent:
409	if (isDecimalDigit(current)) {
410	record8(current);
411	} else
412	setDone(Number);
413	break;
414	default:
415	assert(!"Unhandled state in switch statement");
416	}
417
418	// move on to the next character
419	if (!done)
420	shift(1);
421	#ifndef KJS_PURE_ECMA
422	if (state != Start && state != InSingleLineComment)
423	bol = false;
424	#endif
425	}
426
427	// no identifiers allowed directly after numeric literal, e.g. "3in" is bad
428	if ((state == Number \|\| state == Octal \|\| state == Hex)
429	&& isIdentLetter(current))
430	state = Bad;
431
432	// terminate string
433	buffer8[pos8] = '\0';
434
435	#ifdef KJS_DEBUG_LEX
436	fprintf(stderr, "line: %d ", lineNo());
437	fprintf(stderr, "yytext (%x): ", buffer8[0]);
438	fprintf(stderr, "%s ", buffer8);
439	#endif
440
441	double dval = 0;
442	if (state == Number) {
443	dval = strtod(buffer8, 0L);
444	} else if (state == Hex) { // scan hex numbers
445	// TODO: support long unsigned int
446	unsigned int i;
447	sscanf(buffer8, "%x", &i);
448	dval = i;
449	state = Number;
450	} else if (state == Octal) { // scan octal number
451	unsigned int ui;
452	sscanf(buffer8, "%o", &ui);
453	dval = ui;
454	state = Number;
455	}
456
457	#ifdef KJS_DEBUG_LEX
458	switch (state) {
459	case Eof:
460	printf("(EOF)\n");
461	break;
462	case Other:
463	printf("(Other)\n");
464	break;
465	case Identifier:
466	printf("(Identifier)/(Keyword)\n");
467	break;
468	case String:
469	printf("(String)\n");
470	break;
471	case Number:
472	printf("(Number)\n");
473	break;
474	default:
475	printf("(unknown)");
476	}
477	#endif
478
479	if (state != Identifier && eatNextIdentifier)
480	eatNextIdentifier = false;
481
482	restrKeyword = false;
483	delimited = false;
484	yylloc.first_line = yylineno; // ???
485	yylloc.last_line = yylineno;
486
487	switch (state) {
488	case Eof:
489	token = 0;
490	break;
491	case Other:
492	if(token == '}' \|\| token == ';') {
493	delimited = true;
494	}
495	break;
496	case Identifier:
497	if ((token = Lookup::find(&mainTable, buffer16, pos16)) < 0) {
498	// Lookup for keyword failed, means this is an identifier
499	// Apply anonymous-function hack below (eat the identifier)
500	if (eatNextIdentifier) {
501	eatNextIdentifier = false;
502	UString debugstr(buffer16, pos16); fprintf(stderr,"Anonymous function hack: eating identifier %s\n",debugstr.ascii());
503	token = lex();
504	break;
505	}
506	/* TODO: close leak on parse error. same holds true for String */
507	kjsyylval.ident = new KJS::Identifier(buffer16, pos16);
508	token = IDENT;
509	break;
510	}
511
512	eatNextIdentifier = false;
513	// Hack for "f = function somename() { ... }", too hard to get into the grammar
514	if (token == FUNCTION && lastToken == '=' )
515	eatNextIdentifier = true;
516
517	if (token == CONTINUE \|\| token == BREAK \|\|
518	token == RETURN \|\| token == THROW)
519	restrKeyword = true;
520	break;
521	case String:
522	kjsyylval.ustr = new UString(buffer16, pos16);
523	token = STRING;
524	break;
525	case Number:
526	kjsyylval.dval = dval;
527	token = NUMBER;
528	break;
529	case Bad:
530	fprintf(stderr, "yylex: ERROR.\n");
531	return -1;
532	default:
533	assert(!"unhandled numeration value in switch");
534	return -1;
535	}
536	lastToken = token;
537	return token;
538	}
539
540	bool Lexer::isWhiteSpace() const
541	{
542	return (current == ' ' \|\| current == '\t' \|\|
543	current == 0x0b \|\| current == 0x0c);
544	}
545
546	bool Lexer::isLineTerminator()
547	{
548	bool cr = (current == '\r');
549	bool lf = (current == '\n');
550	if (cr)
551	skipLF = true;
552	else if (lf)
553	skipCR = true;
554	return cr \|\| lf;
555	}
556
557	bool Lexer::isIdentLetter(unsigned short c)
558	{
559	/* TODO: allow other legitimate unicode chars */
560	return (c >= 'a' && c <= 'z' \|\|
561	c >= 'A' && c <= 'Z' \|\|
562	c == '$' \|\| c == '_');
563	}
564
565	bool Lexer::isDecimalDigit(unsigned short c)
566	{
567	return (c >= '0' && c <= '9');
568	}
569
570	bool Lexer::isHexDigit(unsigned short c) const
571	{
572	return (c >= '0' && c <= '9' \|\|
573	c >= 'a' && c <= 'f' \|\|
574	c >= 'A' && c <= 'F');
575	}
576
577	bool Lexer::isOctalDigit(unsigned short c) const
578	{
579	return (c >= '0' && c <= '7');
580	}
581
582	int Lexer::matchPunctuator(unsigned short c1, unsigned short c2,
583	unsigned short c3, unsigned short c4)
584	{
585	if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
586	shift(4);
587	return URSHIFTEQUAL;
588	} else if (c1 == '=' && c2 == '=' && c3 == '=') {
589	shift(3);
590	return STREQ;
591	} else if (c1 == '!' && c2 == '=' && c3 == '=') {
592	shift(3);
593	return STRNEQ;
594	} else if (c1 == '>' && c2 == '>' && c3 == '>') {
595	shift(3);
596	return URSHIFT;
597	} else if (c1 == '<' && c2 == '<' && c3 == '=') {
598	shift(3);
599	return LSHIFTEQUAL;
600	} else if (c1 == '>' && c2 == '>' && c3 == '=') {
601	shift(3);
602	return RSHIFTEQUAL;
603	} else if (c1 == '<' && c2 == '=') {
604	shift(2);
605	return LE;
606	} else if (c1 == '>' && c2 == '=') {
607	shift(2);
608	return GE;
609	} else if (c1 == '!' && c2 == '=') {
610	shift(2);
611	return NE;
612	} else if (c1 == '+' && c2 == '+') {
613	shift(2);
614	if (terminator)
615	return AUTOPLUSPLUS;
616	else
617	return PLUSPLUS;
618	} else if (c1 == '-' && c2 == '-') {
619	shift(2);
620	if (terminator)
621	return AUTOMINUSMINUS;
622	else
623	return MINUSMINUS;
624	} else if (c1 == '=' && c2 == '=') {
625	shift(2);
626	return EQEQ;
627	} else if (c1 == '+' && c2 == '=') {
628	shift(2);
629	return PLUSEQUAL;
630	} else if (c1 == '-' && c2 == '=') {
631	shift(2);
632	return MINUSEQUAL;
633	} else if (c1 == '*' && c2 == '=') {
634	shift(2);
635	return MULTEQUAL;
636	} else if (c1 == '/' && c2 == '=') {
637	shift(2);
638	return DIVEQUAL;
639	} else if (c1 == '&' && c2 == '=') {
640	shift(2);
641	return ANDEQUAL;
642	} else if (c1 == '^' && c2 == '=') {
643	shift(2);
644	return XOREQUAL;
645	} else if (c1 == '%' && c2 == '=') {
646	shift(2);
647	return MODEQUAL;
648	} else if (c1 == '\|' && c2 == '=') {
649	shift(2);
650	return OREQUAL;
651	} else if (c1 == '<' && c2 == '<') {
652	shift(2);
653	return LSHIFT;
654	} else if (c1 == '>' && c2 == '>') {
655	shift(2);
656	return RSHIFT;
657	} else if (c1 == '&' && c2 == '&') {
658	shift(2);
659	return AND;
660	} else if (c1 == '\|' && c2 == '\|') {
661	shift(2);
662	return OR;
663	}
664
665	switch(c1) {
666	case '=':
667	case '>':
668	case '<':
669	case ',':
670	case '!':
671	case '~':
672	case '?':
673	case ':':
674	case '.':
675	case '+':
676	case '-':
677	case '*':
678	case '/':
679	case '&':
680	case '\|':
681	case '^':
682	case '%':
683	case '(':
684	case ')':
685	case '{':
686	case '}':
687	case '[':
688	case ']':
689	case ';':
690	shift(1);
691	return static_cast<int>(c1);
692	default:
693	return -1;
694	}
695	}
696
697	unsigned short Lexer::singleEscape(unsigned short c) const
698	{
699	switch(c) {
700	case 'b':
701	return 0x08;
702	case 't':
703	return 0x09;
704	case 'n':
705	return 0x0A;
706	case 'v':
707	return 0x0B;
708	case 'f':
709	return 0x0C;
710	case 'r':
711	return 0x0D;
712	case '"':
713	return 0x22;
714	case '\'':
715	return 0x27;
716	case '\\':
717	return 0x5C;
718	default:
719	return c;
720	}
721	}
722
723	unsigned short Lexer::convertOctal(unsigned short c1, unsigned short c2,
724	unsigned short c3) const
725	{
726	return ((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
727	}
728
729	unsigned char Lexer::convertHex(unsigned short c)
730	{
731	if (c >= '0' && c <= '9')
732	return (c - '0');
733	else if (c >= 'a' && c <= 'f')
734	return (c - 'a' + 10);
735	else
736	return (c - 'A' + 10);
737	}
738
739	unsigned char Lexer::convertHex(unsigned short c1, unsigned short c2)
740	{
741	return ((convertHex(c1) << 4) + convertHex(c2));
742	}
743
744	UChar Lexer::convertUnicode(unsigned short c1, unsigned short c2,
745	unsigned short c3, unsigned short c4)
746	{
747	return UChar((convertHex(c1) << 4) + convertHex(c2),
748	(convertHex(c3) << 4) + convertHex(c4));
749	}
750
751	void Lexer::record8(unsigned short c)
752	{
753	assert(c <= 0xff);
754
755	// enlarge buffer if full
756	if (pos8 >= size8 - 1) {
757	char tmp = new char[2 size8];
758	memcpy(tmp, buffer8, size8 * sizeof(char));
759	delete [] buffer8;
760	buffer8 = tmp;
761	size8 *= 2;
762	}
763
764	buffer8[pos8++] = (char) c;
765	}
766
767	void Lexer::record16(UChar c)
768	{
769	// enlarge buffer if full
770	if (pos16 >= size16 - 1) {
771	UChar tmp = new UChar[2 size16];
772	memcpy(tmp, buffer16, size16 * sizeof(UChar));
773	delete [] buffer16;
774	buffer16 = tmp;
775	size16 *= 2;
776	}
777
778	buffer16[pos16++] = c;
779	}
780
781	bool Lexer::scanRegExp()
782	{
783	pos16 = 0;
784	bool lastWasEscape = false;
785	bool inBrackets = false;
786
787	while (1) {
788	if (isLineTerminator() \|\| current == 0)
789	return false;
790	else if (current != '/' \|\| lastWasEscape == true \|\| inBrackets == true)
791	{
792	// keep track of '[' and ']'
793	if ( !lastWasEscape ) {
794	if ( current == '[' && !inBrackets )
795	inBrackets = true;
796	if ( current == ']' && inBrackets )
797	inBrackets = false;
798	}
799	record16(current);
800	lastWasEscape =
801	!lastWasEscape && (current == '\\');
802	}
803	else { // end of regexp
804	pattern = UString(buffer16, pos16);
805	pos16 = 0;
806	shift(1);
807	break;
808	}
809	shift(1);
810	}
811
812	while (isIdentLetter(current)) {
813	record16(current);
814	shift(1);
815	}
816	flags = UString(buffer16, pos16);
817
818	return true;
819	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: