Skip to content

Commit 6b2d267

Browse files
committed
Implement tokenizer
1 parent df204ff commit 6b2d267

File tree

6 files changed

+363
-10
lines changed

6 files changed

+363
-10
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.0)
22

33
# Define the project and executable
44
project(EasyCodeIt C)
5-
add_executable(eci eci.c utils.c)
5+
add_executable(eci utils.c parse.c eci.c)
66

77
# Enable warnings
88
if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
9-
add_compile_options(-Wall -Wno-maybe-uninitialized -Wno-parentheses -Wpedantic)
9+
add_compile_options(-Wall -Wpedantic -Wextra -Wshadow -Wno-maybe-uninitialized -Wno-parentheses)
1010
endif()

eci.c

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@
2121
#include <stdlib.h>
2222
#include <stdnoreturn.h>
2323
#include "utils.h"
24-
25-
noreturn void die(char *msg) {
26-
fputs(msg, stderr);
27-
if (*msg != '\0') fputs("\n", stderr);
28-
exit(EXIT_FAILURE);
29-
}
24+
#include "parse.h"
3025

3126
int main(int argc, char *argv[]) {
3227
if (argc < 2) die("No arguments!");
@@ -39,8 +34,8 @@ int main(int argc, char *argv[]) {
3934
char *code = readfile(source_file);
4035
if (!code) die("Failed to read from source file!");
4136

42-
// Output the code
43-
fputs(code, stdout);
37+
// Parse the code
38+
parse(code);
4439

4540
// Free the resources
4641
free(code);

parse.c

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
/*
2+
* This file is part of EasyCodeIt.
3+
*
4+
* Copyright (C) 2020 TheDcoder <[email protected]>
5+
*
6+
* EasyCodeIt is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* This program is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU General Public License
17+
* along with this program. If not, see <https://www.gnu.org/licenses/>.
18+
*/
19+
20+
#include <ctype.h>
21+
#include <stdbool.h>
22+
#include <stdio.h>
23+
#include <strings.h>
24+
#include "parse.h"
25+
#include "utils.h"
26+
27+
const char CHR_COMMENT = ';';
28+
const char CHR_DIRECTIVE = '#';
29+
const char CHR_MACRO = '@';
30+
const char CHR_VARIABLE = '$';
31+
const char CHR_DOT = '.';
32+
const char CHR_COMMA = ',';
33+
34+
char CHRSET_WHITESPACE[] = {' ', '\t', '\n'};
35+
char CHRSET_QUOTE[] = {'\'', '"'};
36+
char CHRSET_OPERATOR[] = {
37+
'+', '-', '*', '/', '^',
38+
'&',
39+
'=', '<', '>',
40+
'?', ':',
41+
};
42+
char CHRSET_OPERATOR_EQUABLE[] = {'+', '-', '*', '/', '^', '&', '='};
43+
char CHRSET_BRACKET[] = {'[', ']', '(', ')'};
44+
45+
char STRING_CS[] = "cs";
46+
char STRING_CE[] = "ce";
47+
char STRING_COMMENT_START[] = "comments-start";
48+
char STRING_COMMENT_END[] = "comments-end";
49+
50+
struct TokenCharMapElem {
51+
enum TokenType type;
52+
union {
53+
const char chr;
54+
const char *chr_arr;
55+
};
56+
};
57+
58+
static void print_token(struct Token *token) {
59+
puts("---### TOKEN ###---");
60+
char *token_type;
61+
switch (token->type) {
62+
case TOK_UNKNOWN:
63+
token_type = "Unknown";
64+
break;
65+
case TOK_WHITESPACE:
66+
token_type = "Whitespace";
67+
break;
68+
case TOK_COMMENT:
69+
token_type = "Comment";
70+
break;
71+
case TOK_DIRECTIVE:
72+
token_type = "Directive";
73+
break;
74+
case TOK_NUMBER:
75+
token_type = "Number";
76+
break;
77+
case TOK_STRING:
78+
token_type = "String";
79+
break;
80+
case TOK_WORD:
81+
token_type = "Word";
82+
break;
83+
case TOK_MACRO:
84+
token_type = "Macro";
85+
break;
86+
case TOK_VARIABLE:
87+
token_type = "Variable";
88+
break;
89+
case TOK_OPERATOR:
90+
token_type = "Operator";
91+
break;
92+
case TOK_BRACKET:
93+
token_type = "Bracket";
94+
break;
95+
case TOK_DOT:
96+
token_type = "Dot";
97+
break;
98+
case TOK_COMMA:
99+
token_type = "Comma";
100+
break;
101+
default:
102+
token_type = "Unnamed";
103+
break;
104+
}
105+
fputs("Type: ", stdout);
106+
puts(token_type);
107+
fputs("Data: ", stdout);
108+
for (size_t c = 0; c < token->data_len; c++) putchar(token->data[c]);
109+
putchar('\n');
110+
}
111+
112+
void parse(char *code) {
113+
while (true) {
114+
struct Token token = token_get(code, &code);
115+
if (!code) break;
116+
if (token.type != TOK_WHITESPACE) print_token(&token);
117+
if (token.type == TOK_UNKNOWN) die("!!! Unknown token encountered !!!");
118+
}
119+
return;
120+
}
121+
122+
struct Token token_get(char *code, char **next) {
123+
struct Token token = {
124+
.type = TOK_UNKNOWN,
125+
.data = NULL,
126+
.data_len = 0,
127+
};
128+
size_t length;
129+
char *next_code = NULL;
130+
131+
// Identify the token
132+
if (length = scan_string(code, char_is_whitespace)) {
133+
// Whitespace
134+
token.type = TOK_WHITESPACE;
135+
token.data = code;
136+
token.data_len = length;
137+
} else if (*code == CHR_COMMENT || *code == CHR_DIRECTIVE) {
138+
// Comment or Directive
139+
token.type = *code == CHR_COMMENT ? TOK_COMMENT : TOK_DIRECTIVE;
140+
token.data = ++code;
141+
token.data_len = scan_string(code, char_is_not_eol);
142+
143+
// Check if this is a multi-line comment
144+
bool multiline_comment = false;
145+
if (token.type == TOK_DIRECTIVE) {
146+
bool match_long, match_short;
147+
match_short = strncasecmp(STRING_CS, code, (sizeof STRING_CS) - 1) == 0;
148+
if (!match_short) match_long = strncasecmp(STRING_COMMENT_START, code, (sizeof STRING_COMMENT_START) - 1) == 0;
149+
// Make sure we have a whitespace after the directive
150+
char *comment_start;
151+
if (match_long || match_short) {
152+
comment_start = code + (match_long ? sizeof STRING_COMMENT_START : sizeof STRING_CS);
153+
multiline_comment = char_is_whitespace(comment_start[-1]);
154+
}
155+
if (multiline_comment) {
156+
token.type = TOK_COMMENT;
157+
token.data = code = comment_start;
158+
}
159+
}
160+
161+
if (multiline_comment) {
162+
// Scan for the ending directive token
163+
char *comment_end;
164+
while (true) {
165+
code += scan_string(code, char_is_not_eol) + 1;
166+
if (*code == '\0') break;
167+
if (*code != CHR_DIRECTIVE) continue;
168+
169+
bool match_long, match_short, match = false;
170+
++code;
171+
match_short = strncasecmp(STRING_CE, code, (sizeof STRING_CE) - 1) == 0;
172+
if (!match_short) match_long = strncasecmp(STRING_COMMENT_END, code, (sizeof STRING_COMMENT_END) - 1) == 0;
173+
// Make sure we have a whitespace after the directive
174+
if (match_long || match_short) {
175+
comment_end = code + ((match_long ? sizeof STRING_COMMENT_END : sizeof STRING_CE) - 1);
176+
match = char_is_whitespace(*comment_end);
177+
}
178+
if (match) break;
179+
}
180+
token.data_len = (code - token.data) - 1;
181+
next_code = comment_end;
182+
} else {
183+
token.data_len = scan_string(code, char_is_not_eol);
184+
}
185+
} else if (length = scan_string(code, char_is_num)){
186+
// Number
187+
token.type = TOK_NUMBER;
188+
token.data = code;
189+
token.data_len = length;
190+
} else if (chrcmp(*code, CHRSET_QUOTE, sizeof CHRSET_QUOTE)) {
191+
// String
192+
token.type = TOK_STRING;
193+
const char quote = *code;
194+
token.data = code + 1;
195+
for (token.data_len = 0; token.data[token.data_len] != quote; ++token.data_len);
196+
next_code = token.data + token.data_len + 1;
197+
} else if (length = scan_string(code, char_is_alphanum)){
198+
// Word
199+
token.type = TOK_WORD;
200+
token.data = code;
201+
token.data_len = length;
202+
} else if (*code == CHR_MACRO || *code == CHR_VARIABLE){
203+
// Macro or Variable
204+
token.type = *code == CHR_MACRO ? TOK_MACRO : TOK_VARIABLE;
205+
token.data = ++code;
206+
token.data_len = scan_string(code, char_is_alphanum);
207+
} else if (char_is_opsym(*code)) {
208+
// Operator
209+
token.type = TOK_OPERATOR;
210+
token.data = code;
211+
212+
// Include the trailing `=` if possible
213+
token.data_len = code[1] == '=' && chrcmp(*code, CHRSET_OPERATOR_EQUABLE, sizeof CHRSET_OPERATOR_EQUABLE) ? 2 : 1;
214+
} else if (char_is_bracket(*code)) {
215+
// Bracket (Parenthesis)
216+
token.type = TOK_BRACKET;
217+
token.data = code;
218+
token.data_len = 1;
219+
} else if (*code == CHR_DOT) {
220+
// Dot (Full Stop)
221+
token.type = TOK_DOT;
222+
token.data = code;
223+
token.data_len = 1;
224+
} else if (*code == CHR_COMMA) {
225+
// Comma
226+
token.type = TOK_COMMA;
227+
token.data = code;
228+
token.data_len = 1;
229+
} else {
230+
// Unknown
231+
token.data = code;
232+
token.data_len = 1;
233+
}
234+
235+
// Set the next code
236+
if (next_code) {
237+
*next = *next_code == '\0' ? NULL : next_code;
238+
} else {
239+
*next = *code == '\0' ? NULL : code + token.data_len;
240+
}
241+
242+
// Return the token
243+
return token;
244+
}
245+
246+
size_t scan_string(char *str, bool (cmpfunc)(char)) {
247+
size_t len = 0;
248+
while (true) {
249+
if (!cmpfunc(*str)) break;
250+
++len; ++str;
251+
}
252+
return len;
253+
}
254+
255+
bool char_is_whitespace(char chr) {
256+
return chrcmp(chr, CHRSET_WHITESPACE, sizeof CHRSET_WHITESPACE);
257+
}
258+
259+
bool char_is_alpha(char chr) {
260+
return isalpha(chr);
261+
}
262+
263+
bool char_is_num(char chr) {
264+
return isdigit(chr);
265+
}
266+
267+
bool char_is_alphanum(char chr) {
268+
return char_is_alpha(chr) || char_is_num(chr) || chr == '_';
269+
}
270+
271+
bool char_is_opsym(char chr) {
272+
return chrcmp(chr, CHRSET_OPERATOR, sizeof CHRSET_OPERATOR);
273+
}
274+
275+
bool char_is_bracket(char chr) {
276+
return chrcmp(chr, CHRSET_BRACKET, sizeof CHRSET_BRACKET);
277+
}
278+
279+
bool char_is_not_eol(char chr) {
280+
return chr != '\n' && chr != '\0';
281+
}

parse.h

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
* This file is part of EasyCodeIt.
3+
*
4+
* Copyright (C) 2020 TheDcoder <[email protected]>
5+
*
6+
* EasyCodeIt is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* This program is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU General Public License
17+
* along with this program. If not, see <https://www.gnu.org/licenses/>.
18+
*/
19+
20+
#ifndef PARSE_H
21+
#define PARSE_H
22+
23+
#include <stddef.h>
24+
25+
enum TokenType {
26+
TOK_UNKNOWN,
27+
TOK_WHITESPACE,
28+
TOK_COMMENT,
29+
TOK_DIRECTIVE,
30+
TOK_NUMBER,
31+
TOK_STRING,
32+
TOK_WORD,
33+
TOK_MACRO,
34+
TOK_VARIABLE,
35+
TOK_OPERATOR,
36+
TOK_BRACKET,
37+
TOK_DOT,
38+
TOK_COMMA,
39+
};
40+
41+
struct Token {
42+
enum TokenType type;
43+
char *data;
44+
size_t data_len;
45+
};
46+
47+
void parse(char *code);
48+
struct Token token_get(char *code, char **next);
49+
size_t scan_string(char *str, bool (cmpfunc)(char));
50+
51+
bool char_is_whitespace(char chr);
52+
bool char_is_alpha(char chr);
53+
bool char_is_num(char chr);
54+
bool char_is_alphanum(char chr);
55+
bool char_is_opsym(char chr);
56+
bool char_is_bracket(char chr);
57+
bool char_is_not_eol(char chr);
58+
59+
#endif

0 commit comments

Comments
 (0)