include/litehtml/css_tokenizer.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

#ifndef LH_CSS_TOKENIZER_H
#define LH_CSS_TOKENIZER_H

#include "types.h"
#include <cstdio>

namespace litehtml
{

// https://www.w3.org/TR/css-syntax-3/#tokenization
// :;,()[]{} token or delim token: type == this char
// EOF token: type == EOF (-1)
// type may be 0 to indicate an error, see at()
enum css_token_type
{
	WHITESPACE = ' ',

	// Giving EOF and some chars explicit names to facilitate debugging and to get rid of warning C4063: case '41' is not a valid value for switch of enum 'litehtml::css_token_type'
	_EOF = EOF,
	LEFT_BRACE = '{',
	RIGHT_BRACE = '}',
	LEFT_BRACKET = '[',
	RIGHT_BRACKET = ']',
	LEFT_PAREN = '(',
	RIGHT_PAREN = ')',
	COLON = ':',
	SEMICOLON = ';',
	COMMA = ',',
	BANG = '!',
	DOT = '.',
	AMPERSAND = '&',

	IDENT = -20,    // do not collide with any unicode chars
	FUNCTION,       // calc(
	AT_KEYWORD,     // @media
	HASH,           // #foo
	STRING,         // "xxx" or 'xxx'
	BAD_STRING,
	URL,            // url(x.com)  - but not url("x.com"), which is function + string + ')'
	BAD_URL,
	NUMBER,         // 25
	PERCENTAGE,     // 25%
	DIMENSION,      // 25px
	CDO,            // <!--
	CDC,            // -->

	// https://www.w3.org/tr/css-syntax-3/#component-value
	CV_FUNCTION  = -100,
	// simple block:
	CURLY_BLOCK  = -100 - '{',
	ROUND_BLOCK  = -100 - '(',
	SQUARE_BLOCK = -100 - '['
};

enum css_number_type
{
	css_number_integer,
	css_number_number
};

enum css_hash_type
{
	css_hash_unrestricted,
	css_hash_id
};

// css_token: CSS token or component value ("fat" token)
// Tokens exist in uncomponentized form only a short time after tokenization, most of the time they are "fat".
// All functions in css_parser work regardless of whether tokens are fat or not, as per standard.
// All functions outside of css_parser that parse media queries, selectors, property values assume tokens are componentized.
struct css_token
{
	css_token(css_token_type type = css_token_type(),
		float number = 0, css_number_type number_type = css_number_integer, string str = "")
		: type(type), str(str), n{number, number_type}
	{
		if (is_component_value()) new(&value) vector<css_token>;
	}

	css_token(css_token_type type, const string& str)
		: type(type), str(str), n()
	{
		if (is_component_value()) new(&value) vector<css_token>;
	}

	css_token(const css_token& token) : type(token.type), str(token.str), repr(token.repr)
	{
		switch (type)
		{
		case HASH:
			hash_type = token.hash_type;
			break;

		case NUMBER:
		case PERCENTAGE:
		case DIMENSION:
			n = token.n;
			break;

		case CV_FUNCTION:
		case CURLY_BLOCK:
		case ROUND_BLOCK:
		case SQUARE_BLOCK:
			new(&value) vector(token.value);
			break;

		default:;
		}
	}

	css_token& operator=(const css_token& token)
	{
		this->~css_token();
		new(this) css_token(token);
		return *this;
	}

	~css_token()
	{
		str.~string();
		if (is_component_value()) value.~vector();
	}

	bool is_component_value() const
	{
		return type <= CV_FUNCTION;
	}

	string ident() const;
	string get_repr(bool insert_spaces = false) const;

	union {
		css_token_type type;
		int ch; // used for <delim-token> or :;,()[]{}
	};
	union {
		string str;  // STRING, URL
		string name; // HASH, IDENT, AT_KEYWORD, FUNCTION, CV_FUNCTION
		string unit; // DIMENSION
	};
	struct number {
		float number; // NUMBER, PERCENTAGE, DIMENSION
		css_number_type number_type; // NUMBER, DIMENSION
	};
	union {
		css_hash_type hash_type; // HASH
		number n;
		vector<css_token> value; // CV_FUNCTION, XXX_BLOCK
	};

	string repr; // https://www.w3.org/TR/css-syntax-3/#representation
};

using css_token_vector = vector<css_token>;
string get_repr(const css_token_vector& tokens, int index = 0, int count = -1, bool insert_spaces = false);

class css_tokenizer
{
public:
	css_tokenizer(const string& input) : str(input), index(0), current_char(0) {}

	css_token_vector tokenize();

private:
	// Input stream. Valid UTF-8; no NUL bytes. https://www.w3.org/TR/css-syntax-3/#input-stream
	string	str;

	// Index of the next input char.  https://www.w3.org/TR/css-syntax-3/#next-input-code-point
	int		index;

	// https://www.w3.org/TR/css-syntax-3/#current-input-code-point
	// This is needed to handle the situation when unconsume_char is called when index == str.size().
	// We need to distinguish between the situation when we just read the last char and
	// the situation when we already have been at the end and just read NUL.
	// If we don't do this tokenizer will loop forever on input "a".
	int		current_char;

private:
	static bool is_whitespace(int ch);
	static bool is_non_printable_code_point(int ch);
	static bool is_ident_start_code_point(int ch);
	static bool is_ident_code_point(int ch);

	struct three_chars { int _1, _2, _3; };

	int			consume_char();
	void		unconsume_char();
	int			peek_char();
	three_chars peek_chars();

	void		consume_comments();
	int			consume_escaped_code_point();
	css_token	consume_string_token(int ending_code_point);

	static bool	would_start_ident_sequence(three_chars chars);
	string		consume_ident_sequence();

	static bool	would_start_a_number(int x, int y, int z);
	static double convert_string_to_number(const string& str);
	double		consume_number(css_number_type& number_type);
	css_token	consume_numeric_token();

	void		consume_remnants_of_bad_url();
	css_token	consume_url_token();

	css_token	consume_ident_like_token();
	css_token	consume_token();
};

void css_parse_error(string msg);
inline css_token_vector tokenize(const string& str)
{
	return css_tokenizer(str).tokenize();
}

} // namespace litehtml

#endif // LH_CSS_TOKENIZER_H