Inja 3.4.0
A Template Engine for Modern C++
Loading...
Searching...
No Matches
lexer.hpp
1#ifndef INCLUDE_INJA_LEXER_HPP_
2#define INCLUDE_INJA_LEXER_HPP_
3
4#include <cctype>
5#include <locale>
6
7#include "config.hpp"
8#include "token.hpp"
9#include "utils.hpp"
10
11namespace inja {
12
16class Lexer {
17 enum class State {
18 Text,
19 ExpressionStart,
20 ExpressionStartForceLstrip,
21 ExpressionBody,
22 LineStart,
23 LineBody,
24 StatementStart,
25 StatementStartNoLstrip,
26 StatementStartForceLstrip,
27 StatementBody,
28 CommentStart,
29 CommentStartForceLstrip,
30 CommentBody,
31 };
32
33 enum class MinusState {
34 Operator,
35 Number,
36 };
37
38 const LexerConfig& config;
39
40 State state;
41 MinusState minus_state;
42 std::string_view m_in;
43 size_t tok_start;
44 size_t pos;
45
46 Token scan_body(std::string_view close, Token::Kind closeKind, std::string_view close_trim = std::string_view(), bool trim = false) {
47 again:
48 // skip whitespace (except for \n as it might be a close)
49 if (tok_start >= m_in.size()) {
50 return make_token(Token::Kind::Eof);
51 }
52 const char ch = m_in[tok_start];
53 if (ch == ' ' || ch == '\t' || ch == '\r') {
54 tok_start += 1;
55 goto again;
56 }
57
58 // check for close
59 if (!close_trim.empty() && inja::string_view::starts_with(m_in.substr(tok_start), close_trim)) {
60 state = State::Text;
61 pos = tok_start + close_trim.size();
62 const Token tok = make_token(closeKind);
63 skip_whitespaces_and_newlines();
64 return tok;
65 }
66
67 if (inja::string_view::starts_with(m_in.substr(tok_start), close)) {
68 state = State::Text;
69 pos = tok_start + close.size();
70 const Token tok = make_token(closeKind);
71 if (trim) {
72 skip_whitespaces_and_first_newline();
73 }
74 return tok;
75 }
76
77 // skip \n
78 if (ch == '\n') {
79 tok_start += 1;
80 goto again;
81 }
82
83 pos = tok_start + 1;
84 if (std::isalpha(ch)) {
85 minus_state = MinusState::Operator;
86 return scan_id();
87 }
88
89 const MinusState current_minus_state = minus_state;
90 if (minus_state == MinusState::Operator) {
91 minus_state = MinusState::Number;
92 }
93
94 switch (ch) {
95 case '+':
96 return make_token(Token::Kind::Plus);
97 case '-':
98 if (current_minus_state == MinusState::Operator) {
99 return make_token(Token::Kind::Minus);
100 }
101 return scan_number();
102 case '*':
103 return make_token(Token::Kind::Times);
104 case '/':
105 return make_token(Token::Kind::Slash);
106 case '^':
107 return make_token(Token::Kind::Power);
108 case '%':
109 return make_token(Token::Kind::Percent);
110 case '.':
111 return make_token(Token::Kind::Dot);
112 case ',':
113 return make_token(Token::Kind::Comma);
114 case ':':
115 return make_token(Token::Kind::Colon);
116 case '(':
117 return make_token(Token::Kind::LeftParen);
118 case ')':
119 minus_state = MinusState::Operator;
120 return make_token(Token::Kind::RightParen);
121 case '[':
122 return make_token(Token::Kind::LeftBracket);
123 case ']':
124 minus_state = MinusState::Operator;
125 return make_token(Token::Kind::RightBracket);
126 case '{':
127 return make_token(Token::Kind::LeftBrace);
128 case '}':
129 minus_state = MinusState::Operator;
130 return make_token(Token::Kind::RightBrace);
131 case '>':
132 if (pos < m_in.size() && m_in[pos] == '=') {
133 pos += 1;
134 return make_token(Token::Kind::GreaterEqual);
135 }
136 return make_token(Token::Kind::GreaterThan);
137 case '<':
138 if (pos < m_in.size() && m_in[pos] == '=') {
139 pos += 1;
140 return make_token(Token::Kind::LessEqual);
141 }
142 return make_token(Token::Kind::LessThan);
143 case '=':
144 if (pos < m_in.size() && m_in[pos] == '=') {
145 pos += 1;
146 return make_token(Token::Kind::Equal);
147 }
148 return make_token(Token::Kind::Unknown);
149 case '!':
150 if (pos < m_in.size() && m_in[pos] == '=') {
151 pos += 1;
152 return make_token(Token::Kind::NotEqual);
153 }
154 return make_token(Token::Kind::Unknown);
155 case '\"':
156 return scan_string();
157 case '0':
158 case '1':
159 case '2':
160 case '3':
161 case '4':
162 case '5':
163 case '6':
164 case '7':
165 case '8':
166 case '9':
167 minus_state = MinusState::Operator;
168 return scan_number();
169 case '_':
170 case '@':
171 case '$':
172 minus_state = MinusState::Operator;
173 return scan_id();
174 default:
175 return make_token(Token::Kind::Unknown);
176 }
177 }
178
179 Token scan_id() {
180 for (;;) {
181 if (pos >= m_in.size()) {
182 break;
183 }
184 const char ch = m_in[pos];
185 if (!std::isalnum(ch) && ch != '.' && ch != '/' && ch != '_' && ch != '-') {
186 break;
187 }
188 pos += 1;
189 }
190 return make_token(Token::Kind::Id);
191 }
192
193 Token scan_number() {
194 for (;;) {
195 if (pos >= m_in.size()) {
196 break;
197 }
198 const char ch = m_in[pos];
199 // be very permissive in lexer (we'll catch errors when conversion happens)
200 if (!(std::isdigit(ch) || ch == '.' || ch == 'e' || ch == 'E' || (ch == '+' && (pos == 0 || m_in[pos-1] == 'e' || m_in[pos-1] == 'E')) || (ch == '-' && (pos == 0 || m_in[pos-1] == 'e' || m_in[pos-1] == 'E')))) {
201 break;
202 }
203 pos += 1;
204 }
205 return make_token(Token::Kind::Number);
206 }
207
208 Token scan_string() {
209 bool escape {false};
210 for (;;) {
211 if (pos >= m_in.size()) {
212 break;
213 }
214 const char ch = m_in[pos++];
215 if (ch == '\\') {
216 escape = true;
217 } else if (!escape && ch == m_in[tok_start]) {
218 break;
219 } else {
220 escape = false;
221 }
222 }
223 return make_token(Token::Kind::String);
224 }
225
226 Token make_token(Token::Kind kind) const {
227 return Token(kind, string_view::slice(m_in, tok_start, pos));
228 }
229
230 void skip_whitespaces_and_newlines() {
231 if (pos < m_in.size()) {
232 while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t' || m_in[pos] == '\n' || m_in[pos] == '\r')) {
233 pos += 1;
234 }
235 }
236 }
237
238 void skip_whitespaces_and_first_newline() {
239 if (pos < m_in.size()) {
240 while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t')) {
241 pos += 1;
242 }
243 }
244
245 if (pos < m_in.size()) {
246 const char ch = m_in[pos];
247 if (ch == '\n') {
248 pos += 1;
249 } else if (ch == '\r') {
250 pos += 1;
251 if (pos < m_in.size() && m_in[pos] == '\n') {
252 pos += 1;
253 }
254 }
255 }
256 }
257
258 static std::string_view clear_final_line_if_whitespace(std::string_view text) {
259 std::string_view result = text;
260 while (!result.empty()) {
261 const char ch = result.back();
262 if (ch == ' ' || ch == '\t') {
263 result.remove_suffix(1);
264 } else if (ch == '\n' || ch == '\r') {
265 break;
266 } else {
267 return text;
268 }
269 }
270 return result;
271 }
272
273public:
274 explicit Lexer(const LexerConfig& config): config(config), state(State::Text), minus_state(MinusState::Number) {}
275
276 SourceLocation current_position() const {
277 return get_source_location(m_in, tok_start);
278 }
279
280 void start(std::string_view input) {
281 m_in = input;
282 tok_start = 0;
283 pos = 0;
284 state = State::Text;
285 minus_state = MinusState::Number;
286
287 // Consume byte order mark (BOM) for UTF-8
288 if (inja::string_view::starts_with(m_in, "\xEF\xBB\xBF")) {
289 m_in = m_in.substr(3);
290 }
291 }
292
293 Token scan() {
294 tok_start = pos;
295
296 again:
297 if (tok_start >= m_in.size()) {
298 return make_token(Token::Kind::Eof);
299 }
300
301 switch (state) {
302 default:
303 case State::Text: {
304 // fast-scan to first open character
305 const size_t open_start = m_in.substr(pos).find_first_of(config.open_chars);
306 if (open_start == std::string_view::npos) {
307 // didn't find open, return remaining text as text token
308 pos = m_in.size();
309 return make_token(Token::Kind::Text);
310 }
311 pos += open_start;
312
313 // try to match one of the opening sequences, and get the close
314 std::string_view open_str = m_in.substr(pos);
315 bool must_lstrip = false;
316 if (inja::string_view::starts_with(open_str, config.expression_open)) {
317 if (inja::string_view::starts_with(open_str, config.expression_open_force_lstrip)) {
318 state = State::ExpressionStartForceLstrip;
319 must_lstrip = true;
320 } else {
321 state = State::ExpressionStart;
322 }
323 } else if (inja::string_view::starts_with(open_str, config.statement_open)) {
324 if (inja::string_view::starts_with(open_str, config.statement_open_no_lstrip)) {
325 state = State::StatementStartNoLstrip;
326 } else if (inja::string_view::starts_with(open_str, config.statement_open_force_lstrip)) {
327 state = State::StatementStartForceLstrip;
328 must_lstrip = true;
329 } else {
330 state = State::StatementStart;
331 must_lstrip = config.lstrip_blocks;
332 }
333 } else if (inja::string_view::starts_with(open_str, config.comment_open)) {
334 if (inja::string_view::starts_with(open_str, config.comment_open_force_lstrip)) {
335 state = State::CommentStartForceLstrip;
336 must_lstrip = true;
337 } else {
338 state = State::CommentStart;
339 must_lstrip = config.lstrip_blocks;
340 }
341 } else if ((pos == 0 || m_in[pos - 1] == '\n') && inja::string_view::starts_with(open_str, config.line_statement)) {
342 state = State::LineStart;
343 } else {
344 pos += 1; // wasn't actually an opening sequence
345 goto again;
346 }
347
348 std::string_view text = string_view::slice(m_in, tok_start, pos);
349 if (must_lstrip) {
350 text = clear_final_line_if_whitespace(text);
351 }
352
353 if (text.empty()) {
354 goto again; // don't generate empty token
355 }
356 return Token(Token::Kind::Text, text);
357 }
358 case State::ExpressionStart: {
359 state = State::ExpressionBody;
360 pos += config.expression_open.size();
361 return make_token(Token::Kind::ExpressionOpen);
362 }
363 case State::ExpressionStartForceLstrip: {
364 state = State::ExpressionBody;
365 pos += config.expression_open_force_lstrip.size();
366 return make_token(Token::Kind::ExpressionOpen);
367 }
368 case State::LineStart: {
369 state = State::LineBody;
370 pos += config.line_statement.size();
371 return make_token(Token::Kind::LineStatementOpen);
372 }
373 case State::StatementStart: {
374 state = State::StatementBody;
375 pos += config.statement_open.size();
376 return make_token(Token::Kind::StatementOpen);
377 }
378 case State::StatementStartNoLstrip: {
379 state = State::StatementBody;
380 pos += config.statement_open_no_lstrip.size();
381 return make_token(Token::Kind::StatementOpen);
382 }
383 case State::StatementStartForceLstrip: {
384 state = State::StatementBody;
385 pos += config.statement_open_force_lstrip.size();
386 return make_token(Token::Kind::StatementOpen);
387 }
388 case State::CommentStart: {
389 state = State::CommentBody;
390 pos += config.comment_open.size();
391 return make_token(Token::Kind::CommentOpen);
392 }
393 case State::CommentStartForceLstrip: {
394 state = State::CommentBody;
395 pos += config.comment_open_force_lstrip.size();
396 return make_token(Token::Kind::CommentOpen);
397 }
398 case State::ExpressionBody:
399 return scan_body(config.expression_close, Token::Kind::ExpressionClose, config.expression_close_force_rstrip);
400 case State::LineBody:
401 return scan_body("\n", Token::Kind::LineStatementClose);
402 case State::StatementBody:
403 return scan_body(config.statement_close, Token::Kind::StatementClose, config.statement_close_force_rstrip, config.trim_blocks);
404 case State::CommentBody: {
405 // fast-scan to comment close
406 const size_t end = m_in.substr(pos).find(config.comment_close);
407 if (end == std::string_view::npos) {
408 pos = m_in.size();
409 return make_token(Token::Kind::Eof);
410 }
411
412 // Check for trim pattern
413 const bool must_rstrip = inja::string_view::starts_with(m_in.substr(pos + end - 1), config.comment_close_force_rstrip);
414
415 // return the entire comment in the close token
416 state = State::Text;
417 pos += end + config.comment_close.size();
418 Token tok = make_token(Token::Kind::CommentClose);
419
420 if (must_rstrip || config.trim_blocks) {
421 skip_whitespaces_and_first_newline();
422 }
423 return tok;
424 }
425 }
426 }
427
428 const LexerConfig& get_config() const {
429 return config;
430 }
431};
432
433} // namespace inja
434
435#endif // INCLUDE_INJA_LEXER_HPP_
Class for lexing an inja Template.
Definition: lexer.hpp:16
Class for lexer configuration.
Definition: config.hpp:14
Definition: exceptions.hpp:9
Helper-class for the inja Lexer.
Definition: token.hpp:12