1#[cfg(test)]
6#[cfg(feature = "std")]
7mod test;
8mod token;
9
10#[cfg(feature = "alloc")]
11use alloc::vec::Vec;
12
13use winnow::stream::AsBStr as _;
14use winnow::stream::ContainsToken as _;
15use winnow::stream::FindSlice as _;
16use winnow::stream::Location;
17use winnow::stream::Stream as _;
18
19use crate::Span;
20
21pub use token::Token;
22pub use token::TokenKind;
23
24pub struct Lexer<'i> {
28 stream: Stream<'i>,
29 eof: bool,
30}
31
32impl<'i> Lexer<'i> {
33 pub(crate) fn new(input: &'i str) -> Self {
34 let mut stream = Stream::new(input);
35 if input.as_bytes().starts_with(BOM) {
36 let offset = BOM.len();
37 #[cfg(feature = "unsafe")] unsafe {
39 stream.next_slice_unchecked(offset)
40 };
41 #[cfg(not(feature = "unsafe"))]
42 stream.next_slice(offset);
43 }
44 Lexer { stream, eof: false }
45 }
46
47 #[cfg(feature = "alloc")]
48 pub fn into_vec(self) -> Vec<Token> {
49 #![allow(unused_qualifications)] let capacity = core::cmp::min(
51 self.stream.len(),
52 usize::MAX / core::mem::size_of::<Token>(),
53 );
54 let mut vec = Vec::with_capacity(capacity);
55 vec.extend(self);
56 vec
57 }
58}
59
60impl Iterator for Lexer<'_> {
61 type Item = Token;
62
63 fn next(&mut self) -> Option<Self::Item> {
64 let Some(peek_byte) = self.stream.as_bstr().first() else {
65 if self.eof {
66 return None;
67 } else {
68 self.eof = true;
69 let start = self.stream.current_token_start();
70 let span = Span::new_unchecked(start, start);
71 return Some(Token::new(TokenKind::Eof, span));
72 }
73 };
74 Some(process_token(*peek_byte, &mut self.stream))
75 }
76}
77
78const BOM: &[u8] = b"\xEF\xBB\xBF";
79
80pub(crate) type Stream<'i> = winnow::stream::LocatingSlice<&'i str>;
81
82fn process_token(peek_byte: u8, stream: &mut Stream<'_>) -> Token {
83 let token = match peek_byte {
84 b'.' => lex_ascii_char(stream, TokenKind::Dot),
85 b'=' => lex_ascii_char(stream, TokenKind::Equals),
86 b',' => lex_ascii_char(stream, TokenKind::Comma),
87 b'[' => lex_ascii_char(stream, TokenKind::LeftSquareBracket),
88 b']' => lex_ascii_char(stream, TokenKind::RightSquareBracket),
89 b'{' => lex_ascii_char(stream, TokenKind::LeftCurlyBracket),
90 b'}' => lex_ascii_char(stream, TokenKind::RightCurlyBracket),
91 b' ' => lex_whitespace(stream),
92 b'\t' => lex_whitespace(stream),
93 b'#' => lex_comment(stream),
94 b'\r' => lex_crlf(stream),
95 b'\n' => lex_ascii_char(stream, TokenKind::Newline),
96 b'\'' => {
97 if stream.starts_with(ML_LITERAL_STRING_DELIM) {
98 lex_ml_literal_string(stream)
99 } else {
100 lex_literal_string(stream)
101 }
102 }
103 b'"' => {
104 if stream.starts_with(ML_BASIC_STRING_DELIM) {
105 lex_ml_basic_string(stream)
106 } else {
107 lex_basic_string(stream)
108 }
109 }
110 _ => lex_atom(stream),
111 };
112 token
113}
114
115fn lex_ascii_char(stream: &mut Stream<'_>, kind: TokenKind) -> Token {
123 debug_assert!(!stream.is_empty());
124 let start = stream.current_token_start();
125
126 let offset = 1; #[cfg(feature = "unsafe")] unsafe {
129 stream.next_slice_unchecked(offset)
130 };
131 #[cfg(not(feature = "unsafe"))]
132 stream.next_slice(offset);
133
134 let end = stream.previous_token_end();
135 let span = Span::new_unchecked(start, end);
136 Token::new(kind, span)
137}
138
139fn lex_whitespace(stream: &mut Stream<'_>) -> Token {
154 debug_assert!(!stream.is_empty());
155 let start = stream.current_token_start();
156
157 let offset = stream
158 .as_bstr()
159 .offset_for(|b| !WSCHAR.contains_token(b))
160 .unwrap_or(stream.eof_offset());
161 #[cfg(feature = "unsafe")] unsafe {
163 stream.next_slice_unchecked(offset)
164 };
165 #[cfg(not(feature = "unsafe"))]
166 stream.next_slice(offset);
167
168 let end = stream.previous_token_end();
169 let span = Span::new_unchecked(start, end);
170 Token::new(TokenKind::Whitespace, span)
171}
172
173pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t');
178
179fn lex_comment(stream: &mut Stream<'_>) -> Token {
196 let start = stream.current_token_start();
197
198 let offset = stream
199 .as_bytes()
200 .find_slice((b'\r', b'\n'))
201 .map(|s| s.start)
202 .unwrap_or_else(|| stream.eof_offset());
203 #[cfg(feature = "unsafe")] unsafe {
205 stream.next_slice_unchecked(offset)
206 };
207 #[cfg(not(feature = "unsafe"))]
208 stream.next_slice(offset);
209
210 let end = stream.previous_token_end();
211 let span = Span::new_unchecked(start, end);
212 Token::new(TokenKind::Comment, span)
213}
214
215pub(crate) const COMMENT_START_SYMBOL: u8 = b'#';
217
218fn lex_crlf(stream: &mut Stream<'_>) -> Token {
232 let start = stream.current_token_start();
233
234 let mut offset = '\r'.len_utf8();
235 let has_lf = stream.as_bstr().get(1) == Some(&b'\n');
236 if has_lf {
237 offset += '\n'.len_utf8();
238 }
239
240 #[cfg(feature = "unsafe")] unsafe {
242 stream.next_slice_unchecked(offset)
243 };
244 #[cfg(not(feature = "unsafe"))]
245 stream.next_slice(offset);
246 let end = stream.previous_token_end();
247 let span = Span::new_unchecked(start, end);
248
249 Token::new(TokenKind::Newline, span)
250}
251
252fn lex_literal_string(stream: &mut Stream<'_>) -> Token {
269 let start = stream.current_token_start();
270
271 let offset = 1; #[cfg(feature = "unsafe")] unsafe {
274 stream.next_slice_unchecked(offset)
275 };
276 #[cfg(not(feature = "unsafe"))]
277 stream.next_slice(offset);
278
279 let offset = match stream.as_bstr().find_slice((APOSTROPHE, b'\n')) {
280 Some(span) => {
281 if stream.as_bstr()[span.start] == APOSTROPHE {
282 span.end
283 } else {
284 span.start
285 }
286 }
287 None => stream.eof_offset(),
288 };
289 #[cfg(feature = "unsafe")]
290 unsafe {
292 stream.next_slice_unchecked(offset)
293 };
294 #[cfg(not(feature = "unsafe"))]
295 stream.next_slice(offset);
296
297 let end = stream.previous_token_end();
298 let span = Span::new_unchecked(start, end);
299 Token::new(TokenKind::LiteralString, span)
300}
301
302pub(crate) const APOSTROPHE: u8 = b'\'';
304
305fn lex_ml_literal_string(stream: &mut Stream<'_>) -> Token {
325 let start = stream.current_token_start();
326
327 let offset = ML_LITERAL_STRING_DELIM.len();
328 #[cfg(feature = "unsafe")] unsafe {
330 stream.next_slice_unchecked(offset)
331 };
332 #[cfg(not(feature = "unsafe"))]
333 stream.next_slice(offset);
334
335 let offset = match stream.as_bstr().find_slice(ML_LITERAL_STRING_DELIM) {
336 Some(span) => span.end,
337 None => stream.eof_offset(),
338 };
339 #[cfg(feature = "unsafe")]
340 unsafe {
342 stream.next_slice_unchecked(offset)
343 };
344 #[cfg(not(feature = "unsafe"))]
345 stream.next_slice(offset);
346
347 if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
348 let offset = 1;
349 #[cfg(feature = "unsafe")] unsafe {
351 stream.next_slice_unchecked(offset)
352 };
353 #[cfg(not(feature = "unsafe"))]
354 stream.next_slice(offset);
355
356 if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
357 let offset = 1;
358 #[cfg(feature = "unsafe")]
359 unsafe {
361 stream.next_slice_unchecked(offset)
362 };
363 #[cfg(not(feature = "unsafe"))]
364 stream.next_slice(offset);
365 }
366 }
367
368 let end = stream.previous_token_end();
369 let span = Span::new_unchecked(start, end);
370 Token::new(TokenKind::MlLiteralString, span)
371}
372
373pub(crate) const ML_LITERAL_STRING_DELIM: &str = "'''";
375
376fn lex_basic_string(stream: &mut Stream<'_>) -> Token {
406 let start = stream.current_token_start();
407
408 let offset = 1; #[cfg(feature = "unsafe")] unsafe {
411 stream.next_slice_unchecked(offset)
412 };
413 #[cfg(not(feature = "unsafe"))]
414 stream.next_slice(offset);
415
416 loop {
417 match stream.as_bstr().find_slice((QUOTATION_MARK, ESCAPE, b'\n')) {
419 Some(span) => {
420 let found = stream.as_bstr()[span.start];
421 if found == QUOTATION_MARK {
422 let offset = span.end;
423 #[cfg(feature = "unsafe")]
424 unsafe {
426 stream.next_slice_unchecked(offset)
427 };
428 #[cfg(not(feature = "unsafe"))]
429 stream.next_slice(offset);
430 break;
431 } else if found == ESCAPE {
432 let offset = span.end;
433 #[cfg(feature = "unsafe")]
434 unsafe {
436 stream.next_slice_unchecked(offset)
437 };
438 #[cfg(not(feature = "unsafe"))]
439 stream.next_slice(offset);
440
441 let peek = stream.as_bstr().peek_token();
442 match peek {
443 Some(ESCAPE) | Some(QUOTATION_MARK) => {
444 let offset = 1; #[cfg(feature = "unsafe")]
446 #[cfg(feature = "unsafe")]
447 unsafe {
449 stream.next_slice_unchecked(offset)
450 };
451 #[cfg(not(feature = "unsafe"))]
452 stream.next_slice(offset);
453 }
454 _ => {}
455 }
456 continue;
457 } else if found == b'\n' {
458 let offset = span.start;
459 #[cfg(feature = "unsafe")]
460 unsafe {
462 stream.next_slice_unchecked(offset)
463 };
464 #[cfg(not(feature = "unsafe"))]
465 stream.next_slice(offset);
466 break;
467 } else {
468 unreachable!("found `{found}`");
469 }
470 }
471 None => {
472 stream.finish();
473 break;
474 }
475 }
476 }
477
478 let end = stream.previous_token_end();
479 let span = Span::new_unchecked(start, end);
480 Token::new(TokenKind::BasicString, span)
481}
482
483pub(crate) const QUOTATION_MARK: u8 = b'"';
485
486pub(crate) const ESCAPE: u8 = b'\\';
488
489fn lex_ml_basic_string(stream: &mut Stream<'_>) -> Token {
511 let start = stream.current_token_start();
512
513 let offset = ML_BASIC_STRING_DELIM.len();
514 #[cfg(feature = "unsafe")] unsafe {
516 stream.next_slice_unchecked(offset)
517 };
518 #[cfg(not(feature = "unsafe"))]
519 stream.next_slice(offset);
520
521 loop {
522 match stream.as_bstr().find_slice((ML_BASIC_STRING_DELIM, "\\")) {
524 Some(span) => {
525 let found = stream.as_bstr()[span.start];
526 if found == QUOTATION_MARK {
527 let offset = span.end;
528 #[cfg(feature = "unsafe")]
529 unsafe {
531 stream.next_slice_unchecked(offset)
532 };
533 #[cfg(not(feature = "unsafe"))]
534 stream.next_slice(offset);
535 break;
536 } else if found == ESCAPE {
537 let offset = span.end;
538 #[cfg(feature = "unsafe")]
539 unsafe {
541 stream.next_slice_unchecked(offset)
542 };
543 #[cfg(not(feature = "unsafe"))]
544 stream.next_slice(offset);
545
546 let peek = stream.as_bstr().peek_token();
547 match peek {
548 Some(ESCAPE) | Some(QUOTATION_MARK) => {
549 let offset = 1; #[cfg(feature = "unsafe")]
551 unsafe {
553 stream.next_slice_unchecked(offset)
554 };
555 #[cfg(not(feature = "unsafe"))]
556 stream.next_slice(offset);
557 }
558 _ => {}
559 }
560 continue;
561 } else {
562 unreachable!("found `{found}`");
563 }
564 }
565 None => {
566 stream.finish();
567 break;
568 }
569 }
570 }
571 if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
572 let offset = 1;
573 #[cfg(feature = "unsafe")]
574 unsafe {
576 stream.next_slice_unchecked(offset)
577 };
578 #[cfg(not(feature = "unsafe"))]
579 stream.next_slice(offset);
580 if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
581 let offset = 1;
582 #[cfg(feature = "unsafe")]
583 unsafe {
585 stream.next_slice_unchecked(offset)
586 };
587 #[cfg(not(feature = "unsafe"))]
588 stream.next_slice(offset);
589 }
590 }
591
592 let end = stream.previous_token_end();
593 let span = Span::new_unchecked(start, end);
594 Token::new(TokenKind::MlBasicString, span)
595}
596
597pub(crate) const ML_BASIC_STRING_DELIM: &str = "\"\"\"";
599
600fn lex_atom(stream: &mut Stream<'_>) -> Token {
609 let start = stream.current_token_start();
610
611 const TOKEN_START: &[u8] = b".=,[]{} \t#\r\n)'\"";
612 let offset = stream
613 .as_bstr()
614 .offset_for(|b| TOKEN_START.contains_token(b))
615 .unwrap_or_else(|| stream.eof_offset());
616 #[cfg(feature = "unsafe")] unsafe {
618 stream.next_slice_unchecked(offset)
619 };
620 #[cfg(not(feature = "unsafe"))]
621 stream.next_slice(offset);
622
623 let end = stream.previous_token_end();
624 let span = Span::new_unchecked(start, end);
625 Token::new(TokenKind::Atom, span)
626}