toml_parser/lexer/
mod.rs

1//! Lex TOML tokens
2//!
3//! To get started, see [`Source::lex`][crate::Source::lex]
4
5#[cfg(test)]
6#[cfg(feature = "std")]
7mod test;
8mod token;
9
10#[cfg(feature = "alloc")]
11use alloc::vec::Vec;
12
13use winnow::stream::AsBStr as _;
14use winnow::stream::ContainsToken as _;
15use winnow::stream::FindSlice as _;
16use winnow::stream::Location;
17use winnow::stream::Stream as _;
18
19use crate::Span;
20
21pub use token::Token;
22pub use token::TokenKind;
23
24/// Lex TOML [tokens][Token]
25///
26/// To get started, see [`Source::lex`][crate::Source::lex]
27pub struct Lexer<'i> {
28    stream: Stream<'i>,
29    eof: bool,
30}
31
32impl<'i> Lexer<'i> {
33    pub(crate) fn new(input: &'i str) -> Self {
34        let mut stream = Stream::new(input);
35        if input.as_bytes().starts_with(BOM) {
36            let offset = BOM.len();
37            #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
38            unsafe {
39                stream.next_slice_unchecked(offset)
40            };
41            #[cfg(not(feature = "unsafe"))]
42            stream.next_slice(offset);
43        }
44        Lexer { stream, eof: false }
45    }
46
47    #[cfg(feature = "alloc")]
48    pub fn into_vec(self) -> Vec<Token> {
49        #![allow(unused_qualifications)] // due to MSRV of 1.66
50        let capacity = core::cmp::min(
51            self.stream.len(),
52            usize::MAX / core::mem::size_of::<Token>(),
53        );
54        let mut vec = Vec::with_capacity(capacity);
55        vec.extend(self);
56        vec
57    }
58}
59
60impl Iterator for Lexer<'_> {
61    type Item = Token;
62
63    fn next(&mut self) -> Option<Self::Item> {
64        let Some(peek_byte) = self.stream.as_bstr().first() else {
65            if self.eof {
66                return None;
67            } else {
68                self.eof = true;
69                let start = self.stream.current_token_start();
70                let span = Span::new_unchecked(start, start);
71                return Some(Token::new(TokenKind::Eof, span));
72            }
73        };
74        Some(process_token(*peek_byte, &mut self.stream))
75    }
76}
77
78const BOM: &[u8] = b"\xEF\xBB\xBF";
79
80pub(crate) type Stream<'i> = winnow::stream::LocatingSlice<&'i str>;
81
82fn process_token(peek_byte: u8, stream: &mut Stream<'_>) -> Token {
83    let token = match peek_byte {
84        b'.' => lex_ascii_char(stream, TokenKind::Dot),
85        b'=' => lex_ascii_char(stream, TokenKind::Equals),
86        b',' => lex_ascii_char(stream, TokenKind::Comma),
87        b'[' => lex_ascii_char(stream, TokenKind::LeftSquareBracket),
88        b']' => lex_ascii_char(stream, TokenKind::RightSquareBracket),
89        b'{' => lex_ascii_char(stream, TokenKind::LeftCurlyBracket),
90        b'}' => lex_ascii_char(stream, TokenKind::RightCurlyBracket),
91        b' ' => lex_whitespace(stream),
92        b'\t' => lex_whitespace(stream),
93        b'#' => lex_comment(stream),
94        b'\r' => lex_crlf(stream),
95        b'\n' => lex_ascii_char(stream, TokenKind::Newline),
96        b'\'' => {
97            if stream.starts_with(ML_LITERAL_STRING_DELIM) {
98                lex_ml_literal_string(stream)
99            } else {
100                lex_literal_string(stream)
101            }
102        }
103        b'"' => {
104            if stream.starts_with(ML_BASIC_STRING_DELIM) {
105                lex_ml_basic_string(stream)
106            } else {
107                lex_basic_string(stream)
108            }
109        }
110        _ => lex_atom(stream),
111    };
112    token
113}
114
115/// Process an ASCII character token
116///
117/// # Safety
118///
119/// - `stream` must be UTF-8
120/// - `stream` must be non-empty
121/// - `stream[0]` must be ASCII
122fn lex_ascii_char(stream: &mut Stream<'_>, kind: TokenKind) -> Token {
123    debug_assert!(!stream.is_empty());
124    let start = stream.current_token_start();
125
126    let offset = 1; // an ascii character
127    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
128    unsafe {
129        stream.next_slice_unchecked(offset)
130    };
131    #[cfg(not(feature = "unsafe"))]
132    stream.next_slice(offset);
133
134    let end = stream.previous_token_end();
135    let span = Span::new_unchecked(start, end);
136    Token::new(kind, span)
137}
138
139/// Process Whitespace
140///
141/// ```bnf
142/// ;; Whitespace
143///
144/// ws = *wschar
145/// wschar =  %x20  ; Space
146/// wschar =/ %x09  ; Horizontal tab
147/// ```
148///
149/// # Safety
150///
151/// - `stream` must be UTF-8
152/// - `stream` must be non-empty
153fn lex_whitespace(stream: &mut Stream<'_>) -> Token {
154    debug_assert!(!stream.is_empty());
155    let start = stream.current_token_start();
156
157    let offset = stream
158        .as_bstr()
159        .offset_for(|b| !WSCHAR.contains_token(b))
160        .unwrap_or(stream.eof_offset());
161    #[cfg(feature = "unsafe")] // SAFETY: WSCHAR ensures `offset` will be at UTF-8 boundary
162    unsafe {
163        stream.next_slice_unchecked(offset)
164    };
165    #[cfg(not(feature = "unsafe"))]
166    stream.next_slice(offset);
167
168    let end = stream.previous_token_end();
169    let span = Span::new_unchecked(start, end);
170    Token::new(TokenKind::Whitespace, span)
171}
172
173/// ```bnf
174/// wschar =  %x20  ; Space
175/// wschar =/ %x09  ; Horizontal tab
176/// ```
177pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t');
178
179/// Process Comment
180///
181/// ```bnf
182/// ;; Comment
183///
184/// comment-start-symbol = %x23 ; #
185/// non-ascii = %x80-D7FF / %xE000-10FFFF
186/// non-eol = %x09 / %x20-7F / non-ascii
187///
188/// comment = comment-start-symbol *non-eol
189/// ```
190///
191/// # Safety
192///
193/// - `stream` must be UTF-8
194/// - `stream[0] == b'#'`
195fn lex_comment(stream: &mut Stream<'_>) -> Token {
196    let start = stream.current_token_start();
197
198    let offset = stream
199        .as_bytes()
200        .find_slice((b'\r', b'\n'))
201        .map(|s| s.start)
202        .unwrap_or_else(|| stream.eof_offset());
203    #[cfg(feature = "unsafe")] // SAFETY: newlines ensure `offset` is along UTF-8 boundary
204    unsafe {
205        stream.next_slice_unchecked(offset)
206    };
207    #[cfg(not(feature = "unsafe"))]
208    stream.next_slice(offset);
209
210    let end = stream.previous_token_end();
211    let span = Span::new_unchecked(start, end);
212    Token::new(TokenKind::Comment, span)
213}
214
215/// `comment-start-symbol = %x23 ; #`
216pub(crate) const COMMENT_START_SYMBOL: u8 = b'#';
217
218/// Process Newline
219///
220/// ```bnf
221/// ;; Newline
222///
223/// newline =  %x0A     ; LF
224/// newline =/ %x0D.0A  ; CRLF
225/// ```
226///
227/// # Safety
228///
229/// - `stream` must be UTF-8
230/// - `stream[0] == b'\r'`
231fn lex_crlf(stream: &mut Stream<'_>) -> Token {
232    let start = stream.current_token_start();
233
234    let mut offset = '\r'.len_utf8();
235    let has_lf = stream.as_bstr().get(1) == Some(&b'\n');
236    if has_lf {
237        offset += '\n'.len_utf8();
238    }
239
240    #[cfg(feature = "unsafe")] // SAFETY: newlines ensure `offset` is along UTF-8 boundary
241    unsafe {
242        stream.next_slice_unchecked(offset)
243    };
244    #[cfg(not(feature = "unsafe"))]
245    stream.next_slice(offset);
246    let end = stream.previous_token_end();
247    let span = Span::new_unchecked(start, end);
248
249    Token::new(TokenKind::Newline, span)
250}
251
252/// Process literal string
253///
254/// ```bnf
255/// ;; Literal String
256///
257/// literal-string = apostrophe *literal-char apostrophe
258///
259/// apostrophe = %x27 ; ' apostrophe
260///
261/// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
262/// ```
263///
264/// # Safety
265///
266/// - `stream` must be UTF-8
267/// - `stream[0] == b'\''`
268fn lex_literal_string(stream: &mut Stream<'_>) -> Token {
269    let start = stream.current_token_start();
270
271    let offset = 1; // APOSTROPHE
272    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
273    unsafe {
274        stream.next_slice_unchecked(offset)
275    };
276    #[cfg(not(feature = "unsafe"))]
277    stream.next_slice(offset);
278
279    let offset = match stream.as_bstr().find_slice((APOSTROPHE, b'\n')) {
280        Some(span) => {
281            if stream.as_bstr()[span.start] == APOSTROPHE {
282                span.end
283            } else {
284                span.start
285            }
286        }
287        None => stream.eof_offset(),
288    };
289    #[cfg(feature = "unsafe")]
290    // SAFETY: `APOSTROPHE`/newline ensure `offset` is along UTF-8 boundary
291    unsafe {
292        stream.next_slice_unchecked(offset)
293    };
294    #[cfg(not(feature = "unsafe"))]
295    stream.next_slice(offset);
296
297    let end = stream.previous_token_end();
298    let span = Span::new_unchecked(start, end);
299    Token::new(TokenKind::LiteralString, span)
300}
301
302/// `apostrophe = %x27 ; ' apostrophe`
303pub(crate) const APOSTROPHE: u8 = b'\'';
304
305/// Process multi-line literal string
306///
307/// ```bnf
308/// ;; Multiline Literal String
309///
310/// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
311///                     ml-literal-string-delim
312/// ml-literal-string-delim = 3apostrophe
313/// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
314///
315/// mll-content = mll-char / newline
316/// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
317/// mll-quotes = 1*2apostrophe
318/// ```
319///
320/// # Safety
321///
322/// - `stream` must be UTF-8
323/// - `stream.starts_with(ML_LITERAL_STRING_DELIM)`
324fn lex_ml_literal_string(stream: &mut Stream<'_>) -> Token {
325    let start = stream.current_token_start();
326
327    let offset = ML_LITERAL_STRING_DELIM.len();
328    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
329    unsafe {
330        stream.next_slice_unchecked(offset)
331    };
332    #[cfg(not(feature = "unsafe"))]
333    stream.next_slice(offset);
334
335    let offset = match stream.as_bstr().find_slice(ML_LITERAL_STRING_DELIM) {
336        Some(span) => span.end,
337        None => stream.eof_offset(),
338    };
339    #[cfg(feature = "unsafe")]
340    // SAFETY: `ML_LITERAL_STRING_DELIM` ensure `offset` is along UTF-8 boundary
341    unsafe {
342        stream.next_slice_unchecked(offset)
343    };
344    #[cfg(not(feature = "unsafe"))]
345    stream.next_slice(offset);
346
347    if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
348        let offset = 1;
349        #[cfg(feature = "unsafe")] // SAFETY: `APOSTROPHE` ensure `offset` is along UTF-8 boundary
350        unsafe {
351            stream.next_slice_unchecked(offset)
352        };
353        #[cfg(not(feature = "unsafe"))]
354        stream.next_slice(offset);
355
356        if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
357            let offset = 1;
358            #[cfg(feature = "unsafe")]
359            // SAFETY: `APOSTROPHE` ensure `offset` is along UTF-8 boundary
360            unsafe {
361                stream.next_slice_unchecked(offset)
362            };
363            #[cfg(not(feature = "unsafe"))]
364            stream.next_slice(offset);
365        }
366    }
367
368    let end = stream.previous_token_end();
369    let span = Span::new_unchecked(start, end);
370    Token::new(TokenKind::MlLiteralString, span)
371}
372
373/// `ml-literal-string-delim = 3apostrophe`
374pub(crate) const ML_LITERAL_STRING_DELIM: &str = "'''";
375
376/// Process basic string
377///
378/// ```bnf
379/// ;; Basic String
380///
381/// basic-string = quotation-mark *basic-char quotation-mark
382///
383/// quotation-mark = %x22            ; "
384///
385/// basic-char = basic-unescaped / escaped
386/// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
387/// escaped = escape escape-seq-char
388///
389/// escape = %x5C                   ; \
390/// escape-seq-char =  %x22         ; "    quotation mark  U+0022
391/// escape-seq-char =/ %x5C         ; \    reverse solidus U+005C
392/// escape-seq-char =/ %x62         ; b    backspace       U+0008
393/// escape-seq-char =/ %x66         ; f    form feed       U+000C
394/// escape-seq-char =/ %x6E         ; n    line feed       U+000A
395/// escape-seq-char =/ %x72         ; r    carriage return U+000D
396/// escape-seq-char =/ %x74         ; t    tab             U+0009
397/// escape-seq-char =/ %x75 4HEXDIG ; uXXXX                U+XXXX
398/// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX            U+XXXXXXXX
399/// ```
400///
401/// # Safety
402///
403/// - `stream` must be UTF-8
404/// - `stream[0] == b'"'`
405fn lex_basic_string(stream: &mut Stream<'_>) -> Token {
406    let start = stream.current_token_start();
407
408    let offset = 1; // QUOTATION_MARK
409    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
410    unsafe {
411        stream.next_slice_unchecked(offset)
412    };
413    #[cfg(not(feature = "unsafe"))]
414    stream.next_slice(offset);
415
416    loop {
417        // newline is present for error recovery
418        match stream.as_bstr().find_slice((QUOTATION_MARK, ESCAPE, b'\n')) {
419            Some(span) => {
420                let found = stream.as_bstr()[span.start];
421                if found == QUOTATION_MARK {
422                    let offset = span.end;
423                    #[cfg(feature = "unsafe")]
424                    // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
425                    unsafe {
426                        stream.next_slice_unchecked(offset)
427                    };
428                    #[cfg(not(feature = "unsafe"))]
429                    stream.next_slice(offset);
430                    break;
431                } else if found == ESCAPE {
432                    let offset = span.end;
433                    #[cfg(feature = "unsafe")]
434                    // SAFETY: `ESCAPE` / newline ensure `offset` is along UTF-8 boundary
435                    unsafe {
436                        stream.next_slice_unchecked(offset)
437                    };
438                    #[cfg(not(feature = "unsafe"))]
439                    stream.next_slice(offset);
440
441                    let peek = stream.as_bstr().peek_token();
442                    match peek {
443                        Some(ESCAPE) | Some(QUOTATION_MARK) => {
444                            let offset = 1; // ESCAPE / QUOTATION_MARK
445                            #[cfg(feature = "unsafe")]
446                            #[cfg(feature = "unsafe")]
447                            // SAFETY: `ESCAPE` / newline ensure `offset` is along UTF-8 boundary
448                            unsafe {
449                                stream.next_slice_unchecked(offset)
450                            };
451                            #[cfg(not(feature = "unsafe"))]
452                            stream.next_slice(offset);
453                        }
454                        _ => {}
455                    }
456                    continue;
457                } else if found == b'\n' {
458                    let offset = span.start;
459                    #[cfg(feature = "unsafe")]
460                    // SAFETY: newline ensure `offset` is along UTF-8 boundary
461                    unsafe {
462                        stream.next_slice_unchecked(offset)
463                    };
464                    #[cfg(not(feature = "unsafe"))]
465                    stream.next_slice(offset);
466                    break;
467                } else {
468                    unreachable!("found `{found}`");
469                }
470            }
471            None => {
472                stream.finish();
473                break;
474            }
475        }
476    }
477
478    let end = stream.previous_token_end();
479    let span = Span::new_unchecked(start, end);
480    Token::new(TokenKind::BasicString, span)
481}
482
483/// `quotation-mark = %x22            ; "`
484pub(crate) const QUOTATION_MARK: u8 = b'"';
485
486/// `escape = %x5C                   ; \`
487pub(crate) const ESCAPE: u8 = b'\\';
488
489/// Process multi-line basic string
490///
491/// ```bnf
492/// ;; Multiline Basic String
493///
494/// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
495///                   ml-basic-string-delim
496/// ml-basic-string-delim = 3quotation-mark
497/// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
498///
499/// mlb-content = mlb-char / newline / mlb-escaped-nl
500/// mlb-char = mlb-unescaped / escaped
501/// mlb-quotes = 1*2quotation-mark
502/// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
503/// mlb-escaped-nl = escape ws newline *( wschar / newline )
504/// ```
505///
506/// # Safety
507///
508/// - `stream` must be UTF-8
509/// - `stream.starts_with(ML_BASIC_STRING_DELIM)`
510fn lex_ml_basic_string(stream: &mut Stream<'_>) -> Token {
511    let start = stream.current_token_start();
512
513    let offset = ML_BASIC_STRING_DELIM.len();
514    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
515    unsafe {
516        stream.next_slice_unchecked(offset)
517    };
518    #[cfg(not(feature = "unsafe"))]
519    stream.next_slice(offset);
520
521    loop {
522        // newline is present for error recovery
523        match stream.as_bstr().find_slice((ML_BASIC_STRING_DELIM, "\\")) {
524            Some(span) => {
525                let found = stream.as_bstr()[span.start];
526                if found == QUOTATION_MARK {
527                    let offset = span.end;
528                    #[cfg(feature = "unsafe")]
529                    // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
530                    unsafe {
531                        stream.next_slice_unchecked(offset)
532                    };
533                    #[cfg(not(feature = "unsafe"))]
534                    stream.next_slice(offset);
535                    break;
536                } else if found == ESCAPE {
537                    let offset = span.end;
538                    #[cfg(feature = "unsafe")]
539                    // SAFETY: `ESCAPE` ensure `offset` is along UTF-8 boundary
540                    unsafe {
541                        stream.next_slice_unchecked(offset)
542                    };
543                    #[cfg(not(feature = "unsafe"))]
544                    stream.next_slice(offset);
545
546                    let peek = stream.as_bstr().peek_token();
547                    match peek {
548                        Some(ESCAPE) | Some(QUOTATION_MARK) => {
549                            let offset = 1; // ESCAPE / QUOTATION_MARK
550                            #[cfg(feature = "unsafe")]
551                            // SAFETY: `QUOTATION_MARK`/`ESCAPE` ensure `offset` is along UTF-8 boundary
552                            unsafe {
553                                stream.next_slice_unchecked(offset)
554                            };
555                            #[cfg(not(feature = "unsafe"))]
556                            stream.next_slice(offset);
557                        }
558                        _ => {}
559                    }
560                    continue;
561                } else {
562                    unreachable!("found `{found}`");
563                }
564            }
565            None => {
566                stream.finish();
567                break;
568            }
569        }
570    }
571    if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
572        let offset = 1;
573        #[cfg(feature = "unsafe")]
574        // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
575        unsafe {
576            stream.next_slice_unchecked(offset)
577        };
578        #[cfg(not(feature = "unsafe"))]
579        stream.next_slice(offset);
580        if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
581            let offset = 1;
582            #[cfg(feature = "unsafe")]
583            // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
584            unsafe {
585                stream.next_slice_unchecked(offset)
586            };
587            #[cfg(not(feature = "unsafe"))]
588            stream.next_slice(offset);
589        }
590    }
591
592    let end = stream.previous_token_end();
593    let span = Span::new_unchecked(start, end);
594    Token::new(TokenKind::MlBasicString, span)
595}
596
597/// `ml-basic-string-delim = 3quotation-mark`
598pub(crate) const ML_BASIC_STRING_DELIM: &str = "\"\"\"";
599
600/// Process Atom
601///
602/// This is everything else
603///
604/// # Safety
605///
606/// - `stream` must be UTF-8
607/// - `stream` must be non-empty
608fn lex_atom(stream: &mut Stream<'_>) -> Token {
609    let start = stream.current_token_start();
610
611    const TOKEN_START: &[u8] = b".=,[]{} \t#\r\n)'\"";
612    let offset = stream
613        .as_bstr()
614        .offset_for(|b| TOKEN_START.contains_token(b))
615        .unwrap_or_else(|| stream.eof_offset());
616    #[cfg(feature = "unsafe")] // SAFETY: `TOKEN_START` ensure `offset` is along UTF-8 boundary
617    unsafe {
618        stream.next_slice_unchecked(offset)
619    };
620    #[cfg(not(feature = "unsafe"))]
621    stream.next_slice(offset);
622
623    let end = stream.previous_token_end();
624    let span = Span::new_unchecked(start, end);
625    Token::new(TokenKind::Atom, span)
626}
toml_parser/lexer/mod.rs

toml_parser/lexer/
mod.rs