iri_string/parser/str/
maybe_pct_encoded.rs

1//! Processor for possibly- or invalidly-percent-encoded strings.
2
3use core::fmt::{self, Write as _};
4use core::marker::PhantomData;
5use core::num::NonZeroU8;
6use core::ops::ControlFlow;
7
8use crate::parser::str::find_split;
9use crate::parser::trusted::hexdigits_to_byte;
10
11/// Fragment in a possibly percent-encoded (and possibly broken) string.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub(crate) enum PctEncodedFragments<'a> {
14    /// String fragment without percent-encoded triplets.
15    NoPctStr(&'a str),
16    /// Stray `%` (percent) character.
17    StrayPercent,
18    /// Valid percent-encoded triplets for a character.
19    Char(&'a str, char),
20    /// Percent-encoded triplets that does not consists of a valid UTF-8 sequence.
21    InvalidUtf8PctTriplets(&'a str),
22}
23
24/// Processes characters in a string which may contain (possibly invalid) percent-encoded triplets.
25pub(crate) fn process_percent_encoded_best_effort<T, F, B>(
26    v: T,
27    mut f: F,
28) -> Result<ControlFlow<B>, fmt::Error>
29where
30    T: fmt::Display,
31    F: FnMut(PctEncodedFragments<'_>) -> ControlFlow<B>,
32{
33    let mut buf = [0_u8; 12];
34    let mut writer = DecomposeWriter {
35        f: &mut f,
36        decoder: Default::default(),
37        buf: &mut buf,
38        result: ControlFlow::Continue(()),
39        _r: PhantomData,
40    };
41
42    if write!(writer, "{v}").is_err() {
43        match writer.result {
44            ControlFlow::Continue(_) => return Err(fmt::Error),
45            ControlFlow::Break(v) => return Ok(ControlFlow::Break(v)),
46        }
47    }
48
49    // Flush the internal buffer of the decoder.
50    if let Some(len) = writer.decoder.flush(&mut buf).map(|v| usize::from(v.get())) {
51        let len_suffix = len % 3;
52        let triplets_end = len - len_suffix;
53        let triplets = core::str::from_utf8(&buf[..triplets_end])
54            .expect("[validity] percent-encoded triplets consist of ASCII characters");
55        if let ControlFlow::Break(v) = f(PctEncodedFragments::InvalidUtf8PctTriplets(triplets)) {
56            return Ok(ControlFlow::Break(v));
57        }
58
59        if len_suffix > 0 {
60            if let ControlFlow::Break(v) = f(PctEncodedFragments::StrayPercent) {
61                return Ok(ControlFlow::Break(v));
62            }
63        }
64        if len_suffix > 1 {
65            let after_percent = core::str::from_utf8(
66                &buf[(triplets_end + 1)..(triplets_end + len_suffix)],
67            )
68            .expect("[consistency] percent-encoded triplets contains only ASCII characters");
69            if let ControlFlow::Break(v) = f(PctEncodedFragments::NoPctStr(after_percent)) {
70                return Ok(ControlFlow::Break(v));
71            }
72        }
73    }
74
75    Ok(ControlFlow::Continue(()))
76}
77
78/// Writer to decompose the input into fragments.
79struct DecomposeWriter<'a, F, B> {
80    /// Output function.
81    f: &'a mut F,
82    /// Decoder.
83    decoder: DecoderBuffer,
84    /// Buffer.
85    buf: &'a mut [u8],
86    /// Result of the last output function call.
87    result: ControlFlow<B>,
88    /// Dummy field for the type parameter of the return type of the function `f`.
89    _r: PhantomData<fn() -> B>,
90}
91impl<F, B> DecomposeWriter<'_, F, B>
92where
93    F: FnMut(PctEncodedFragments<'_>) -> ControlFlow<B>,
94{
95    /// Returns `Ok(_)` if the stored result is `Continue`, and `Err(_)` otherwise.
96    #[inline(always)]
97    fn result_continue_or_err(&self) -> fmt::Result {
98        if self.result.is_break() {
99            return Err(fmt::Error);
100        }
101        Ok(())
102    }
103
104    /// Calls the output functions with the undecodable fragments.
105    fn output_as_undecodable(&mut self, len_undecodable: u8) -> fmt::Result {
106        let len_written = usize::from(len_undecodable);
107        let frag = core::str::from_utf8(&self.buf[..len_written])
108            .expect("[validity] `DecoderBuffer` writes a valid ASCII string");
109        let len_incomplete = len_written % 3;
110        let len_complete = len_written - len_incomplete;
111        self.result = (self.f)(PctEncodedFragments::InvalidUtf8PctTriplets(
112            &frag[..len_complete],
113        ));
114        self.result_continue_or_err()?;
115        if len_incomplete > 0 {
116            // At least the first `%` exists.
117            self.result = (self.f)(PctEncodedFragments::StrayPercent);
118            if self.result.is_break() {
119                return Err(fmt::Error);
120            }
121            if len_incomplete > 1 {
122                // A following hexdigit is available.
123                debug_assert_eq!(
124                    len_incomplete, 2,
125                    "[consistency] the length of incomplete percent-encoded \
126                         triplet must be less than 2 bytes"
127                );
128                self.result = (self.f)(PctEncodedFragments::NoPctStr(
129                    &frag[(len_complete + 1)..len_written],
130                ));
131                self.result_continue_or_err()?;
132            }
133        }
134        Ok(())
135    }
136}
137
138impl<F, B> fmt::Write for DecomposeWriter<'_, F, B>
139where
140    F: FnMut(PctEncodedFragments<'_>) -> ControlFlow<B>,
141{
142    fn write_str(&mut self, s: &str) -> fmt::Result {
143        self.result_continue_or_err()?;
144        let mut rest = s;
145        while !rest.is_empty() {
146            let (len_consumed, result) = self.decoder.push_encoded(self.buf, rest);
147            if len_consumed == 0 {
148                // `rest` does not start with the percent-encoded triplets.
149                // Flush the decoder before attempting to decode more data.
150                if let Some(len_written) = self.decoder.flush(self.buf).map(NonZeroU8::get) {
151                    self.output_as_undecodable(len_written)?;
152                    rest = &rest[usize::from(len_written)..];
153                }
154
155                // Write plain string prefix (if found).
156                let (plain_prefix, suffix) = find_split(rest, b'%').unwrap_or((rest, ""));
157                debug_assert!(
158                    !plain_prefix.is_empty(),
159                    "[consistency] `len_consumed == 0` indicates non-empty \
160                     `rest` not starting with `%`"
161                );
162                self.result = (self.f)(PctEncodedFragments::NoPctStr(plain_prefix));
163                self.result_continue_or_err()?;
164                rest = suffix;
165                continue;
166            }
167
168            // Process decoding result.
169            match result {
170                PushResult::Decoded(len_written, c) => {
171                    let len_written = usize::from(len_written.get());
172                    let frag = core::str::from_utf8(&self.buf[..len_written])
173                        .expect("[validity] `DecoderBuffer` writes a valid ASCII string");
174                    self.result = (self.f)(PctEncodedFragments::Char(frag, c));
175                    self.result_continue_or_err()?;
176                }
177                PushResult::Undecodable(len_written) => {
178                    self.output_as_undecodable(len_written)?;
179                }
180                PushResult::NeedMoreBytes => {
181                    // Nothing to write at this time.
182                }
183            }
184            rest = &rest[len_consumed..];
185        }
186        Ok(())
187    }
188}
189
190/// A type for result of feeding data to [`DecoderBuffer`].
191#[derive(Debug, Clone, Copy)]
192enum PushResult {
193    /// Input is still incomplete, needs more bytes to get the decoding result.
194    NeedMoreBytes,
195    /// Bytes decodable to valid UTF-8 sequence.
196    // `.0`: Length of decodable fragment.
197    // `.1`: Decoded character.
198    Decoded(NonZeroU8, char),
199    /// Valid percent-encoded triplets but not decodable to valid UTF-8 sequence.
200    // `.0`: Length of undecodable fragment.
201    Undecodable(u8),
202}
203
204/// Buffer to contain (and to decode) incomplete percent-encoded triplets.
205#[derive(Default, Debug, Clone, Copy)]
206struct DecoderBuffer {
207    /// Percent-encoded triplets that possibly consists a valid UTF-8 sequence after decoded.
208    //
209    // `3 * 4`: 3 ASCII characters for single percent-encoded triplet, and
210    // 4 triplets at most for single Unicode codepoint in UTF-8.
211    encoded: [u8; 12],
212    /// Decoded bytes.
213    decoded: [u8; 4],
214    /// Number of bytes available in `buf_encoded` buffer.
215    ///
216    /// `buf_encoded_len / 3` also indicates the length of data in `decoded`.
217    len_encoded: u8,
218}
219
220impl DecoderBuffer {
221    /// Writes the data of the given length to the destination, and remove that part from buffer.
222    fn write_and_pop(&mut self, dest: &mut [u8], remove_len: u8) {
223        let new_len = self.len_encoded - remove_len;
224        let remove_len = usize::from(remove_len);
225        let src_range = remove_len..usize::from(self.len_encoded);
226        dest[..remove_len].copy_from_slice(&self.encoded[..remove_len]);
227
228        if new_len == 0 {
229            *self = Self::default();
230            return;
231        }
232        self.encoded.copy_within(src_range, 0);
233        self.decoded
234            .copy_within((remove_len / 3)..usize::from(self.len_encoded / 3), 0);
235        self.len_encoded = new_len;
236    }
237
238    /// Pushes a byte of a (possible) percent-encoded tripet to the buffer.
239    fn push_single_encoded_byte(&mut self, byte: u8) {
240        debug_assert!(
241            self.len_encoded < 12,
242            "[consistency] four percent-encoded triplets are enough for a unicode code point"
243        );
244        let pos_enc = usize::from(self.len_encoded);
245        self.len_encoded += 1;
246        self.encoded[pos_enc] = byte;
247        if self.len_encoded % 3 == 0 {
248            // A new percent-encoded triplet is read. Decode and remember.
249            let pos_dec = usize::from(self.len_encoded / 3 - 1);
250            let upper = self.encoded[pos_enc - 1];
251            let lower = byte;
252            debug_assert!(
253                upper.is_ascii_hexdigit() && lower.is_ascii_hexdigit(),
254                "[consistency] the `encoded` buffer should contain valid percent-encoded triplets"
255            );
256            self.decoded[pos_dec] = hexdigits_to_byte([upper, lower]);
257        }
258    }
259
260    /// Pushes the (possibly) encoded string to the buffer.
261    ///
262    /// When the push result is not `PctTripletPushResult::NeedMoreBytes`, the
263    /// caller should call `Self::clear()` before pushing more bytes.
264    ///
265    /// # Preconditions
266    ///
267    /// * `buf` should be more than 12 bytes. If not, this method may panic.
268    #[must_use]
269    pub(crate) fn push_encoded(&mut self, buf: &mut [u8], s: &str) -> (usize, PushResult) {
270        debug_assert!(
271            buf.len() >= 12,
272            "[internal precondition] destination buffer should be at least 12 bytes"
273        );
274        let mut chars = s.chars();
275        let mut len_triplet_incomplete = self.len_encoded % 3;
276        for c in &mut chars {
277            if len_triplet_incomplete == 0 {
278                // Expect `%`.
279                if c != '%' {
280                    // Undecodable.
281                    // `-1`: the last byte is peeked but not consumed.
282                    let len_consumed = s.len() - chars.as_str().len() - 1;
283                    let len_result = self.len_encoded;
284                    self.write_and_pop(buf, len_result);
285                    return (len_consumed, PushResult::Undecodable(len_result));
286                }
287                self.push_single_encoded_byte(b'%');
288                len_triplet_incomplete = 1;
289                continue;
290            }
291
292            // Expect a nibble.
293            if !c.is_ascii_hexdigit() {
294                // Undecodable.
295                // `-1`: the last byte is peeked but not consumed.
296                let len_consumed = s.len() - chars.as_str().len() - 1;
297                let len_result = self.len_encoded;
298                self.write_and_pop(buf, len_result);
299                return (len_consumed, PushResult::Undecodable(len_result));
300            }
301            self.push_single_encoded_byte(c as u8);
302            if len_triplet_incomplete == 1 {
303                len_triplet_incomplete = 2;
304                continue;
305            } else {
306                // Now a new percent-encoded triplet is read!
307                debug_assert_eq!(len_triplet_incomplete, 2);
308                len_triplet_incomplete = 0;
309            }
310
311            // Now a new percent-encoded triplet is read.
312            // Check if the buffer contains a valid decodable content.
313            let len_decoded = usize::from(self.len_encoded) / 3;
314            match core::str::from_utf8(&self.decoded[..len_decoded]) {
315                Ok(decoded_str) => {
316                    // Successfully decoded.
317                    let len_consumed = s.len() - chars.as_str().len();
318                    let c = decoded_str
319                        .chars()
320                        .next()
321                        .expect("[validity] `decoded` buffer is nonempty");
322                    let len_result = NonZeroU8::new(self.len_encoded).expect(
323                        "[consistency] `encoded` buffer is nonempty since \
324                         `push_single_encoded_byte()` was called",
325                    );
326                    self.write_and_pop(buf, len_result.get());
327                    return (len_consumed, PushResult::Decoded(len_result, c));
328                }
329                Err(e) => {
330                    // Undecodable.
331                    assert_eq!(
332                        e.valid_up_to(),
333                        0,
334                        "[consistency] `decoded` buffer contains at most one character"
335                    );
336                    let skip_len_decoded = match e.error_len() {
337                        // Unexpected EOF. Wait for remaining input.
338                        None => continue,
339                        // Skip invalid bytes.
340                        Some(v) => v,
341                    };
342                    let len_consumed = s.len() - chars.as_str().len();
343                    let len_result = skip_len_decoded as u8 * 3;
344                    assert_ne!(
345                        skip_len_decoded, 0,
346                        "[consistency] empty bytes cannot be invalid"
347                    );
348                    self.write_and_pop(buf, len_result);
349                    return (len_consumed, PushResult::Undecodable(len_result));
350                }
351            };
352        }
353        let len_consumed = s.len() - chars.as_str().len();
354        (len_consumed, PushResult::NeedMoreBytes)
355    }
356
357    /// Writes the incomplete data completely to the destination, and clears the internal buffer.
358    #[must_use]
359    pub(crate) fn flush(&mut self, buf: &mut [u8]) -> Option<NonZeroU8> {
360        let len_result = NonZeroU8::new(self.len_encoded)?;
361        // Emit the current (undecodable) buffer as is.
362        self.write_and_pop(buf, len_result.get());
363        debug_assert_eq!(
364            self.len_encoded, 0,
365            "[consistency] the buffer should be cleared after flushed"
366        );
367        Some(len_result)
368    }
369}