jiff/util/
utf8.rs

1use core::cmp::Ordering;
2
3/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
4///
5/// If no valid encoding of a codepoint exists at the beginning of the given
6/// byte slice, then the first byte is returned instead.
7///
8/// This returns `None` if and only if `bytes` is empty.
9///
10/// This never panics.
11///
12/// *WARNING*: This is not designed for performance. If you're looking for a
13/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
14/// crate, then please file an issue and discuss your use case.
15pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
16    if bytes.is_empty() {
17        return None;
18    }
19    let len = match utf8_len(bytes[0]) {
20        None => return Some(Err(bytes[0])),
21        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
22        Some(1) => return Some(Ok(char::from(bytes[0]))),
23        Some(len) => len,
24    };
25    match core::str::from_utf8(&bytes[..len]) {
26        Ok(s) => Some(Ok(s.chars().next().unwrap())),
27        Err(_) => Some(Err(bytes[0])),
28    }
29}
30
31/// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering`.
32#[inline]
33pub(crate) fn cmp_ignore_ascii_case(s1: &str, s2: &str) -> Ordering {
34    cmp_ignore_ascii_case_bytes(s1.as_bytes(), s2.as_bytes())
35}
36
37/// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering` on
38/// `&[u8]`.
39#[inline]
40pub(crate) fn cmp_ignore_ascii_case_bytes(s1: &[u8], s2: &[u8]) -> Ordering {
41    // This function used to look like this:
42    //
43    //     let it1 = s1.iter().map(|&b| b.to_ascii_lowercase());
44    //     let it2 = s2.iter().map(|&b| b.to_ascii_lowercase());
45    //     it1.cmp(it2)
46    //
47    // But the code below seems to do better in microbenchmarks.
48    let mut i = 0;
49    loop {
50        let b1 = s1.get(i).copied().map(|b| b.to_ascii_lowercase());
51        let b2 = s2.get(i).copied().map(|b| b.to_ascii_lowercase());
52        match (b1, b2) {
53            (None, None) => return Ordering::Equal,
54            (Some(_), None) => return Ordering::Greater,
55            (None, Some(_)) => return Ordering::Less,
56            (Some(b1), Some(b2)) if b1 == b2 => i += 1,
57            (Some(b1), Some(b2)) => return b1.cmp(&b2),
58        }
59    }
60}
61
62/// Given a UTF-8 leading byte, this returns the total number of code units
63/// in the following encoded codepoint.
64///
65/// If the given byte is not a valid UTF-8 leading byte, then this returns
66/// `None`.
67fn utf8_len(byte: u8) -> Option<usize> {
68    if byte <= 0x7F {
69        return Some(1);
70    } else if byte & 0b1100_0000 == 0b1000_0000 {
71        return None;
72    } else if byte <= 0b1101_1111 {
73        Some(2)
74    } else if byte <= 0b1110_1111 {
75        Some(3)
76    } else if byte <= 0b1111_0111 {
77        Some(4)
78    } else {
79        None
80    }
81}