iri_string/normalize/pct_case.rs
1//! Percent-encoding normalization and case normalization.
2
3use core::cmp::Ordering;
4use core::fmt::{self, Write as _};
5use core::marker::PhantomData;
6
7use crate::format::eq_str_display;
8use crate::parser::char::{is_ascii_unreserved, is_unreserved, is_utf8_byte_continue};
9use crate::parser::str::{find_split_hole, take_first_char};
10use crate::parser::trusted::take_xdigits2;
11use crate::spec::Spec;
12
13/// Returns true if the given string is percent-encoding normalized and case
14/// normalized.
15///
16/// Note that normalization of ASCII-only host requires additional case
17/// normalization, so checking by this function is not sufficient for that case.
18pub(crate) fn is_pct_case_normalized<S: Spec>(s: &str) -> bool {
19 eq_str_display(s, &PctCaseNormalized::<S>::new(s))
20}
21
22/// Returns a character for the slice.
23///
24/// Essentially equivalent to `core::str::from_utf8(bytes).unwrap().and_then(|s| s.get(0))`,
25/// but this function fully trusts that the input is a valid UTF-8 string with
26/// only one character.
27fn into_char_trusted(bytes: &[u8]) -> Result<char, ()> {
28 /// The bit mask to get the content part in a continue byte.
29 const CONTINUE_BYTE_MASK: u8 = 0b_0011_1111;
30 /// Minimum valid values for a code point in a UTF-8 sequence of 2, 3, and 4 bytes.
31 const MIN: [u32; 3] = [0x80, 0x800, 0x1_0000];
32
33 let len = bytes.len();
34 let c: u32 = match len {
35 2 => (u32::from(bytes[0] & 0b_0001_1111) << 6) | u32::from(bytes[1] & CONTINUE_BYTE_MASK),
36 3 => {
37 (u32::from(bytes[0] & 0b_0000_1111) << 12)
38 | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 6)
39 | u32::from(bytes[2] & CONTINUE_BYTE_MASK)
40 }
41 4 => {
42 (u32::from(bytes[0] & 0b_0000_0111) << 18)
43 | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 12)
44 | (u32::from(bytes[2] & CONTINUE_BYTE_MASK) << 6)
45 | u32::from(bytes[3] & CONTINUE_BYTE_MASK)
46 }
47 len => unreachable!(
48 "[consistency] expected 2, 3, or 4 bytes for a character, but got {len} as the length"
49 ),
50 };
51 if c < MIN[len - 2] {
52 // Redundant UTF-8 encoding.
53 return Err(());
54 }
55 // Can be an invalid Unicode code point.
56 char::from_u32(c).ok_or(())
57}
58
59/// Writable as a normalized path segment percent-encoding IRI.
60///
61/// This wrapper does the things below when being formatted:
62///
63/// * Decode unnecessarily percent-encoded characters.
64/// * Convert alphabetic characters uppercase in percent-encoded triplets.
65///
66/// Note that this does not newly encode raw characters.
67///
68/// # Safety
69///
70/// The given string should be the valid path segment.
71#[derive(Debug, Clone, Copy)]
72pub(crate) struct PctCaseNormalized<'a, S> {
73 /// Valid segment name to normalize.
74 segname: &'a str,
75 /// Spec.
76 _spec: PhantomData<fn() -> S>,
77}
78
79impl<'a, S: Spec> PctCaseNormalized<'a, S> {
80 /// Creates a new `PctCaseNormalized` value.
81 #[inline]
82 #[must_use]
83 pub(crate) fn new(source: &'a str) -> Self {
84 Self {
85 segname: source,
86 _spec: PhantomData,
87 }
88 }
89}
90
91impl<S: Spec> fmt::Display for PctCaseNormalized<'_, S> {
92 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
93 let mut rest = self.segname;
94
95 'outer_loop: while !rest.is_empty() {
96 // Scan the next percent-encoded triplet.
97 let (prefix, after_percent) = match find_split_hole(rest, b'%') {
98 Some(v) => v,
99 None => return f.write_str(rest),
100 };
101 // Write the string before the percent-encoded triplet.
102 f.write_str(prefix)?;
103 // Decode the percent-encoded triplet.
104 let (first_decoded, after_first_triplet) = take_xdigits2(after_percent);
105 rest = after_first_triplet;
106
107 if first_decoded.is_ascii() {
108 if is_ascii_unreserved(first_decoded) {
109 // Unreserved. Print the decoded.
110 f.write_char(char::from(first_decoded))?;
111 } else {
112 write!(f, "%{:02X}", first_decoded)?;
113 }
114 continue 'outer_loop;
115 }
116
117 // Continue byte cannot be the first byte of a character.
118 if is_utf8_byte_continue(first_decoded) {
119 write!(f, "%{:02X}", first_decoded)?;
120 continue 'outer_loop;
121 }
122
123 // Get the expected length of decoded char.
124 let expected_char_len = match (first_decoded & 0xf0).cmp(&0b1110_0000) {
125 Ordering::Less => 2,
126 Ordering::Equal => 3,
127 Ordering::Greater => 4,
128 };
129
130 // Get continue bytes.
131 let c_buf = &mut [first_decoded, 0, 0, 0][..expected_char_len];
132 for (i, buf_dest) in c_buf[1..].iter_mut().enumerate() {
133 match take_first_char(rest) {
134 Some(('%', after_percent)) => {
135 let (byte, after_triplet) = take_xdigits2(after_percent);
136 if !is_utf8_byte_continue(byte) {
137 // Note that `byte` can start the new string.
138 // Leave the byte in the `rest` for next try (i.e.
139 // don't update `rest` in this case).
140 c_buf[..=i]
141 .iter()
142 .try_for_each(|b| write!(f, "%{:02X}", b))?;
143 continue 'outer_loop;
144 }
145 *buf_dest = byte;
146 rest = after_triplet;
147 }
148 // If the next character is not `%`, decoded bytes so far
149 // won't be valid UTF-8 byte sequence.
150 // Write the read percent-encoded triplets without decoding.
151 // Note that all characters in `&c_buf[1..]` (if available)
152 // will be decoded to "continue byte" of UTF-8, so they
153 // cannot be the start of a valid UTF-8 byte sequence if
154 // decoded.
155 Some((c, after_percent)) => {
156 c_buf[..=i]
157 .iter()
158 .try_for_each(|b| write!(f, "%{:02X}", b))?;
159 f.write_char(c)?;
160 rest = after_percent;
161 continue 'outer_loop;
162 }
163 None => {
164 c_buf[..=i]
165 .iter()
166 .try_for_each(|b| write!(f, "%{:02X}", b))?;
167 // Reached the end of the string.
168 break 'outer_loop;
169 }
170 }
171 }
172
173 // Decode the bytes into a character.
174 match into_char_trusted(&c_buf[..expected_char_len]) {
175 Ok(decoded_c) => {
176 if is_unreserved::<S>(decoded_c) {
177 // Unreserved. Print the decoded.
178 f.write_char(decoded_c)?;
179 } else {
180 c_buf[0..expected_char_len]
181 .iter()
182 .try_for_each(|b| write!(f, "%{:02X}", b))?;
183 }
184 }
185 Err(_) => {
186 // Skip decoding of the entire sequence of pct-encoded triplets loaded
187 // in `c_buf`. This is valid from the reasons below.
188 //
189 // * The first byte in `c_buf` is valid as the first byte, and it tells the
190 // expected number of bytes for a code unit. The cases the bytes being too
191 // short and the sequence being incomplete have already been handled, and
192 // the execution does not reach here then.
193 // * All of the non-first bytes are checked if they are valid as UTF8 continue
194 // bytes by `is_utf8_byte_continue()`. If they're not, the decoding of
195 // that codepoint is aborted and the bytes in the buffer are immediately
196 // emitted as pct-encoded, and the execution does not reach here. This
197 // means that the bytes in the current `c_buf` have passed these tests.
198 // * Since all of the the non-first bytes are UTF8 continue bytes, any of
199 // them cannot start the new valid UTF-8 byte sequence. This means that
200 // if the bytes in the buffer does not consitute a valid UTF-8 bytes
201 // sequence, the whole buffer can immediately be emmitted as pct-encoded.
202
203 debug_assert!(
204 c_buf[1..expected_char_len]
205 .iter()
206 .copied()
207 .all(is_utf8_byte_continue),
208 "[consistency] all non-first bytes have been \
209 confirmed that they are UTF-8 continue bytes"
210 );
211 // Note that the first pct-encoded triplet is stripped from
212 // `after_first_triplet`.
213 rest = &after_first_triplet[((expected_char_len - 1) * 3)..];
214 c_buf[0..expected_char_len]
215 .iter()
216 .try_for_each(|b| write!(f, "%{:02X}", b))?;
217 }
218 }
219 }
220
221 Ok(())
222 }
223}
224
225/// Writable as a normalized ASCII-only `host` (and optionally `port` followed).
226#[derive(Debug, Clone, Copy)]
227pub(crate) struct NormalizedAsciiOnlyHost<'a> {
228 /// Valid host (and additionaly port) to normalize.
229 host_port: &'a str,
230}
231
232impl<'a> NormalizedAsciiOnlyHost<'a> {
233 /// Creates a new `NormalizedAsciiOnlyHost` value.
234 ///
235 /// # Preconditions
236 ///
237 /// The given string should be the valid ASCII-only `host` or
238 /// `host ":" port` after percent-encoding normalization.
239 /// In other words, [`parser::trusted::is_ascii_only_host`] should return
240 /// true for the given value.
241 ///
242 /// [`parser::trusted::is_ascii_only_host`]: `crate::parser::trusted::is_ascii_only_host`
243 #[inline]
244 #[must_use]
245 pub(crate) fn new(host_port: &'a str) -> Self {
246 Self { host_port }
247 }
248}
249
250impl fmt::Display for NormalizedAsciiOnlyHost<'_> {
251 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
252 let mut rest = self.host_port;
253
254 while !rest.is_empty() {
255 // Scan the next percent-encoded triplet.
256 let (prefix, after_percent) = match find_split_hole(rest, b'%') {
257 Some(v) => v,
258 None => {
259 return rest
260 .chars()
261 .try_for_each(|c| f.write_char(c.to_ascii_lowercase()));
262 }
263 };
264 // Write the string before the percent-encoded triplet.
265 prefix
266 .chars()
267 .try_for_each(|c| f.write_char(c.to_ascii_lowercase()))?;
268 // Decode the percent-encoded triplet.
269 let (first_decoded, after_triplet) = take_xdigits2(after_percent);
270 rest = after_triplet;
271
272 assert!(
273 first_decoded.is_ascii(),
274 "[consistency] this function requires ASCII-only host as an argument"
275 );
276
277 if is_ascii_unreserved(first_decoded) {
278 // Unreserved. Convert to lowercase and print.
279 f.write_char(char::from(first_decoded.to_ascii_lowercase()))?;
280 } else {
281 write!(f, "%{:02X}", first_decoded)?;
282 }
283 }
284
285 Ok(())
286 }
287}
288
289#[cfg(test)]
290#[cfg(feature = "alloc")]
291mod tests {
292 use super::*;
293
294 #[cfg(all(feature = "alloc", not(feature = "std")))]
295 use alloc::string::ToString;
296
297 use crate::spec::{IriSpec, UriSpec};
298
299 #[test]
300 fn invalid_utf8() {
301 assert_eq!(
302 PctCaseNormalized::<UriSpec>::new("%80%cc%cc%cc").to_string(),
303 "%80%CC%CC%CC"
304 );
305 assert_eq!(
306 PctCaseNormalized::<IriSpec>::new("%80%cc%cc%cc").to_string(),
307 "%80%CC%CC%CC"
308 );
309 }
310
311 #[test]
312 fn iri_unreserved() {
313 assert_eq!(
314 PctCaseNormalized::<UriSpec>::new("%ce%b1").to_string(),
315 "%CE%B1"
316 );
317 assert_eq!(
318 PctCaseNormalized::<IriSpec>::new("%ce%b1").to_string(),
319 "\u{03B1}"
320 );
321 }
322
323 #[test]
324 fn iri_middle_decode() {
325 assert_eq!(
326 PctCaseNormalized::<UriSpec>::new("%ce%ce%b1%b1").to_string(),
327 "%CE%CE%B1%B1"
328 );
329 assert_eq!(
330 PctCaseNormalized::<IriSpec>::new("%ce%ce%b1%b1").to_string(),
331 "%CE\u{03B1}%B1"
332 );
333 }
334
335 #[test]
336 fn ascii_reserved() {
337 assert_eq!(PctCaseNormalized::<UriSpec>::new("%3f").to_string(), "%3F");
338 assert_eq!(PctCaseNormalized::<IriSpec>::new("%3f").to_string(), "%3F");
339 }
340
341 #[test]
342 fn ascii_forbidden() {
343 assert_eq!(
344 PctCaseNormalized::<UriSpec>::new("%3c%3e").to_string(),
345 "%3C%3E"
346 );
347 assert_eq!(
348 PctCaseNormalized::<IriSpec>::new("%3c%3e").to_string(),
349 "%3C%3E"
350 );
351 }
352
353 #[test]
354 fn ascii_unreserved() {
355 assert_eq!(PctCaseNormalized::<UriSpec>::new("%7ea").to_string(), "~a");
356 assert_eq!(PctCaseNormalized::<IriSpec>::new("%7ea").to_string(), "~a");
357 }
358}