convert_case/boundary.rs
1use unicode_segmentation::UnicodeSegmentation;
2
3fn grapheme_is_digit(c: &&str) -> bool {
4 c.chars().all(|c| c.is_ascii_digit())
5}
6
7fn grapheme_is_uppercase(c: &&str) -> bool {
8 c.to_uppercase() != c.to_lowercase() && *c == c.to_uppercase()
9}
10
11fn grapheme_is_lowercase(c: &&str) -> bool {
12 c.to_uppercase() != c.to_lowercase() && *c == c.to_lowercase()
13}
14
15/// How an identifier is split into words.
16///
17/// Some boundaries, `HYPHEN`, `UNDERSCORE`, and `SPACE`, consume the character they
18/// split on, whereas the other boundaries do not.
19///
20/// `Boundary` includes methods that return useful groups of boundaries. It also
21/// contains the [`defaults_from`](Boundary::defaults_from) method which will generate a subset
22/// of default boundaries based on the boundaries present in a string.
23///
24/// You can also create custom delimiter boundaries using the [`from_delim`](Boundary::from_delim)
25/// method or directly instantiate Boundary for complex boundary conditions.
26/// ```
27/// use convert_case::{Boundary, Case, Casing, Converter};
28///
29/// assert_eq!(
30/// "transformations_in_3d",
31/// "TransformationsIn3D"
32/// .from_case(Case::Camel)
33/// .without_boundaries(&Boundary::digit_letter())
34/// .to_case(Case::Snake)
35/// );
36///
37/// let conv = Converter::new()
38/// .set_boundaries(&Boundary::defaults_from("aA "))
39/// .to_case(Case::Title);
40/// assert_eq!("7empest By Tool", conv.convert("7empest byTool"));
41/// ```
42#[derive(Debug, Eq, Hash, Clone, Copy)]
43pub struct Boundary {
44 /// A unique name used for comparison.
45 pub name: &'static str,
46 /// A function that determines if this boundary is present at the start
47 /// of the string. Second argument is the `arg` field.
48 pub condition: fn(&[&str], Option<&'static str>) -> bool,
49 /// An optional string passed to `condition` at runtime. Used
50 /// internally for [`Boundary::from_delim`] method.
51 pub arg: Option<&'static str>,
52 /// Where the beginning of the boundary is.
53 pub start: usize,
54 /// The length of the boundary. This is the number of graphemes that
55 /// are removed when splitting.
56 pub len: usize,
57}
58
59impl PartialEq for Boundary {
60 fn eq(&self, other: &Self) -> bool {
61 self.name == other.name
62 }
63}
64
65impl Boundary {
66 /// Splits on space, consuming the character on segmentation.
67 /// ```
68 /// # use convert_case::Boundary;
69 /// assert_eq!(
70 /// vec![Boundary::SPACE],
71 /// Boundary::defaults_from(" ")
72 /// );
73 /// ```
74 pub const SPACE: Boundary = Boundary {
75 name: "Space",
76 condition: |s, _| s.get(0) == Some(&" "),
77 arg: None,
78 start: 0,
79 len: 1,
80 };
81
82 /// Splits on `-`, consuming the character on segmentation.
83 /// ```
84 /// # use convert_case::Boundary;
85 /// assert_eq!(
86 /// vec![Boundary::HYPHEN],
87 /// Boundary::defaults_from("-")
88 /// );
89 /// ```
90 pub const HYPHEN: Boundary = Boundary {
91 name: "Hyphen",
92 condition: |s, _| s.get(0) == Some(&"-"),
93 arg: None,
94 start: 0,
95 len: 1,
96 };
97
98 /// Splits on `_`, consuming the character on segmentation.
99 /// ```
100 /// # use convert_case::Boundary;
101 /// assert_eq!(
102 /// vec![Boundary::UNDERSCORE],
103 /// Boundary::defaults_from("_")
104 /// );
105 /// ```
106 pub const UNDERSCORE: Boundary = Boundary {
107 name: "Underscore",
108 condition: |s, _| s.get(0) == Some(&"_"),
109 arg: None,
110 start: 0,
111 len: 1,
112 };
113
114 /// Splits where a lowercase letter is followed by an uppercase letter.
115 /// ```
116 /// # use convert_case::Boundary;
117 /// assert_eq!(
118 /// vec![Boundary::LOWER_UPPER],
119 /// Boundary::defaults_from("aA")
120 /// );
121 /// ```
122 pub const LOWER_UPPER: Boundary = Boundary {
123 name: "LowerUpper",
124 condition: |s, _| {
125 s.get(0).map(grapheme_is_lowercase) == Some(true)
126 && s.get(1).map(grapheme_is_uppercase) == Some(true)
127 },
128 arg: None,
129 start: 1,
130 len: 0,
131 };
132 /// Splits where an uppercase letter is followed by a lowercase letter. This is seldom used,
133 /// and is **not** included in the [defaults](Boundary::defaults).
134 /// ```
135 /// # use convert_case::Boundary;
136 /// assert!(
137 /// Boundary::defaults_from("Aa").len() == 0
138 /// );
139 /// ```
140 pub const UPPER_LOWER: Boundary = Boundary {
141 name: "UpperLower",
142 condition: |s, _| {
143 s.get(0).map(grapheme_is_uppercase) == Some(true)
144 && s.get(1).map(grapheme_is_lowercase) == Some(true)
145 },
146 arg: None,
147 start: 1,
148 len: 0,
149 };
150
151 /// Acronyms are identified by two uppercase letters followed by a lowercase letter.
152 /// The word boundary is between the two uppercase letters. For example, "HTTPRequest"
153 /// would have an acronym boundary identified at "PRe" and split into "HTTP" and "Request".
154 /// ```
155 /// # use convert_case::Boundary;
156 /// assert_eq!(
157 /// vec![Boundary::ACRONYM],
158 /// Boundary::defaults_from("AAa")
159 /// );
160 /// ```
161 pub const ACRONYM: Boundary = Boundary {
162 name: "Acronym",
163 condition: |s, _| {
164 s.get(0).map(grapheme_is_uppercase) == Some(true)
165 && s.get(1).map(grapheme_is_uppercase) == Some(true)
166 && s.get(2).map(grapheme_is_lowercase) == Some(true)
167 },
168 arg: None,
169 start: 1,
170 len: 0,
171 };
172
173 /// Splits where a lowercase letter is followed by a digit.
174 /// ```
175 /// # use convert_case::Boundary;
176 /// assert_eq!(
177 /// vec![Boundary::LOWER_DIGIT],
178 /// Boundary::defaults_from("a1")
179 /// );
180 /// ```
181 pub const LOWER_DIGIT: Boundary = Boundary {
182 name: "LowerDigit",
183 condition: |s, _| {
184 s.get(0).map(grapheme_is_lowercase) == Some(true)
185 && s.get(1).map(grapheme_is_digit) == Some(true)
186 },
187 arg: None,
188 start: 1,
189 len: 0,
190 };
191
192 /// Splits where an uppercase letter is followed by a digit.
193 /// ```
194 /// # use convert_case::Boundary;
195 /// assert_eq!(
196 /// vec![Boundary::UPPER_DIGIT],
197 /// Boundary::defaults_from("A1")
198 /// );
199 /// ```
200 pub const UPPER_DIGIT: Boundary = Boundary {
201 name: "UpperDigit",
202 condition: |s, _| {
203 s.get(0).map(grapheme_is_uppercase) == Some(true)
204 && s.get(1).map(grapheme_is_digit) == Some(true)
205 },
206 arg: None,
207 start: 1,
208 len: 0,
209 };
210
211 /// Splits where digit is followed by a lowercase letter.
212 /// ```
213 /// # use convert_case::Boundary;
214 /// assert_eq!(
215 /// vec![Boundary::DIGIT_LOWER],
216 /// Boundary::defaults_from("1a")
217 /// );
218 /// ```
219 pub const DIGIT_LOWER: Boundary = Boundary {
220 name: "DigitLower",
221 condition: |s, _| {
222 s.get(0).map(grapheme_is_digit) == Some(true)
223 && s.get(1).map(grapheme_is_lowercase) == Some(true)
224 },
225 arg: None,
226 start: 1,
227 len: 0,
228 };
229
230 /// Splits where digit is followed by an uppercase letter.
231 /// ```
232 /// # use convert_case::Boundary;
233 /// assert_eq!(
234 /// vec![Boundary::DIGIT_UPPER],
235 /// Boundary::defaults_from("1A")
236 /// );
237 /// ```
238 pub const DIGIT_UPPER: Boundary = Boundary {
239 name: "DigitUpper",
240 condition: |s, _| {
241 s.get(0).map(grapheme_is_digit) == Some(true)
242 && s.get(1).map(grapheme_is_uppercase) == Some(true)
243 },
244 arg: None,
245 start: 1,
246 len: 0,
247 };
248
249 /// Create a new boundary based on a delimiter.
250 /// ```
251 /// # use convert_case::{Case, Converter, Boundary};
252 /// let conv = Converter::new()
253 /// .set_boundaries(&[Boundary::from_delim("::")])
254 /// .to_case(Case::Camel);
255 /// assert_eq!(
256 /// "myVarName",
257 /// conv.convert("my::var::name")
258 /// )
259 /// ```
260 pub const fn from_delim(delim: &'static str) -> Boundary {
261 Boundary {
262 name: delim,
263 arg: Some(delim),
264 condition: |s, arg| s.join("").starts_with(arg.unwrap()),
265 start: 0,
266 len: delim.len(),
267 }
268 }
269
270 /// The default list of boundaries used when `Casing::to_case` is called directly
271 /// and in a `Converter` generated from `Converter::new()`.
272 /// ```
273 /// # use convert_case::Boundary;
274 /// assert_eq!(
275 /// [
276 /// Boundary::SPACE,
277 /// Boundary::HYPHEN,
278 /// Boundary::UNDERSCORE,
279 /// Boundary::LOWER_UPPER,
280 /// Boundary::ACRONYM,
281 /// Boundary::LOWER_DIGIT,
282 /// Boundary::UPPER_DIGIT,
283 /// Boundary::DIGIT_LOWER,
284 /// Boundary::DIGIT_UPPER,
285 /// ],
286 /// Boundary::defaults()
287 /// );
288 /// ```
289 pub const fn defaults() -> [Boundary; 9] {
290 [
291 Boundary::SPACE,
292 Boundary::HYPHEN,
293 Boundary::UNDERSCORE,
294 Boundary::LOWER_UPPER,
295 Boundary::ACRONYM,
296 Boundary::LOWER_DIGIT,
297 Boundary::UPPER_DIGIT,
298 Boundary::DIGIT_LOWER,
299 Boundary::DIGIT_UPPER,
300 ]
301 }
302
303 /// Returns the boundaries that involve digits.
304 /// `LowerDigit`.
305 /// ```
306 /// # use convert_case::Boundary;
307 /// assert_eq!(
308 /// [
309 /// Boundary::LOWER_DIGIT,
310 /// Boundary::UPPER_DIGIT,
311 /// Boundary::DIGIT_LOWER,
312 /// Boundary::DIGIT_UPPER,
313 /// ],
314 /// Boundary::digits()
315 /// );
316 /// ```
317 pub const fn digits() -> [Boundary; 4] {
318 [
319 Boundary::LOWER_DIGIT,
320 Boundary::UPPER_DIGIT,
321 Boundary::DIGIT_LOWER,
322 Boundary::DIGIT_UPPER,
323 ]
324 }
325
326 /// Returns the boundaries that are letters followed by digits.
327 /// ```
328 /// # use convert_case::Boundary;
329 /// assert_eq!(
330 /// [
331 /// Boundary::LOWER_DIGIT,
332 /// Boundary::UPPER_DIGIT,
333 /// ],
334 /// Boundary::letter_digit()
335 /// );
336 /// ```
337 pub const fn letter_digit() -> [Boundary; 2] {
338 [Boundary::LOWER_DIGIT, Boundary::UPPER_DIGIT]
339 }
340
341 /// Returns the boundaries that are digits followed by letters.
342 /// ```
343 /// # use convert_case::Boundary;
344 /// assert_eq!(
345 /// [
346 /// Boundary::DIGIT_LOWER,
347 /// Boundary::DIGIT_UPPER
348 /// ],
349 /// Boundary::digit_letter()
350 /// );
351 /// ```
352 pub fn digit_letter() -> [Boundary; 2] {
353 [Boundary::DIGIT_LOWER, Boundary::DIGIT_UPPER]
354 }
355
356 /// Returns a list of all boundaries that are identified within the given string.
357 /// Could be a short of writing out all the boundaries in a list directly. This will not
358 /// identify boundary `UpperLower` if it also used as part of `Acronym`.
359 ///
360 /// If you want to be very explicit and not overlap boundaries, it is recommended to use a colon
361 /// character.
362 /// ```
363 /// # use convert_case::Boundary;
364 /// assert_eq!(
365 /// vec![
366 /// Boundary::SPACE,
367 /// Boundary::HYPHEN,
368 /// Boundary::LOWER_UPPER,
369 /// Boundary::UPPER_DIGIT,
370 /// Boundary::DIGIT_LOWER,
371 /// ],
372 /// Boundary::defaults_from("aA8a -")
373 /// );
374 /// assert_eq!(
375 /// vec![
376 /// Boundary::UNDERSCORE,
377 /// Boundary::LOWER_UPPER,
378 /// Boundary::ACRONYM,
379 /// Boundary::DIGIT_UPPER,
380 /// ],
381 /// Boundary::defaults_from("bD:0B:_:AAa")
382 /// );
383 /// ```
384 pub fn defaults_from(pattern: &str) -> Vec<Boundary> {
385 let mut boundaries = Vec::new();
386 for boundary in Boundary::defaults() {
387 let parts = split(&pattern, &[boundary]);
388 if parts.len() > 1 || parts.len() == 0 || parts[0] != pattern {
389 boundaries.push(boundary);
390 }
391 }
392 boundaries
393 }
394}
395
396/// Split an identifier into a list of words using the list of boundaries.
397///
398/// This is used internally for splitting an identifier before mutating by
399/// a pattern and joining again with a delimiter.
400/// ```
401/// use convert_case::{Boundary, split};
402/// assert_eq!(
403/// vec!["one", "two", "three.four"],
404/// split(&"one_two-three.four", &[Boundary::UNDERSCORE, Boundary::HYPHEN]),
405/// )
406/// ```
407pub fn split<'s, T>(s: &'s T, boundaries: &[Boundary]) -> Vec<&'s str>
408where
409 T: AsRef<str>,
410{
411 let s = s.as_ref();
412
413 if s.len() == 0 {
414 return vec![];
415 }
416
417 let mut words = Vec::new();
418 let mut last_boundary_end = 0;
419
420 let (indices, graphemes): (Vec<_>, Vec<_>) = s.grapheme_indices(true).unzip();
421 let grapheme_length = indices[graphemes.len() - 1] + graphemes[graphemes.len() - 1].len();
422
423 for i in 0..graphemes.len() {
424 for boundary in boundaries {
425 //let byte_index = indices[i];
426
427 if (boundary.condition)(&graphemes[i..], boundary.arg) {
428 // What if we find a condition at the end of the array?
429 // Maybe we can stop early based on length
430 // To do this, need to switch the loops
431 // TODO
432 let boundary_byte_start: usize =
433 *indices.get(i + boundary.start).unwrap_or(&grapheme_length);
434 let boundary_byte_end: usize = *indices
435 .get(i + boundary.start + boundary.len)
436 .unwrap_or(&grapheme_length);
437
438 // todo clean this up a bit
439 words.push(&s[last_boundary_end..boundary_byte_start]);
440 last_boundary_end = boundary_byte_end;
441 break;
442 }
443 }
444 }
445 words.push(&s[last_boundary_end..]);
446 words.into_iter().filter(|s| !s.is_empty()).collect()
447}
448
449// ascii version
450//pub fn split<'s, T>(s: &'s T, boundaries: &[Boundary]) -> Vec<&'s str>
451//where
452// T: AsRef<str>,
453//{
454// let s = s.as_ref();
455//
456// let mut words = Vec::new();
457// let mut last_end = 0;
458// for i in 0..s.len() {
459// for boundary in boundaries {
460// if (boundary.condition)(&s[i..]) {
461// words.push(&s[last_end..i + boundary.start]);
462// last_end = i + boundary.start + boundary.len;
463// break;
464// }
465// }
466// }
467// words.push(&s[last_end..]);
468// words
469//}
470
471#[cfg(test)]
472mod tests {
473 use super::*;
474
475 #[test]
476 fn hyphen() {
477 let s = "a-b-c";
478 let v = split(&s, &[Boundary::HYPHEN]);
479 assert_eq!(v, vec!["a", "b", "c"]);
480 }
481
482 #[test]
483 fn underscore() {
484 let s = "a_b_c";
485 let v = split(&s, &[Boundary::UNDERSCORE]);
486 assert_eq!(v, vec!["a", "b", "c"]);
487 }
488
489 #[test]
490 fn space() {
491 let s = "a b c";
492 let v = split(&s, &[Boundary::SPACE]);
493 assert_eq!(v, vec!["a", "b", "c"]);
494 }
495
496 #[test]
497 fn delimiters() {
498 let s = "aaa-bbb_ccc ddd ddd-eee";
499 let v = split(
500 &s,
501 &[Boundary::SPACE, Boundary::UNDERSCORE, Boundary::HYPHEN],
502 );
503 assert_eq!(v, vec!["aaa", "bbb", "ccc", "ddd", "ddd", "eee"]);
504 }
505
506 #[test]
507 fn lower_upper() {
508 let s = "lowerUpperUpper";
509 let v = split(&s, &[Boundary::LOWER_UPPER]);
510 assert_eq!(v, vec!["lower", "Upper", "Upper"]);
511 }
512
513 #[test]
514 fn acronym() {
515 let s = "XMLRequest";
516 let v = split(&s, &[Boundary::ACRONYM]);
517 assert_eq!(v, vec!["XML", "Request"]);
518 }
519
520 // TODO: add tests for other boundaries
521
522 #[test]
523 fn boundaries_found_in_string() {
524 // upper lower is not longer a default
525 assert_eq!(Vec::<Boundary>::new(), Boundary::defaults_from(".Aaaa"));
526 assert_eq!(
527 vec![Boundary::LOWER_UPPER, Boundary::LOWER_DIGIT,],
528 Boundary::defaults_from("a8.Aa.aA")
529 );
530 assert_eq!(
531 Boundary::digits().to_vec(),
532 Boundary::defaults_from("b1B1b")
533 );
534 assert_eq!(
535 vec![
536 Boundary::SPACE,
537 Boundary::HYPHEN,
538 Boundary::UNDERSCORE,
539 Boundary::ACRONYM,
540 ],
541 Boundary::defaults_from("AAa -_")
542 );
543 }
544
545 #[test]
546 fn boundary_consts_same() {
547 assert_eq!(Boundary::SPACE, Boundary::SPACE);
548 }
549
550 #[test]
551 fn from_delim_dot() {
552 let boundary = Boundary::from_delim(".");
553 let s = "lower.Upper.Upper";
554 let v = split(&s, &[boundary]);
555 assert_eq!(vec!["lower", "Upper", "Upper"], v)
556 }
557
558 #[test]
559 fn from_delim_double_colon() {
560 let boundary = Boundary::from_delim("::");
561 let s = "lower::lowerUpper::Upper";
562 let v = split(&s, &[boundary]);
563 assert_eq!(vec!["lower", "lowerUpper", "Upper"], v)
564 }
565}