convert_case/
boundary.rs

1use unicode_segmentation::UnicodeSegmentation;
2
3fn grapheme_is_digit(c: &&str) -> bool {
4    c.chars().all(|c| c.is_ascii_digit())
5}
6
7fn grapheme_is_uppercase(c: &&str) -> bool {
8    c.to_uppercase() != c.to_lowercase() && *c == c.to_uppercase()
9}
10
11fn grapheme_is_lowercase(c: &&str) -> bool {
12    c.to_uppercase() != c.to_lowercase() && *c == c.to_lowercase()
13}
14
15/// How an identifier is split into words.  
16///
17/// Some boundaries, `HYPHEN`, `UNDERSCORE`, and `SPACE`, consume the character they
18/// split on, whereas the other boundaries do not.
19///
20/// `Boundary` includes methods that return useful groups of boundaries.  It also
21/// contains the [`defaults_from`](Boundary::defaults_from) method which will generate a subset
22/// of default boundaries based on the boundaries present in a string.
23///
24/// You can also create custom delimiter boundaries using the [`from_delim`](Boundary::from_delim)
25/// method or directly instantiate Boundary for complex boundary conditions.
26/// ```
27/// use convert_case::{Boundary, Case, Casing, Converter};
28///
29/// assert_eq!(
30///     "transformations_in_3d",
31///     "TransformationsIn3D"
32///         .from_case(Case::Camel)
33///         .without_boundaries(&Boundary::digit_letter())
34///         .to_case(Case::Snake)
35/// );
36///
37/// let conv = Converter::new()
38///     .set_boundaries(&Boundary::defaults_from("aA "))
39///     .to_case(Case::Title);
40/// assert_eq!("7empest By Tool", conv.convert("7empest byTool"));
41/// ```
42#[derive(Debug, Eq, Hash, Clone, Copy)]
43pub struct Boundary {
44    /// A unique name used for comparison.
45    pub name: &'static str,
46    /// A function that determines if this boundary is present at the start
47    /// of the string.  Second argument is the `arg` field.
48    pub condition: fn(&[&str], Option<&'static str>) -> bool,
49    /// An optional string passed to `condition` at runtime.  Used
50    /// internally for [`Boundary::from_delim`] method.
51    pub arg: Option<&'static str>,
52    /// Where the beginning of the boundary is.
53    pub start: usize,
54    /// The length of the boundary.  This is the number of graphemes that
55    /// are removed when splitting.
56    pub len: usize,
57}
58
59impl PartialEq for Boundary {
60    fn eq(&self, other: &Self) -> bool {
61        self.name == other.name
62    }
63}
64
65impl Boundary {
66    /// Splits on space, consuming the character on segmentation.
67    /// ```
68    /// # use convert_case::Boundary;
69    /// assert_eq!(
70    ///     vec![Boundary::SPACE],
71    ///     Boundary::defaults_from(" ")
72    /// );
73    /// ```
74    pub const SPACE: Boundary = Boundary {
75        name: "Space",
76        condition: |s, _| s.get(0) == Some(&" "),
77        arg: None,
78        start: 0,
79        len: 1,
80    };
81
82    /// Splits on `-`, consuming the character on segmentation.
83    /// ```
84    /// # use convert_case::Boundary;
85    /// assert_eq!(
86    ///     vec![Boundary::HYPHEN],
87    ///     Boundary::defaults_from("-")
88    /// );
89    /// ```
90    pub const HYPHEN: Boundary = Boundary {
91        name: "Hyphen",
92        condition: |s, _| s.get(0) == Some(&"-"),
93        arg: None,
94        start: 0,
95        len: 1,
96    };
97
98    /// Splits on `_`, consuming the character on segmentation.
99    /// ```
100    /// # use convert_case::Boundary;
101    /// assert_eq!(
102    ///     vec![Boundary::UNDERSCORE],
103    ///     Boundary::defaults_from("_")
104    /// );
105    /// ```
106    pub const UNDERSCORE: Boundary = Boundary {
107        name: "Underscore",
108        condition: |s, _| s.get(0) == Some(&"_"),
109        arg: None,
110        start: 0,
111        len: 1,
112    };
113
114    /// Splits where a lowercase letter is followed by an uppercase letter.
115    /// ```
116    /// # use convert_case::Boundary;
117    /// assert_eq!(
118    ///     vec![Boundary::LOWER_UPPER],
119    ///     Boundary::defaults_from("aA")
120    /// );
121    /// ```
122    pub const LOWER_UPPER: Boundary = Boundary {
123        name: "LowerUpper",
124        condition: |s, _| {
125            s.get(0).map(grapheme_is_lowercase) == Some(true)
126                && s.get(1).map(grapheme_is_uppercase) == Some(true)
127        },
128        arg: None,
129        start: 1,
130        len: 0,
131    };
132    /// Splits where an uppercase letter is followed by a lowercase letter.  This is seldom used,
133    /// and is **not** included in the [defaults](Boundary::defaults).
134    /// ```
135    /// # use convert_case::Boundary;
136    /// assert!(
137    ///     Boundary::defaults_from("Aa").len() == 0
138    /// );
139    /// ```
140    pub const UPPER_LOWER: Boundary = Boundary {
141        name: "UpperLower",
142        condition: |s, _| {
143            s.get(0).map(grapheme_is_uppercase) == Some(true)
144                && s.get(1).map(grapheme_is_lowercase) == Some(true)
145        },
146        arg: None,
147        start: 1,
148        len: 0,
149    };
150
151    /// Acronyms are identified by two uppercase letters followed by a lowercase letter.
152    /// The word boundary is between the two uppercase letters.  For example, "HTTPRequest"
153    /// would have an acronym boundary identified at "PRe" and split into "HTTP" and "Request".
154    /// ```
155    /// # use convert_case::Boundary;
156    /// assert_eq!(
157    ///     vec![Boundary::ACRONYM],
158    ///     Boundary::defaults_from("AAa")
159    /// );
160    /// ```
161    pub const ACRONYM: Boundary = Boundary {
162        name: "Acronym",
163        condition: |s, _| {
164            s.get(0).map(grapheme_is_uppercase) == Some(true)
165                && s.get(1).map(grapheme_is_uppercase) == Some(true)
166                && s.get(2).map(grapheme_is_lowercase) == Some(true)
167        },
168        arg: None,
169        start: 1,
170        len: 0,
171    };
172
173    /// Splits where a lowercase letter is followed by a digit.
174    /// ```
175    /// # use convert_case::Boundary;
176    /// assert_eq!(
177    ///     vec![Boundary::LOWER_DIGIT],
178    ///     Boundary::defaults_from("a1")
179    /// );
180    /// ```
181    pub const LOWER_DIGIT: Boundary = Boundary {
182        name: "LowerDigit",
183        condition: |s, _| {
184            s.get(0).map(grapheme_is_lowercase) == Some(true)
185                && s.get(1).map(grapheme_is_digit) == Some(true)
186        },
187        arg: None,
188        start: 1,
189        len: 0,
190    };
191
192    /// Splits where an uppercase letter is followed by a digit.
193    /// ```
194    /// # use convert_case::Boundary;
195    /// assert_eq!(
196    ///     vec![Boundary::UPPER_DIGIT],
197    ///     Boundary::defaults_from("A1")
198    /// );
199    /// ```
200    pub const UPPER_DIGIT: Boundary = Boundary {
201        name: "UpperDigit",
202        condition: |s, _| {
203            s.get(0).map(grapheme_is_uppercase) == Some(true)
204                && s.get(1).map(grapheme_is_digit) == Some(true)
205        },
206        arg: None,
207        start: 1,
208        len: 0,
209    };
210
211    /// Splits where digit is followed by a lowercase letter.
212    /// ```
213    /// # use convert_case::Boundary;
214    /// assert_eq!(
215    ///     vec![Boundary::DIGIT_LOWER],
216    ///     Boundary::defaults_from("1a")
217    /// );
218    /// ```
219    pub const DIGIT_LOWER: Boundary = Boundary {
220        name: "DigitLower",
221        condition: |s, _| {
222            s.get(0).map(grapheme_is_digit) == Some(true)
223                && s.get(1).map(grapheme_is_lowercase) == Some(true)
224        },
225        arg: None,
226        start: 1,
227        len: 0,
228    };
229
230    /// Splits where digit is followed by an uppercase letter.
231    /// ```
232    /// # use convert_case::Boundary;
233    /// assert_eq!(
234    ///     vec![Boundary::DIGIT_UPPER],
235    ///     Boundary::defaults_from("1A")
236    /// );
237    /// ```
238    pub const DIGIT_UPPER: Boundary = Boundary {
239        name: "DigitUpper",
240        condition: |s, _| {
241            s.get(0).map(grapheme_is_digit) == Some(true)
242                && s.get(1).map(grapheme_is_uppercase) == Some(true)
243        },
244        arg: None,
245        start: 1,
246        len: 0,
247    };
248
249    /// Create a new boundary based on a delimiter.
250    /// ```
251    /// # use convert_case::{Case, Converter, Boundary};
252    /// let conv = Converter::new()
253    ///     .set_boundaries(&[Boundary::from_delim("::")])
254    ///     .to_case(Case::Camel);
255    /// assert_eq!(
256    ///     "myVarName",
257    ///     conv.convert("my::var::name")
258    /// )
259    /// ```
260    pub const fn from_delim(delim: &'static str) -> Boundary {
261        Boundary {
262            name: delim,
263            arg: Some(delim),
264            condition: |s, arg| s.join("").starts_with(arg.unwrap()),
265            start: 0,
266            len: delim.len(),
267        }
268    }
269
270    /// The default list of boundaries used when `Casing::to_case` is called directly
271    /// and in a `Converter` generated from `Converter::new()`.
272    /// ```
273    /// # use convert_case::Boundary;
274    /// assert_eq!(
275    ///     [
276    ///         Boundary::SPACE,
277    ///         Boundary::HYPHEN,
278    ///         Boundary::UNDERSCORE,
279    ///         Boundary::LOWER_UPPER,
280    ///         Boundary::ACRONYM,
281    ///         Boundary::LOWER_DIGIT,
282    ///         Boundary::UPPER_DIGIT,
283    ///         Boundary::DIGIT_LOWER,
284    ///         Boundary::DIGIT_UPPER,
285    ///     ],
286    ///     Boundary::defaults()
287    /// );
288    /// ```
289    pub const fn defaults() -> [Boundary; 9] {
290        [
291            Boundary::SPACE,
292            Boundary::HYPHEN,
293            Boundary::UNDERSCORE,
294            Boundary::LOWER_UPPER,
295            Boundary::ACRONYM,
296            Boundary::LOWER_DIGIT,
297            Boundary::UPPER_DIGIT,
298            Boundary::DIGIT_LOWER,
299            Boundary::DIGIT_UPPER,
300        ]
301    }
302
303    /// Returns the boundaries that involve digits.
304    /// `LowerDigit`.
305    /// ```
306    /// # use convert_case::Boundary;
307    /// assert_eq!(
308    ///     [
309    ///         Boundary::LOWER_DIGIT,
310    ///         Boundary::UPPER_DIGIT,
311    ///         Boundary::DIGIT_LOWER,
312    ///         Boundary::DIGIT_UPPER,
313    ///     ],
314    ///     Boundary::digits()
315    /// );
316    /// ```
317    pub const fn digits() -> [Boundary; 4] {
318        [
319            Boundary::LOWER_DIGIT,
320            Boundary::UPPER_DIGIT,
321            Boundary::DIGIT_LOWER,
322            Boundary::DIGIT_UPPER,
323        ]
324    }
325
326    /// Returns the boundaries that are letters followed by digits.
327    /// ```
328    /// # use convert_case::Boundary;
329    /// assert_eq!(
330    ///     [
331    ///         Boundary::LOWER_DIGIT,
332    ///         Boundary::UPPER_DIGIT,
333    ///     ],
334    ///     Boundary::letter_digit()
335    /// );
336    /// ```
337    pub const fn letter_digit() -> [Boundary; 2] {
338        [Boundary::LOWER_DIGIT, Boundary::UPPER_DIGIT]
339    }
340
341    /// Returns the boundaries that are digits followed by letters.
342    /// ```
343    /// # use convert_case::Boundary;
344    /// assert_eq!(
345    ///     [
346    ///         Boundary::DIGIT_LOWER,
347    ///         Boundary::DIGIT_UPPER
348    ///     ],
349    ///     Boundary::digit_letter()
350    /// );
351    /// ```
352    pub fn digit_letter() -> [Boundary; 2] {
353        [Boundary::DIGIT_LOWER, Boundary::DIGIT_UPPER]
354    }
355
356    /// Returns a list of all boundaries that are identified within the given string.
357    /// Could be a short of writing out all the boundaries in a list directly.  This will not
358    /// identify boundary `UpperLower` if it also used as part of `Acronym`.
359    ///
360    /// If you want to be very explicit and not overlap boundaries, it is recommended to use a colon
361    /// character.
362    /// ```
363    /// # use convert_case::Boundary;
364    /// assert_eq!(
365    ///     vec![
366    ///         Boundary::SPACE,
367    ///         Boundary::HYPHEN,
368    ///         Boundary::LOWER_UPPER,
369    ///         Boundary::UPPER_DIGIT,
370    ///         Boundary::DIGIT_LOWER,
371    ///     ],
372    ///     Boundary::defaults_from("aA8a -")
373    /// );
374    /// assert_eq!(
375    ///     vec![
376    ///         Boundary::UNDERSCORE,
377    ///         Boundary::LOWER_UPPER,
378    ///         Boundary::ACRONYM,
379    ///         Boundary::DIGIT_UPPER,
380    ///     ],
381    ///     Boundary::defaults_from("bD:0B:_:AAa")
382    /// );
383    /// ```
384    pub fn defaults_from(pattern: &str) -> Vec<Boundary> {
385        let mut boundaries = Vec::new();
386        for boundary in Boundary::defaults() {
387            let parts = split(&pattern, &[boundary]);
388            if parts.len() > 1 || parts.len() == 0 || parts[0] != pattern {
389                boundaries.push(boundary);
390            }
391        }
392        boundaries
393    }
394}
395
396/// Split an identifier into a list of words using the list of boundaries.
397///
398/// This is used internally for splitting an identifier before mutating by
399/// a pattern and joining again with a delimiter.
400/// ```
401/// use convert_case::{Boundary, split};
402/// assert_eq!(
403///     vec!["one", "two", "three.four"],
404///     split(&"one_two-three.four", &[Boundary::UNDERSCORE, Boundary::HYPHEN]),
405/// )
406/// ```
407pub fn split<'s, T>(s: &'s T, boundaries: &[Boundary]) -> Vec<&'s str>
408where
409    T: AsRef<str>,
410{
411    let s = s.as_ref();
412
413    if s.len() == 0 {
414        return vec![];
415    }
416
417    let mut words = Vec::new();
418    let mut last_boundary_end = 0;
419
420    let (indices, graphemes): (Vec<_>, Vec<_>) = s.grapheme_indices(true).unzip();
421    let grapheme_length = indices[graphemes.len() - 1] + graphemes[graphemes.len() - 1].len();
422
423    for i in 0..graphemes.len() {
424        for boundary in boundaries {
425            //let byte_index = indices[i];
426
427            if (boundary.condition)(&graphemes[i..], boundary.arg) {
428                // What if we find a condition at the end of the array?
429                // Maybe we can stop early based on length
430                // To do this, need to switch the loops
431                // TODO
432                let boundary_byte_start: usize =
433                    *indices.get(i + boundary.start).unwrap_or(&grapheme_length);
434                let boundary_byte_end: usize = *indices
435                    .get(i + boundary.start + boundary.len)
436                    .unwrap_or(&grapheme_length);
437
438                // todo clean this up a bit
439                words.push(&s[last_boundary_end..boundary_byte_start]);
440                last_boundary_end = boundary_byte_end;
441                break;
442            }
443        }
444    }
445    words.push(&s[last_boundary_end..]);
446    words.into_iter().filter(|s| !s.is_empty()).collect()
447}
448
449// ascii version
450//pub fn split<'s, T>(s: &'s T, boundaries: &[Boundary]) -> Vec<&'s str>
451//where
452//    T: AsRef<str>,
453//{
454//    let s = s.as_ref();
455//
456//    let mut words = Vec::new();
457//    let mut last_end = 0;
458//    for i in 0..s.len() {
459//        for boundary in boundaries {
460//            if (boundary.condition)(&s[i..]) {
461//                words.push(&s[last_end..i + boundary.start]);
462//                last_end = i + boundary.start + boundary.len;
463//                break;
464//            }
465//        }
466//    }
467//    words.push(&s[last_end..]);
468//    words
469//}
470
471#[cfg(test)]
472mod tests {
473    use super::*;
474
475    #[test]
476    fn hyphen() {
477        let s = "a-b-c";
478        let v = split(&s, &[Boundary::HYPHEN]);
479        assert_eq!(v, vec!["a", "b", "c"]);
480    }
481
482    #[test]
483    fn underscore() {
484        let s = "a_b_c";
485        let v = split(&s, &[Boundary::UNDERSCORE]);
486        assert_eq!(v, vec!["a", "b", "c"]);
487    }
488
489    #[test]
490    fn space() {
491        let s = "a b c";
492        let v = split(&s, &[Boundary::SPACE]);
493        assert_eq!(v, vec!["a", "b", "c"]);
494    }
495
496    #[test]
497    fn delimiters() {
498        let s = "aaa-bbb_ccc ddd ddd-eee";
499        let v = split(
500            &s,
501            &[Boundary::SPACE, Boundary::UNDERSCORE, Boundary::HYPHEN],
502        );
503        assert_eq!(v, vec!["aaa", "bbb", "ccc", "ddd", "ddd", "eee"]);
504    }
505
506    #[test]
507    fn lower_upper() {
508        let s = "lowerUpperUpper";
509        let v = split(&s, &[Boundary::LOWER_UPPER]);
510        assert_eq!(v, vec!["lower", "Upper", "Upper"]);
511    }
512
513    #[test]
514    fn acronym() {
515        let s = "XMLRequest";
516        let v = split(&s, &[Boundary::ACRONYM]);
517        assert_eq!(v, vec!["XML", "Request"]);
518    }
519
520    // TODO: add tests for other boundaries
521
522    #[test]
523    fn boundaries_found_in_string() {
524        // upper lower is not longer a default
525        assert_eq!(Vec::<Boundary>::new(), Boundary::defaults_from(".Aaaa"));
526        assert_eq!(
527            vec![Boundary::LOWER_UPPER, Boundary::LOWER_DIGIT,],
528            Boundary::defaults_from("a8.Aa.aA")
529        );
530        assert_eq!(
531            Boundary::digits().to_vec(),
532            Boundary::defaults_from("b1B1b")
533        );
534        assert_eq!(
535            vec![
536                Boundary::SPACE,
537                Boundary::HYPHEN,
538                Boundary::UNDERSCORE,
539                Boundary::ACRONYM,
540            ],
541            Boundary::defaults_from("AAa -_")
542        );
543    }
544
545    #[test]
546    fn boundary_consts_same() {
547        assert_eq!(Boundary::SPACE, Boundary::SPACE);
548    }
549
550    #[test]
551    fn from_delim_dot() {
552        let boundary = Boundary::from_delim(".");
553        let s = "lower.Upper.Upper";
554        let v = split(&s, &[boundary]);
555        assert_eq!(vec!["lower", "Upper", "Upper"], v)
556    }
557
558    #[test]
559    fn from_delim_double_colon() {
560        let boundary = Boundary::from_delim("::");
561        let s = "lower::lowerUpper::Upper";
562        let v = split(&s, &[boundary]);
563        assert_eq!(vec!["lower", "lowerUpper", "Upper"], v)
564    }
565}