unic_ucd_segment/
grapheme_cluster_break.rs

1// Copyright 2017 The UNIC Project Developers.
2//
3// See the COPYRIGHT file at the top-level directory of this distribution.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode `Grapheme_Cluster_Break` Character Property.
12//!
13//! ## References
14//!
15//! * <https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break>
16//! * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
17//! * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values>
18
19use unic_char_property::TotalCharProperty;
20
21char_property! {
22    /// Represents the Unicode character
23    /// [`Grapheme_Cluster_Break`](https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break)
24    /// property.
25    ///
26    /// ## References
27    ///
28    /// * <https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break>
29    /// * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
30    /// * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values>
31    pub enum GraphemeClusterBreak {
32        abbr => "GCB";
33        long => "Grapheme_Cluster_Break";
34        human => "Grapheme Cluster Break";
35
36        /// ```text
37        /// U+000D CARRIAGE RETURN (CR)
38        /// ```
39        CR {
40            abbr => CR,
41            long => CR,
42            human => "Carriage Return",
43        }
44
45        /// ```text
46        /// U+000A LINE FEED (LF)
47        /// ```
48        LF {
49            abbr => LF,
50            long => LF,
51            human => "Line Feed",
52        }
53
54        /// ```text
55        /// General_Category = Line_Separator, or
56        /// General_Category = Paragraph_Separator, or
57        /// General_Category = Control, or
58        /// General_Category = Unassigned and Default_Ignorable_Code_Point, or
59        /// General_Category = Surrogate, or
60        /// General_Category = Format
61        /// and not U+000D CARRIAGE RETURN
62        /// and not U+000A LINE FEED
63        /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ)
64        /// and not U+200D ZERO WIDTH JOINER (ZWJ)
65        /// ```
66        Control {
67            abbr => CN,
68            long => Control,
69            human => "Control",
70        }
71
72        /// ```text
73        /// Grapheme_Extend = Yes
74        ///
75        /// This includes:
76        /// General_Category = Nonspacing_Mark
77        /// General_Category = Enclosing_Mark
78        /// U+200C ZERO WIDTH NON-JOINER
79        /// plus a few General_Category = Spacing_Mark needed for canonical equivalence.
80        /// ```
81        Extend {
82            abbr => EX,
83            long => Extend,
84            human => "Extend",
85        }
86
87        /// ```text
88        /// U+200D ZERO WIDTH JOINER
89        /// ```
90        ZWJ {
91            abbr => ZWJ,
92            long => ZWJ,
93            human => "Zero Width Joiner (ZWJ)",
94        }
95
96        /// ```text
97        /// Regional_Indicator = Yes
98        /// ```
99        ///
100        /// This consists of the range:
101        ///
102        /// ```text
103        /// U+1F1E6 REGIONAL INDICATOR SYMBOL LETTER A
104        /// ..U+1F1FF REGIONAL INDICATOR SYMBOL LETTER Z
105        /// ```
106        RegionalIndicator {
107            abbr => RI,
108            long => Regional_Indicator,
109            human => "Regional Indicator",
110        }
111
112        /// ```text
113        /// Indic_Syllabic_Category = Consonant_Preceding_Repha, or
114        /// Indic_Syllabic_Category = Consonant_Prefixed, or
115        /// Prepended_Concatenation_Mark = Yes
116        /// ```
117        Prepend {
118            abbr => PP,
119            long => Prepend,
120            human => "Prepend",
121        }
122
123        /// ```text
124        /// Grapheme_Cluster_Break ≠ Extend, and
125        /// General_Category = Spacing_Mark, or
126        /// any of the following (which have General_Category = Other_Letter):
127        /// U+0E33 ( ำ ) THAI CHARACTER SARA AM
128        /// U+0EB3 ( ຳ ) LAO VOWEL SIGN AM
129        /// ```
130        ///
131        /// Exceptions: The following (which have General_Category = Spacing_Mark and would
132        /// otherwise be included) are specifically excluded:
133        ///
134        /// ```text
135        /// U+102B ( ါ ) MYANMAR VOWEL SIGN TALL AA
136        /// U+102C ( ာ ) MYANMAR VOWEL SIGN AA
137        /// U+1038 ( း ) MYANMAR SIGN VISARGA
138        /// U+1062 ( ၢ ) MYANMAR VOWEL SIGN SGAW KAREN EU
139        /// ..U+1064 ( ၤ ) MYANMAR TONE MARK SGAW KAREN KE PHO
140        /// U+1067 ( ၧ ) MYANMAR VOWEL SIGN WESTERN PWO KAREN EU
141        /// ..U+106D ( ၭ ) MYANMAR SIGN WESTERN PWO KAREN TONE-5
142        /// U+1083 ( ႃ ) MYANMAR VOWEL SIGN SHAN AA
143        /// U+1087 ( ႇ ) MYANMAR SIGN SHAN TONE-2
144        /// ..U+108C ( ႌ ) MYANMAR SIGN SHAN COUNCIL TONE-3
145        /// U+108F ( ႏ ) MYANMAR SIGN RUMAI PALAUNG TONE-5
146        /// U+109A ( ႚ ) MYANMAR SIGN KHAMTI TONE-1
147        /// ..U+109C ( ႜ ) MYANMAR VOWEL SIGN AITON A
148        /// U+1A61 ( ᩡ ) TAI THAM VOWEL SIGN A
149        /// U+1A63 ( ᩣ ) TAI THAM VOWEL SIGN AA
150        /// U+1A64 ( ᩤ ) TAI THAM VOWEL SIGN TALL AA
151        /// U+AA7B ( ꩻ ) MYANMAR SIGN PAO KAREN TONE
152        /// U+AA7D ( ꩽ ) MYANMAR SIGN TAI LAING TONE-5
153        /// U+11720 ( 𑜠 ) AHOM VOWEL SIGN A
154        /// U+11721 ( 𑜡 ) AHOM VOWEL SIGN AA
155        /// ```
156        SpacingMark {
157            abbr => SM,
158            long => SpacingMark,
159            human => "Spacing Mark",
160        }
161
162        // Hangul
163
164        /// ```text
165        /// Hangul_Syllable_Type=L
166        /// ```
167        ///
168        /// Such as:
169        ///
170        /// ```text
171        /// U+1100 ( ᄀ ) HANGUL CHOSEONG KIYEOK
172        /// U+115F ( ᅟ ) HANGUL CHOSEONG FILLER
173        /// U+A960 ( ꥠ ) HANGUL CHOSEONG TIKEUT-MIEUM
174        /// U+A97C ( ꥼ ) HANGUL CHOSEONG SSANGYEORINHIEUH
175        /// ```
176        L {
177            abbr => L,
178            long => L,
179            human => "Hangul Syllable Type L",
180        }
181
182        /// ```text
183        /// Hangul_Syllable_Type=V
184        /// ```
185        ///
186        /// Such as:
187        ///
188        /// ```text
189        /// U+1160 ( ᅠ ) HANGUL JUNGSEONG FILLER
190        /// U+11A2 ( ᆢ ) HANGUL JUNGSEONG SSANGARAEA
191        /// U+D7B0 ( ힰ ) HANGUL JUNGSEONG O-YEO
192        /// U+D7C6 ( ퟆ ) HANGUL JUNGSEONG ARAEA-E
193        /// ```
194        V {
195            abbr => V,
196            long => V,
197            human => "Hangul Syllable Type V",
198        }
199
200        /// ```text
201        /// Hangul_Syllable_Type=T
202        /// ```
203        ///
204        /// Such as:
205        ///
206        /// ```text
207        /// U+11A8 ( ᆨ ) HANGUL JONGSEONG KIYEOK
208        /// U+11F9 ( ᇹ ) HANGUL JONGSEONG YEORINHIEUH
209        /// U+D7CB ( ퟋ ) HANGUL JONGSEONG NIEUN-RIEUL
210        /// U+D7FB ( ퟻ ) HANGUL JONGSEONG PHIEUPH-THIEUTH
211        /// ```
212        T {
213            abbr => T,
214            long => T,
215            human => "Hangul Syllable Type T",
216        }
217
218        /// ```text
219        /// Hangul_Syllable_Type=LV:
220        /// ```
221        ///
222        /// That is:
223        ///
224        /// ```text
225        /// U+AC00 ( 가 ) HANGUL SYLLABLE GA
226        /// U+AC1C ( 개 ) HANGUL SYLLABLE GAE
227        /// U+AC38 ( 갸 ) HANGUL SYLLABLE GYA
228        /// ...
229        /// ```
230        LV {
231            abbr => LV,
232            long => LV,
233            human => "Hangul Syllable Type LV",
234        }
235
236        /// ```text
237        /// Hangul_Syllable_Type=LVT
238        /// ```
239        ///
240        /// That is:
241        ///
242        /// ```text
243        /// U+AC01 ( 각 ) HANGUL SYLLABLE GAG
244        /// U+AC02 ( 갂 ) HANGUL SYLLABLE GAGG
245        /// U+AC03 ( 갃 ) HANGUL SYLLABLE GAGS
246        /// U+AC04 ( 간 ) HANGUL SYLLABLE GAN
247        /// ...
248        /// ```
249        LVT {
250            abbr => LVT,
251            long => LVT,
252            human => "Hangul Syllable Type LVT",
253        }
254
255        // Emoji
256
257        /// Emoji characters listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`, which do not
258        /// occur after ZWJ in `emoji-zwj-sequences.txt`.
259        ///
260        /// See <https://www.unicode.org/reports/tr51/>.
261        EBase {
262            abbr => EB,
263            long => E_Base,
264            human => "Emoji Base",
265        }
266
267        /// Emoji characters listed as `Emoji_Modifer=Yes` in `emoji-data.txt`.
268        ///
269        /// See <https://www.unicode.org/reports/tr51/>.
270        EModifier {
271            abbr => EM,
272            long => E_Modifier,
273            human => "Emoji Modifier",
274        }
275
276        /// Emoji characters that do not break from a previous ZWJ in a defined emoji ZWJ sequence,
277        /// and are not listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`.
278        ///
279        /// See <https://www.unicode.org/reports/tr51/>.
280        GlueAfterZwj {
281            abbr => GAZ,
282            long => Glue_After_Zwj,
283            human => "Glue After ZWJ",
284        }
285
286        /// Emoji characters listed as `Emoji_Modifer_Base=Yes` in `emoji_data.txt`, and also occur
287        /// after ZWJ in `emoji-zwj-sequences.txt`.
288        ///
289        /// See <https://www.unicode.org/reports/tr51/>.
290        EBaseGAZ {
291            abbr => EBG,
292            long => E_Base_GAZ,
293            human => "Emoji Base and Glue After ZWJ",
294        }
295
296        /// All other characters
297        Other {
298            abbr => XX,
299            long => Other,
300            human => "Other",
301        }
302    }
303
304    /// Abbreviated name aliases for the
305    /// [`Grapheme_Cluster_Break`](https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break)
306    /// property.
307    ///
308    /// ## See Also
309    ///
310    /// * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
311    pub mod abbr_names for abbr;
312
313    /// Long name aliases for the
314    /// [`Grapheme_Cluster_Break`](https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break)
315    /// property.
316    ///
317    /// ## See Also
318    ///
319    /// * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
320    pub mod long_names for long;
321}
322
323impl TotalCharProperty for GraphemeClusterBreak {
324    fn of(ch: char) -> Self {
325        Self::of(ch)
326    }
327}
328
329impl Default for GraphemeClusterBreak {
330    fn default() -> Self {
331        GraphemeClusterBreak::Other
332    }
333}
334
335mod data {
336    use super::long_names as GCB;
337    use unic_char_property::tables::CharDataTable;
338    pub const GRAPHEME_CLUSTER_BREAK_TABLE: CharDataTable<super::GraphemeClusterBreak> =
339        include!("../tables/grapheme_cluster_break.rsv");
340}
341
342impl GraphemeClusterBreak {
343    /// Find the character `Grapheme_Cluster_Break` property value.
344    pub fn of(ch: char) -> GraphemeClusterBreak {
345        data::GRAPHEME_CLUSTER_BREAK_TABLE.find_or_default(ch)
346    }
347}
348
349#[cfg(test)]
350mod tests {
351    use super::GraphemeClusterBreak as GCB;
352    use unic_char_property::EnumeratedCharProperty;
353
354    #[test]
355    fn test_ascii() {
356        assert_eq!(GCB::of('\u{0000}'), GCB::Control);
357        assert_eq!(GCB::of('\u{0040}'), GCB::Other);
358        assert_eq!(GCB::of('\u{0041}'), GCB::Other);
359        assert_eq!(GCB::of('\u{0062}'), GCB::Other);
360        assert_eq!(GCB::of('\u{007F}'), GCB::Control);
361    }
362
363    #[test]
364    fn test_bmp() {
365        // Hebrew
366        assert_eq!(GCB::of('\u{0590}'), GCB::Other);
367        assert_eq!(GCB::of('\u{05D0}'), GCB::Other);
368        assert_eq!(GCB::of('\u{05D1}'), GCB::Other);
369        assert_eq!(GCB::of('\u{05FF}'), GCB::Other);
370
371        // Arabic
372        assert_eq!(GCB::of('\u{0600}'), GCB::Prepend);
373        assert_eq!(GCB::of('\u{0627}'), GCB::Other);
374        assert_eq!(GCB::of('\u{07BF}'), GCB::Other);
375
376        // Default R + Arabic Extras
377        assert_eq!(GCB::of('\u{07C0}'), GCB::Other);
378        assert_eq!(GCB::of('\u{085F}'), GCB::Other);
379        assert_eq!(GCB::of('\u{0860}'), GCB::Other);
380        assert_eq!(GCB::of('\u{0870}'), GCB::Other);
381        assert_eq!(GCB::of('\u{089F}'), GCB::Other);
382        assert_eq!(GCB::of('\u{08A0}'), GCB::Other);
383        assert_eq!(GCB::of('\u{089F}'), GCB::Other);
384        assert_eq!(GCB::of('\u{08FF}'), GCB::Extend);
385
386        // Default ET
387        assert_eq!(GCB::of('\u{20A0}'), GCB::Other);
388        assert_eq!(GCB::of('\u{20CF}'), GCB::Other);
389
390        // Arabic Presentation Forms
391        assert_eq!(GCB::of('\u{FB1D}'), GCB::Other);
392        assert_eq!(GCB::of('\u{FB4F}'), GCB::Other);
393        assert_eq!(GCB::of('\u{FB50}'), GCB::Other);
394        assert_eq!(GCB::of('\u{FDCF}'), GCB::Other);
395        assert_eq!(GCB::of('\u{FDF0}'), GCB::Other);
396        assert_eq!(GCB::of('\u{FDFF}'), GCB::Other);
397        assert_eq!(GCB::of('\u{FE70}'), GCB::Other);
398        assert_eq!(GCB::of('\u{FEFE}'), GCB::Other);
399        assert_eq!(GCB::of('\u{FEFF}'), GCB::Control);
400
401        // noncharacters
402        assert_eq!(GCB::of('\u{FDD0}'), GCB::Other);
403        assert_eq!(GCB::of('\u{FDD1}'), GCB::Other);
404        assert_eq!(GCB::of('\u{FDEE}'), GCB::Other);
405        assert_eq!(GCB::of('\u{FDEF}'), GCB::Other);
406        assert_eq!(GCB::of('\u{FFFE}'), GCB::Other);
407        assert_eq!(GCB::of('\u{FFFF}'), GCB::Other);
408    }
409
410    #[test]
411    fn test_smp() {
412        // Default AL + R
413        assert_eq!(GCB::of('\u{10800}'), GCB::Other);
414        assert_eq!(GCB::of('\u{10FFF}'), GCB::Other);
415        assert_eq!(GCB::of('\u{1E800}'), GCB::Other);
416        assert_eq!(GCB::of('\u{1EDFF}'), GCB::Other);
417        assert_eq!(GCB::of('\u{1EE00}'), GCB::Other);
418        assert_eq!(GCB::of('\u{1EEFF}'), GCB::Other);
419        assert_eq!(GCB::of('\u{1EF00}'), GCB::Other);
420        assert_eq!(GCB::of('\u{1EFFF}'), GCB::Other);
421    }
422
423    #[test]
424    fn test_unassigned_planes() {
425        assert_eq!(GCB::of('\u{30000}'), GCB::Other);
426        assert_eq!(GCB::of('\u{40000}'), GCB::Other);
427        assert_eq!(GCB::of('\u{50000}'), GCB::Other);
428        assert_eq!(GCB::of('\u{60000}'), GCB::Other);
429        assert_eq!(GCB::of('\u{70000}'), GCB::Other);
430        assert_eq!(GCB::of('\u{80000}'), GCB::Other);
431        assert_eq!(GCB::of('\u{90000}'), GCB::Other);
432        assert_eq!(GCB::of('\u{a0000}'), GCB::Other);
433    }
434
435    #[test]
436    fn test_abbr_name() {
437        assert_eq!(GCB::CR.abbr_name(), "CR");
438    }
439
440    #[test]
441    fn test_long_name() {
442        assert_eq!(GCB::CR.long_name(), "CR");
443    }
444
445    #[test]
446    fn test_human_name() {
447        assert_eq!(GCB::CR.human_name(), "Carriage Return");
448    }
449}