unic_ucd_segment/grapheme_cluster_break.rs
1// Copyright 2017 The UNIC Project Developers.
2//
3// See the COPYRIGHT file at the top-level directory of this distribution.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode `Grapheme_Cluster_Break` Character Property.
12//!
13//! ## References
14//!
15//! * <https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break>
16//! * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
17//! * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values>
18
19use unic_char_property::TotalCharProperty;
20
21char_property! {
22 /// Represents the Unicode character
23 /// [`Grapheme_Cluster_Break`](https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break)
24 /// property.
25 ///
26 /// ## References
27 ///
28 /// * <https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break>
29 /// * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
30 /// * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values>
31 pub enum GraphemeClusterBreak {
32 abbr => "GCB";
33 long => "Grapheme_Cluster_Break";
34 human => "Grapheme Cluster Break";
35
36 /// ```text
37 /// U+000D CARRIAGE RETURN (CR)
38 /// ```
39 CR {
40 abbr => CR,
41 long => CR,
42 human => "Carriage Return",
43 }
44
45 /// ```text
46 /// U+000A LINE FEED (LF)
47 /// ```
48 LF {
49 abbr => LF,
50 long => LF,
51 human => "Line Feed",
52 }
53
54 /// ```text
55 /// General_Category = Line_Separator, or
56 /// General_Category = Paragraph_Separator, or
57 /// General_Category = Control, or
58 /// General_Category = Unassigned and Default_Ignorable_Code_Point, or
59 /// General_Category = Surrogate, or
60 /// General_Category = Format
61 /// and not U+000D CARRIAGE RETURN
62 /// and not U+000A LINE FEED
63 /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ)
64 /// and not U+200D ZERO WIDTH JOINER (ZWJ)
65 /// ```
66 Control {
67 abbr => CN,
68 long => Control,
69 human => "Control",
70 }
71
72 /// ```text
73 /// Grapheme_Extend = Yes
74 ///
75 /// This includes:
76 /// General_Category = Nonspacing_Mark
77 /// General_Category = Enclosing_Mark
78 /// U+200C ZERO WIDTH NON-JOINER
79 /// plus a few General_Category = Spacing_Mark needed for canonical equivalence.
80 /// ```
81 Extend {
82 abbr => EX,
83 long => Extend,
84 human => "Extend",
85 }
86
87 /// ```text
88 /// U+200D ZERO WIDTH JOINER
89 /// ```
90 ZWJ {
91 abbr => ZWJ,
92 long => ZWJ,
93 human => "Zero Width Joiner (ZWJ)",
94 }
95
96 /// ```text
97 /// Regional_Indicator = Yes
98 /// ```
99 ///
100 /// This consists of the range:
101 ///
102 /// ```text
103 /// U+1F1E6 REGIONAL INDICATOR SYMBOL LETTER A
104 /// ..U+1F1FF REGIONAL INDICATOR SYMBOL LETTER Z
105 /// ```
106 RegionalIndicator {
107 abbr => RI,
108 long => Regional_Indicator,
109 human => "Regional Indicator",
110 }
111
112 /// ```text
113 /// Indic_Syllabic_Category = Consonant_Preceding_Repha, or
114 /// Indic_Syllabic_Category = Consonant_Prefixed, or
115 /// Prepended_Concatenation_Mark = Yes
116 /// ```
117 Prepend {
118 abbr => PP,
119 long => Prepend,
120 human => "Prepend",
121 }
122
123 /// ```text
124 /// Grapheme_Cluster_Break ≠ Extend, and
125 /// General_Category = Spacing_Mark, or
126 /// any of the following (which have General_Category = Other_Letter):
127 /// U+0E33 ( ำ ) THAI CHARACTER SARA AM
128 /// U+0EB3 ( ຳ ) LAO VOWEL SIGN AM
129 /// ```
130 ///
131 /// Exceptions: The following (which have General_Category = Spacing_Mark and would
132 /// otherwise be included) are specifically excluded:
133 ///
134 /// ```text
135 /// U+102B ( ါ ) MYANMAR VOWEL SIGN TALL AA
136 /// U+102C ( ာ ) MYANMAR VOWEL SIGN AA
137 /// U+1038 ( း ) MYANMAR SIGN VISARGA
138 /// U+1062 ( ၢ ) MYANMAR VOWEL SIGN SGAW KAREN EU
139 /// ..U+1064 ( ၤ ) MYANMAR TONE MARK SGAW KAREN KE PHO
140 /// U+1067 ( ၧ ) MYANMAR VOWEL SIGN WESTERN PWO KAREN EU
141 /// ..U+106D ( ၭ ) MYANMAR SIGN WESTERN PWO KAREN TONE-5
142 /// U+1083 ( ႃ ) MYANMAR VOWEL SIGN SHAN AA
143 /// U+1087 ( ႇ ) MYANMAR SIGN SHAN TONE-2
144 /// ..U+108C ( ႌ ) MYANMAR SIGN SHAN COUNCIL TONE-3
145 /// U+108F ( ႏ ) MYANMAR SIGN RUMAI PALAUNG TONE-5
146 /// U+109A ( ႚ ) MYANMAR SIGN KHAMTI TONE-1
147 /// ..U+109C ( ႜ ) MYANMAR VOWEL SIGN AITON A
148 /// U+1A61 ( ᩡ ) TAI THAM VOWEL SIGN A
149 /// U+1A63 ( ᩣ ) TAI THAM VOWEL SIGN AA
150 /// U+1A64 ( ᩤ ) TAI THAM VOWEL SIGN TALL AA
151 /// U+AA7B ( ꩻ ) MYANMAR SIGN PAO KAREN TONE
152 /// U+AA7D ( ꩽ ) MYANMAR SIGN TAI LAING TONE-5
153 /// U+11720 ( 𑜠 ) AHOM VOWEL SIGN A
154 /// U+11721 ( 𑜡 ) AHOM VOWEL SIGN AA
155 /// ```
156 SpacingMark {
157 abbr => SM,
158 long => SpacingMark,
159 human => "Spacing Mark",
160 }
161
162 // Hangul
163
164 /// ```text
165 /// Hangul_Syllable_Type=L
166 /// ```
167 ///
168 /// Such as:
169 ///
170 /// ```text
171 /// U+1100 ( ᄀ ) HANGUL CHOSEONG KIYEOK
172 /// U+115F ( ᅟ ) HANGUL CHOSEONG FILLER
173 /// U+A960 ( ꥠ ) HANGUL CHOSEONG TIKEUT-MIEUM
174 /// U+A97C ( ꥼ ) HANGUL CHOSEONG SSANGYEORINHIEUH
175 /// ```
176 L {
177 abbr => L,
178 long => L,
179 human => "Hangul Syllable Type L",
180 }
181
182 /// ```text
183 /// Hangul_Syllable_Type=V
184 /// ```
185 ///
186 /// Such as:
187 ///
188 /// ```text
189 /// U+1160 ( ᅠ ) HANGUL JUNGSEONG FILLER
190 /// U+11A2 ( ᆢ ) HANGUL JUNGSEONG SSANGARAEA
191 /// U+D7B0 ( ힰ ) HANGUL JUNGSEONG O-YEO
192 /// U+D7C6 ( ퟆ ) HANGUL JUNGSEONG ARAEA-E
193 /// ```
194 V {
195 abbr => V,
196 long => V,
197 human => "Hangul Syllable Type V",
198 }
199
200 /// ```text
201 /// Hangul_Syllable_Type=T
202 /// ```
203 ///
204 /// Such as:
205 ///
206 /// ```text
207 /// U+11A8 ( ᆨ ) HANGUL JONGSEONG KIYEOK
208 /// U+11F9 ( ᇹ ) HANGUL JONGSEONG YEORINHIEUH
209 /// U+D7CB ( ퟋ ) HANGUL JONGSEONG NIEUN-RIEUL
210 /// U+D7FB ( ퟻ ) HANGUL JONGSEONG PHIEUPH-THIEUTH
211 /// ```
212 T {
213 abbr => T,
214 long => T,
215 human => "Hangul Syllable Type T",
216 }
217
218 /// ```text
219 /// Hangul_Syllable_Type=LV:
220 /// ```
221 ///
222 /// That is:
223 ///
224 /// ```text
225 /// U+AC00 ( 가 ) HANGUL SYLLABLE GA
226 /// U+AC1C ( 개 ) HANGUL SYLLABLE GAE
227 /// U+AC38 ( 갸 ) HANGUL SYLLABLE GYA
228 /// ...
229 /// ```
230 LV {
231 abbr => LV,
232 long => LV,
233 human => "Hangul Syllable Type LV",
234 }
235
236 /// ```text
237 /// Hangul_Syllable_Type=LVT
238 /// ```
239 ///
240 /// That is:
241 ///
242 /// ```text
243 /// U+AC01 ( 각 ) HANGUL SYLLABLE GAG
244 /// U+AC02 ( 갂 ) HANGUL SYLLABLE GAGG
245 /// U+AC03 ( 갃 ) HANGUL SYLLABLE GAGS
246 /// U+AC04 ( 간 ) HANGUL SYLLABLE GAN
247 /// ...
248 /// ```
249 LVT {
250 abbr => LVT,
251 long => LVT,
252 human => "Hangul Syllable Type LVT",
253 }
254
255 // Emoji
256
257 /// Emoji characters listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`, which do not
258 /// occur after ZWJ in `emoji-zwj-sequences.txt`.
259 ///
260 /// See <https://www.unicode.org/reports/tr51/>.
261 EBase {
262 abbr => EB,
263 long => E_Base,
264 human => "Emoji Base",
265 }
266
267 /// Emoji characters listed as `Emoji_Modifer=Yes` in `emoji-data.txt`.
268 ///
269 /// See <https://www.unicode.org/reports/tr51/>.
270 EModifier {
271 abbr => EM,
272 long => E_Modifier,
273 human => "Emoji Modifier",
274 }
275
276 /// Emoji characters that do not break from a previous ZWJ in a defined emoji ZWJ sequence,
277 /// and are not listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`.
278 ///
279 /// See <https://www.unicode.org/reports/tr51/>.
280 GlueAfterZwj {
281 abbr => GAZ,
282 long => Glue_After_Zwj,
283 human => "Glue After ZWJ",
284 }
285
286 /// Emoji characters listed as `Emoji_Modifer_Base=Yes` in `emoji_data.txt`, and also occur
287 /// after ZWJ in `emoji-zwj-sequences.txt`.
288 ///
289 /// See <https://www.unicode.org/reports/tr51/>.
290 EBaseGAZ {
291 abbr => EBG,
292 long => E_Base_GAZ,
293 human => "Emoji Base and Glue After ZWJ",
294 }
295
296 /// All other characters
297 Other {
298 abbr => XX,
299 long => Other,
300 human => "Other",
301 }
302 }
303
304 /// Abbreviated name aliases for the
305 /// [`Grapheme_Cluster_Break`](https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break)
306 /// property.
307 ///
308 /// ## See Also
309 ///
310 /// * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
311 pub mod abbr_names for abbr;
312
313 /// Long name aliases for the
314 /// [`Grapheme_Cluster_Break`](https://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break)
315 /// property.
316 ///
317 /// ## See Also
318 ///
319 /// * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
320 pub mod long_names for long;
321}
322
323impl TotalCharProperty for GraphemeClusterBreak {
324 fn of(ch: char) -> Self {
325 Self::of(ch)
326 }
327}
328
329impl Default for GraphemeClusterBreak {
330 fn default() -> Self {
331 GraphemeClusterBreak::Other
332 }
333}
334
335mod data {
336 use super::long_names as GCB;
337 use unic_char_property::tables::CharDataTable;
338 pub const GRAPHEME_CLUSTER_BREAK_TABLE: CharDataTable<super::GraphemeClusterBreak> =
339 include!("../tables/grapheme_cluster_break.rsv");
340}
341
342impl GraphemeClusterBreak {
343 /// Find the character `Grapheme_Cluster_Break` property value.
344 pub fn of(ch: char) -> GraphemeClusterBreak {
345 data::GRAPHEME_CLUSTER_BREAK_TABLE.find_or_default(ch)
346 }
347}
348
349#[cfg(test)]
350mod tests {
351 use super::GraphemeClusterBreak as GCB;
352 use unic_char_property::EnumeratedCharProperty;
353
354 #[test]
355 fn test_ascii() {
356 assert_eq!(GCB::of('\u{0000}'), GCB::Control);
357 assert_eq!(GCB::of('\u{0040}'), GCB::Other);
358 assert_eq!(GCB::of('\u{0041}'), GCB::Other);
359 assert_eq!(GCB::of('\u{0062}'), GCB::Other);
360 assert_eq!(GCB::of('\u{007F}'), GCB::Control);
361 }
362
363 #[test]
364 fn test_bmp() {
365 // Hebrew
366 assert_eq!(GCB::of('\u{0590}'), GCB::Other);
367 assert_eq!(GCB::of('\u{05D0}'), GCB::Other);
368 assert_eq!(GCB::of('\u{05D1}'), GCB::Other);
369 assert_eq!(GCB::of('\u{05FF}'), GCB::Other);
370
371 // Arabic
372 assert_eq!(GCB::of('\u{0600}'), GCB::Prepend);
373 assert_eq!(GCB::of('\u{0627}'), GCB::Other);
374 assert_eq!(GCB::of('\u{07BF}'), GCB::Other);
375
376 // Default R + Arabic Extras
377 assert_eq!(GCB::of('\u{07C0}'), GCB::Other);
378 assert_eq!(GCB::of('\u{085F}'), GCB::Other);
379 assert_eq!(GCB::of('\u{0860}'), GCB::Other);
380 assert_eq!(GCB::of('\u{0870}'), GCB::Other);
381 assert_eq!(GCB::of('\u{089F}'), GCB::Other);
382 assert_eq!(GCB::of('\u{08A0}'), GCB::Other);
383 assert_eq!(GCB::of('\u{089F}'), GCB::Other);
384 assert_eq!(GCB::of('\u{08FF}'), GCB::Extend);
385
386 // Default ET
387 assert_eq!(GCB::of('\u{20A0}'), GCB::Other);
388 assert_eq!(GCB::of('\u{20CF}'), GCB::Other);
389
390 // Arabic Presentation Forms
391 assert_eq!(GCB::of('\u{FB1D}'), GCB::Other);
392 assert_eq!(GCB::of('\u{FB4F}'), GCB::Other);
393 assert_eq!(GCB::of('\u{FB50}'), GCB::Other);
394 assert_eq!(GCB::of('\u{FDCF}'), GCB::Other);
395 assert_eq!(GCB::of('\u{FDF0}'), GCB::Other);
396 assert_eq!(GCB::of('\u{FDFF}'), GCB::Other);
397 assert_eq!(GCB::of('\u{FE70}'), GCB::Other);
398 assert_eq!(GCB::of('\u{FEFE}'), GCB::Other);
399 assert_eq!(GCB::of('\u{FEFF}'), GCB::Control);
400
401 // noncharacters
402 assert_eq!(GCB::of('\u{FDD0}'), GCB::Other);
403 assert_eq!(GCB::of('\u{FDD1}'), GCB::Other);
404 assert_eq!(GCB::of('\u{FDEE}'), GCB::Other);
405 assert_eq!(GCB::of('\u{FDEF}'), GCB::Other);
406 assert_eq!(GCB::of('\u{FFFE}'), GCB::Other);
407 assert_eq!(GCB::of('\u{FFFF}'), GCB::Other);
408 }
409
410 #[test]
411 fn test_smp() {
412 // Default AL + R
413 assert_eq!(GCB::of('\u{10800}'), GCB::Other);
414 assert_eq!(GCB::of('\u{10FFF}'), GCB::Other);
415 assert_eq!(GCB::of('\u{1E800}'), GCB::Other);
416 assert_eq!(GCB::of('\u{1EDFF}'), GCB::Other);
417 assert_eq!(GCB::of('\u{1EE00}'), GCB::Other);
418 assert_eq!(GCB::of('\u{1EEFF}'), GCB::Other);
419 assert_eq!(GCB::of('\u{1EF00}'), GCB::Other);
420 assert_eq!(GCB::of('\u{1EFFF}'), GCB::Other);
421 }
422
423 #[test]
424 fn test_unassigned_planes() {
425 assert_eq!(GCB::of('\u{30000}'), GCB::Other);
426 assert_eq!(GCB::of('\u{40000}'), GCB::Other);
427 assert_eq!(GCB::of('\u{50000}'), GCB::Other);
428 assert_eq!(GCB::of('\u{60000}'), GCB::Other);
429 assert_eq!(GCB::of('\u{70000}'), GCB::Other);
430 assert_eq!(GCB::of('\u{80000}'), GCB::Other);
431 assert_eq!(GCB::of('\u{90000}'), GCB::Other);
432 assert_eq!(GCB::of('\u{a0000}'), GCB::Other);
433 }
434
435 #[test]
436 fn test_abbr_name() {
437 assert_eq!(GCB::CR.abbr_name(), "CR");
438 }
439
440 #[test]
441 fn test_long_name() {
442 assert_eq!(GCB::CR.long_name(), "CR");
443 }
444
445 #[test]
446 fn test_human_name() {
447 assert_eq!(GCB::CR.human_name(), "Carriage Return");
448 }
449}