unic_ucd_segment/
word_break.rs

1// Copyright 2017 The UNIC Project Developers.
2//
3// See the COPYRIGHT file at the top-level directory of this distribution.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode `Word_Break` Character Property.
12//!
13//! ## References
14//!
15//! * <https://www.unicode.org/reports/tr44/#Word_Break>
16//! * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
17//! * <https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values>
18
19use unic_char_property::TotalCharProperty;
20
21char_property! {
22    /// Represents the Unicode character
23    /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break)
24    /// property.
25    ///
26    /// ## References
27    ///
28    /// * <https://www.unicode.org/reports/tr44/#Word_Break>
29    /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
30    /// * <https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values>
31    pub enum WordBreak {
32        abbr => "WB";
33        long => "Word_Break";
34        human => "Word Break";
35
36        /// ```text
37        /// U+000D CARRIAGE RETURN (CR)
38        /// ```
39        CR {
40            abbr => CR,
41            long => CR,
42            human => "Carriage Return",
43        }
44
45        /// ```text
46        /// U+000A LINE FEED (LF)
47        /// ```
48        LF {
49            abbr => LF,
50            long => LF,
51            human => "Line Feed",
52        }
53
54        /// ```text
55        /// U+000B LINE TABULATION
56        /// U+000C FORM FEED (FF)
57        /// U+0085 NEXT LINE (NEL)
58        /// U+2028 LINE SEPARATOR
59        /// U+2029 PARAGRAPH SEPARATOR
60        /// ```
61        Newline {
62            abbr => NL,
63            long => Newline,
64            human => "Newline",
65        }
66
67        /// ```text
68        /// Grapheme_Extend = Yes, or
69        /// General_Category = Spacing_Mark
70        /// and not U+200D ZERO WIDTH JOINER (ZWJ)
71        /// ```
72        Extend {
73            abbr => Extend,
74            long => Extend,
75            human => "Extend",
76        }
77
78        /// ```text
79        /// U+200D ZERO WIDTH JOINER
80        /// ```
81        ZWJ {
82            abbr => ZWJ,
83            long => ZWJ,
84            human => "Zero Width Joiner (ZWJ)",
85        }
86
87        /// ```text
88        /// Regional_Indicator = Yes
89        /// ```
90        ///
91        /// This consists of the range:
92        ///
93        /// ```text
94        /// U+1F1E6 REGIONAL INDICATOR SYMBOL LETTER A
95        /// ..U+1F1FF REGIONAL INDICATOR SYMBOL LETTER Z
96        /// ```
97        RegionalIndicator {
98            abbr => RI,
99            long => Regional_Indicator,
100            human => "Regional Indicator",
101        }
102
103        /// ```text
104        /// General_Category = Format
105        /// and not U+200B ZERO WIDTH SPACE (ZWSP)
106        /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ)
107        /// and not U+200D ZERO WIDTH JOINER (ZWJ)
108        /// ```
109        Format {
110            abbr => FO,
111            long => Format,
112            human => "Format",
113        }
114
115        /// ```text
116        /// Script = KATAKANA, or
117        /// any of the following:
118        /// U+3031 ( 〱 ) VERTICAL KANA REPEAT MARK
119        /// U+3032 ( 〲 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK
120        /// U+3033 ( 〳 ) VERTICAL KANA REPEAT MARK UPPER HALF
121        /// U+3034 ( 〴 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF
122        /// U+3035 ( 〵 ) VERTICAL KANA REPEAT MARK LOWER HALF
123        /// U+309B ( ゛ ) KATAKANA-HIRAGANA VOICED SOUND MARK
124        /// U+309C ( ゜ ) KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
125        /// U+30A0 ( ゠ ) KATAKANA-HIRAGANA DOUBLE HYPHEN
126        /// U+30FC ( ー ) KATAKANA-HIRAGANA PROLONGED SOUND MARK
127        /// U+FF70 ( ー ) HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
128        /// ```
129        Katakana {
130            abbr => KA,
131            long => Katakana,
132            human => "Katakana",
133        }
134
135        /// ```text
136        /// Script = Hebrew
137        /// and General_Category = Other_Letter
138        /// ```
139        HebrewLetter {
140            abbr => HL,
141            long => Hebrew_Letter,
142            human => "Hebrew Letter",
143        }
144
145        /// ```text
146        /// Alphabetic = Yes, or
147        /// any of the following 36 characters:
148        /// U+02C2 ( ˂ ) MODIFIER LETTER LEFT ARROWHEAD
149        /// ..U+02C5 ( ˅ ) MODIFIER LETTER DOWN ARROWHEAD
150        /// U+02D2 ( ˒ ) MODIFIER LETTER CENTRED RIGHT HALF RING
151        /// ..U+02D7 ( ˗ ) MODIFIER LETTER MINUS SIGN
152        /// U+02DE ( ˞ ) MODIFIER LETTER RHOTIC HOOK
153        /// U+02DF ( ˟ ) MODIFIER LETTER CROSS ACCENT
154        /// U+02ED ( ˭ ) MODIFIER LETTER UNASPIRATED
155        /// U+02EF ( ˯ ) MODIFIER LETTER LOW DOWN ARROWHEAD
156        /// ..U+02FF ( ˿ ) MODIFIER LETTER LOW LEFT ARROW
157        /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH
158        /// U+A720 ( ꜠ ) MODIFIER LETTER STRESS AND HIGH TONE
159        /// U+A721 ( ꜡ ) MODIFIER LETTER STRESS AND LOW TONE
160        /// U+A789 ( ꞉ ) MODIFIER LETTER COLON
161        /// U+A78A ( ꞊ ) MODIFIER LETTER SHORT EQUALS SIGN
162        /// U+AB5B ( ꭛ ) MODIFIER BREVE WITH INVERTED BREVE
163        /// and Ideographic = No
164        /// and Word_Break ≠ Katakana
165        /// and Line_Break ≠ Complex_Context (SA)
166        /// and Script ≠ Hiragana
167        /// and Word_Break ≠ Extend
168        /// and Word_Break ≠ Hebrew_Letter
169        /// ```
170        ALetter {
171            abbr => LE,
172            long => ALetter,
173            human => "Alphabetic Letter",
174        }
175
176        /// ```text
177        /// U+0027 ( ' ) APOSTROPHE
178        /// ```
179        SingleQuote {
180            abbr => SQ,
181            long => Single_Quote,
182            human => "Single Quote",
183        }
184
185        /// ```text
186        /// U+0022 ( " ) QUOTATION MARK
187        /// ```
188        DoubleQuote {
189            abbr => DQ,
190            long => Double_Quote,
191            human => "Double Quote",
192        }
193
194        /// ```text
195        /// U+002E ( . ) FULL STOP
196        /// U+2018 ( ‘ ) LEFT SINGLE QUOTATION MARK
197        /// U+2019 ( ’ ) RIGHT SINGLE QUOTATION MARK
198        /// U+2024 ( ․ ) ONE DOT LEADER
199        /// U+FE52 ( ﹒ ) SMALL FULL STOP
200        /// U+FF07 ( ' ) FULLWIDTH APOSTROPHE
201        /// U+FF0E ( . ) FULLWIDTH FULL STOP
202        /// ```
203        MidNumLet {
204            abbr => MB,
205            long => MidNumLet,
206            human => "Middle of Numeric/Letter",
207        }
208
209        /// ```text
210        /// U+00B7 ( · ) MIDDLE DOT
211        /// U+0387 ( · ) GREEK ANO TELEIA
212        /// U+05F4 ( ״ ) HEBREW PUNCTUATION GERSHAYIM
213        /// U+2027 ( ‧ ) HYPHENATION POINT
214        /// U+003A ( : ) COLON (used in Swedish)
215        /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON
216        /// U+FE55 ( ﹕ ) SMALL COLON
217        /// U+FF1A ( : ) FULLWIDTH COLON
218        /// ```
219        MidLetter {
220            abbr => ML,
221            long => MidLetter,
222            human => "Middle of Letter",
223        }
224
225        /// ```text
226        /// Line_Break = Infix_Numeric, or
227        /// any of the following:
228        /// U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR
229        /// U+FE50 ( ﹐ ) SMALL COMMA
230        /// U+FE54 ( ﹔ ) SMALL SEMICOLON
231        /// U+FF0C ( , ) FULLWIDTH COMMA
232        /// U+FF1B ( ; ) FULLWIDTH SEMICOLON
233        /// and not U+003A ( : ) COLON
234        /// and not U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON
235        /// and not U+002E ( . ) FULL STOP
236        /// ```
237        MidNum {
238            abbr => MN,
239            long => MidNum,
240            human => "Middle of Numeric",
241        }
242
243        /// ```text
244        /// Line_Break = Numeric
245        /// and not U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR
246        /// ```
247        Numeric {
248            abbr => NU,
249            long => Numeric,
250            human => "Numeric",
251        }
252
253        /// ```text
254        /// General_Category = Connector_Punctuation, or
255        /// U+202F NARROW NO-BREAK SPACE (NNBSP)
256        /// ```
257        ExtendNumLet {
258            abbr => EX,
259            long => ExtendNumLet,
260            human => "Extend Numeric/Letter",
261        }
262
263        // Emoji
264
265        /// Emoji characters listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`, which do not
266        /// occur after ZWJ in `emoji-zwj-sequences.txt`.
267        ///
268        /// See <https://www.unicode.org/reports/tr51/>.
269        EBase {
270            abbr => EB,
271            long => E_Base,
272            human => "Emoji Base",
273        }
274
275        /// Emoji characters listed as `Emoji_Modifer=Yes` in `emoji-data.txt`.
276        ///
277        /// See <https://www.unicode.org/reports/tr51/>.
278        EModifier {
279            abbr => EM,
280            long => E_Modifier,
281            human => "Emoji Modifier",
282        }
283
284        /// Emoji characters that do not break from a previous ZWJ in a defined emoji ZWJ sequence,
285        /// and are not listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`.
286        ///
287        /// See <https://www.unicode.org/reports/tr51/>.
288        GlueAfterZwj {
289            abbr => GAZ,
290            long => Glue_After_Zwj,
291            human => "Glue After ZWJ",
292        }
293
294        /// Emoji characters listed as `Emoji_Modifer_Base=Yes` in `emoji_data.txt`, and also occur
295        /// after ZWJ in `emoji-zwj-sequences.txt`.
296        ///
297        /// See <https://www.unicode.org/reports/tr51/>.
298        EBaseGAZ {
299            abbr => EBG,
300            long => E_Base_GAZ,
301            human => "Emoji Base and Glue After ZWJ",
302        }
303
304        /// All other characters
305        Other {
306            abbr => XX,
307            long => Other,
308            human => "Other",
309        }
310    }
311
312    /// Abbreviated name aliases for the
313    /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break)
314    /// property.
315    ///
316    /// ## See Also
317    ///
318    /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
319    pub mod abbr_names for abbr;
320
321    /// Long name aliases for the
322    /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break)
323    /// property.
324    ///
325    /// ## See Also
326    ///
327    /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
328    pub mod long_names for long;
329}
330
331impl TotalCharProperty for WordBreak {
332    fn of(ch: char) -> Self {
333        Self::of(ch)
334    }
335}
336
337impl Default for WordBreak {
338    fn default() -> Self {
339        WordBreak::Other
340    }
341}
342
343mod data {
344    use super::long_names as WB;
345    use unic_char_property::tables::CharDataTable;
346    pub const WORD_BREAK_TABLE: CharDataTable<super::WordBreak> =
347        include!("../tables/word_break.rsv");
348}
349
350impl WordBreak {
351    /// Find the character `Word_Break` property value.
352    pub fn of(ch: char) -> WordBreak {
353        data::WORD_BREAK_TABLE.find_or_default(ch)
354    }
355}
356
357#[cfg(test)]
358mod tests {
359    use super::WordBreak as WB;
360    use unic_char_property::EnumeratedCharProperty;
361
362    #[test]
363    fn test_ascii() {
364        assert_eq!(WB::of('\u{0000}'), WB::Other);
365        assert_eq!(WB::of('\u{0040}'), WB::Other);
366        assert_eq!(WB::of('\u{0041}'), WB::ALetter);
367        assert_eq!(WB::of('\u{0062}'), WB::ALetter);
368        assert_eq!(WB::of('\u{007F}'), WB::Other);
369    }
370
371    #[test]
372    fn test_bmp() {
373        // Hebrew
374        assert_eq!(WB::of('\u{0590}'), WB::Other);
375        assert_eq!(WB::of('\u{05D0}'), WB::HebrewLetter);
376        assert_eq!(WB::of('\u{05D1}'), WB::HebrewLetter);
377        assert_eq!(WB::of('\u{05FF}'), WB::Other);
378
379        // Arabic
380        assert_eq!(WB::of('\u{0600}'), WB::Format);
381        assert_eq!(WB::of('\u{0627}'), WB::ALetter);
382        assert_eq!(WB::of('\u{07BF}'), WB::Other);
383
384        // Default R + Arabic Extras
385        assert_eq!(WB::of('\u{07C0}'), WB::Numeric);
386        assert_eq!(WB::of('\u{085F}'), WB::Other);
387        assert_eq!(WB::of('\u{0860}'), WB::ALetter);
388        assert_eq!(WB::of('\u{0870}'), WB::Other);
389        assert_eq!(WB::of('\u{089F}'), WB::Other);
390        assert_eq!(WB::of('\u{08A0}'), WB::ALetter);
391        assert_eq!(WB::of('\u{089F}'), WB::Other);
392        assert_eq!(WB::of('\u{08FF}'), WB::Extend);
393
394        // Default ET
395        assert_eq!(WB::of('\u{20A0}'), WB::Other);
396        assert_eq!(WB::of('\u{20CF}'), WB::Other);
397
398        // Arabic Presentation Forms
399        assert_eq!(WB::of('\u{FB1D}'), WB::HebrewLetter);
400        assert_eq!(WB::of('\u{FB4F}'), WB::HebrewLetter);
401        assert_eq!(WB::of('\u{FB50}'), WB::ALetter);
402        assert_eq!(WB::of('\u{FDCF}'), WB::Other);
403        assert_eq!(WB::of('\u{FDF0}'), WB::ALetter);
404        assert_eq!(WB::of('\u{FDFF}'), WB::Other);
405        assert_eq!(WB::of('\u{FE70}'), WB::ALetter);
406        assert_eq!(WB::of('\u{FEFE}'), WB::Other);
407        assert_eq!(WB::of('\u{FEFF}'), WB::Format);
408
409        // noncharacters
410        assert_eq!(WB::of('\u{FDD0}'), WB::Other);
411        assert_eq!(WB::of('\u{FDD1}'), WB::Other);
412        assert_eq!(WB::of('\u{FDEE}'), WB::Other);
413        assert_eq!(WB::of('\u{FDEF}'), WB::Other);
414        assert_eq!(WB::of('\u{FFFE}'), WB::Other);
415        assert_eq!(WB::of('\u{FFFF}'), WB::Other);
416    }
417
418    #[test]
419    fn test_smp() {
420        // Default AL + R
421        assert_eq!(WB::of('\u{10800}'), WB::ALetter);
422        assert_eq!(WB::of('\u{10FFF}'), WB::Other);
423        assert_eq!(WB::of('\u{1E800}'), WB::ALetter);
424        assert_eq!(WB::of('\u{1EDFF}'), WB::Other);
425        assert_eq!(WB::of('\u{1EE00}'), WB::ALetter);
426        assert_eq!(WB::of('\u{1EEFF}'), WB::Other);
427        assert_eq!(WB::of('\u{1EF00}'), WB::Other);
428        assert_eq!(WB::of('\u{1EFFF}'), WB::Other);
429    }
430
431    #[test]
432    fn test_unassigned_planes() {
433        assert_eq!(WB::of('\u{30000}'), WB::Other);
434        assert_eq!(WB::of('\u{40000}'), WB::Other);
435        assert_eq!(WB::of('\u{50000}'), WB::Other);
436        assert_eq!(WB::of('\u{60000}'), WB::Other);
437        assert_eq!(WB::of('\u{70000}'), WB::Other);
438        assert_eq!(WB::of('\u{80000}'), WB::Other);
439        assert_eq!(WB::of('\u{90000}'), WB::Other);
440        assert_eq!(WB::of('\u{a0000}'), WB::Other);
441    }
442
443    #[test]
444    fn test_abbr_name() {
445        assert_eq!(WB::CR.abbr_name(), "CR");
446    }
447
448    #[test]
449    fn test_long_name() {
450        assert_eq!(WB::CR.long_name(), "CR");
451    }
452
453    #[test]
454    fn test_human_name() {
455        assert_eq!(WB::CR.human_name(), "Carriage Return");
456    }
457}