unic_ucd_segment/
sentence_break.rs

1// Copyright 2017 The UNIC Project Developers.
2//
3// See the COPYRIGHT file at the top-level directory of this distribution.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode `Sentence_Break` Character Property.
12//!
13//! ## References
14//!
15//! * <https://www.unicode.org/reports/tr44/#Sentence_Break>
16//! * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
17//! * <https://www.unicode.org/reports/tr29/#Table_Sentence_Break_Property_Values>
18
19use unic_char_property::TotalCharProperty;
20
21char_property! {
22    /// Represents the Unicode character
23    /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break)
24    /// property.
25    ///
26    /// ## References
27    ///
28    /// * <https://www.unicode.org/reports/tr44/#Sentence_Break>
29    /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
30    /// * <https://www.unicode.org/reports/tr29/#Table_Sentence_Break_Property_Values>
31    pub enum SentenceBreak {
32        abbr => "SB";
33        long => "Sentence_Break";
34        human => "Sentence Break";
35
36        /// ```text
37        /// U+000D CARRIAGE RETURN (CR)
38        /// ```
39        CR {
40            abbr => CR,
41            long => CR,
42            human => "Carriage Return",
43        }
44
45        /// ```text
46        /// U+000A LINE FEED (LF)
47        /// ```
48        LF {
49            abbr => LF,
50            long => LF,
51            human => "Line Feed",
52        }
53
54        /// ```text
55        /// Grapheme_Extend = Yes, or
56        /// U+200D ZERO WIDTH JOINER (ZWJ), or
57        /// General_Category = Spacing_Mark
58        /// ```
59        Extend {
60            abbr => Extend,
61            long => Extend,
62            human => "Extend",
63        }
64
65        /// ```text
66        /// U+0085 NEXT LINE (NEL)
67        /// U+2028 LINE SEPARATOR
68        /// U+2029 PARAGRAPH SEPARATOR
69        /// ```
70        Sep {
71            abbr => SE,
72            long => Sep,
73            human => "Separator",
74        }
75
76        /// ```text
77        /// General_Category = Format
78        /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ)
79        /// and not U+200D ZERO WIDTH JOINER (ZWJ)
80        /// ```
81        Format {
82            abbr => FO,
83            long => Format,
84            human => "Format",
85        }
86
87        /// ```text
88        /// White_Space = Yes
89        /// and Sentence_Break ≠ Sep
90        /// and Sentence_Break ≠ CR
91        /// and Sentence_Break ≠ LF
92        /// ```
93        Sp {
94            abbr => SP,
95            long => Sp,
96            human => "Space",
97        }
98
99        /// ```text
100        /// Lowercase = Yes
101        /// and Grapheme_Extend = No
102        /// ```
103        Lower {
104            abbr => LO,
105            long => Lower,
106            human => "Lowercase",
107        }
108
109        /// ```text
110        /// General_Category = Titlecase_Letter, or
111        /// Uppercase = Yes
112        /// ```
113        Upper {
114            abbr => UP,
115            long => Upper,
116            human => "Uppercase",
117        }
118
119        /// ```text
120        /// Alphabetic = Yes, or
121        /// U+00A0 NO-BREAK SPACE (NBSP), or
122        /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH
123        /// and Lower = No
124        /// and Upper = No
125        /// and Sentence_Break ≠ Extend
126        /// ```
127        OLetter {
128            abbr => LE,
129            long => OLetter,
130            human => "Other Letter",
131        }
132
133        /// ```text
134        /// Line_Break = Numeric
135        /// ```
136        Numeric {
137            abbr => NU,
138            long => Numeric,
139            human => "Numeric",
140        }
141
142        /// ```text
143        /// U+002E ( . ) FULL STOP
144        /// U+2024 ( ․ ) ONE DOT LEADER
145        /// U+FE52 ( ﹒ ) SMALL FULL STOP
146        /// U+FF0E ( . ) FULLWIDTH FULL STOP
147        /// ```
148        ATerm {
149            abbr => AT,
150            long => ATerm,
151            human => "ATerm",
152        }
153
154        /// ```text
155        /// U+002C ( , ) COMMA
156        /// U+002D ( - ) HYPHEN-MINUS
157        /// U+003A ( : ) COLON
158        /// U+055D ( ՝ ) ARMENIAN COMMA
159        /// U+060C ( ، ) ARABIC COMMA
160        /// U+060D ( ‎؍‎ ) ARABIC DATE SEPARATOR
161        /// U+07F8 ( ߸ ) NKO COMMA
162        /// U+1802 ( ᠂ ) MONGOLIAN COMMA
163        /// U+1808 ( ᠈ ) MONGOLIAN MANCHU COMMA
164        /// U+2013 ( – ) EN DASH
165        /// U+2014 ( — ) EM DASH
166        /// U+3001 ( 、 ) IDEOGRAPHIC COMMA
167        /// U+FE10 ( ︐ ) PRESENTATION FORM FOR VERTICAL COMMA
168        /// U+FE11 ( ︑ ) PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA
169        /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON
170        /// U+FE31 ( ︱ ) PRESENTATION FORM FOR VERTICAL EM DASH
171        /// U+FE32 ( ︲ ) PRESENTATION FORM FOR VERTICAL EN DASH
172        /// U+FE50 ( ﹐ ) SMALL COMMA
173        /// U+FE51 ( ﹑ ) SMALL IDEOGRAPHIC COMMA
174        /// U+FE55 ( ﹕ ) SMALL COLON
175        /// U+FE58 ( ﹘ ) SMALL EM DASH
176        /// U+FE63 ( ﹣ ) SMALL HYPHEN-MINUS
177        /// U+FF0C ( , ) FULLWIDTH COMMA
178        /// U+FF0D ( - ) FULLWIDTH HYPHEN-MINUS
179        /// U+FF1A ( : ) FULLWIDTH COLON
180        /// U+FF64 ( 、 ) HALFWIDTH IDEOGRAPHIC COMMA
181        /// ```
182        SContinue {
183            abbr => SC,
184            long => SContinue,
185            human => "Sentence Continue",
186        }
187
188        /// ```text
189        /// Sentence_Terminal = Yes
190        /// ```
191        STerm {
192            abbr => ST,
193            long => STerm,
194            human => "Sentence Terminal",
195        }
196
197        /// ```text
198        /// General_Category = Open_Punctuation, or
199        /// General_Category = Close_Punctuation, or
200        /// Line_Break = Quotation
201        /// and not U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH
202        /// and ATerm = No
203        /// and STerm = No
204        /// ```
205        Close {
206            abbr => CL,
207            long => Close,
208            human => "Close",
209        }
210
211        /// All other characters
212        Other {
213            abbr => XX,
214            long => Other,
215            human => "Other",
216        }
217    }
218
219    /// Abbreviated name aliases for the
220    /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break)
221    /// property.
222    ///
223    /// ## See Also
224    ///
225    /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
226    pub mod abbr_names for abbr;
227
228    /// Long name aliases for the
229    /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break)
230    /// property.
231    ///
232    /// ## See Also
233    ///
234    /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
235    pub mod long_names for long;
236}
237
238impl TotalCharProperty for SentenceBreak {
239    fn of(ch: char) -> Self {
240        Self::of(ch)
241    }
242}
243
244impl Default for SentenceBreak {
245    fn default() -> Self {
246        SentenceBreak::Other
247    }
248}
249
250mod data {
251    use super::long_names as SB;
252    use unic_char_property::tables::CharDataTable;
253    pub const SENTENCE_BREAK_TABLE: CharDataTable<super::SentenceBreak> =
254        include!("../tables/sentence_break.rsv");
255}
256
257impl SentenceBreak {
258    /// Find the character `Sentence_Break` property value.
259    pub fn of(ch: char) -> SentenceBreak {
260        data::SENTENCE_BREAK_TABLE.find_or_default(ch)
261    }
262}
263
264#[cfg(test)]
265mod tests {
266    use super::SentenceBreak as SB;
267    use unic_char_property::EnumeratedCharProperty;
268
269    #[test]
270    fn test_ascii() {
271        assert_eq!(SB::of('\u{0000}'), SB::Other);
272        assert_eq!(SB::of('\u{0040}'), SB::Other);
273        assert_eq!(SB::of('\u{0041}'), SB::Upper);
274        assert_eq!(SB::of('\u{0062}'), SB::Lower);
275        assert_eq!(SB::of('\u{007F}'), SB::Other);
276    }
277
278    #[test]
279    fn test_bmp() {
280        // Hebrew
281        assert_eq!(SB::of('\u{0590}'), SB::Other);
282        assert_eq!(SB::of('\u{05D0}'), SB::OLetter);
283        assert_eq!(SB::of('\u{05D1}'), SB::OLetter);
284        assert_eq!(SB::of('\u{05FF}'), SB::Other);
285
286        // Arabic
287        assert_eq!(SB::of('\u{0600}'), SB::Format);
288        assert_eq!(SB::of('\u{0627}'), SB::OLetter);
289        assert_eq!(SB::of('\u{07BF}'), SB::Other);
290
291        // Default R + Arabic Extras
292        assert_eq!(SB::of('\u{07C0}'), SB::Numeric);
293        assert_eq!(SB::of('\u{085F}'), SB::Other);
294        assert_eq!(SB::of('\u{0860}'), SB::OLetter);
295        assert_eq!(SB::of('\u{0870}'), SB::Other);
296        assert_eq!(SB::of('\u{089F}'), SB::Other);
297        assert_eq!(SB::of('\u{08A0}'), SB::OLetter);
298        assert_eq!(SB::of('\u{089F}'), SB::Other);
299        assert_eq!(SB::of('\u{08FF}'), SB::Extend);
300
301        // Default ET
302        assert_eq!(SB::of('\u{20A0}'), SB::Other);
303        assert_eq!(SB::of('\u{20CF}'), SB::Other);
304
305        // Arabic Presentation Forms
306        assert_eq!(SB::of('\u{FB1D}'), SB::OLetter);
307        assert_eq!(SB::of('\u{FB4F}'), SB::OLetter);
308        assert_eq!(SB::of('\u{FB50}'), SB::OLetter);
309        assert_eq!(SB::of('\u{FDCF}'), SB::Other);
310        assert_eq!(SB::of('\u{FDF0}'), SB::OLetter);
311        assert_eq!(SB::of('\u{FDFF}'), SB::Other);
312        assert_eq!(SB::of('\u{FE70}'), SB::OLetter);
313        assert_eq!(SB::of('\u{FEFE}'), SB::Other);
314        assert_eq!(SB::of('\u{FEFF}'), SB::Format);
315
316        // noncharacters
317        assert_eq!(SB::of('\u{FDD0}'), SB::Other);
318        assert_eq!(SB::of('\u{FDD1}'), SB::Other);
319        assert_eq!(SB::of('\u{FDEE}'), SB::Other);
320        assert_eq!(SB::of('\u{FDEF}'), SB::Other);
321        assert_eq!(SB::of('\u{FFFE}'), SB::Other);
322        assert_eq!(SB::of('\u{FFFF}'), SB::Other);
323    }
324
325    #[test]
326    fn test_smp() {
327        // Default AL + R
328        assert_eq!(SB::of('\u{10800}'), SB::OLetter);
329        assert_eq!(SB::of('\u{10FFF}'), SB::Other);
330        assert_eq!(SB::of('\u{1E800}'), SB::OLetter);
331        assert_eq!(SB::of('\u{1EDFF}'), SB::Other);
332        assert_eq!(SB::of('\u{1EE00}'), SB::OLetter);
333        assert_eq!(SB::of('\u{1EEFF}'), SB::Other);
334        assert_eq!(SB::of('\u{1EF00}'), SB::Other);
335        assert_eq!(SB::of('\u{1EFFF}'), SB::Other);
336    }
337
338    #[test]
339    fn test_unassigned_planes() {
340        assert_eq!(SB::of('\u{30000}'), SB::Other);
341        assert_eq!(SB::of('\u{40000}'), SB::Other);
342        assert_eq!(SB::of('\u{50000}'), SB::Other);
343        assert_eq!(SB::of('\u{60000}'), SB::Other);
344        assert_eq!(SB::of('\u{70000}'), SB::Other);
345        assert_eq!(SB::of('\u{80000}'), SB::Other);
346        assert_eq!(SB::of('\u{90000}'), SB::Other);
347        assert_eq!(SB::of('\u{a0000}'), SB::Other);
348    }
349
350    #[test]
351    fn test_abbr_name() {
352        assert_eq!(SB::CR.abbr_name(), "CR");
353    }
354
355    #[test]
356    fn test_long_name() {
357        assert_eq!(SB::CR.long_name(), "CR");
358    }
359
360    #[test]
361    fn test_human_name() {
362        assert_eq!(SB::CR.human_name(), "Carriage Return");
363    }
364}