unic_ucd_segment/sentence_break.rs
1// Copyright 2017 The UNIC Project Developers.
2//
3// See the COPYRIGHT file at the top-level directory of this distribution.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode `Sentence_Break` Character Property.
12//!
13//! ## References
14//!
15//! * <https://www.unicode.org/reports/tr44/#Sentence_Break>
16//! * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
17//! * <https://www.unicode.org/reports/tr29/#Table_Sentence_Break_Property_Values>
18
19use unic_char_property::TotalCharProperty;
20
21char_property! {
22 /// Represents the Unicode character
23 /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break)
24 /// property.
25 ///
26 /// ## References
27 ///
28 /// * <https://www.unicode.org/reports/tr44/#Sentence_Break>
29 /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
30 /// * <https://www.unicode.org/reports/tr29/#Table_Sentence_Break_Property_Values>
31 pub enum SentenceBreak {
32 abbr => "SB";
33 long => "Sentence_Break";
34 human => "Sentence Break";
35
36 /// ```text
37 /// U+000D CARRIAGE RETURN (CR)
38 /// ```
39 CR {
40 abbr => CR,
41 long => CR,
42 human => "Carriage Return",
43 }
44
45 /// ```text
46 /// U+000A LINE FEED (LF)
47 /// ```
48 LF {
49 abbr => LF,
50 long => LF,
51 human => "Line Feed",
52 }
53
54 /// ```text
55 /// Grapheme_Extend = Yes, or
56 /// U+200D ZERO WIDTH JOINER (ZWJ), or
57 /// General_Category = Spacing_Mark
58 /// ```
59 Extend {
60 abbr => Extend,
61 long => Extend,
62 human => "Extend",
63 }
64
65 /// ```text
66 /// U+0085 NEXT LINE (NEL)
67 /// U+2028 LINE SEPARATOR
68 /// U+2029 PARAGRAPH SEPARATOR
69 /// ```
70 Sep {
71 abbr => SE,
72 long => Sep,
73 human => "Separator",
74 }
75
76 /// ```text
77 /// General_Category = Format
78 /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ)
79 /// and not U+200D ZERO WIDTH JOINER (ZWJ)
80 /// ```
81 Format {
82 abbr => FO,
83 long => Format,
84 human => "Format",
85 }
86
87 /// ```text
88 /// White_Space = Yes
89 /// and Sentence_Break ≠ Sep
90 /// and Sentence_Break ≠ CR
91 /// and Sentence_Break ≠ LF
92 /// ```
93 Sp {
94 abbr => SP,
95 long => Sp,
96 human => "Space",
97 }
98
99 /// ```text
100 /// Lowercase = Yes
101 /// and Grapheme_Extend = No
102 /// ```
103 Lower {
104 abbr => LO,
105 long => Lower,
106 human => "Lowercase",
107 }
108
109 /// ```text
110 /// General_Category = Titlecase_Letter, or
111 /// Uppercase = Yes
112 /// ```
113 Upper {
114 abbr => UP,
115 long => Upper,
116 human => "Uppercase",
117 }
118
119 /// ```text
120 /// Alphabetic = Yes, or
121 /// U+00A0 NO-BREAK SPACE (NBSP), or
122 /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH
123 /// and Lower = No
124 /// and Upper = No
125 /// and Sentence_Break ≠ Extend
126 /// ```
127 OLetter {
128 abbr => LE,
129 long => OLetter,
130 human => "Other Letter",
131 }
132
133 /// ```text
134 /// Line_Break = Numeric
135 /// ```
136 Numeric {
137 abbr => NU,
138 long => Numeric,
139 human => "Numeric",
140 }
141
142 /// ```text
143 /// U+002E ( . ) FULL STOP
144 /// U+2024 ( ․ ) ONE DOT LEADER
145 /// U+FE52 ( ﹒ ) SMALL FULL STOP
146 /// U+FF0E ( . ) FULLWIDTH FULL STOP
147 /// ```
148 ATerm {
149 abbr => AT,
150 long => ATerm,
151 human => "ATerm",
152 }
153
154 /// ```text
155 /// U+002C ( , ) COMMA
156 /// U+002D ( - ) HYPHEN-MINUS
157 /// U+003A ( : ) COLON
158 /// U+055D ( ՝ ) ARMENIAN COMMA
159 /// U+060C ( ، ) ARABIC COMMA
160 /// U+060D ( ؍ ) ARABIC DATE SEPARATOR
161 /// U+07F8 ( ߸ ) NKO COMMA
162 /// U+1802 ( ᠂ ) MONGOLIAN COMMA
163 /// U+1808 ( ᠈ ) MONGOLIAN MANCHU COMMA
164 /// U+2013 ( – ) EN DASH
165 /// U+2014 ( — ) EM DASH
166 /// U+3001 ( 、 ) IDEOGRAPHIC COMMA
167 /// U+FE10 ( ︐ ) PRESENTATION FORM FOR VERTICAL COMMA
168 /// U+FE11 ( ︑ ) PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA
169 /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON
170 /// U+FE31 ( ︱ ) PRESENTATION FORM FOR VERTICAL EM DASH
171 /// U+FE32 ( ︲ ) PRESENTATION FORM FOR VERTICAL EN DASH
172 /// U+FE50 ( ﹐ ) SMALL COMMA
173 /// U+FE51 ( ﹑ ) SMALL IDEOGRAPHIC COMMA
174 /// U+FE55 ( ﹕ ) SMALL COLON
175 /// U+FE58 ( ﹘ ) SMALL EM DASH
176 /// U+FE63 ( ﹣ ) SMALL HYPHEN-MINUS
177 /// U+FF0C ( , ) FULLWIDTH COMMA
178 /// U+FF0D ( - ) FULLWIDTH HYPHEN-MINUS
179 /// U+FF1A ( : ) FULLWIDTH COLON
180 /// U+FF64 ( 、 ) HALFWIDTH IDEOGRAPHIC COMMA
181 /// ```
182 SContinue {
183 abbr => SC,
184 long => SContinue,
185 human => "Sentence Continue",
186 }
187
188 /// ```text
189 /// Sentence_Terminal = Yes
190 /// ```
191 STerm {
192 abbr => ST,
193 long => STerm,
194 human => "Sentence Terminal",
195 }
196
197 /// ```text
198 /// General_Category = Open_Punctuation, or
199 /// General_Category = Close_Punctuation, or
200 /// Line_Break = Quotation
201 /// and not U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH
202 /// and ATerm = No
203 /// and STerm = No
204 /// ```
205 Close {
206 abbr => CL,
207 long => Close,
208 human => "Close",
209 }
210
211 /// All other characters
212 Other {
213 abbr => XX,
214 long => Other,
215 human => "Other",
216 }
217 }
218
219 /// Abbreviated name aliases for the
220 /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break)
221 /// property.
222 ///
223 /// ## See Also
224 ///
225 /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
226 pub mod abbr_names for abbr;
227
228 /// Long name aliases for the
229 /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break)
230 /// property.
231 ///
232 /// ## See Also
233 ///
234 /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
235 pub mod long_names for long;
236}
237
238impl TotalCharProperty for SentenceBreak {
239 fn of(ch: char) -> Self {
240 Self::of(ch)
241 }
242}
243
244impl Default for SentenceBreak {
245 fn default() -> Self {
246 SentenceBreak::Other
247 }
248}
249
250mod data {
251 use super::long_names as SB;
252 use unic_char_property::tables::CharDataTable;
253 pub const SENTENCE_BREAK_TABLE: CharDataTable<super::SentenceBreak> =
254 include!("../tables/sentence_break.rsv");
255}
256
257impl SentenceBreak {
258 /// Find the character `Sentence_Break` property value.
259 pub fn of(ch: char) -> SentenceBreak {
260 data::SENTENCE_BREAK_TABLE.find_or_default(ch)
261 }
262}
263
264#[cfg(test)]
265mod tests {
266 use super::SentenceBreak as SB;
267 use unic_char_property::EnumeratedCharProperty;
268
269 #[test]
270 fn test_ascii() {
271 assert_eq!(SB::of('\u{0000}'), SB::Other);
272 assert_eq!(SB::of('\u{0040}'), SB::Other);
273 assert_eq!(SB::of('\u{0041}'), SB::Upper);
274 assert_eq!(SB::of('\u{0062}'), SB::Lower);
275 assert_eq!(SB::of('\u{007F}'), SB::Other);
276 }
277
278 #[test]
279 fn test_bmp() {
280 // Hebrew
281 assert_eq!(SB::of('\u{0590}'), SB::Other);
282 assert_eq!(SB::of('\u{05D0}'), SB::OLetter);
283 assert_eq!(SB::of('\u{05D1}'), SB::OLetter);
284 assert_eq!(SB::of('\u{05FF}'), SB::Other);
285
286 // Arabic
287 assert_eq!(SB::of('\u{0600}'), SB::Format);
288 assert_eq!(SB::of('\u{0627}'), SB::OLetter);
289 assert_eq!(SB::of('\u{07BF}'), SB::Other);
290
291 // Default R + Arabic Extras
292 assert_eq!(SB::of('\u{07C0}'), SB::Numeric);
293 assert_eq!(SB::of('\u{085F}'), SB::Other);
294 assert_eq!(SB::of('\u{0860}'), SB::OLetter);
295 assert_eq!(SB::of('\u{0870}'), SB::Other);
296 assert_eq!(SB::of('\u{089F}'), SB::Other);
297 assert_eq!(SB::of('\u{08A0}'), SB::OLetter);
298 assert_eq!(SB::of('\u{089F}'), SB::Other);
299 assert_eq!(SB::of('\u{08FF}'), SB::Extend);
300
301 // Default ET
302 assert_eq!(SB::of('\u{20A0}'), SB::Other);
303 assert_eq!(SB::of('\u{20CF}'), SB::Other);
304
305 // Arabic Presentation Forms
306 assert_eq!(SB::of('\u{FB1D}'), SB::OLetter);
307 assert_eq!(SB::of('\u{FB4F}'), SB::OLetter);
308 assert_eq!(SB::of('\u{FB50}'), SB::OLetter);
309 assert_eq!(SB::of('\u{FDCF}'), SB::Other);
310 assert_eq!(SB::of('\u{FDF0}'), SB::OLetter);
311 assert_eq!(SB::of('\u{FDFF}'), SB::Other);
312 assert_eq!(SB::of('\u{FE70}'), SB::OLetter);
313 assert_eq!(SB::of('\u{FEFE}'), SB::Other);
314 assert_eq!(SB::of('\u{FEFF}'), SB::Format);
315
316 // noncharacters
317 assert_eq!(SB::of('\u{FDD0}'), SB::Other);
318 assert_eq!(SB::of('\u{FDD1}'), SB::Other);
319 assert_eq!(SB::of('\u{FDEE}'), SB::Other);
320 assert_eq!(SB::of('\u{FDEF}'), SB::Other);
321 assert_eq!(SB::of('\u{FFFE}'), SB::Other);
322 assert_eq!(SB::of('\u{FFFF}'), SB::Other);
323 }
324
325 #[test]
326 fn test_smp() {
327 // Default AL + R
328 assert_eq!(SB::of('\u{10800}'), SB::OLetter);
329 assert_eq!(SB::of('\u{10FFF}'), SB::Other);
330 assert_eq!(SB::of('\u{1E800}'), SB::OLetter);
331 assert_eq!(SB::of('\u{1EDFF}'), SB::Other);
332 assert_eq!(SB::of('\u{1EE00}'), SB::OLetter);
333 assert_eq!(SB::of('\u{1EEFF}'), SB::Other);
334 assert_eq!(SB::of('\u{1EF00}'), SB::Other);
335 assert_eq!(SB::of('\u{1EFFF}'), SB::Other);
336 }
337
338 #[test]
339 fn test_unassigned_planes() {
340 assert_eq!(SB::of('\u{30000}'), SB::Other);
341 assert_eq!(SB::of('\u{40000}'), SB::Other);
342 assert_eq!(SB::of('\u{50000}'), SB::Other);
343 assert_eq!(SB::of('\u{60000}'), SB::Other);
344 assert_eq!(SB::of('\u{70000}'), SB::Other);
345 assert_eq!(SB::of('\u{80000}'), SB::Other);
346 assert_eq!(SB::of('\u{90000}'), SB::Other);
347 assert_eq!(SB::of('\u{a0000}'), SB::Other);
348 }
349
350 #[test]
351 fn test_abbr_name() {
352 assert_eq!(SB::CR.abbr_name(), "CR");
353 }
354
355 #[test]
356 fn test_long_name() {
357 assert_eq!(SB::CR.long_name(), "CR");
358 }
359
360 #[test]
361 fn test_human_name() {
362 assert_eq!(SB::CR.human_name(), "Carriage Return");
363 }
364}