unic_ucd_segment/word_break.rs
1// Copyright 2017 The UNIC Project Developers.
2//
3// See the COPYRIGHT file at the top-level directory of this distribution.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode `Word_Break` Character Property.
12//!
13//! ## References
14//!
15//! * <https://www.unicode.org/reports/tr44/#Word_Break>
16//! * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
17//! * <https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values>
18
19use unic_char_property::TotalCharProperty;
20
21char_property! {
22 /// Represents the Unicode character
23 /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break)
24 /// property.
25 ///
26 /// ## References
27 ///
28 /// * <https://www.unicode.org/reports/tr44/#Word_Break>
29 /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
30 /// * <https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values>
31 pub enum WordBreak {
32 abbr => "WB";
33 long => "Word_Break";
34 human => "Word Break";
35
36 /// ```text
37 /// U+000D CARRIAGE RETURN (CR)
38 /// ```
39 CR {
40 abbr => CR,
41 long => CR,
42 human => "Carriage Return",
43 }
44
45 /// ```text
46 /// U+000A LINE FEED (LF)
47 /// ```
48 LF {
49 abbr => LF,
50 long => LF,
51 human => "Line Feed",
52 }
53
54 /// ```text
55 /// U+000B LINE TABULATION
56 /// U+000C FORM FEED (FF)
57 /// U+0085 NEXT LINE (NEL)
58 /// U+2028 LINE SEPARATOR
59 /// U+2029 PARAGRAPH SEPARATOR
60 /// ```
61 Newline {
62 abbr => NL,
63 long => Newline,
64 human => "Newline",
65 }
66
67 /// ```text
68 /// Grapheme_Extend = Yes, or
69 /// General_Category = Spacing_Mark
70 /// and not U+200D ZERO WIDTH JOINER (ZWJ)
71 /// ```
72 Extend {
73 abbr => Extend,
74 long => Extend,
75 human => "Extend",
76 }
77
78 /// ```text
79 /// U+200D ZERO WIDTH JOINER
80 /// ```
81 ZWJ {
82 abbr => ZWJ,
83 long => ZWJ,
84 human => "Zero Width Joiner (ZWJ)",
85 }
86
87 /// ```text
88 /// Regional_Indicator = Yes
89 /// ```
90 ///
91 /// This consists of the range:
92 ///
93 /// ```text
94 /// U+1F1E6 REGIONAL INDICATOR SYMBOL LETTER A
95 /// ..U+1F1FF REGIONAL INDICATOR SYMBOL LETTER Z
96 /// ```
97 RegionalIndicator {
98 abbr => RI,
99 long => Regional_Indicator,
100 human => "Regional Indicator",
101 }
102
103 /// ```text
104 /// General_Category = Format
105 /// and not U+200B ZERO WIDTH SPACE (ZWSP)
106 /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ)
107 /// and not U+200D ZERO WIDTH JOINER (ZWJ)
108 /// ```
109 Format {
110 abbr => FO,
111 long => Format,
112 human => "Format",
113 }
114
115 /// ```text
116 /// Script = KATAKANA, or
117 /// any of the following:
118 /// U+3031 ( 〱 ) VERTICAL KANA REPEAT MARK
119 /// U+3032 ( 〲 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK
120 /// U+3033 ( 〳 ) VERTICAL KANA REPEAT MARK UPPER HALF
121 /// U+3034 ( 〴 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF
122 /// U+3035 ( 〵 ) VERTICAL KANA REPEAT MARK LOWER HALF
123 /// U+309B ( ゛ ) KATAKANA-HIRAGANA VOICED SOUND MARK
124 /// U+309C ( ゜ ) KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
125 /// U+30A0 ( ゠ ) KATAKANA-HIRAGANA DOUBLE HYPHEN
126 /// U+30FC ( ー ) KATAKANA-HIRAGANA PROLONGED SOUND MARK
127 /// U+FF70 ( ー ) HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
128 /// ```
129 Katakana {
130 abbr => KA,
131 long => Katakana,
132 human => "Katakana",
133 }
134
135 /// ```text
136 /// Script = Hebrew
137 /// and General_Category = Other_Letter
138 /// ```
139 HebrewLetter {
140 abbr => HL,
141 long => Hebrew_Letter,
142 human => "Hebrew Letter",
143 }
144
145 /// ```text
146 /// Alphabetic = Yes, or
147 /// any of the following 36 characters:
148 /// U+02C2 ( ˂ ) MODIFIER LETTER LEFT ARROWHEAD
149 /// ..U+02C5 ( ˅ ) MODIFIER LETTER DOWN ARROWHEAD
150 /// U+02D2 ( ˒ ) MODIFIER LETTER CENTRED RIGHT HALF RING
151 /// ..U+02D7 ( ˗ ) MODIFIER LETTER MINUS SIGN
152 /// U+02DE ( ˞ ) MODIFIER LETTER RHOTIC HOOK
153 /// U+02DF ( ˟ ) MODIFIER LETTER CROSS ACCENT
154 /// U+02ED ( ˭ ) MODIFIER LETTER UNASPIRATED
155 /// U+02EF ( ˯ ) MODIFIER LETTER LOW DOWN ARROWHEAD
156 /// ..U+02FF ( ˿ ) MODIFIER LETTER LOW LEFT ARROW
157 /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH
158 /// U+A720 ( ꜠ ) MODIFIER LETTER STRESS AND HIGH TONE
159 /// U+A721 ( ꜡ ) MODIFIER LETTER STRESS AND LOW TONE
160 /// U+A789 ( ꞉ ) MODIFIER LETTER COLON
161 /// U+A78A ( ꞊ ) MODIFIER LETTER SHORT EQUALS SIGN
162 /// U+AB5B ( ꭛ ) MODIFIER BREVE WITH INVERTED BREVE
163 /// and Ideographic = No
164 /// and Word_Break ≠ Katakana
165 /// and Line_Break ≠ Complex_Context (SA)
166 /// and Script ≠ Hiragana
167 /// and Word_Break ≠ Extend
168 /// and Word_Break ≠ Hebrew_Letter
169 /// ```
170 ALetter {
171 abbr => LE,
172 long => ALetter,
173 human => "Alphabetic Letter",
174 }
175
176 /// ```text
177 /// U+0027 ( ' ) APOSTROPHE
178 /// ```
179 SingleQuote {
180 abbr => SQ,
181 long => Single_Quote,
182 human => "Single Quote",
183 }
184
185 /// ```text
186 /// U+0022 ( " ) QUOTATION MARK
187 /// ```
188 DoubleQuote {
189 abbr => DQ,
190 long => Double_Quote,
191 human => "Double Quote",
192 }
193
194 /// ```text
195 /// U+002E ( . ) FULL STOP
196 /// U+2018 ( ‘ ) LEFT SINGLE QUOTATION MARK
197 /// U+2019 ( ’ ) RIGHT SINGLE QUOTATION MARK
198 /// U+2024 ( ․ ) ONE DOT LEADER
199 /// U+FE52 ( ﹒ ) SMALL FULL STOP
200 /// U+FF07 ( ' ) FULLWIDTH APOSTROPHE
201 /// U+FF0E ( . ) FULLWIDTH FULL STOP
202 /// ```
203 MidNumLet {
204 abbr => MB,
205 long => MidNumLet,
206 human => "Middle of Numeric/Letter",
207 }
208
209 /// ```text
210 /// U+00B7 ( · ) MIDDLE DOT
211 /// U+0387 ( · ) GREEK ANO TELEIA
212 /// U+05F4 ( ״ ) HEBREW PUNCTUATION GERSHAYIM
213 /// U+2027 ( ‧ ) HYPHENATION POINT
214 /// U+003A ( : ) COLON (used in Swedish)
215 /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON
216 /// U+FE55 ( ﹕ ) SMALL COLON
217 /// U+FF1A ( : ) FULLWIDTH COLON
218 /// ```
219 MidLetter {
220 abbr => ML,
221 long => MidLetter,
222 human => "Middle of Letter",
223 }
224
225 /// ```text
226 /// Line_Break = Infix_Numeric, or
227 /// any of the following:
228 /// U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR
229 /// U+FE50 ( ﹐ ) SMALL COMMA
230 /// U+FE54 ( ﹔ ) SMALL SEMICOLON
231 /// U+FF0C ( , ) FULLWIDTH COMMA
232 /// U+FF1B ( ; ) FULLWIDTH SEMICOLON
233 /// and not U+003A ( : ) COLON
234 /// and not U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON
235 /// and not U+002E ( . ) FULL STOP
236 /// ```
237 MidNum {
238 abbr => MN,
239 long => MidNum,
240 human => "Middle of Numeric",
241 }
242
243 /// ```text
244 /// Line_Break = Numeric
245 /// and not U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR
246 /// ```
247 Numeric {
248 abbr => NU,
249 long => Numeric,
250 human => "Numeric",
251 }
252
253 /// ```text
254 /// General_Category = Connector_Punctuation, or
255 /// U+202F NARROW NO-BREAK SPACE (NNBSP)
256 /// ```
257 ExtendNumLet {
258 abbr => EX,
259 long => ExtendNumLet,
260 human => "Extend Numeric/Letter",
261 }
262
263 // Emoji
264
265 /// Emoji characters listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`, which do not
266 /// occur after ZWJ in `emoji-zwj-sequences.txt`.
267 ///
268 /// See <https://www.unicode.org/reports/tr51/>.
269 EBase {
270 abbr => EB,
271 long => E_Base,
272 human => "Emoji Base",
273 }
274
275 /// Emoji characters listed as `Emoji_Modifer=Yes` in `emoji-data.txt`.
276 ///
277 /// See <https://www.unicode.org/reports/tr51/>.
278 EModifier {
279 abbr => EM,
280 long => E_Modifier,
281 human => "Emoji Modifier",
282 }
283
284 /// Emoji characters that do not break from a previous ZWJ in a defined emoji ZWJ sequence,
285 /// and are not listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`.
286 ///
287 /// See <https://www.unicode.org/reports/tr51/>.
288 GlueAfterZwj {
289 abbr => GAZ,
290 long => Glue_After_Zwj,
291 human => "Glue After ZWJ",
292 }
293
294 /// Emoji characters listed as `Emoji_Modifer_Base=Yes` in `emoji_data.txt`, and also occur
295 /// after ZWJ in `emoji-zwj-sequences.txt`.
296 ///
297 /// See <https://www.unicode.org/reports/tr51/>.
298 EBaseGAZ {
299 abbr => EBG,
300 long => E_Base_GAZ,
301 human => "Emoji Base and Glue After ZWJ",
302 }
303
304 /// All other characters
305 Other {
306 abbr => XX,
307 long => Other,
308 human => "Other",
309 }
310 }
311
312 /// Abbreviated name aliases for the
313 /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break)
314 /// property.
315 ///
316 /// ## See Also
317 ///
318 /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
319 pub mod abbr_names for abbr;
320
321 /// Long name aliases for the
322 /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break)
323 /// property.
324 ///
325 /// ## See Also
326 ///
327 /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
328 pub mod long_names for long;
329}
330
331impl TotalCharProperty for WordBreak {
332 fn of(ch: char) -> Self {
333 Self::of(ch)
334 }
335}
336
337impl Default for WordBreak {
338 fn default() -> Self {
339 WordBreak::Other
340 }
341}
342
343mod data {
344 use super::long_names as WB;
345 use unic_char_property::tables::CharDataTable;
346 pub const WORD_BREAK_TABLE: CharDataTable<super::WordBreak> =
347 include!("../tables/word_break.rsv");
348}
349
350impl WordBreak {
351 /// Find the character `Word_Break` property value.
352 pub fn of(ch: char) -> WordBreak {
353 data::WORD_BREAK_TABLE.find_or_default(ch)
354 }
355}
356
357#[cfg(test)]
358mod tests {
359 use super::WordBreak as WB;
360 use unic_char_property::EnumeratedCharProperty;
361
362 #[test]
363 fn test_ascii() {
364 assert_eq!(WB::of('\u{0000}'), WB::Other);
365 assert_eq!(WB::of('\u{0040}'), WB::Other);
366 assert_eq!(WB::of('\u{0041}'), WB::ALetter);
367 assert_eq!(WB::of('\u{0062}'), WB::ALetter);
368 assert_eq!(WB::of('\u{007F}'), WB::Other);
369 }
370
371 #[test]
372 fn test_bmp() {
373 // Hebrew
374 assert_eq!(WB::of('\u{0590}'), WB::Other);
375 assert_eq!(WB::of('\u{05D0}'), WB::HebrewLetter);
376 assert_eq!(WB::of('\u{05D1}'), WB::HebrewLetter);
377 assert_eq!(WB::of('\u{05FF}'), WB::Other);
378
379 // Arabic
380 assert_eq!(WB::of('\u{0600}'), WB::Format);
381 assert_eq!(WB::of('\u{0627}'), WB::ALetter);
382 assert_eq!(WB::of('\u{07BF}'), WB::Other);
383
384 // Default R + Arabic Extras
385 assert_eq!(WB::of('\u{07C0}'), WB::Numeric);
386 assert_eq!(WB::of('\u{085F}'), WB::Other);
387 assert_eq!(WB::of('\u{0860}'), WB::ALetter);
388 assert_eq!(WB::of('\u{0870}'), WB::Other);
389 assert_eq!(WB::of('\u{089F}'), WB::Other);
390 assert_eq!(WB::of('\u{08A0}'), WB::ALetter);
391 assert_eq!(WB::of('\u{089F}'), WB::Other);
392 assert_eq!(WB::of('\u{08FF}'), WB::Extend);
393
394 // Default ET
395 assert_eq!(WB::of('\u{20A0}'), WB::Other);
396 assert_eq!(WB::of('\u{20CF}'), WB::Other);
397
398 // Arabic Presentation Forms
399 assert_eq!(WB::of('\u{FB1D}'), WB::HebrewLetter);
400 assert_eq!(WB::of('\u{FB4F}'), WB::HebrewLetter);
401 assert_eq!(WB::of('\u{FB50}'), WB::ALetter);
402 assert_eq!(WB::of('\u{FDCF}'), WB::Other);
403 assert_eq!(WB::of('\u{FDF0}'), WB::ALetter);
404 assert_eq!(WB::of('\u{FDFF}'), WB::Other);
405 assert_eq!(WB::of('\u{FE70}'), WB::ALetter);
406 assert_eq!(WB::of('\u{FEFE}'), WB::Other);
407 assert_eq!(WB::of('\u{FEFF}'), WB::Format);
408
409 // noncharacters
410 assert_eq!(WB::of('\u{FDD0}'), WB::Other);
411 assert_eq!(WB::of('\u{FDD1}'), WB::Other);
412 assert_eq!(WB::of('\u{FDEE}'), WB::Other);
413 assert_eq!(WB::of('\u{FDEF}'), WB::Other);
414 assert_eq!(WB::of('\u{FFFE}'), WB::Other);
415 assert_eq!(WB::of('\u{FFFF}'), WB::Other);
416 }
417
418 #[test]
419 fn test_smp() {
420 // Default AL + R
421 assert_eq!(WB::of('\u{10800}'), WB::ALetter);
422 assert_eq!(WB::of('\u{10FFF}'), WB::Other);
423 assert_eq!(WB::of('\u{1E800}'), WB::ALetter);
424 assert_eq!(WB::of('\u{1EDFF}'), WB::Other);
425 assert_eq!(WB::of('\u{1EE00}'), WB::ALetter);
426 assert_eq!(WB::of('\u{1EEFF}'), WB::Other);
427 assert_eq!(WB::of('\u{1EF00}'), WB::Other);
428 assert_eq!(WB::of('\u{1EFFF}'), WB::Other);
429 }
430
431 #[test]
432 fn test_unassigned_planes() {
433 assert_eq!(WB::of('\u{30000}'), WB::Other);
434 assert_eq!(WB::of('\u{40000}'), WB::Other);
435 assert_eq!(WB::of('\u{50000}'), WB::Other);
436 assert_eq!(WB::of('\u{60000}'), WB::Other);
437 assert_eq!(WB::of('\u{70000}'), WB::Other);
438 assert_eq!(WB::of('\u{80000}'), WB::Other);
439 assert_eq!(WB::of('\u{90000}'), WB::Other);
440 assert_eq!(WB::of('\u{a0000}'), WB::Other);
441 }
442
443 #[test]
444 fn test_abbr_name() {
445 assert_eq!(WB::CR.abbr_name(), "CR");
446 }
447
448 #[test]
449 fn test_long_name() {
450 assert_eq!(WB::CR.long_name(), "CR");
451 }
452
453 #[test]
454 fn test_human_name() {
455 assert_eq!(WB::CR.human_name(), "Carriage Return");
456 }
457}