unic_segment/word.rs
1// Copyright 2012-2015 The Rust Project Developers.
2// Copyright 2017 The UNIC Project Developers.
3//
4// See the COPYRIGHT file at the top-level directory of this distribution.
5//
6// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9// option. This file may not be copied, modified, or distributed
10// except according to those terms.
11
12//! Unicode Words of a string.
13//!
14//! ## References
15//!
16//! * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
17
18use std::cmp;
19use std::iter::Filter;
20
21use unic_ucd_segment::WordBreak as WB;
22
23/// An iterator over the substrings of a string which, after splitting the string on [word
24/// boundaries](https://www.unicode.org/reports/tr29/#Word_Boundaries), contain any characters with
25/// the [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) property, or with
26/// [`General_Category=Number`](http://unicode.org/reports/tr44/#General_Category_Values).
27#[derive(Debug)]
28pub struct Words<'a> {
29 inner: Filter<WordBounds<'a>, fn(&&str) -> bool>,
30}
31
32impl<'a> Iterator for Words<'a> {
33 type Item = &'a str;
34
35 #[inline]
36 fn next(&mut self) -> Option<&'a str> {
37 self.inner.next()
38 }
39}
40
41impl<'a> DoubleEndedIterator for Words<'a> {
42 #[inline]
43 fn next_back(&mut self) -> Option<&'a str> {
44 self.inner.next_back()
45 }
46}
47
48impl<'a> Words<'a> {
49 /// Create new iterator for *words*.
50 #[inline]
51 pub fn new(s: &str, filter: fn(&&str) -> bool) -> Words<'_> {
52 Words {
53 inner: WordBounds::new(s).filter(filter),
54 }
55 }
56}
57
58/// External iterator for a string's
59/// [word boundaries](https://www.unicode.org/reports/tr29/#Word_Boundaries).
60#[derive(Clone, Debug)]
61pub struct WordBounds<'a> {
62 string: &'a str,
63 cat: Option<WB>,
64 catb: Option<WB>,
65}
66
67/// External iterator for word boundaries and byte offsets.
68#[derive(Clone, Debug)]
69pub struct WordBoundIndices<'a> {
70 start_offset: usize,
71 iter: WordBounds<'a>,
72}
73
74impl<'a> WordBoundIndices<'a> {
75 /// Create new iterator for *word boundries and their indices*.
76 #[inline]
77 pub fn new(s: &str) -> WordBoundIndices<'_> {
78 WordBoundIndices {
79 start_offset: s.as_ptr() as usize,
80 iter: WordBounds::new(s),
81 }
82 }
83
84 #[inline]
85 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
86 ///
87 /// ```rust
88 /// # use unic_segment::WordBoundIndices;
89 /// let mut iter = WordBoundIndices::new("Hello world");
90 /// assert_eq!(iter.as_str(), "Hello world");
91 ///
92 /// iter.next();
93 /// assert_eq!(iter.as_str(), " world");
94 ///
95 /// iter.next();
96 /// assert_eq!(iter.as_str(), "world");
97 /// ```
98 pub fn as_str(&self) -> &'a str {
99 self.iter.as_str()
100 }
101}
102
103impl<'a> Iterator for WordBoundIndices<'a> {
104 type Item = (usize, &'a str);
105
106 #[inline]
107 fn next(&mut self) -> Option<(usize, &'a str)> {
108 self.iter
109 .next()
110 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
111 }
112
113 #[inline]
114 fn size_hint(&self) -> (usize, Option<usize>) {
115 self.iter.size_hint()
116 }
117}
118
119impl<'a> DoubleEndedIterator for WordBoundIndices<'a> {
120 #[inline]
121 fn next_back(&mut self) -> Option<(usize, &'a str)> {
122 self.iter
123 .next_back()
124 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
125 }
126}
127
128// state machine for word boundary rules
129#[derive(Clone, Copy, Debug, Eq, PartialEq)]
130enum WordBoundsState {
131 Start,
132 Letter,
133 HLetter,
134 Numeric,
135 Katakana,
136 ExtendNumLet,
137 Regional(RegionalState),
138 FormatExtend(FormatExtendType),
139 Zwj,
140 Emoji,
141}
142
143// subtypes for FormatExtend state in WordBoundsState
144#[derive(Clone, Copy, Debug, Eq, PartialEq)]
145enum FormatExtendType {
146 AcceptAny,
147 AcceptNone,
148 RequireLetter,
149 RequireHLetter,
150 AcceptQLetter,
151 RequireNumeric,
152}
153
154#[derive(Clone, Copy, Debug, Eq, PartialEq)]
155enum RegionalState {
156 Half,
157 Full,
158 Unknown,
159}
160
161impl<'a> Iterator for WordBounds<'a> {
162 type Item = &'a str;
163
164 #[inline]
165 fn size_hint(&self) -> (usize, Option<usize>) {
166 let slen = self.string.len();
167 (cmp::min(slen, 1), Some(slen))
168 }
169
170 #[inline]
171 #[cfg_attr(feature = "cargo-clippy", allow(match_same_arms))]
172 fn next(&mut self) -> Option<&'a str> {
173 use self::FormatExtendType::*;
174 use self::WordBoundsState::*;
175
176 if self.string.is_empty() {
177 return None;
178 }
179
180 let mut take_curr = true;
181 let mut take_cat = true;
182 let mut idx = 0;
183 let mut saveidx = 0;
184 let mut state = Start;
185 let mut cat = WB::Other;
186 let mut savecat = WB::Other;
187
188 // Whether or not the previous category was ZWJ
189 // ZWJs get collapsed, so this handles precedence of WB3c over WB4
190 let mut prev_zwj;
191 for (curr, ch) in self.string.char_indices() {
192 idx = curr;
193 prev_zwj = cat == WB::ZWJ;
194 // if there's a category cached, grab it
195 cat = match self.cat {
196 None => WB::of(ch),
197 _ => self.cat.take().unwrap(),
198 };
199 take_cat = true;
200
201 // handle rule WB4
202 // just skip all format, extend, and zwj chars
203 // note that Start is a special case: if there's a bunch of Format | Extend
204 // characters at the beginning of a block of text, dump them out as one unit.
205 //
206 // (This is not obvious from the wording of UAX#29, but if you look at the
207 // test cases https://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
208 // then the "correct" interpretation of WB4 becomes apparent.)
209 if state != Start {
210 match cat {
211 WB::Extend | WB::Format | WB::ZWJ => continue,
212 _ => {}
213 }
214 }
215
216 // rule WB3c
217 // WB4 makes all ZWJs collapse into the previous state
218 // but you can still be in a Zwj state if you started with Zwj
219 //
220 // This means that Zwj + Extend will collapse into Zwj, which is wrong,
221 // since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
222 // and that rule (WB3c) has higher priority
223 //
224 // Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
225 // which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
226 //
227 // Thus, we separately keep track of whether or not the last character
228 // was a ZWJ. This is an additional bit of state tracked outside of the
229 // state enum; the state enum represents the last non-zwj state encountered.
230 // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
231 // however we are in the previous state for the purposes of all other rules.
232 if prev_zwj {
233 match cat {
234 WB::GlueAfterZwj => continue,
235 WB::EBaseGAZ => {
236 state = Emoji;
237 continue;
238 }
239 _ => (),
240 }
241 }
242 // Don't use `continue` in this match without updating `cat`
243 state = match state {
244 Start if cat == WB::CR => {
245 idx += match self.get_next_cat(idx) {
246 Some(ncat) if ncat == WB::LF => 1, // rule WB3
247 _ => 0,
248 };
249 break; // rule WB3a
250 }
251 Start => match cat {
252 WB::ALetter => Letter, // rule WB5, WB6, WB9, WB13a
253 WB::HebrewLetter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
254 WB::Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
255 WB::Katakana => Katakana, // rule WB13, WB13a
256 WB::ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
257 WB::RegionalIndicator => Regional(RegionalState::Half), // rule WB13c
258 WB::LF | WB::Newline => break, // rule WB3a
259 WB::ZWJ => Zwj, // rule WB3c
260 WB::EBase | WB::EBaseGAZ => Emoji, // rule WB14
261 _ => {
262 if let Some(ncat) = self.get_next_cat(idx) {
263 // rule WB4
264 if ncat == WB::Format || ncat == WB::Extend || ncat == WB::ZWJ {
265 state = FormatExtend(AcceptNone);
266 self.cat = Some(ncat);
267 continue;
268 }
269 }
270 break; // rule WB999
271 }
272 },
273 Zwj => {
274 // We already handle WB3c above. At this point,
275 // the current category is not GAZ or EBG,
276 // or the previous character was not actually a ZWJ
277 take_curr = false;
278 break;
279 }
280 Letter | HLetter => match cat {
281 WB::ALetter => Letter, // rule WB5
282 WB::HebrewLetter => HLetter, // rule WB5
283 WB::Numeric => Numeric, // rule WB9
284 WB::ExtendNumLet => ExtendNumLet, // rule WB13a
285 WB::DoubleQuote if state == HLetter => {
286 savecat = cat;
287 saveidx = idx;
288 FormatExtend(RequireHLetter) // rule WB7b
289 }
290 WB::SingleQuote if state == HLetter => {
291 FormatExtend(AcceptQLetter) // rule WB7a
292 }
293 WB::MidLetter | WB::MidNumLet | WB::SingleQuote => {
294 savecat = cat;
295 saveidx = idx;
296 FormatExtend(RequireLetter) // rule WB6
297 }
298 _ => {
299 take_curr = false;
300 break;
301 }
302 },
303 Numeric => match cat {
304 WB::Numeric => Numeric, // rule WB8
305 WB::ALetter => Letter, // rule WB10
306 WB::HebrewLetter => HLetter, // rule WB10
307 WB::ExtendNumLet => ExtendNumLet, // rule WB13a
308 WB::MidNum | WB::MidNumLet | WB::SingleQuote => {
309 savecat = cat;
310 saveidx = idx;
311 FormatExtend(RequireNumeric) // rule WB12
312 }
313 _ => {
314 take_curr = false;
315 break;
316 }
317 },
318 Katakana => match cat {
319 WB::Katakana => Katakana, // rule WB13
320 WB::ExtendNumLet => ExtendNumLet, // rule WB13a
321 _ => {
322 take_curr = false;
323 break;
324 }
325 },
326 ExtendNumLet => match cat {
327 WB::ExtendNumLet => ExtendNumLet, // rule WB13a
328 WB::ALetter => Letter, // rule WB13b
329 WB::HebrewLetter => HLetter, // rule WB13b
330 WB::Numeric => Numeric, // rule WB13b
331 WB::Katakana => Katakana, // rule WB13b
332 _ => {
333 take_curr = false;
334 break;
335 }
336 },
337 Regional(RegionalState::Full) => {
338 // if it reaches here we've gone too far,
339 // a full flag can only compose with ZWJ/Extend/Format
340 // proceeding it.
341 take_curr = false;
342 break;
343 }
344 Regional(RegionalState::Half) => match cat {
345 WB::RegionalIndicator => Regional(RegionalState::Full), // rule WB13c
346 _ => {
347 take_curr = false;
348 break;
349 }
350 },
351 Regional(_) => {
352 unreachable!("RegionalState::Unknown should not occur on forward iteration")
353 }
354 Emoji => match cat {
355 // rule WB14
356 WB::EModifier => state,
357 _ => {
358 take_curr = false;
359 break;
360 }
361 },
362 FormatExtend(t) => match t {
363 // handle FormatExtends depending on what type
364 RequireNumeric if cat == WB::Numeric => Numeric, // rule WB11
365 RequireLetter | AcceptQLetter if cat == WB::ALetter => Letter, // rule WB7
366 RequireLetter | AcceptQLetter if cat == WB::HebrewLetter => HLetter, // WB7a
367 RequireHLetter if cat == WB::HebrewLetter => HLetter, // rule WB7b
368 AcceptNone | AcceptQLetter => {
369 take_curr = false; // emit all the Format|Extend characters
370 take_cat = false;
371 break;
372 }
373 _ => break, // rewind (in if statement below)
374 },
375 }
376 }
377
378 if let FormatExtend(t) = state {
379 // we were looking for something and didn't find it; we have to back up
380 if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
381 idx = saveidx;
382 cat = savecat;
383 take_curr = false;
384 }
385 }
386
387 self.cat = if take_curr {
388 idx += self.string[idx..].chars().next().unwrap().len_utf8();
389 None
390 } else if take_cat {
391 Some(cat)
392 } else {
393 None
394 };
395
396 let retstr = &self.string[..idx];
397 self.string = &self.string[idx..];
398 Some(retstr)
399 }
400}
401
402impl<'a> DoubleEndedIterator for WordBounds<'a> {
403 #[inline]
404 #[cfg_attr(feature = "cargo-clippy", allow(cyclomatic_complexity))]
405 fn next_back(&mut self) -> Option<&'a str> {
406 use self::FormatExtendType::*;
407 use self::WordBoundsState::*;
408 if self.string.is_empty() {
409 return None;
410 }
411
412 let mut take_curr = true;
413 let mut take_cat = true;
414 let mut idx = self.string.len();
415 idx -= self.string.chars().next_back().unwrap().len_utf8();
416 let mut previdx = idx;
417 let mut saveidx = idx;
418 let mut state = Start;
419 let mut savestate = Start;
420 let mut cat = WB::Other;
421
422 for (curr, ch) in self.string.char_indices().rev() {
423 previdx = idx;
424 idx = curr;
425
426 // if there's a category cached, grab it
427 cat = match self.catb {
428 None => WB::of(ch),
429 _ => self.catb.take().unwrap(),
430 };
431 take_cat = true;
432
433 // backward iterator over word boundaries. Mostly the same as the forward
434 // iterator, with two weirdnesses:
435 // (1) If we encounter a single quote in the Start state, we have to check for a
436 // Hebrew Letter immediately before it.
437 // (2) Format and Extend char handling takes some gymnastics.
438
439 if cat == WB::Extend || cat == WB::Format || (cat == WB::ZWJ && state != Zwj) {
440 // WB3c has more priority so we should not
441 // fold in that case
442 if match state {
443 FormatExtend(_) | Start => false,
444 _ => true,
445 } {
446 saveidx = previdx;
447 savestate = state;
448 state = FormatExtend(AcceptNone);
449 }
450
451 if state != Start {
452 continue;
453 }
454 } else if state == FormatExtend(AcceptNone) {
455 // finished a scan of some Format|Extend chars, restore previous state
456 state = savestate;
457 previdx = saveidx;
458 take_cat = false;
459 }
460
461 // Don't use `continue` in this match without updating `catb`
462 state = match state {
463 Start | FormatExtend(AcceptAny) => match cat {
464 WB::ALetter => Letter, // rule WB5, WB7, WB10, WB13b
465 WB::HebrewLetter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
466 WB::Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
467 WB::Katakana => Katakana, // rule WB13, WB13b
468 WB::ExtendNumLet => ExtendNumLet, // rule WB13a
469 WB::RegionalIndicator => Regional(RegionalState::Unknown), // rule WB13c
470 WB::GlueAfterZwj | WB::EBaseGAZ => Zwj, // rule WB3c
471 // rule WB4:
472 WB::Extend | WB::Format | WB::ZWJ => FormatExtend(AcceptAny),
473 WB::SingleQuote => {
474 saveidx = idx;
475 FormatExtend(AcceptQLetter) // rule WB7a
476 }
477 WB::EModifier => Emoji, // rule WB14
478 WB::CR | WB::LF | WB::Newline => {
479 if state == Start {
480 if cat == WB::LF {
481 idx -= match self.get_prev_cat(idx) {
482 Some(pcat) if pcat == WB::CR => 1, // rule WB3
483 _ => 0,
484 };
485 }
486 } else {
487 take_curr = false;
488 }
489 break; // rule WB3a
490 }
491 _ => break, // rule WB999
492 },
493 Zwj => match cat {
494 // rule WB3c
495 WB::ZWJ => FormatExtend(AcceptAny),
496 _ => {
497 take_curr = false;
498 break;
499 }
500 },
501 Letter | HLetter => match cat {
502 WB::ALetter => Letter, // rule WB5
503 WB::HebrewLetter => HLetter, // rule WB5
504 WB::Numeric => Numeric, // rule WB10
505 WB::ExtendNumLet => ExtendNumLet, // rule WB13b
506 WB::DoubleQuote if state == HLetter => {
507 saveidx = previdx;
508 FormatExtend(RequireHLetter) // rule WB7c
509 }
510 WB::MidLetter | WB::MidNumLet | WB::SingleQuote => {
511 saveidx = previdx;
512 FormatExtend(RequireLetter) // rule WB7
513 }
514 _ => {
515 take_curr = false;
516 break;
517 }
518 },
519 Numeric => match cat {
520 WB::Numeric => Numeric, // rule WB8
521 WB::ALetter => Letter, // rule WB9
522 WB::HebrewLetter => HLetter, // rule WB9
523 WB::ExtendNumLet => ExtendNumLet, // rule WB13b
524 WB::MidNum | WB::MidNumLet | WB::SingleQuote => {
525 saveidx = previdx;
526 FormatExtend(RequireNumeric) // rule WB11
527 }
528 _ => {
529 take_curr = false;
530 break;
531 }
532 },
533 Katakana => match cat {
534 WB::Katakana => Katakana, // rule WB13
535 WB::ExtendNumLet => ExtendNumLet, // rule WB13b
536 _ => {
537 take_curr = false;
538 break;
539 }
540 },
541 ExtendNumLet => match cat {
542 WB::ExtendNumLet => ExtendNumLet, // rule WB13a
543 WB::ALetter => Letter, // rule WB13a
544 WB::HebrewLetter => HLetter, // rule WB13a
545 WB::Numeric => Numeric, // rule WB13a
546 WB::Katakana => Katakana, // rule WB13a
547 _ => {
548 take_curr = false;
549 break;
550 }
551 },
552 Regional(mut regional_state) => match cat {
553 // rule WB13c
554 WB::RegionalIndicator => {
555 if regional_state == RegionalState::Unknown {
556 let count = self.string[..previdx]
557 .chars()
558 .rev()
559 .map(WB::of)
560 .filter(|&c| !(c == WB::ZWJ || c == WB::Extend || c == WB::Format))
561 .take_while(|&c| c == WB::RegionalIndicator)
562 .count();
563 regional_state = if count % 2 == 0 {
564 RegionalState::Full
565 } else {
566 RegionalState::Half
567 };
568 }
569 if regional_state == RegionalState::Full {
570 take_curr = false;
571 break;
572 } else {
573 Regional(RegionalState::Full)
574 }
575 }
576 _ => {
577 take_curr = false;
578 break;
579 }
580 },
581 Emoji => match cat {
582 // rule WB14
583 WB::EBase | WB::EBaseGAZ => Zwj,
584 _ => {
585 take_curr = false;
586 break;
587 }
588 },
589 FormatExtend(t) => match t {
590 RequireNumeric if cat == WB::Numeric => Numeric, // rule WB12
591 RequireLetter if cat == WB::ALetter => Letter, // rule WB6
592 RequireLetter if cat == WB::HebrewLetter => HLetter, // rule WB6
593 AcceptQLetter if cat == WB::HebrewLetter => HLetter, // rule WB7a
594 RequireHLetter if cat == WB::HebrewLetter => HLetter, // rule WB7b
595 _ => break, // backtrack will happens
596 },
597 }
598 }
599
600 if let FormatExtend(t) = state {
601 // if we required something but didn't find it, backtrack
602 if t == RequireLetter
603 || t == RequireHLetter
604 || t == RequireNumeric
605 || t == AcceptNone
606 || t == AcceptQLetter
607 {
608 previdx = saveidx;
609 take_cat = false;
610 take_curr = false;
611 }
612 }
613
614 self.catb = if take_curr {
615 None
616 } else {
617 idx = previdx;
618 if take_cat {
619 Some(cat)
620 } else {
621 None
622 }
623 };
624
625 let retstr = &self.string[idx..];
626 self.string = &self.string[..idx];
627 Some(retstr)
628 }
629}
630
631impl<'a> WordBounds<'a> {
632 /// Create new iterator for *word boundries*.
633 #[inline]
634 pub fn new(s: &str) -> WordBounds<'_> {
635 WordBounds {
636 string: s,
637 cat: None,
638 catb: None,
639 }
640 }
641
642 #[inline]
643 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
644 ///
645 /// ```rust
646 /// # use unic_segment::WordBounds;
647 /// let mut iter = WordBounds::new("Hello world");
648 /// assert_eq!(iter.as_str(), "Hello world");
649 ///
650 /// iter.next();
651 /// assert_eq!(iter.as_str(), " world");
652 ///
653 /// iter.next();
654 /// assert_eq!(iter.as_str(), "world");
655 /// ```
656 pub fn as_str(&self) -> &'a str {
657 self.string
658 }
659
660 #[inline]
661 fn get_next_cat(&self, idx: usize) -> Option<WB> {
662 let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
663 if nidx < self.string.len() {
664 let nch = self.string[nidx..].chars().next().unwrap();
665 Some(WB::of(nch))
666 } else {
667 None
668 }
669 }
670
671 #[inline]
672 fn get_prev_cat(&self, idx: usize) -> Option<WB> {
673 if idx > 0 {
674 let nch = self.string[..idx].chars().next_back().unwrap();
675 Some(WB::of(nch))
676 } else {
677 None
678 }
679 }
680}
681
682#[cfg(test)]
683mod tests {
684 use super::{WordBounds, Words};
685 use unic_ucd_common::is_alphanumeric;
686
687 #[test]
688 fn test_word_bounds() {
689 assert_eq!(
690 WordBounds::new("The quick (\"brown\") fox").collect::<Vec<&str>>(),
691 &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"]
692 );
693 }
694
695 #[test]
696 fn test_words() {
697 assert_eq!(
698 Words::new(
699 "The quick (\"brown\") fox can't jump 32.3 feet, right?",
700 |s: &&str| s.chars().any(is_alphanumeric),
701 )
702 .collect::<Vec<&str>>(),
703 &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]
704 );
705 }
706}