unic_segment/
word.rs

1// Copyright 2012-2015 The Rust Project Developers.
2// Copyright 2017 The UNIC Project Developers.
3//
4// See the COPYRIGHT file at the top-level directory of this distribution.
5//
6// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9// option. This file may not be copied, modified, or distributed
10// except according to those terms.
11
12//! Unicode Words of a string.
13//!
14//! ## References
15//!
16//! * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
17
18use std::cmp;
19use std::iter::Filter;
20
21use unic_ucd_segment::WordBreak as WB;
22
23/// An iterator over the substrings of a string which, after splitting the string on [word
24/// boundaries](https://www.unicode.org/reports/tr29/#Word_Boundaries), contain any characters with
25/// the [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) property, or with
26/// [`General_Category=Number`](http://unicode.org/reports/tr44/#General_Category_Values).
27#[derive(Debug)]
28pub struct Words<'a> {
29    inner: Filter<WordBounds<'a>, fn(&&str) -> bool>,
30}
31
32impl<'a> Iterator for Words<'a> {
33    type Item = &'a str;
34
35    #[inline]
36    fn next(&mut self) -> Option<&'a str> {
37        self.inner.next()
38    }
39}
40
41impl<'a> DoubleEndedIterator for Words<'a> {
42    #[inline]
43    fn next_back(&mut self) -> Option<&'a str> {
44        self.inner.next_back()
45    }
46}
47
48impl<'a> Words<'a> {
49    /// Create new iterator for *words*.
50    #[inline]
51    pub fn new(s: &str, filter: fn(&&str) -> bool) -> Words<'_> {
52        Words {
53            inner: WordBounds::new(s).filter(filter),
54        }
55    }
56}
57
58/// External iterator for a string's
59/// [word boundaries](https://www.unicode.org/reports/tr29/#Word_Boundaries).
60#[derive(Clone, Debug)]
61pub struct WordBounds<'a> {
62    string: &'a str,
63    cat: Option<WB>,
64    catb: Option<WB>,
65}
66
67/// External iterator for word boundaries and byte offsets.
68#[derive(Clone, Debug)]
69pub struct WordBoundIndices<'a> {
70    start_offset: usize,
71    iter: WordBounds<'a>,
72}
73
74impl<'a> WordBoundIndices<'a> {
75    /// Create new iterator for *word boundries and their indices*.
76    #[inline]
77    pub fn new(s: &str) -> WordBoundIndices<'_> {
78        WordBoundIndices {
79            start_offset: s.as_ptr() as usize,
80            iter: WordBounds::new(s),
81        }
82    }
83
84    #[inline]
85    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
86    ///
87    /// ```rust
88    /// # use unic_segment::WordBoundIndices;
89    /// let mut iter = WordBoundIndices::new("Hello world");
90    /// assert_eq!(iter.as_str(), "Hello world");
91    ///
92    /// iter.next();
93    /// assert_eq!(iter.as_str(), " world");
94    ///
95    /// iter.next();
96    /// assert_eq!(iter.as_str(), "world");
97    /// ```
98    pub fn as_str(&self) -> &'a str {
99        self.iter.as_str()
100    }
101}
102
103impl<'a> Iterator for WordBoundIndices<'a> {
104    type Item = (usize, &'a str);
105
106    #[inline]
107    fn next(&mut self) -> Option<(usize, &'a str)> {
108        self.iter
109            .next()
110            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
111    }
112
113    #[inline]
114    fn size_hint(&self) -> (usize, Option<usize>) {
115        self.iter.size_hint()
116    }
117}
118
119impl<'a> DoubleEndedIterator for WordBoundIndices<'a> {
120    #[inline]
121    fn next_back(&mut self) -> Option<(usize, &'a str)> {
122        self.iter
123            .next_back()
124            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
125    }
126}
127
128// state machine for word boundary rules
129#[derive(Clone, Copy, Debug, Eq, PartialEq)]
130enum WordBoundsState {
131    Start,
132    Letter,
133    HLetter,
134    Numeric,
135    Katakana,
136    ExtendNumLet,
137    Regional(RegionalState),
138    FormatExtend(FormatExtendType),
139    Zwj,
140    Emoji,
141}
142
143// subtypes for FormatExtend state in WordBoundsState
144#[derive(Clone, Copy, Debug, Eq, PartialEq)]
145enum FormatExtendType {
146    AcceptAny,
147    AcceptNone,
148    RequireLetter,
149    RequireHLetter,
150    AcceptQLetter,
151    RequireNumeric,
152}
153
154#[derive(Clone, Copy, Debug, Eq, PartialEq)]
155enum RegionalState {
156    Half,
157    Full,
158    Unknown,
159}
160
161impl<'a> Iterator for WordBounds<'a> {
162    type Item = &'a str;
163
164    #[inline]
165    fn size_hint(&self) -> (usize, Option<usize>) {
166        let slen = self.string.len();
167        (cmp::min(slen, 1), Some(slen))
168    }
169
170    #[inline]
171    #[cfg_attr(feature = "cargo-clippy", allow(match_same_arms))]
172    fn next(&mut self) -> Option<&'a str> {
173        use self::FormatExtendType::*;
174        use self::WordBoundsState::*;
175
176        if self.string.is_empty() {
177            return None;
178        }
179
180        let mut take_curr = true;
181        let mut take_cat = true;
182        let mut idx = 0;
183        let mut saveidx = 0;
184        let mut state = Start;
185        let mut cat = WB::Other;
186        let mut savecat = WB::Other;
187
188        // Whether or not the previous category was ZWJ
189        // ZWJs get collapsed, so this handles precedence of WB3c over WB4
190        let mut prev_zwj;
191        for (curr, ch) in self.string.char_indices() {
192            idx = curr;
193            prev_zwj = cat == WB::ZWJ;
194            // if there's a category cached, grab it
195            cat = match self.cat {
196                None => WB::of(ch),
197                _ => self.cat.take().unwrap(),
198            };
199            take_cat = true;
200
201            // handle rule WB4
202            // just skip all format, extend, and zwj chars
203            // note that Start is a special case: if there's a bunch of Format | Extend
204            // characters at the beginning of a block of text, dump them out as one unit.
205            //
206            // (This is not obvious from the wording of UAX#29, but if you look at the
207            // test cases https://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
208            // then the "correct" interpretation of WB4 becomes apparent.)
209            if state != Start {
210                match cat {
211                    WB::Extend | WB::Format | WB::ZWJ => continue,
212                    _ => {}
213                }
214            }
215
216            // rule WB3c
217            // WB4 makes all ZWJs collapse into the previous state
218            // but you can still be in a Zwj state if you started with Zwj
219            //
220            // This means that Zwj + Extend will collapse into Zwj, which is wrong,
221            // since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
222            // and that rule (WB3c) has higher priority
223            //
224            // Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
225            // which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
226            //
227            // Thus, we separately keep track of whether or not the last character
228            // was a ZWJ. This is an additional bit of state tracked outside of the
229            // state enum; the state enum represents the last non-zwj state encountered.
230            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
231            // however we are in the previous state for the purposes of all other rules.
232            if prev_zwj {
233                match cat {
234                    WB::GlueAfterZwj => continue,
235                    WB::EBaseGAZ => {
236                        state = Emoji;
237                        continue;
238                    }
239                    _ => (),
240                }
241            }
242            // Don't use `continue` in this match without updating `cat`
243            state = match state {
244                Start if cat == WB::CR => {
245                    idx += match self.get_next_cat(idx) {
246                        Some(ncat) if ncat == WB::LF => 1, // rule WB3
247                        _ => 0,
248                    };
249                    break; // rule WB3a
250                }
251                Start => match cat {
252                    WB::ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
253                    WB::HebrewLetter => HLetter,      // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
254                    WB::Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
255                    WB::Katakana => Katakana,         // rule WB13, WB13a
256                    WB::ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
257                    WB::RegionalIndicator => Regional(RegionalState::Half), // rule WB13c
258                    WB::LF | WB::Newline => break,    // rule WB3a
259                    WB::ZWJ => Zwj,                   // rule WB3c
260                    WB::EBase | WB::EBaseGAZ => Emoji, // rule WB14
261                    _ => {
262                        if let Some(ncat) = self.get_next_cat(idx) {
263                            // rule WB4
264                            if ncat == WB::Format || ncat == WB::Extend || ncat == WB::ZWJ {
265                                state = FormatExtend(AcceptNone);
266                                self.cat = Some(ncat);
267                                continue;
268                            }
269                        }
270                        break; // rule WB999
271                    }
272                },
273                Zwj => {
274                    // We already handle WB3c above. At this point,
275                    // the current category is not GAZ or EBG,
276                    // or the previous character was not actually a ZWJ
277                    take_curr = false;
278                    break;
279                }
280                Letter | HLetter => match cat {
281                    WB::ALetter => Letter,            // rule WB5
282                    WB::HebrewLetter => HLetter,      // rule WB5
283                    WB::Numeric => Numeric,           // rule WB9
284                    WB::ExtendNumLet => ExtendNumLet, // rule WB13a
285                    WB::DoubleQuote if state == HLetter => {
286                        savecat = cat;
287                        saveidx = idx;
288                        FormatExtend(RequireHLetter) // rule WB7b
289                    }
290                    WB::SingleQuote if state == HLetter => {
291                        FormatExtend(AcceptQLetter) // rule WB7a
292                    }
293                    WB::MidLetter | WB::MidNumLet | WB::SingleQuote => {
294                        savecat = cat;
295                        saveidx = idx;
296                        FormatExtend(RequireLetter) // rule WB6
297                    }
298                    _ => {
299                        take_curr = false;
300                        break;
301                    }
302                },
303                Numeric => match cat {
304                    WB::Numeric => Numeric,           // rule WB8
305                    WB::ALetter => Letter,            // rule WB10
306                    WB::HebrewLetter => HLetter,      // rule WB10
307                    WB::ExtendNumLet => ExtendNumLet, // rule WB13a
308                    WB::MidNum | WB::MidNumLet | WB::SingleQuote => {
309                        savecat = cat;
310                        saveidx = idx;
311                        FormatExtend(RequireNumeric) // rule WB12
312                    }
313                    _ => {
314                        take_curr = false;
315                        break;
316                    }
317                },
318                Katakana => match cat {
319                    WB::Katakana => Katakana,         // rule WB13
320                    WB::ExtendNumLet => ExtendNumLet, // rule WB13a
321                    _ => {
322                        take_curr = false;
323                        break;
324                    }
325                },
326                ExtendNumLet => match cat {
327                    WB::ExtendNumLet => ExtendNumLet, // rule WB13a
328                    WB::ALetter => Letter,            // rule WB13b
329                    WB::HebrewLetter => HLetter,      // rule WB13b
330                    WB::Numeric => Numeric,           // rule WB13b
331                    WB::Katakana => Katakana,         // rule WB13b
332                    _ => {
333                        take_curr = false;
334                        break;
335                    }
336                },
337                Regional(RegionalState::Full) => {
338                    // if it reaches here we've gone too far,
339                    // a full flag can only compose with ZWJ/Extend/Format
340                    // proceeding it.
341                    take_curr = false;
342                    break;
343                }
344                Regional(RegionalState::Half) => match cat {
345                    WB::RegionalIndicator => Regional(RegionalState::Full), // rule WB13c
346                    _ => {
347                        take_curr = false;
348                        break;
349                    }
350                },
351                Regional(_) => {
352                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
353                }
354                Emoji => match cat {
355                    // rule WB14
356                    WB::EModifier => state,
357                    _ => {
358                        take_curr = false;
359                        break;
360                    }
361                },
362                FormatExtend(t) => match t {
363                    // handle FormatExtends depending on what type
364                    RequireNumeric if cat == WB::Numeric => Numeric, // rule WB11
365                    RequireLetter | AcceptQLetter if cat == WB::ALetter => Letter, // rule WB7
366                    RequireLetter | AcceptQLetter if cat == WB::HebrewLetter => HLetter, // WB7a
367                    RequireHLetter if cat == WB::HebrewLetter => HLetter, // rule WB7b
368                    AcceptNone | AcceptQLetter => {
369                        take_curr = false; // emit all the Format|Extend characters
370                        take_cat = false;
371                        break;
372                    }
373                    _ => break, // rewind (in if statement below)
374                },
375            }
376        }
377
378        if let FormatExtend(t) = state {
379            // we were looking for something and didn't find it; we have to back up
380            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
381                idx = saveidx;
382                cat = savecat;
383                take_curr = false;
384            }
385        }
386
387        self.cat = if take_curr {
388            idx += self.string[idx..].chars().next().unwrap().len_utf8();
389            None
390        } else if take_cat {
391            Some(cat)
392        } else {
393            None
394        };
395
396        let retstr = &self.string[..idx];
397        self.string = &self.string[idx..];
398        Some(retstr)
399    }
400}
401
402impl<'a> DoubleEndedIterator for WordBounds<'a> {
403    #[inline]
404    #[cfg_attr(feature = "cargo-clippy", allow(cyclomatic_complexity))]
405    fn next_back(&mut self) -> Option<&'a str> {
406        use self::FormatExtendType::*;
407        use self::WordBoundsState::*;
408        if self.string.is_empty() {
409            return None;
410        }
411
412        let mut take_curr = true;
413        let mut take_cat = true;
414        let mut idx = self.string.len();
415        idx -= self.string.chars().next_back().unwrap().len_utf8();
416        let mut previdx = idx;
417        let mut saveidx = idx;
418        let mut state = Start;
419        let mut savestate = Start;
420        let mut cat = WB::Other;
421
422        for (curr, ch) in self.string.char_indices().rev() {
423            previdx = idx;
424            idx = curr;
425
426            // if there's a category cached, grab it
427            cat = match self.catb {
428                None => WB::of(ch),
429                _ => self.catb.take().unwrap(),
430            };
431            take_cat = true;
432
433            // backward iterator over word boundaries. Mostly the same as the forward
434            // iterator, with two weirdnesses:
435            // (1) If we encounter a single quote in the Start state, we have to check for a
436            //     Hebrew Letter immediately before it.
437            // (2) Format and Extend char handling takes some gymnastics.
438
439            if cat == WB::Extend || cat == WB::Format || (cat == WB::ZWJ && state != Zwj) {
440                // WB3c has more priority so we should not
441                // fold in that case
442                if match state {
443                    FormatExtend(_) | Start => false,
444                    _ => true,
445                } {
446                    saveidx = previdx;
447                    savestate = state;
448                    state = FormatExtend(AcceptNone);
449                }
450
451                if state != Start {
452                    continue;
453                }
454            } else if state == FormatExtend(AcceptNone) {
455                // finished a scan of some Format|Extend chars, restore previous state
456                state = savestate;
457                previdx = saveidx;
458                take_cat = false;
459            }
460
461            // Don't use `continue` in this match without updating `catb`
462            state = match state {
463                Start | FormatExtend(AcceptAny) => match cat {
464                    WB::ALetter => Letter,            // rule WB5, WB7, WB10, WB13b
465                    WB::HebrewLetter => HLetter,      // rule WB5, WB7, WB7c, WB10, WB13b
466                    WB::Numeric => Numeric,           // rule WB8, WB9, WB11, WB13b
467                    WB::Katakana => Katakana,         // rule WB13, WB13b
468                    WB::ExtendNumLet => ExtendNumLet, // rule WB13a
469                    WB::RegionalIndicator => Regional(RegionalState::Unknown), // rule WB13c
470                    WB::GlueAfterZwj | WB::EBaseGAZ => Zwj, // rule WB3c
471                    // rule WB4:
472                    WB::Extend | WB::Format | WB::ZWJ => FormatExtend(AcceptAny),
473                    WB::SingleQuote => {
474                        saveidx = idx;
475                        FormatExtend(AcceptQLetter) // rule WB7a
476                    }
477                    WB::EModifier => Emoji, // rule WB14
478                    WB::CR | WB::LF | WB::Newline => {
479                        if state == Start {
480                            if cat == WB::LF {
481                                idx -= match self.get_prev_cat(idx) {
482                                    Some(pcat) if pcat == WB::CR => 1, // rule WB3
483                                    _ => 0,
484                                };
485                            }
486                        } else {
487                            take_curr = false;
488                        }
489                        break; // rule WB3a
490                    }
491                    _ => break, // rule WB999
492                },
493                Zwj => match cat {
494                    // rule WB3c
495                    WB::ZWJ => FormatExtend(AcceptAny),
496                    _ => {
497                        take_curr = false;
498                        break;
499                    }
500                },
501                Letter | HLetter => match cat {
502                    WB::ALetter => Letter,            // rule WB5
503                    WB::HebrewLetter => HLetter,      // rule WB5
504                    WB::Numeric => Numeric,           // rule WB10
505                    WB::ExtendNumLet => ExtendNumLet, // rule WB13b
506                    WB::DoubleQuote if state == HLetter => {
507                        saveidx = previdx;
508                        FormatExtend(RequireHLetter) // rule WB7c
509                    }
510                    WB::MidLetter | WB::MidNumLet | WB::SingleQuote => {
511                        saveidx = previdx;
512                        FormatExtend(RequireLetter) // rule WB7
513                    }
514                    _ => {
515                        take_curr = false;
516                        break;
517                    }
518                },
519                Numeric => match cat {
520                    WB::Numeric => Numeric,           // rule WB8
521                    WB::ALetter => Letter,            // rule WB9
522                    WB::HebrewLetter => HLetter,      // rule WB9
523                    WB::ExtendNumLet => ExtendNumLet, // rule WB13b
524                    WB::MidNum | WB::MidNumLet | WB::SingleQuote => {
525                        saveidx = previdx;
526                        FormatExtend(RequireNumeric) // rule WB11
527                    }
528                    _ => {
529                        take_curr = false;
530                        break;
531                    }
532                },
533                Katakana => match cat {
534                    WB::Katakana => Katakana,         // rule WB13
535                    WB::ExtendNumLet => ExtendNumLet, // rule WB13b
536                    _ => {
537                        take_curr = false;
538                        break;
539                    }
540                },
541                ExtendNumLet => match cat {
542                    WB::ExtendNumLet => ExtendNumLet, // rule WB13a
543                    WB::ALetter => Letter,            // rule WB13a
544                    WB::HebrewLetter => HLetter,      // rule WB13a
545                    WB::Numeric => Numeric,           // rule WB13a
546                    WB::Katakana => Katakana,         // rule WB13a
547                    _ => {
548                        take_curr = false;
549                        break;
550                    }
551                },
552                Regional(mut regional_state) => match cat {
553                    // rule WB13c
554                    WB::RegionalIndicator => {
555                        if regional_state == RegionalState::Unknown {
556                            let count = self.string[..previdx]
557                                .chars()
558                                .rev()
559                                .map(WB::of)
560                                .filter(|&c| !(c == WB::ZWJ || c == WB::Extend || c == WB::Format))
561                                .take_while(|&c| c == WB::RegionalIndicator)
562                                .count();
563                            regional_state = if count % 2 == 0 {
564                                RegionalState::Full
565                            } else {
566                                RegionalState::Half
567                            };
568                        }
569                        if regional_state == RegionalState::Full {
570                            take_curr = false;
571                            break;
572                        } else {
573                            Regional(RegionalState::Full)
574                        }
575                    }
576                    _ => {
577                        take_curr = false;
578                        break;
579                    }
580                },
581                Emoji => match cat {
582                    // rule WB14
583                    WB::EBase | WB::EBaseGAZ => Zwj,
584                    _ => {
585                        take_curr = false;
586                        break;
587                    }
588                },
589                FormatExtend(t) => match t {
590                    RequireNumeric if cat == WB::Numeric => Numeric, // rule WB12
591                    RequireLetter if cat == WB::ALetter => Letter,   // rule WB6
592                    RequireLetter if cat == WB::HebrewLetter => HLetter, // rule WB6
593                    AcceptQLetter if cat == WB::HebrewLetter => HLetter, // rule WB7a
594                    RequireHLetter if cat == WB::HebrewLetter => HLetter, // rule WB7b
595                    _ => break,                                      // backtrack will happens
596                },
597            }
598        }
599
600        if let FormatExtend(t) = state {
601            // if we required something but didn't find it, backtrack
602            if t == RequireLetter
603                || t == RequireHLetter
604                || t == RequireNumeric
605                || t == AcceptNone
606                || t == AcceptQLetter
607            {
608                previdx = saveidx;
609                take_cat = false;
610                take_curr = false;
611            }
612        }
613
614        self.catb = if take_curr {
615            None
616        } else {
617            idx = previdx;
618            if take_cat {
619                Some(cat)
620            } else {
621                None
622            }
623        };
624
625        let retstr = &self.string[idx..];
626        self.string = &self.string[..idx];
627        Some(retstr)
628    }
629}
630
631impl<'a> WordBounds<'a> {
632    /// Create new iterator for *word boundries*.
633    #[inline]
634    pub fn new(s: &str) -> WordBounds<'_> {
635        WordBounds {
636            string: s,
637            cat: None,
638            catb: None,
639        }
640    }
641
642    #[inline]
643    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
644    ///
645    /// ```rust
646    /// # use unic_segment::WordBounds;
647    /// let mut iter = WordBounds::new("Hello world");
648    /// assert_eq!(iter.as_str(), "Hello world");
649    ///
650    /// iter.next();
651    /// assert_eq!(iter.as_str(), " world");
652    ///
653    /// iter.next();
654    /// assert_eq!(iter.as_str(), "world");
655    /// ```
656    pub fn as_str(&self) -> &'a str {
657        self.string
658    }
659
660    #[inline]
661    fn get_next_cat(&self, idx: usize) -> Option<WB> {
662        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
663        if nidx < self.string.len() {
664            let nch = self.string[nidx..].chars().next().unwrap();
665            Some(WB::of(nch))
666        } else {
667            None
668        }
669    }
670
671    #[inline]
672    fn get_prev_cat(&self, idx: usize) -> Option<WB> {
673        if idx > 0 {
674            let nch = self.string[..idx].chars().next_back().unwrap();
675            Some(WB::of(nch))
676        } else {
677            None
678        }
679    }
680}
681
682#[cfg(test)]
683mod tests {
684    use super::{WordBounds, Words};
685    use unic_ucd_common::is_alphanumeric;
686
687    #[test]
688    fn test_word_bounds() {
689        assert_eq!(
690            WordBounds::new("The quick (\"brown\")  fox").collect::<Vec<&str>>(),
691            &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"]
692        );
693    }
694
695    #[test]
696    fn test_words() {
697        assert_eq!(
698            Words::new(
699                "The quick (\"brown\") fox can't jump 32.3 feet, right?",
700                |s: &&str| s.chars().any(is_alphanumeric),
701            )
702            .collect::<Vec<&str>>(),
703            &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]
704        );
705    }
706}