aho_corasick/packed/
api.rs

1use crate::{
2    packed::{
3        pattern::Patterns,
4        rabinkarp::RabinKarp,
5        teddy::{self, Teddy},
6    },
7    util::search::{Match, Span},
8};
9
10/// This is a limit placed on the total number of patterns we're willing to try
11/// and match at once. As more sophisticated algorithms are added, this number
12/// may be increased.
13const PATTERN_LIMIT: usize = 128;
14
15/// A knob for controlling the match semantics of a packed multiple string
16/// searcher.
17///
18/// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level
19/// crate module in that it doesn't support "standard" match semantics,
20/// and instead only supports leftmost-first or leftmost-longest. Namely,
21/// "standard" semantics cannot be easily supported by packed searchers.
22///
23/// For more information on the distinction between leftmost-first and
24/// leftmost-longest, see the docs on the top-level `MatchKind` type.
25///
26/// Unlike the top-level `MatchKind` type, the default match semantics for this
27/// type are leftmost-first.
28#[derive(Clone, Copy, Debug, Eq, PartialEq)]
29#[non_exhaustive]
30pub enum MatchKind {
31    /// Use leftmost-first match semantics, which reports leftmost matches.
32    /// When there are multiple possible leftmost matches, the match
33    /// corresponding to the pattern that appeared earlier when constructing
34    /// the automaton is reported.
35    ///
36    /// This is the default.
37    LeftmostFirst,
38    /// Use leftmost-longest match semantics, which reports leftmost matches.
39    /// When there are multiple possible leftmost matches, the longest match
40    /// is chosen.
41    LeftmostLongest,
42}
43
44impl Default for MatchKind {
45    fn default() -> MatchKind {
46        MatchKind::LeftmostFirst
47    }
48}
49
50/// The configuration for a packed multiple pattern searcher.
51///
52/// The configuration is currently limited only to being able to select the
53/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
54/// future, more knobs may be made available.
55///
56/// A configuration produces a [`packed::Builder`](Builder), which in turn can
57/// be used to construct a [`packed::Searcher`](Searcher) for searching.
58///
59/// # Example
60///
61/// This example shows how to use leftmost-longest semantics instead of the
62/// default (leftmost-first).
63///
64/// ```
65/// use aho_corasick::{packed::{Config, MatchKind}, PatternID};
66///
67/// # fn example() -> Option<()> {
68/// let searcher = Config::new()
69///     .match_kind(MatchKind::LeftmostLongest)
70///     .builder()
71///     .add("foo")
72///     .add("foobar")
73///     .build()?;
74/// let matches: Vec<PatternID> = searcher
75///     .find_iter("foobar")
76///     .map(|mat| mat.pattern())
77///     .collect();
78/// assert_eq!(vec![PatternID::must(1)], matches);
79/// # Some(()) }
80/// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
81/// #     example().unwrap()
82/// # } else {
83/// #     assert!(example().is_none());
84/// # }
85/// ```
86#[derive(Clone, Debug)]
87pub struct Config {
88    kind: MatchKind,
89    force: Option<ForceAlgorithm>,
90    force_teddy_fat: Option<bool>,
91    force_avx: Option<bool>,
92}
93
94/// An internal option for forcing the use of a particular packed algorithm.
95///
96/// When an algorithm is forced, if a searcher could not be constructed for it,
97/// then no searcher will be returned even if an alternative algorithm would
98/// work.
99#[derive(Clone, Debug)]
100enum ForceAlgorithm {
101    Teddy,
102    RabinKarp,
103}
104
105impl Default for Config {
106    fn default() -> Config {
107        Config::new()
108    }
109}
110
111impl Config {
112    /// Create a new default configuration. A default configuration uses
113    /// leftmost-first match semantics.
114    pub fn new() -> Config {
115        Config {
116            kind: MatchKind::LeftmostFirst,
117            force: None,
118            force_teddy_fat: None,
119            force_avx: None,
120        }
121    }
122
123    /// Create a packed builder from this configuration. The builder can be
124    /// used to accumulate patterns and create a [`Searcher`] from them.
125    pub fn builder(&self) -> Builder {
126        Builder::from_config(self.clone())
127    }
128
129    /// Set the match semantics for this configuration.
130    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
131        self.kind = kind;
132        self
133    }
134
135    /// An undocumented method for forcing the use of the Teddy algorithm.
136    ///
137    /// This is only exposed for more precise testing and benchmarks. Callers
138    /// should not use it as it is not part of the API stability guarantees of
139    /// this crate.
140    #[doc(hidden)]
141    pub fn force_teddy(&mut self, yes: bool) -> &mut Config {
142        if yes {
143            self.force = Some(ForceAlgorithm::Teddy);
144        } else {
145            self.force = None;
146        }
147        self
148    }
149
150    /// An undocumented method for forcing the use of the Fat Teddy algorithm.
151    ///
152    /// This is only exposed for more precise testing and benchmarks. Callers
153    /// should not use it as it is not part of the API stability guarantees of
154    /// this crate.
155    #[doc(hidden)]
156    pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
157        self.force_teddy_fat = yes;
158        self
159    }
160
161    /// An undocumented method for forcing the use of SSE (`Some(false)`) or
162    /// AVX (`Some(true)`) algorithms.
163    ///
164    /// This is only exposed for more precise testing and benchmarks. Callers
165    /// should not use it as it is not part of the API stability guarantees of
166    /// this crate.
167    #[doc(hidden)]
168    pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config {
169        self.force_avx = yes;
170        self
171    }
172
173    /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
174    ///
175    /// This is only exposed for more precise testing and benchmarks. Callers
176    /// should not use it as it is not part of the API stability guarantees of
177    /// this crate.
178    #[doc(hidden)]
179    pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config {
180        if yes {
181            self.force = Some(ForceAlgorithm::RabinKarp);
182        } else {
183            self.force = None;
184        }
185        self
186    }
187}
188
189/// A builder for constructing a packed searcher from a collection of patterns.
190///
191/// # Example
192///
193/// This example shows how to use a builder to construct a searcher. By
194/// default, leftmost-first match semantics are used.
195///
196/// ```
197/// use aho_corasick::{packed::{Builder, MatchKind}, PatternID};
198///
199/// # fn example() -> Option<()> {
200/// let searcher = Builder::new()
201///     .add("foobar")
202///     .add("foo")
203///     .build()?;
204/// let matches: Vec<PatternID> = searcher
205///     .find_iter("foobar")
206///     .map(|mat| mat.pattern())
207///     .collect();
208/// assert_eq!(vec![PatternID::ZERO], matches);
209/// # Some(()) }
210/// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
211/// #     example().unwrap()
212/// # } else {
213/// #     assert!(example().is_none());
214/// # }
215/// ```
216#[derive(Clone, Debug)]
217pub struct Builder {
218    /// The configuration of this builder and subsequent matcher.
219    config: Config,
220    /// Set to true if the builder detects that a matcher cannot be built.
221    inert: bool,
222    /// The patterns provided by the caller.
223    patterns: Patterns,
224}
225
226impl Builder {
227    /// Create a new builder for constructing a multi-pattern searcher. This
228    /// constructor uses the default configuration.
229    pub fn new() -> Builder {
230        Builder::from_config(Config::new())
231    }
232
233    fn from_config(config: Config) -> Builder {
234        Builder { config, inert: false, patterns: Patterns::new() }
235    }
236
237    /// Build a searcher from the patterns added to this builder so far.
238    pub fn build(&self) -> Option<Searcher> {
239        if self.inert || self.patterns.is_empty() {
240            return None;
241        }
242        let mut patterns = self.patterns.clone();
243        patterns.set_match_kind(self.config.kind);
244        let rabinkarp = RabinKarp::new(&patterns);
245        // Effectively, we only want to return a searcher if we can use Teddy,
246        // since Teddy is our only fast packed searcher at the moment.
247        // Rabin-Karp is only used when searching haystacks smaller than what
248        // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
249        // is to force it using undocumented APIs (for tests/benchmarks).
250        let (search_kind, minimum_len) = match self.config.force {
251            None | Some(ForceAlgorithm::Teddy) => {
252                debug!("trying to build Teddy packed matcher");
253                let teddy = match self.build_teddy(&patterns) {
254                    None => return None,
255                    Some(teddy) => teddy,
256                };
257                let minimum_len = teddy.minimum_len();
258                (SearchKind::Teddy(teddy), minimum_len)
259            }
260            Some(ForceAlgorithm::RabinKarp) => {
261                debug!("using Rabin-Karp packed matcher");
262                (SearchKind::RabinKarp, 0)
263            }
264        };
265        Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
266    }
267
268    fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
269        teddy::Builder::new()
270            .avx(self.config.force_avx)
271            .fat(self.config.force_teddy_fat)
272            .build(&patterns)
273    }
274
275    /// Add the given pattern to this set to match.
276    ///
277    /// The order in which patterns are added is significant. Namely, when
278    /// using leftmost-first match semantics, then when multiple patterns can
279    /// match at a particular location, the pattern that was added first is
280    /// used as the match.
281    ///
282    /// If the number of patterns added exceeds the amount supported by packed
283    /// searchers, then the builder will stop accumulating patterns and render
284    /// itself inert. At this point, constructing a searcher will always return
285    /// `None`.
286    pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
287        if self.inert {
288            return self;
289        } else if self.patterns.len() >= PATTERN_LIMIT {
290            self.inert = true;
291            self.patterns.reset();
292            return self;
293        }
294        // Just in case PATTERN_LIMIT increases beyond u16::MAX.
295        assert!(self.patterns.len() <= core::u16::MAX as usize);
296
297        let pattern = pattern.as_ref();
298        if pattern.is_empty() {
299            self.inert = true;
300            self.patterns.reset();
301            return self;
302        }
303        self.patterns.add(pattern);
304        self
305    }
306
307    /// Add the given iterator of patterns to this set to match.
308    ///
309    /// The iterator must yield elements that can be converted into a `&[u8]`.
310    ///
311    /// The order in which patterns are added is significant. Namely, when
312    /// using leftmost-first match semantics, then when multiple patterns can
313    /// match at a particular location, the pattern that was added first is
314    /// used as the match.
315    ///
316    /// If the number of patterns added exceeds the amount supported by packed
317    /// searchers, then the builder will stop accumulating patterns and render
318    /// itself inert. At this point, constructing a searcher will always return
319    /// `None`.
320    pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
321    where
322        I: IntoIterator<Item = P>,
323        P: AsRef<[u8]>,
324    {
325        for p in patterns {
326            self.add(p);
327        }
328        self
329    }
330}
331
332impl Default for Builder {
333    fn default() -> Builder {
334        Builder::new()
335    }
336}
337
338/// A packed searcher for quickly finding occurrences of multiple patterns.
339///
340/// If callers need more flexible construction, or if one wants to change the
341/// match semantics (either leftmost-first or leftmost-longest), then one can
342/// use the [`Config`] and/or [`Builder`] types for more fine grained control.
343///
344/// # Example
345///
346/// This example shows how to create a searcher from an iterator of patterns.
347/// By default, leftmost-first match semantics are used.
348///
349/// ```
350/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
351///
352/// # fn example() -> Option<()> {
353/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
354/// let matches: Vec<PatternID> = searcher
355///     .find_iter("foobar")
356///     .map(|mat| mat.pattern())
357///     .collect();
358/// assert_eq!(vec![PatternID::ZERO], matches);
359/// # Some(()) }
360/// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
361/// #     example().unwrap()
362/// # } else {
363/// #     assert!(example().is_none());
364/// # }
365/// ```
366#[derive(Clone, Debug)]
367pub struct Searcher {
368    patterns: Patterns,
369    rabinkarp: RabinKarp,
370    search_kind: SearchKind,
371    minimum_len: usize,
372}
373
374#[derive(Clone, Debug)]
375enum SearchKind {
376    Teddy(Teddy),
377    RabinKarp,
378}
379
380impl Searcher {
381    /// A convenience function for constructing a searcher from an iterator
382    /// of things that can be converted to a `&[u8]`.
383    ///
384    /// If a searcher could not be constructed (either because of an
385    /// unsupported CPU or because there are too many patterns), then `None`
386    /// is returned.
387    ///
388    /// # Example
389    ///
390    /// Basic usage:
391    ///
392    /// ```
393    /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
394    ///
395    /// # fn example() -> Option<()> {
396    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
397    /// let matches: Vec<PatternID> = searcher
398    ///     .find_iter("foobar")
399    ///     .map(|mat| mat.pattern())
400    ///     .collect();
401    /// assert_eq!(vec![PatternID::ZERO], matches);
402    /// # Some(()) }
403    /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
404    /// #     example().unwrap()
405    /// # } else {
406    /// #     assert!(example().is_none());
407    /// # }
408    /// ```
409    pub fn new<I, P>(patterns: I) -> Option<Searcher>
410    where
411        I: IntoIterator<Item = P>,
412        P: AsRef<[u8]>,
413    {
414        Builder::new().extend(patterns).build()
415    }
416
417    /// A convenience function for calling `Config::new()`.
418    ///
419    /// This is useful for avoiding an additional import.
420    pub fn config() -> Config {
421        Config::new()
422    }
423
424    /// A convenience function for calling `Builder::new()`.
425    ///
426    /// This is useful for avoiding an additional import.
427    pub fn builder() -> Builder {
428        Builder::new()
429    }
430
431    /// Return the first occurrence of any of the patterns in this searcher,
432    /// according to its match semantics, in the given haystack. The `Match`
433    /// returned will include the identifier of the pattern that matched, which
434    /// corresponds to the index of the pattern (starting from `0`) in which it
435    /// was added.
436    ///
437    /// # Example
438    ///
439    /// Basic usage:
440    ///
441    /// ```
442    /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
443    ///
444    /// # fn example() -> Option<()> {
445    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
446    /// let mat = searcher.find("foobar")?;
447    /// assert_eq!(PatternID::ZERO, mat.pattern());
448    /// assert_eq!(0, mat.start());
449    /// assert_eq!(6, mat.end());
450    /// # Some(()) }
451    /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
452    /// #     example().unwrap()
453    /// # } else {
454    /// #     assert!(example().is_none());
455    /// # }
456    /// ```
457    #[inline]
458    pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
459        let haystack = haystack.as_ref();
460        self.find_in(haystack, Span::from(0..haystack.len()))
461    }
462
463    /// Return the first occurrence of any of the patterns in this searcher,
464    /// according to its match semantics, in the given haystack starting from
465    /// the given position.
466    ///
467    /// The `Match` returned will include the identifier of the pattern that
468    /// matched, which corresponds to the index of the pattern (starting from
469    /// `0`) in which it was added. The offsets in the `Match` will be relative
470    /// to the start of `haystack` (and not `at`).
471    ///
472    /// # Example
473    ///
474    /// Basic usage:
475    ///
476    /// ```
477    /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span};
478    ///
479    /// # fn example() -> Option<()> {
480    /// let haystack = "foofoobar";
481    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
482    /// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?;
483    /// assert_eq!(PatternID::ZERO, mat.pattern());
484    /// assert_eq!(3, mat.start());
485    /// assert_eq!(9, mat.end());
486    /// # Some(()) }
487    /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
488    /// #     example().unwrap()
489    /// # } else {
490    /// #     assert!(example().is_none());
491    /// # }
492    /// ```
493    #[inline]
494    pub fn find_in<B: AsRef<[u8]>>(
495        &self,
496        haystack: B,
497        span: Span,
498    ) -> Option<Match> {
499        let haystack = haystack.as_ref();
500        match self.search_kind {
501            SearchKind::Teddy(ref teddy) => {
502                if haystack[span].len() < teddy.minimum_len() {
503                    return self.find_in_slow(haystack, span);
504                }
505                teddy.find_at(
506                    &self.patterns,
507                    &haystack[..span.end],
508                    span.start,
509                )
510            }
511            SearchKind::RabinKarp => self.rabinkarp.find_at(
512                &self.patterns,
513                &haystack[..span.end],
514                span.start,
515            ),
516        }
517    }
518
519    /// Return an iterator of non-overlapping occurrences of the patterns in
520    /// this searcher, according to its match semantics, in the given haystack.
521    ///
522    /// # Example
523    ///
524    /// Basic usage:
525    ///
526    /// ```
527    /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
528    ///
529    /// # fn example() -> Option<()> {
530    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
531    /// let matches: Vec<PatternID> = searcher
532    ///     .find_iter("foobar fooba foofoo")
533    ///     .map(|mat| mat.pattern())
534    ///     .collect();
535    /// assert_eq!(vec![
536    ///     PatternID::must(0),
537    ///     PatternID::must(1),
538    ///     PatternID::must(1),
539    ///     PatternID::must(1),
540    /// ], matches);
541    /// # Some(()) }
542    /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
543    /// #     example().unwrap()
544    /// # } else {
545    /// #     assert!(example().is_none());
546    /// # }
547    /// ```
548    pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
549        &'a self,
550        haystack: &'b B,
551    ) -> FindIter<'a, 'b> {
552        let haystack = haystack.as_ref();
553        let span = Span::from(0..haystack.len());
554        FindIter { searcher: self, haystack, span }
555    }
556
557    /// Returns the match kind used by this packed searcher.
558    ///
559    /// # Examples
560    ///
561    /// Basic usage:
562    ///
563    /// ```
564    /// use aho_corasick::packed::{MatchKind, Searcher};
565    ///
566    /// # fn example() -> Option<()> {
567    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
568    /// // leftmost-first is the default.
569    /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
570    /// # Some(()) }
571    /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
572    /// #     example().unwrap()
573    /// # } else {
574    /// #     assert!(example().is_none());
575    /// # }
576    /// ```
577    pub fn match_kind(&self) -> &MatchKind {
578        self.patterns.match_kind()
579    }
580
581    /// Returns the minimum length of a haystack that is required in order for
582    /// packed searching to be effective.
583    ///
584    /// In some cases, the underlying packed searcher may not be able to search
585    /// very short haystacks. When that occurs, the implementation will defer
586    /// to a slower non-packed searcher (which is still generally faster than
587    /// Aho-Corasick for a small number of patterns). However, callers may
588    /// want to avoid ever using the slower variant, which one can do by
589    /// never passing a haystack shorter than the minimum length returned by
590    /// this method.
591    pub fn minimum_len(&self) -> usize {
592        self.minimum_len
593    }
594
595    /// Returns the approximate total amount of heap used by this searcher, in
596    /// units of bytes.
597    pub fn memory_usage(&self) -> usize {
598        self.patterns.memory_usage()
599            + self.rabinkarp.memory_usage()
600            + self.search_kind.memory_usage()
601    }
602
603    /// Use a slow (non-packed) searcher.
604    ///
605    /// This is useful when a packed searcher could be constructed, but could
606    /// not be used to search a specific haystack. For example, if Teddy was
607    /// built but the haystack is smaller than ~34 bytes, then Teddy might not
608    /// be able to run.
609    fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option<Match> {
610        self.rabinkarp.find_at(
611            &self.patterns,
612            &haystack[..span.end],
613            span.start,
614        )
615    }
616}
617
618impl SearchKind {
619    fn memory_usage(&self) -> usize {
620        match *self {
621            SearchKind::Teddy(ref ted) => ted.memory_usage(),
622            SearchKind::RabinKarp => 0,
623        }
624    }
625}
626
627/// An iterator over non-overlapping matches from a packed searcher.
628///
629/// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`],
630/// while the lifetime `'h` refers to the lifetime of the haystack being
631/// searched.
632#[derive(Debug)]
633pub struct FindIter<'s, 'h> {
634    searcher: &'s Searcher,
635    haystack: &'h [u8],
636    span: Span,
637}
638
639impl<'s, 'h> Iterator for FindIter<'s, 'h> {
640    type Item = Match;
641
642    fn next(&mut self) -> Option<Match> {
643        if self.span.start > self.span.end {
644            return None;
645        }
646        match self.searcher.find_in(&self.haystack, self.span) {
647            None => None,
648            Some(m) => {
649                self.span.start = m.end();
650                Some(m)
651            }
652        }
653    }
654}