aho_corasick/
dfa.rs

1/*!
2Provides direct access to a DFA implementation of Aho-Corasick.
3
4This is a low-level API that generally only needs to be used in niche
5circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6instead of a DFA directly. Using an `DFA` directly is typically only necessary
7when one needs access to the [`Automaton`] trait implementation.
8*/
9
10use alloc::{vec, vec::Vec};
11
12use crate::{
13    automaton::Automaton,
14    nfa::noncontiguous,
15    util::{
16        alphabet::ByteClasses,
17        error::{BuildError, MatchError},
18        int::{Usize, U32},
19        prefilter::Prefilter,
20        primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
21        search::{Anchored, MatchKind, StartKind},
22        special::Special,
23    },
24};
25
26/// A DFA implementation of Aho-Corasick.
27///
28/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
29/// this type directly. Using a `DFA` directly is typically only necessary when
30/// one needs access to the [`Automaton`] trait implementation.
31///
32/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
33/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
34/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
35///
36/// A DFA provides the best possible search performance (in this crate) via two
37/// mechanisms:
38///
39/// * All states use a dense representation for their transitions.
40/// * All failure transitions are pre-computed such that they are never
41/// explicitly handled at search time.
42///
43/// These two facts combined mean that every state transition is performed
44/// using a constant number of instructions. However, this comes at
45/// great cost. The memory usage of a DFA can be quite exorbitant.
46/// It is potentially multiple orders of magnitude greater than a
47/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
48/// a DFA will typically have better search speed than a `contiguous::NFA`, but
49/// not by orders of magnitude.
50///
51/// Unless you have a small number of patterns or memory usage is not a concern
52/// and search performance is critical, a DFA is usually not the best choice.
53///
54/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
55/// support for anchored and unanchored search configurations. Namely,
56/// since failure transitions are pre-computed, supporting both anchored
57/// and unanchored searches requires a duplication of the transition table,
58/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
59/// unconditionally support both anchored and unanchored searches because there
60/// is essentially no added cost for doing so.) It is for this reason that
61/// a DFA's support for anchored and unanchored searches can be configured
62/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
63/// searches.
64///
65/// # Example
66///
67/// This example shows how to build an `DFA` directly and use it to execute
68/// [`Automaton::try_find`]:
69///
70/// ```
71/// use aho_corasick::{
72///     automaton::Automaton,
73///     dfa::DFA,
74///     Input, Match,
75/// };
76///
77/// let patterns = &["b", "abc", "abcd"];
78/// let haystack = "abcd";
79///
80/// let nfa = DFA::new(patterns).unwrap();
81/// assert_eq!(
82///     Some(Match::must(0, 1..2)),
83///     nfa.try_find(&Input::new(haystack))?,
84/// );
85/// # Ok::<(), Box<dyn std::error::Error>>(())
86/// ```
87///
88/// It is also possible to implement your own version of `try_find`. See the
89/// [`Automaton`] documentation for an example.
90#[derive(Clone)]
91pub struct DFA {
92    /// The DFA transition table. IDs in this table are pre-multiplied. So
93    /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride,
94    /// 2*stride, 3*stride, ...
95    trans: Vec<StateID>,
96    /// The matches for every match state in this DFA. This is indexed by order
97    /// of match states in the DFA. Namely, as constructed, match states are
98    /// always laid out sequentially and contiguously in memory. Thus, after
99    /// converting a match state ID to a match state index, the indices are
100    /// all adjacent.
101    ///
102    /// More concretely, when a search enters a match state with id 'sid', then
103    /// the matching patterns are at 'matches[(sid >> stride2) - 2]'. The '- 2'
104    /// is to offset the first two states of a DFA: the dead and fail states.
105    matches: Vec<Vec<PatternID>>,
106    /// The amount of heap memory used, in bytes, by the inner Vecs of
107    /// 'matches'.
108    matches_memory_usage: usize,
109    /// The length of each pattern. This is used to compute the start offset
110    /// of a match.
111    pattern_lens: Vec<SmallIndex>,
112    /// A prefilter for accelerating searches, if one exists.
113    prefilter: Option<Prefilter>,
114    /// The match semantics built into this DFA.
115    match_kind: MatchKind,
116    /// The total number of states in this DFA.
117    state_len: usize,
118    /// The alphabet size, or total number of equivalence classes, for this
119    /// DFA. Note that the actual number of transitions in each state is
120    /// stride=2^stride2, where stride is the smallest power of 2 greater than
121    /// or equal to alphabet_len. We do things this way so that we can use
122    /// bitshifting to go from a state ID to an index into 'matches'.
123    alphabet_len: usize,
124    /// The exponent with a base 2, such that stride=2^stride2. Given a state
125    /// index 'i', its state identifier is 'i << stride2'. Given a state
126    /// identifier 'sid', its state index is 'sid >> stride2'.
127    stride2: usize,
128    /// The equivalence classes for this DFA. All transitions are defined on
129    /// equivalence classes and not on the 256 distinct byte values.
130    byte_classes: ByteClasses,
131    /// The length of the shortest pattern in this automaton.
132    min_pattern_len: usize,
133    /// The length of the longest pattern in this automaton.
134    max_pattern_len: usize,
135    /// The information required to deduce which states are "special" in this
136    /// DFA.
137    special: Special,
138}
139
140impl DFA {
141    /// Create a new Aho-Corasick DFA using the default configuration.
142    ///
143    /// Use a [`Builder`] if you want to change the configuration.
144    pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
145    where
146        I: IntoIterator<Item = P>,
147        P: AsRef<[u8]>,
148    {
149        DFA::builder().build(patterns)
150    }
151
152    /// A convenience method for returning a new Aho-Corasick DFA builder.
153    ///
154    /// This usually permits one to just import the `DFA` type.
155    pub fn builder() -> Builder {
156        Builder::new()
157    }
158}
159
160impl DFA {
161    /// A sentinel state ID indicating that a search should stop once it has
162    /// entered this state. When a search stops, it returns a match if one has
163    /// been found, otherwise no match. A DFA always has an actual dead state
164    /// at this ID.
165    ///
166    /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
167    /// Namely, the whole point of a DFA is that the FAIL state is completely
168    /// compiled away. That is, DFA construction involves pre-computing the
169    /// failure transitions everywhere, such that failure transitions are no
170    /// longer used at search time. This, combined with its uniformly dense
171    /// representation, are the two most important factors in why it's faster
172    /// than the NFAs in this crate.
173    const DEAD: StateID = StateID::new_unchecked(0);
174
175    /// Adds the given pattern IDs as matches to the given state and also
176    /// records the added memory usage.
177    fn set_matches(&mut self, sid: StateID, pids: &[PatternID]) {
178        use core::mem::size_of;
179
180        assert!(!pids.is_empty(), "match state must have non-empty pids");
181        let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap();
182        self.matches[index].extend_from_slice(pids);
183        self.matches_memory_usage += size_of::<PatternID>() * pids.len();
184    }
185}
186
187// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
188// returns a valid state ID given a valid state ID. We otherwise claim that
189// all other methods are correct as well.
190unsafe impl Automaton for DFA {
191    #[inline(always)]
192    fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
193        // Either of the start state IDs can be DEAD, in which case, support
194        // for that type of search is not provided by this DFA. Which start
195        // state IDs are inactive depends on the 'StartKind' configuration at
196        // DFA construction time.
197        match anchored {
198            Anchored::No => {
199                let start = self.special.start_unanchored_id;
200                if start == DFA::DEAD {
201                    Err(MatchError::invalid_input_unanchored())
202                } else {
203                    Ok(start)
204                }
205            }
206            Anchored::Yes => {
207                let start = self.special.start_anchored_id;
208                if start == DFA::DEAD {
209                    Err(MatchError::invalid_input_anchored())
210                } else {
211                    Ok(start)
212                }
213            }
214        }
215    }
216
217    #[inline(always)]
218    fn next_state(
219        &self,
220        _anchored: Anchored,
221        sid: StateID,
222        byte: u8,
223    ) -> StateID {
224        let class = self.byte_classes.get(byte);
225        self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
226    }
227
228    #[inline(always)]
229    fn is_special(&self, sid: StateID) -> bool {
230        sid <= self.special.max_special_id
231    }
232
233    #[inline(always)]
234    fn is_dead(&self, sid: StateID) -> bool {
235        sid == DFA::DEAD
236    }
237
238    #[inline(always)]
239    fn is_match(&self, sid: StateID) -> bool {
240        !self.is_dead(sid) && sid <= self.special.max_match_id
241    }
242
243    #[inline(always)]
244    fn is_start(&self, sid: StateID) -> bool {
245        sid == self.special.start_unanchored_id
246            || sid == self.special.start_anchored_id
247    }
248
249    #[inline(always)]
250    fn match_kind(&self) -> MatchKind {
251        self.match_kind
252    }
253
254    #[inline(always)]
255    fn patterns_len(&self) -> usize {
256        self.pattern_lens.len()
257    }
258
259    #[inline(always)]
260    fn pattern_len(&self, pid: PatternID) -> usize {
261        self.pattern_lens[pid].as_usize()
262    }
263
264    #[inline(always)]
265    fn min_pattern_len(&self) -> usize {
266        self.min_pattern_len
267    }
268
269    #[inline(always)]
270    fn max_pattern_len(&self) -> usize {
271        self.max_pattern_len
272    }
273
274    #[inline(always)]
275    fn match_len(&self, sid: StateID) -> usize {
276        debug_assert!(self.is_match(sid));
277        let offset = (sid.as_usize() >> self.stride2) - 2;
278        self.matches[offset].len()
279    }
280
281    #[inline(always)]
282    fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
283        debug_assert!(self.is_match(sid));
284        let offset = (sid.as_usize() >> self.stride2) - 2;
285        self.matches[offset][index]
286    }
287
288    #[inline(always)]
289    fn memory_usage(&self) -> usize {
290        use core::mem::size_of;
291
292        (self.trans.len() * size_of::<u32>())
293            + (self.matches.len() * size_of::<Vec<PatternID>>())
294            + self.matches_memory_usage
295            + (self.pattern_lens.len() * size_of::<SmallIndex>())
296            + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
297    }
298
299    #[inline(always)]
300    fn prefilter(&self) -> Option<&Prefilter> {
301        self.prefilter.as_ref()
302    }
303}
304
305impl core::fmt::Debug for DFA {
306    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
307        use crate::{
308            automaton::{fmt_state_indicator, sparse_transitions},
309            util::debug::DebugByte,
310        };
311
312        writeln!(f, "dfa::DFA(")?;
313        for index in 0..self.state_len {
314            let sid = StateID::new_unchecked(index << self.stride2);
315            // While we do currently include the FAIL state in the transition
316            // table (to simplify construction), it is never actually used. It
317            // poses problems with the code below because it gets treated as
318            // a match state incidentally when it is, of course, not. So we
319            // special case it. The fail state is always the first state after
320            // the dead state.
321            //
322            // If the construction is changed to remove the fail state (it
323            // probably should be), then this special case should be updated.
324            if index == 1 {
325                writeln!(f, "F {:06}:", sid.as_usize())?;
326                continue;
327            }
328            fmt_state_indicator(f, self, sid)?;
329            write!(f, "{:06}: ", sid.as_usize())?;
330
331            let it = (0..self.byte_classes.alphabet_len()).map(|class| {
332                (class.as_u8(), self.trans[sid.as_usize() + class])
333            });
334            for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
335                if i > 0 {
336                    write!(f, ", ")?;
337                }
338                if start == end {
339                    write!(
340                        f,
341                        "{:?} => {:?}",
342                        DebugByte(start),
343                        next.as_usize()
344                    )?;
345                } else {
346                    write!(
347                        f,
348                        "{:?}-{:?} => {:?}",
349                        DebugByte(start),
350                        DebugByte(end),
351                        next.as_usize()
352                    )?;
353                }
354            }
355            write!(f, "\n")?;
356            if self.is_match(sid) {
357                write!(f, " matches: ")?;
358                for i in 0..self.match_len(sid) {
359                    if i > 0 {
360                        write!(f, ", ")?;
361                    }
362                    let pid = self.match_pattern(sid, i);
363                    write!(f, "{}", pid.as_usize())?;
364                }
365                write!(f, "\n")?;
366            }
367        }
368        writeln!(f, "match kind: {:?}", self.match_kind)?;
369        writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
370        writeln!(f, "state length: {:?}", self.state_len)?;
371        writeln!(f, "pattern length: {:?}", self.patterns_len())?;
372        writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
373        writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
374        writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
375        writeln!(f, "stride: {:?}", 1 << self.stride2)?;
376        writeln!(f, "byte classes: {:?}", self.byte_classes)?;
377        writeln!(f, "memory usage: {:?}", self.memory_usage())?;
378        writeln!(f, ")")?;
379        Ok(())
380    }
381}
382
383/// A builder for configuring an Aho-Corasick DFA.
384///
385/// This builder has a subset of the options available to a
386/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
387/// their behavior is identical.
388#[derive(Clone, Debug)]
389pub struct Builder {
390    noncontiguous: noncontiguous::Builder,
391    start_kind: StartKind,
392    byte_classes: bool,
393}
394
395impl Default for Builder {
396    fn default() -> Builder {
397        Builder {
398            noncontiguous: noncontiguous::Builder::new(),
399            start_kind: StartKind::Unanchored,
400            byte_classes: true,
401        }
402    }
403}
404
405impl Builder {
406    /// Create a new builder for configuring an Aho-Corasick DFA.
407    pub fn new() -> Builder {
408        Builder::default()
409    }
410
411    /// Build an Aho-Corasick DFA from the given iterator of patterns.
412    ///
413    /// A builder may be reused to create more DFAs.
414    pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
415    where
416        I: IntoIterator<Item = P>,
417        P: AsRef<[u8]>,
418    {
419        let nnfa = self.noncontiguous.build(patterns)?;
420        self.build_from_noncontiguous(&nnfa)
421    }
422
423    /// Build an Aho-Corasick DFA from the given noncontiguous NFA.
424    ///
425    /// Note that when this method is used, only the `start_kind` and
426    /// `byte_classes` settings on this builder are respected. The other
427    /// settings only apply to the initial construction of the Aho-Corasick
428    /// automaton. Since using this method requires that initial construction
429    /// has already completed, all settings impacting only initial construction
430    /// are no longer relevant.
431    pub fn build_from_noncontiguous(
432        &self,
433        nnfa: &noncontiguous::NFA,
434    ) -> Result<DFA, BuildError> {
435        debug!("building DFA");
436        let byte_classes = if self.byte_classes {
437            nnfa.byte_classes().clone()
438        } else {
439            ByteClasses::singletons()
440        };
441        let state_len = match self.start_kind {
442            StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(),
443            StartKind::Both => {
444                // These unwraps are OK because we know that the number of
445                // NFA states is < StateID::LIMIT which is in turn less than
446                // i32::MAX. Thus, there is always room to multiply by 2.
447                // Finally, the number of states is always at least 4 in the
448                // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
449                // subtraction of 4 is okay.
450                //
451                // Note that we subtract 4 because the "anchored" part of
452                // the DFA duplicates the unanchored part (without failure
453                // transitions), but reuses the DEAD, FAIL and START states.
454                nnfa.states()
455                    .len()
456                    .checked_mul(2)
457                    .unwrap()
458                    .checked_sub(4)
459                    .unwrap()
460            }
461        };
462        let trans_len =
463            match state_len.checked_shl(byte_classes.stride2().as_u32()) {
464                Some(trans_len) => trans_len,
465                None => {
466                    return Err(BuildError::state_id_overflow(
467                        StateID::MAX.as_u64(),
468                        usize::MAX.as_u64(),
469                    ))
470                }
471            };
472        StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
473            .map_err(|e| {
474                BuildError::state_id_overflow(
475                    StateID::MAX.as_u64(),
476                    e.attempted(),
477                )
478            })?;
479        let num_match_states = match self.start_kind {
480            StartKind::Unanchored | StartKind::Anchored => {
481                nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap()
482            }
483            StartKind::Both => nnfa
484                .special()
485                .max_match_id
486                .as_usize()
487                .checked_sub(1)
488                .unwrap()
489                .checked_mul(2)
490                .unwrap(),
491        };
492        let mut dfa = DFA {
493            trans: vec![DFA::DEAD; trans_len],
494            matches: vec![vec![]; num_match_states],
495            matches_memory_usage: 0,
496            pattern_lens: nnfa.pattern_lens_raw().to_vec(),
497            prefilter: nnfa.prefilter().map(|p| p.clone()),
498            match_kind: nnfa.match_kind(),
499            state_len,
500            alphabet_len: byte_classes.alphabet_len(),
501            stride2: byte_classes.stride2(),
502            byte_classes,
503            min_pattern_len: nnfa.min_pattern_len(),
504            max_pattern_len: nnfa.max_pattern_len(),
505            // The special state IDs are set later.
506            special: Special::zero(),
507        };
508        match self.start_kind {
509            StartKind::Both => {
510                self.finish_build_both_starts(nnfa, &mut dfa);
511            }
512            StartKind::Unanchored => {
513                self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
514            }
515            StartKind::Anchored => {
516                self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
517            }
518        }
519        debug!(
520            "DFA built, <states: {:?}, size: {:?}, \
521             alphabet len: {:?}, stride: {:?}>",
522            dfa.state_len,
523            dfa.memory_usage(),
524            dfa.byte_classes.alphabet_len(),
525            dfa.byte_classes.stride(),
526        );
527        Ok(dfa)
528    }
529
530    /// Finishes building a DFA for either unanchored or anchored searches,
531    /// but NOT both.
532    fn finish_build_one_start(
533        &self,
534        anchored: Anchored,
535        nnfa: &noncontiguous::NFA,
536        dfa: &mut DFA,
537    ) {
538        // This function always succeeds because we check above that all of the
539        // states in the NFA can be mapped to DFA state IDs.
540        let stride2 = dfa.stride2;
541        let old2new = |oldsid: StateID| {
542            StateID::new_unchecked(oldsid.as_usize() << stride2)
543        };
544        for (oldsid, state) in nnfa.states().iter().with_state_ids() {
545            let newsid = old2new(oldsid);
546            if !state.matches.is_empty() {
547                dfa.set_matches(newsid, &state.matches);
548            }
549            sparse_iter(
550                state,
551                &dfa.byte_classes,
552                |byte, class, mut oldnextsid| {
553                    if oldnextsid == noncontiguous::NFA::FAIL {
554                        if anchored.is_anchored() {
555                            oldnextsid = noncontiguous::NFA::DEAD;
556                        } else {
557                            oldnextsid = nnfa.next_state(
558                                Anchored::No,
559                                state.fail,
560                                byte,
561                            );
562                        }
563                    }
564                    dfa.trans[newsid.as_usize() + usize::from(class)] =
565                        old2new(oldnextsid);
566                },
567            );
568        }
569        // Now that we've remapped all the IDs in our states, all that's left
570        // is remapping the special state IDs.
571        let old = nnfa.special();
572        let mut new = &mut dfa.special;
573        new.max_special_id = old2new(old.max_special_id);
574        new.max_match_id = old2new(old.max_match_id);
575        if anchored.is_anchored() {
576            new.start_unanchored_id = DFA::DEAD;
577            new.start_anchored_id = old2new(old.start_anchored_id);
578        } else {
579            new.start_unanchored_id = old2new(old.start_unanchored_id);
580            new.start_anchored_id = DFA::DEAD;
581        }
582    }
583
584    /// Finishes building a DFA that supports BOTH unanchored and anchored
585    /// searches. It works by inter-leaving unanchored states with anchored
586    /// states in the same transition table. This way, we avoid needing to
587    /// re-shuffle states afterward to ensure that our states still look like
588    /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
589    ///
590    /// Honestly this is pretty inscrutable... Simplifications are most
591    /// welcome.
592    fn finish_build_both_starts(
593        &self,
594        nnfa: &noncontiguous::NFA,
595        dfa: &mut DFA,
596    ) {
597        let stride2 = dfa.stride2;
598        let stride = 1 << stride2;
599        let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
600        let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
601        let mut is_anchored = vec![false; dfa.state_len];
602        let mut newsid = DFA::DEAD;
603        let next_dfa_id =
604            |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride);
605        for (oldsid, state) in nnfa.states().iter().with_state_ids() {
606            if oldsid == noncontiguous::NFA::DEAD
607                || oldsid == noncontiguous::NFA::FAIL
608            {
609                remap_unanchored[oldsid] = newsid;
610                remap_anchored[oldsid] = newsid;
611                newsid = next_dfa_id(newsid);
612            } else if oldsid == nnfa.special().start_unanchored_id
613                || oldsid == nnfa.special().start_anchored_id
614            {
615                if oldsid == nnfa.special().start_unanchored_id {
616                    remap_unanchored[oldsid] = newsid;
617                    remap_anchored[oldsid] = DFA::DEAD;
618                } else {
619                    remap_unanchored[oldsid] = DFA::DEAD;
620                    remap_anchored[oldsid] = newsid;
621                    is_anchored[newsid.as_usize() >> stride2] = true;
622                }
623                if !state.matches.is_empty() {
624                    dfa.set_matches(newsid, &state.matches);
625                }
626                sparse_iter(
627                    state,
628                    &dfa.byte_classes,
629                    |_, class, oldnextsid| {
630                        let class = usize::from(class);
631                        if oldnextsid == noncontiguous::NFA::FAIL {
632                            dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
633                        } else {
634                            dfa.trans[newsid.as_usize() + class] = oldnextsid;
635                        }
636                    },
637                );
638                newsid = next_dfa_id(newsid);
639            } else {
640                let unewsid = newsid;
641                newsid = next_dfa_id(newsid);
642                let anewsid = newsid;
643                newsid = next_dfa_id(newsid);
644
645                remap_unanchored[oldsid] = unewsid;
646                remap_anchored[oldsid] = anewsid;
647                is_anchored[anewsid.as_usize() >> stride2] = true;
648                if !state.matches.is_empty() {
649                    dfa.set_matches(unewsid, &state.matches);
650                    dfa.set_matches(anewsid, &state.matches);
651                }
652                sparse_iter(
653                    state,
654                    &dfa.byte_classes,
655                    |byte, class, oldnextsid| {
656                        let class = usize::from(class);
657                        if oldnextsid == noncontiguous::NFA::FAIL {
658                            dfa.trans[unewsid.as_usize() + class] = nnfa
659                                .next_state(Anchored::No, state.fail, byte);
660                        } else {
661                            dfa.trans[unewsid.as_usize() + class] = oldnextsid;
662                            dfa.trans[anewsid.as_usize() + class] = oldnextsid;
663                        }
664                    },
665                );
666            }
667        }
668        for i in 0..dfa.state_len {
669            let sid = i << stride2;
670            if is_anchored[i] {
671                for next in dfa.trans[sid..][..stride].iter_mut() {
672                    *next = remap_anchored[*next];
673                }
674            } else {
675                for next in dfa.trans[sid..][..stride].iter_mut() {
676                    *next = remap_unanchored[*next];
677                }
678            }
679        }
680        // Now that we've remapped all the IDs in our states, all that's left
681        // is remapping the special state IDs.
682        let old = nnfa.special();
683        let mut new = &mut dfa.special;
684        new.max_special_id = remap_anchored[old.max_special_id];
685        new.max_match_id = remap_anchored[old.max_match_id];
686        new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
687        new.start_anchored_id = remap_anchored[old.start_anchored_id];
688    }
689
690    /// Set the desired match semantics.
691    ///
692    /// This only applies when using [`Builder::build`] and not
693    /// [`Builder::build_from_noncontiguous`].
694    ///
695    /// See
696    /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
697    /// for more documentation and examples.
698    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
699        self.noncontiguous.match_kind(kind);
700        self
701    }
702
703    /// Enable ASCII-aware case insensitive matching.
704    ///
705    /// This only applies when using [`Builder::build`] and not
706    /// [`Builder::build_from_noncontiguous`].
707    ///
708    /// See
709    /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
710    /// for more documentation and examples.
711    pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
712        self.noncontiguous.ascii_case_insensitive(yes);
713        self
714    }
715
716    /// Enable heuristic prefilter optimizations.
717    ///
718    /// This only applies when using [`Builder::build`] and not
719    /// [`Builder::build_from_noncontiguous`].
720    ///
721    /// See
722    /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
723    /// for more documentation and examples.
724    pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
725        self.noncontiguous.prefilter(yes);
726        self
727    }
728
729    /// Sets the starting state configuration for the automaton.
730    ///
731    /// See
732    /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
733    /// for more documentation and examples.
734    pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
735        self.start_kind = kind;
736        self
737    }
738
739    /// A debug setting for whether to attempt to shrink the size of the
740    /// automaton's alphabet or not.
741    ///
742    /// This should never be enabled unless you're debugging an automaton.
743    /// Namely, disabling byte classes makes transitions easier to reason
744    /// about, since they use the actual bytes instead of equivalence classes.
745    /// Disabling this confers no performance benefit at search time.
746    ///
747    /// See
748    /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
749    /// for more documentation and examples.
750    pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
751        self.byte_classes = yes;
752        self
753    }
754}
755
756/// Iterate over all possible equivalence class transitions in this state.
757/// The closure is called for all transitions with a distinct equivalence
758/// class, even those not explicitly represented in this sparse state. For
759/// any implicitly defined transitions, the given closure is called with
760/// the fail state ID.
761///
762/// The closure is guaranteed to be called precisely
763/// `byte_classes.alphabet_len()` times, once for every possible class in
764/// ascending order.
765fn sparse_iter<F: FnMut(u8, u8, StateID)>(
766    state: &noncontiguous::State,
767    classes: &ByteClasses,
768    mut f: F,
769) {
770    let mut prev_class = None;
771    let mut byte = 0usize;
772    for &(b, id) in state.trans.iter() {
773        while byte < usize::from(b) {
774            let rep = byte.as_u8();
775            let class = classes.get(rep);
776            byte += 1;
777            if prev_class != Some(class) {
778                f(rep, class, noncontiguous::NFA::FAIL);
779                prev_class = Some(class);
780            }
781        }
782        let rep = b;
783        let class = classes.get(rep);
784        byte += 1;
785        if prev_class != Some(class) {
786            f(rep, class, id);
787            prev_class = Some(class);
788        }
789    }
790    for b in byte..=255 {
791        let rep = b.as_u8();
792        let class = classes.get(rep);
793        if prev_class != Some(class) {
794            f(rep, class, noncontiguous::NFA::FAIL);
795            prev_class = Some(class);
796        }
797    }
798}
aho_corasick/dfa.rs

aho_corasick/
dfa.rs