aho_corasick/packed/teddy/
runtime.rs

1// See the README in this directory for an explanation of the Teddy algorithm.
2// It is strongly recommended to peruse the README before trying to grok this
3// code, as its use of SIMD is pretty opaque, although I tried to add comments
4// where appropriate.
5//
6// Moreover, while there is a lot of code in this file, most of it is
7// repeated variants of the same thing. Specifically, there are three Teddy
8// variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets)
9// and Fat 256-bit Teddy (16 buckets). For each variant, there are three
10// implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to
11// a total of nine variants. Each one is structured roughly the same:
12//
13//     while at <= len(haystack) - CHUNK_SIZE:
14//         let candidate = find_candidate_in_chunk(haystack, at)
15//         if not all zeroes(candidate):
16//             if match = verify(haystack, at, candidate):
17//                 return match
18//
19// For the most part, this remains unchanged. The parts that vary are the
20// verification routine (for slim vs fat Teddy) and the candidate extraction
21// (based on the number of masks).
22//
23// In the code below, a "candidate" corresponds to a single vector with 8-bit
24// lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the
25// jth lane if and only if the byte occurring at position `j` is in the
26// bucket `i` (where the `j`th position is the position in the current window
27// of the haystack, which is always 16 or 32 bytes). Note to be careful here:
28// the ith bit and the jth lane correspond to the least significant bits of the
29// vector. So when visualizing how the current window of bytes is stored in a
30// vector, you often need to flip it around. For example, the text `abcd` in a
31// 4-byte vector would look like this:
32//
33//     01100100 01100011 01100010 01100001
34//         d        c        b        a
35//
36// When the mask length is 1, then finding the candidate is pretty straight
37// forward: you just apply the shuffle indices (from the haystack window) to
38// the masks, and then AND them together, as described in the README. But for
39// masks of length 2 and 3, you need to keep a little state. Specifically,
40// you need to store the final 1 (for mask length 2) or 2 (for mask length 3)
41// bytes of the candidate for use when searching the next window. This is for
42// handling matches that span two windows.
43//
44// With respect to the repeated code, it would likely be possible to reduce
45// the number of copies of code below using polymorphism, but I find this
46// formulation clearer instead of needing to reason through generics. However,
47// I admit, there may be a simpler generic construction that I'm missing.
48//
49// All variants are fairly heavily tested in src/packed/tests.rs.
50
51use core::{arch::x86_64::*, mem};
52
53use alloc::vec::Vec;
54
55use crate::{
56    packed::{
57        pattern::{PatternID, Patterns},
58        teddy::compile,
59        vector,
60    },
61    util::search::Match,
62};
63
64/// The Teddy runtime.
65///
66/// A Teddy runtime can be used to quickly search for occurrences of one or
67/// more patterns. While it does not scale to an arbitrary number of patterns
68/// like Aho-Corasick, it does find occurrences for a small set of patterns
69/// much more quickly than Aho-Corasick.
70///
71/// Teddy cannot run on small haystacks below a certain size, which is
72/// dependent on the type of matcher used. This size can be queried via the
73/// `minimum_len` method. Violating this will result in a panic.
74///
75/// Finally, when callers use a Teddy runtime, they must provide precisely the
76/// patterns used to construct the Teddy matcher. Violating this will result
77/// in either a panic or incorrect results, but will never sacrifice memory
78/// safety.
79#[derive(Clone, Debug)]
80pub struct Teddy {
81    /// The allocation of patterns in buckets. This only contains the IDs of
82    /// patterns. In order to do full verification, callers must provide the
83    /// actual patterns when using Teddy.
84    pub buckets: Vec<Vec<PatternID>>,
85    /// The maximum identifier of a pattern. This is used as a sanity check to
86    /// ensure that the patterns provided by the caller are the same as the
87    /// patterns that were used to compile the matcher. This sanity check
88    /// permits safely eliminating bounds checks regardless of what patterns
89    /// are provided by the caller.
90    ///
91    /// Note that users of the aho-corasick crate cannot get this wrong. Only
92    /// code internal to this crate can get it wrong, since neither `Patterns`
93    /// type nor the Teddy runtime are public API items.
94    pub max_pattern_id: PatternID,
95    /// The actual runtime to use.
96    pub exec: Exec,
97}
98
99impl Teddy {
100    /// Return the first occurrence of a match in the given haystack after or
101    /// starting at `at`.
102    ///
103    /// The patterns provided must be precisely the same patterns given to the
104    /// Teddy builder, otherwise this may panic or produce incorrect results.
105    ///
106    /// All matches are consistent with the match semantics (leftmost-first or
107    /// leftmost-longest) set on `pats`.
108    pub fn find_at(
109        &self,
110        pats: &Patterns,
111        haystack: &[u8],
112        at: usize,
113    ) -> Option<Match> {
114        // This assert is a bit subtle, but it's an important guarantee.
115        // Namely, if the maximum pattern ID seen by Teddy is the same as the
116        // one in the patterns given, then we are guaranteed that every pattern
117        // ID in all Teddy buckets are valid indices into `pats`. While this
118        // is nominally true, there is no guarantee that callers provide the
119        // same `pats` to both the Teddy builder and the searcher, which would
120        // otherwise make `find_at` unsafe to call. But this assert lets us
121        // keep this routine safe and eliminate an important bounds check in
122        // verification.
123        assert_eq!(
124            self.max_pattern_id,
125            pats.max_pattern_id(),
126            "teddy must be called with same patterns it was built with",
127        );
128        // SAFETY: The haystack must have at least a minimum number of bytes
129        // for Teddy to be able to work. The minimum number varies depending on
130        // which matcher is used below. If this is violated, then it's possible
131        // for searching to do out-of-bounds writes.
132        assert!(haystack[at..].len() >= self.minimum_len());
133        // SAFETY: The various Teddy matchers are always safe to call because
134        // the Teddy builder guarantees that a particular Exec variant is
135        // built only when it can be run the current CPU. That is, the Teddy
136        // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is
137        // enabled. That is, our dynamic CPU feature detection is performed
138        // once in the builder, and we rely on the type system to avoid needing
139        // to do it again.
140        unsafe {
141            match self.exec {
142                Exec::TeddySlim1Mask128(ref e) => {
143                    e.find_at(pats, self, haystack, at)
144                }
145                Exec::TeddySlim1Mask256(ref e) => {
146                    e.find_at(pats, self, haystack, at)
147                }
148                Exec::TeddyFat1Mask256(ref e) => {
149                    e.find_at(pats, self, haystack, at)
150                }
151                Exec::TeddySlim2Mask128(ref e) => {
152                    e.find_at(pats, self, haystack, at)
153                }
154                Exec::TeddySlim2Mask256(ref e) => {
155                    e.find_at(pats, self, haystack, at)
156                }
157                Exec::TeddyFat2Mask256(ref e) => {
158                    e.find_at(pats, self, haystack, at)
159                }
160                Exec::TeddySlim3Mask128(ref e) => {
161                    e.find_at(pats, self, haystack, at)
162                }
163                Exec::TeddySlim3Mask256(ref e) => {
164                    e.find_at(pats, self, haystack, at)
165                }
166                Exec::TeddyFat3Mask256(ref e) => {
167                    e.find_at(pats, self, haystack, at)
168                }
169                Exec::TeddySlim4Mask128(ref e) => {
170                    e.find_at(pats, self, haystack, at)
171                }
172                Exec::TeddySlim4Mask256(ref e) => {
173                    e.find_at(pats, self, haystack, at)
174                }
175                Exec::TeddyFat4Mask256(ref e) => {
176                    e.find_at(pats, self, haystack, at)
177                }
178            }
179        }
180    }
181
182    /// Returns the minimum length of a haystack that must be provided by
183    /// callers to this Teddy searcher. Providing a haystack shorter than this
184    /// will result in a panic, but will never violate memory safety.
185    pub fn minimum_len(&self) -> usize {
186        // SAFETY: These values must be correct in order to ensure safety.
187        // The Teddy runtime assumes their haystacks have at least these
188        // lengths. Violating this will sacrifice memory safety.
189        match self.exec {
190            Exec::TeddySlim1Mask128(_) => 16,
191            Exec::TeddySlim1Mask256(_) => 32,
192            Exec::TeddyFat1Mask256(_) => 16,
193            Exec::TeddySlim2Mask128(_) => 17,
194            Exec::TeddySlim2Mask256(_) => 33,
195            Exec::TeddyFat2Mask256(_) => 17,
196            Exec::TeddySlim3Mask128(_) => 18,
197            Exec::TeddySlim3Mask256(_) => 34,
198            Exec::TeddyFat3Mask256(_) => 18,
199            Exec::TeddySlim4Mask128(_) => 19,
200            Exec::TeddySlim4Mask256(_) => 35,
201            Exec::TeddyFat4Mask256(_) => 19,
202        }
203    }
204
205    /// Returns the approximate total amount of heap used by this searcher, in
206    /// units of bytes.
207    pub fn memory_usage(&self) -> usize {
208        let num_patterns = self.max_pattern_id as usize + 1;
209        self.buckets.len() * mem::size_of::<Vec<PatternID>>()
210            + num_patterns * mem::size_of::<PatternID>()
211    }
212
213    /// Runs the verification routine for Slim 128-bit Teddy.
214    ///
215    /// The candidate given should be a collection of 8-bit bitsets (one bitset
216    /// per lane), where the ith bit is set in the jth lane if and only if the
217    /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
218    ///
219    /// This is not safe to call unless the SSSE3 target feature is enabled.
220    /// The `target_feature` attribute is not applied since this function is
221    /// always forcefully inlined.
222    #[inline(always)]
223    unsafe fn verify128(
224        &self,
225        pats: &Patterns,
226        haystack: &[u8],
227        at: usize,
228        cand: __m128i,
229    ) -> Option<Match> {
230        debug_assert!(!vector::is_all_zeroes128(cand));
231        debug_assert_eq!(8, self.buckets.len());
232
233        // Convert the candidate into 64-bit chunks, and then verify each of
234        // those chunks.
235        let parts = vector::unpack64x128(cand);
236        for (i, &part) in parts.iter().enumerate() {
237            let pos = at + i * 8;
238            if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
239                return Some(m);
240            }
241        }
242        None
243    }
244
245    /// Runs the verification routine for Slim 256-bit Teddy.
246    ///
247    /// The candidate given should be a collection of 8-bit bitsets (one bitset
248    /// per lane), where the ith bit is set in the jth lane if and only if the
249    /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
250    ///
251    /// This is not safe to call unless the AVX2 target feature is enabled.
252    /// The `target_feature` attribute is not applied since this function is
253    /// always forcefully inlined.
254    #[inline(always)]
255    unsafe fn verify256(
256        &self,
257        pats: &Patterns,
258        haystack: &[u8],
259        at: usize,
260        cand: __m256i,
261    ) -> Option<Match> {
262        debug_assert!(!vector::is_all_zeroes256(cand));
263        debug_assert_eq!(8, self.buckets.len());
264
265        // Convert the candidate into 64-bit chunks, and then verify each of
266        // those chunks.
267        let parts = vector::unpack64x256(cand);
268        let mut pos = at;
269        if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[0]) {
270            return Some(m);
271        }
272        pos += 8;
273        if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[1]) {
274            return Some(m);
275        }
276        pos += 8;
277        if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[2]) {
278            return Some(m);
279        }
280        pos += 8;
281        if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[3]) {
282            return Some(m);
283        }
284        None
285    }
286
287    /// Runs the verification routine for Fat 256-bit Teddy.
288    ///
289    /// The candidate given should be a collection of 8-bit bitsets (one bitset
290    /// per lane), where the ith bit is set in the jth lane if and only if the
291    /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the
292    /// bucket `j < 16 ? i : i + 8`.
293    ///
294    /// This is not safe to call unless the AVX2 target feature is enabled.
295    /// The `target_feature` attribute is not applied since this function is
296    /// always forcefully inlined.
297    #[inline(always)]
298    unsafe fn verify_fat256(
299        &self,
300        pats: &Patterns,
301        haystack: &[u8],
302        at: usize,
303        cand: __m256i,
304    ) -> Option<Match> {
305        debug_assert!(!vector::is_all_zeroes256(cand));
306        debug_assert_eq!(16, self.buckets.len());
307
308        // This is a bit tricky, but we basically want to convert our
309        // candidate, which looks like this
310        //
311        //     a31 a30 ... a17 a16 a15 a14 ... a01 a00
312        //
313        // where each a(i) is an 8-bit bitset corresponding to the activated
314        // buckets, to this
315        //
316        //     a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
317        //
318        // Namely, for Fat Teddy, the high 128-bits of the candidate correspond
319        // to the same bytes in the haystack in the low 128-bits (so we only
320        // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
321        //
322        // The verification routine wants to look at all potentially matching
323        // buckets before moving on to the next lane. So for example, both
324        // a16 and a00 both correspond to the first byte in our window; a00
325        // contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
326        // a16 should be checked before a01. So the transformation shown above
327        // allows us to use our normal verification procedure with one small
328        // change: we treat each bitset as 16 bits instead of 8 bits.
329
330        // Swap the 128-bit lanes in the candidate vector.
331        let swap = _mm256_permute4x64_epi64(cand, 0x4E);
332        // Interleave the bytes from the low 128-bit lanes, starting with
333        // cand first.
334        let r1 = _mm256_unpacklo_epi8(cand, swap);
335        // Interleave the bytes from the high 128-bit lanes, starting with
336        // cand first.
337        let r2 = _mm256_unpackhi_epi8(cand, swap);
338        // Now just take the 2 low 64-bit integers from both r1 and r2. We
339        // can drop the high 64-bit integers because they are a mirror image
340        // of the low 64-bit integers. All we care about are the low 128-bit
341        // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
342        // laid out in the desired order, as described above.
343        let parts = vector::unpacklo64x256(r1, r2);
344        for (i, &part) in parts.iter().enumerate() {
345            let pos = at + i * 4;
346            if let Some(m) = self.verify64(pats, 16, haystack, pos, part) {
347                return Some(m);
348            }
349        }
350        None
351    }
352
353    /// Verify whether there are any matches starting at or after `at` in the
354    /// given `haystack`. The candidate given should correspond to either 8-bit
355    /// (for 8 buckets) or 16-bit (16 buckets) bitsets.
356    #[inline(always)]
357    fn verify64(
358        &self,
359        pats: &Patterns,
360        bucket_count: usize,
361        haystack: &[u8],
362        at: usize,
363        mut cand: u64,
364    ) -> Option<Match> {
365        // N.B. While the bucket count is known from self.buckets.len(),
366        // requiring it as a parameter makes it easier for the optimizer to
367        // know its value, and thus produce more efficient codegen.
368        debug_assert!(bucket_count == 8 || bucket_count == 16);
369        while cand != 0 {
370            let bit = cand.trailing_zeros() as usize;
371            cand &= !(1 << bit);
372
373            let at = at + (bit / bucket_count);
374            let bucket = bit % bucket_count;
375            if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) {
376                return Some(m);
377            }
378        }
379        None
380    }
381
382    /// Verify whether there are any matches starting at `at` in the given
383    /// `haystack` corresponding only to patterns in the given bucket.
384    #[inline(always)]
385    fn verify_bucket(
386        &self,
387        pats: &Patterns,
388        haystack: &[u8],
389        bucket: usize,
390        at: usize,
391    ) -> Option<Match> {
392        // Forcing this function to not inline and be "cold" seems to help
393        // the codegen for Teddy overall. Interestingly, this is good for a
394        // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among
395        // others). Overall, this seems like a problem with codegen, since
396        // creating the Match itself is a very small amount of code.
397        #[cold]
398        #[inline(never)]
399        fn match_from_span(
400            pati: PatternID,
401            start: usize,
402            end: usize,
403        ) -> Match {
404            Match::must(pati as usize, start..end)
405        }
406
407        // N.B. The bounds check for this bucket lookup *should* be elided
408        // since we assert the number of buckets in each `find_at` routine,
409        // and the compiler can prove that the `% 8` (or `% 16`) in callers
410        // of this routine will always be in bounds.
411        for &pati in &self.buckets[bucket] {
412            // SAFETY: This is safe because we are guaranteed that every
413            // index in a Teddy bucket is a valid index into `pats`. This
414            // guarantee is upheld by the assert checking `max_pattern_id` in
415            // the beginning of `find_at` above.
416            //
417            // This explicit bounds check elision is (amazingly) good for a
418            // 25-50% boost in some benchmarks, particularly ones with a lot
419            // of short literals.
420            let pat = unsafe { pats.get_unchecked(pati) };
421            if pat.is_prefix(&haystack[at..]) {
422                return Some(match_from_span(pati, at, at + pat.len()));
423            }
424        }
425        None
426    }
427}
428
429/// Exec represents the different search strategies supported by the Teddy
430/// runtime.
431///
432/// This enum is an important safety abstraction. Namely, callers should only
433/// construct a variant in this enum if it is safe to execute its corresponding
434/// target features on the current CPU. The 128-bit searchers require SSSE3,
435/// while the 256-bit searchers require AVX2.
436#[derive(Clone, Debug)]
437pub enum Exec {
438    TeddySlim1Mask128(TeddySlim1Mask128),
439    TeddySlim1Mask256(TeddySlim1Mask256),
440    TeddyFat1Mask256(TeddyFat1Mask256),
441    TeddySlim2Mask128(TeddySlim2Mask128),
442    TeddySlim2Mask256(TeddySlim2Mask256),
443    TeddyFat2Mask256(TeddyFat2Mask256),
444    TeddySlim3Mask128(TeddySlim3Mask128),
445    TeddySlim3Mask256(TeddySlim3Mask256),
446    TeddyFat3Mask256(TeddyFat3Mask256),
447    TeddySlim4Mask128(TeddySlim4Mask128),
448    TeddySlim4Mask256(TeddySlim4Mask256),
449    TeddyFat4Mask256(TeddyFat4Mask256),
450}
451
452// Most of the code below remains undocumented because they are effectively
453// repeated versions of themselves. The general structure is described in the
454// README and in the comments above.
455
456#[derive(Clone, Debug)]
457pub struct TeddySlim1Mask128 {
458    pub mask1: Mask128,
459}
460
461impl TeddySlim1Mask128 {
462    #[target_feature(enable = "ssse3")]
463    unsafe fn find_at(
464        &self,
465        pats: &Patterns,
466        teddy: &Teddy,
467        haystack: &[u8],
468        mut at: usize,
469    ) -> Option<Match> {
470        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
471        // This assert helps eliminate bounds checks for bucket lookups in
472        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
473        assert_eq!(8, teddy.buckets.len());
474
475        let len = haystack.len();
476        while at <= len - 16 {
477            let c = self.candidate(haystack, at);
478            if !vector::is_all_zeroes128(c) {
479                if let Some(m) = teddy.verify128(pats, haystack, at, c) {
480                    return Some(m);
481                }
482            }
483            at += 16;
484        }
485        if at < len {
486            at = len - 16;
487            let c = self.candidate(haystack, at);
488            if !vector::is_all_zeroes128(c) {
489                if let Some(m) = teddy.verify128(pats, haystack, at, c) {
490                    return Some(m);
491                }
492            }
493        }
494        None
495    }
496
497    #[inline(always)]
498    unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i {
499        debug_assert!(haystack[at..].len() >= 16);
500
501        let chunk = vector::loadu128(haystack, at);
502        members1m128(chunk, self.mask1)
503    }
504}
505
506#[derive(Clone, Debug)]
507pub struct TeddySlim1Mask256 {
508    pub mask1: Mask256,
509}
510
511impl TeddySlim1Mask256 {
512    #[target_feature(enable = "avx2")]
513    unsafe fn find_at(
514        &self,
515        pats: &Patterns,
516        teddy: &Teddy,
517        haystack: &[u8],
518        mut at: usize,
519    ) -> Option<Match> {
520        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
521        // This assert helps eliminate bounds checks for bucket lookups in
522        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
523        assert_eq!(8, teddy.buckets.len());
524
525        let len = haystack.len();
526        while at <= len - 32 {
527            let c = self.candidate(haystack, at);
528            if !vector::is_all_zeroes256(c) {
529                if let Some(m) = teddy.verify256(pats, haystack, at, c) {
530                    return Some(m);
531                }
532            }
533            at += 32;
534        }
535        if at < len {
536            at = len - 32;
537            let c = self.candidate(haystack, at);
538            if !vector::is_all_zeroes256(c) {
539                if let Some(m) = teddy.verify256(pats, haystack, at, c) {
540                    return Some(m);
541                }
542            }
543        }
544        None
545    }
546
547    #[inline(always)]
548    unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
549        debug_assert!(haystack[at..].len() >= 32);
550
551        let chunk = vector::loadu256(haystack, at);
552        members1m256(chunk, self.mask1)
553    }
554}
555
556#[derive(Clone, Debug)]
557pub struct TeddyFat1Mask256 {
558    pub mask1: Mask256,
559}
560
561impl TeddyFat1Mask256 {
562    #[target_feature(enable = "avx2")]
563    unsafe fn find_at(
564        &self,
565        pats: &Patterns,
566        teddy: &Teddy,
567        haystack: &[u8],
568        mut at: usize,
569    ) -> Option<Match> {
570        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
571        // This assert helps eliminate bounds checks for bucket lookups in
572        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
573        assert_eq!(16, teddy.buckets.len());
574
575        let len = haystack.len();
576        while at <= len - 16 {
577            let c = self.candidate(haystack, at);
578            if !vector::is_all_zeroes256(c) {
579                if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
580                    return Some(m);
581                }
582            }
583            at += 16;
584        }
585        if at < len {
586            at = len - 16;
587            let c = self.candidate(haystack, at);
588            if !vector::is_all_zeroes256(c) {
589                if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
590                    return Some(m);
591                }
592            }
593        }
594        None
595    }
596
597    #[inline(always)]
598    unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
599        debug_assert!(haystack[at..].len() >= 16);
600
601        let chunk =
602            _mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
603        members1m256(chunk, self.mask1)
604    }
605}
606
607#[derive(Clone, Debug)]
608pub struct TeddySlim2Mask128 {
609    pub mask1: Mask128,
610    pub mask2: Mask128,
611}
612
613impl TeddySlim2Mask128 {
614    #[target_feature(enable = "ssse3")]
615    unsafe fn find_at(
616        &self,
617        pats: &Patterns,
618        teddy: &Teddy,
619        haystack: &[u8],
620        mut at: usize,
621    ) -> Option<Match> {
622        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
623        // This assert helps eliminate bounds checks for bucket lookups in
624        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
625        assert_eq!(8, teddy.buckets.len());
626
627        at += 1;
628        let len = haystack.len();
629        let mut prev0 = vector::ones128();
630        while at <= len - 16 {
631            let c = self.candidate(haystack, at, &mut prev0);
632            if !vector::is_all_zeroes128(c) {
633                if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
634                    return Some(m);
635                }
636            }
637            at += 16;
638        }
639        if at < len {
640            at = len - 16;
641            prev0 = vector::ones128();
642
643            let c = self.candidate(haystack, at, &mut prev0);
644            if !vector::is_all_zeroes128(c) {
645                if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
646                    return Some(m);
647                }
648            }
649        }
650        None
651    }
652
653    #[inline(always)]
654    unsafe fn candidate(
655        &self,
656        haystack: &[u8],
657        at: usize,
658        prev0: &mut __m128i,
659    ) -> __m128i {
660        debug_assert!(haystack[at..].len() >= 16);
661
662        let chunk = vector::loadu128(haystack, at);
663        let (res0, res1) = members2m128(chunk, self.mask1, self.mask2);
664        let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15);
665        _mm_and_si128(res0prev0, res1)
666    }
667}
668
669#[derive(Clone, Debug)]
670pub struct TeddySlim2Mask256 {
671    pub mask1: Mask256,
672    pub mask2: Mask256,
673}
674
675impl TeddySlim2Mask256 {
676    #[target_feature(enable = "avx2")]
677    unsafe fn find_at(
678        &self,
679        pats: &Patterns,
680        teddy: &Teddy,
681        haystack: &[u8],
682        mut at: usize,
683    ) -> Option<Match> {
684        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
685        // This assert helps eliminate bounds checks for bucket lookups in
686        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
687        assert_eq!(8, teddy.buckets.len());
688
689        at += 1;
690        let len = haystack.len();
691        let mut prev0 = vector::ones256();
692        while at <= len - 32 {
693            let c = self.candidate(haystack, at, &mut prev0);
694            if !vector::is_all_zeroes256(c) {
695                if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
696                    return Some(m);
697                }
698            }
699            at += 32;
700        }
701        if at < len {
702            at = len - 32;
703            prev0 = vector::ones256();
704
705            let c = self.candidate(haystack, at, &mut prev0);
706            if !vector::is_all_zeroes256(c) {
707                if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
708                    return Some(m);
709                }
710            }
711        }
712        None
713    }
714
715    #[inline(always)]
716    unsafe fn candidate(
717        &self,
718        haystack: &[u8],
719        at: usize,
720        prev0: &mut __m256i,
721    ) -> __m256i {
722        debug_assert!(haystack[at..].len() >= 32);
723
724        let chunk = vector::loadu256(haystack, at);
725        let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
726        let res0prev0 = vector::alignr256_15(res0, *prev0);
727        let res = _mm256_and_si256(res0prev0, res1);
728        *prev0 = res0;
729        res
730    }
731}
732
733#[derive(Clone, Debug)]
734pub struct TeddyFat2Mask256 {
735    pub mask1: Mask256,
736    pub mask2: Mask256,
737}
738
739impl TeddyFat2Mask256 {
740    #[target_feature(enable = "avx2")]
741    unsafe fn find_at(
742        &self,
743        pats: &Patterns,
744        teddy: &Teddy,
745        haystack: &[u8],
746        mut at: usize,
747    ) -> Option<Match> {
748        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
749        // This assert helps eliminate bounds checks for bucket lookups in
750        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
751        assert_eq!(16, teddy.buckets.len());
752
753        at += 1;
754        let len = haystack.len();
755        let mut prev0 = vector::ones256();
756        while at <= len - 16 {
757            let c = self.candidate(haystack, at, &mut prev0);
758            if !vector::is_all_zeroes256(c) {
759                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
760                {
761                    return Some(m);
762                }
763            }
764            at += 16;
765        }
766        if at < len {
767            at = len - 16;
768            prev0 = vector::ones256();
769
770            let c = self.candidate(haystack, at, &mut prev0);
771            if !vector::is_all_zeroes256(c) {
772                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
773                {
774                    return Some(m);
775                }
776            }
777        }
778        None
779    }
780
781    #[inline(always)]
782    unsafe fn candidate(
783        &self,
784        haystack: &[u8],
785        at: usize,
786        prev0: &mut __m256i,
787    ) -> __m256i {
788        debug_assert!(haystack[at..].len() >= 16);
789
790        let chunk =
791            _mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
792        let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
793        let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15);
794        let res = _mm256_and_si256(res0prev0, res1);
795        *prev0 = res0;
796        res
797    }
798}
799
800#[derive(Clone, Debug)]
801pub struct TeddySlim3Mask128 {
802    pub mask1: Mask128,
803    pub mask2: Mask128,
804    pub mask3: Mask128,
805}
806
807impl TeddySlim3Mask128 {
808    #[target_feature(enable = "ssse3")]
809    unsafe fn find_at(
810        &self,
811        pats: &Patterns,
812        teddy: &Teddy,
813        haystack: &[u8],
814        mut at: usize,
815    ) -> Option<Match> {
816        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
817        // This assert helps eliminate bounds checks for bucket lookups in
818        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
819        assert_eq!(8, teddy.buckets.len());
820
821        at += 2;
822        let len = haystack.len();
823        let (mut prev0, mut prev1) = (vector::ones128(), vector::ones128());
824        while at <= len - 16 {
825            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
826            if !vector::is_all_zeroes128(c) {
827                if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
828                    return Some(m);
829                }
830            }
831            at += 16;
832        }
833        if at < len {
834            at = len - 16;
835            prev0 = vector::ones128();
836            prev1 = vector::ones128();
837
838            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
839            if !vector::is_all_zeroes128(c) {
840                if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
841                    return Some(m);
842                }
843            }
844        }
845        None
846    }
847
848    #[inline(always)]
849    unsafe fn candidate(
850        &self,
851        haystack: &[u8],
852        at: usize,
853        prev0: &mut __m128i,
854        prev1: &mut __m128i,
855    ) -> __m128i {
856        debug_assert!(haystack[at..].len() >= 16);
857
858        let chunk = vector::loadu128(haystack, at);
859        let (res0, res1, res2) =
860            members3m128(chunk, self.mask1, self.mask2, self.mask3);
861        let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14);
862        let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15);
863        let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2);
864        *prev0 = res0;
865        *prev1 = res1;
866        res
867    }
868}
869
870#[derive(Clone, Debug)]
871pub struct TeddySlim3Mask256 {
872    pub mask1: Mask256,
873    pub mask2: Mask256,
874    pub mask3: Mask256,
875}
876
877impl TeddySlim3Mask256 {
878    #[target_feature(enable = "avx2")]
879    unsafe fn find_at(
880        &self,
881        pats: &Patterns,
882        teddy: &Teddy,
883        haystack: &[u8],
884        mut at: usize,
885    ) -> Option<Match> {
886        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
887        // This assert helps eliminate bounds checks for bucket lookups in
888        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
889        assert_eq!(8, teddy.buckets.len());
890
891        at += 2;
892        let len = haystack.len();
893        let (mut prev0, mut prev1) = (vector::ones256(), vector::ones256());
894        while at <= len - 32 {
895            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
896            if !vector::is_all_zeroes256(c) {
897                if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
898                    return Some(m);
899                }
900            }
901            at += 32;
902        }
903        if at < len {
904            at = len - 32;
905            prev0 = vector::ones256();
906            prev1 = vector::ones256();
907
908            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
909            if !vector::is_all_zeroes256(c) {
910                if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
911                    return Some(m);
912                }
913            }
914        }
915        None
916    }
917
918    #[inline(always)]
919    unsafe fn candidate(
920        &self,
921        haystack: &[u8],
922        at: usize,
923        prev0: &mut __m256i,
924        prev1: &mut __m256i,
925    ) -> __m256i {
926        debug_assert!(haystack[at..].len() >= 32);
927
928        let chunk = vector::loadu256(haystack, at);
929        let (res0, res1, res2) =
930            members3m256(chunk, self.mask1, self.mask2, self.mask3);
931        let res0prev0 = vector::alignr256_14(res0, *prev0);
932        let res1prev1 = vector::alignr256_15(res1, *prev1);
933        let res =
934            _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
935        *prev0 = res0;
936        *prev1 = res1;
937        res
938    }
939}
940
941#[derive(Clone, Debug)]
942pub struct TeddyFat3Mask256 {
943    pub mask1: Mask256,
944    pub mask2: Mask256,
945    pub mask3: Mask256,
946}
947
948impl TeddyFat3Mask256 {
949    #[target_feature(enable = "avx2")]
950    unsafe fn find_at(
951        &self,
952        pats: &Patterns,
953        teddy: &Teddy,
954        haystack: &[u8],
955        mut at: usize,
956    ) -> Option<Match> {
957        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
958        // This assert helps eliminate bounds checks for bucket lookups in
959        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
960        assert_eq!(16, teddy.buckets.len());
961
962        at += 2;
963        let len = haystack.len();
964        let (mut prev0, mut prev1) = (vector::ones256(), vector::ones256());
965        while at <= len - 16 {
966            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
967            if !vector::is_all_zeroes256(c) {
968                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
969                {
970                    return Some(m);
971                }
972            }
973            at += 16;
974        }
975        if at < len {
976            at = len - 16;
977            prev0 = vector::ones256();
978            prev1 = vector::ones256();
979
980            let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
981            if !vector::is_all_zeroes256(c) {
982                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
983                {
984                    return Some(m);
985                }
986            }
987        }
988        None
989    }
990
991    #[inline(always)]
992    unsafe fn candidate(
993        &self,
994        haystack: &[u8],
995        at: usize,
996        prev0: &mut __m256i,
997        prev1: &mut __m256i,
998    ) -> __m256i {
999        debug_assert!(haystack[at..].len() >= 16);
1000
1001        let chunk =
1002            _mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
1003        let (res0, res1, res2) =
1004            members3m256(chunk, self.mask1, self.mask2, self.mask3);
1005        let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14);
1006        let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15);
1007        let res =
1008            _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
1009        *prev0 = res0;
1010        *prev1 = res1;
1011        res
1012    }
1013}
1014
1015#[derive(Clone, Debug)]
1016pub struct TeddySlim4Mask128 {
1017    pub mask1: Mask128,
1018    pub mask2: Mask128,
1019    pub mask3: Mask128,
1020    pub mask4: Mask128,
1021}
1022
1023impl TeddySlim4Mask128 {
1024    #[target_feature(enable = "ssse3")]
1025    unsafe fn find_at(
1026        &self,
1027        pats: &Patterns,
1028        teddy: &Teddy,
1029        haystack: &[u8],
1030        mut at: usize,
1031    ) -> Option<Match> {
1032        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
1033        // This assert helps eliminate bounds checks for bucket lookups in
1034        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
1035        assert_eq!(8, teddy.buckets.len());
1036
1037        at += 3;
1038        let len = haystack.len();
1039        let mut prev0 = vector::ones128();
1040        let mut prev1 = vector::ones128();
1041        let mut prev2 = vector::ones128();
1042        while at <= len - 16 {
1043            let c = self
1044                .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1045            if !vector::is_all_zeroes128(c) {
1046                if let Some(m) = teddy.verify128(pats, haystack, at - 3, c) {
1047                    return Some(m);
1048                }
1049            }
1050            at += 16;
1051        }
1052        if at < len {
1053            at = len - 16;
1054            prev0 = vector::ones128();
1055            prev1 = vector::ones128();
1056            prev2 = vector::ones128();
1057
1058            let c = self
1059                .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1060            if !vector::is_all_zeroes128(c) {
1061                if let Some(m) = teddy.verify128(pats, haystack, at - 3, c) {
1062                    return Some(m);
1063                }
1064            }
1065        }
1066        None
1067    }
1068
1069    #[inline(always)]
1070    unsafe fn candidate(
1071        &self,
1072        haystack: &[u8],
1073        at: usize,
1074        prev0: &mut __m128i,
1075        prev1: &mut __m128i,
1076        prev2: &mut __m128i,
1077    ) -> __m128i {
1078        debug_assert!(haystack[at..].len() >= 16);
1079
1080        let chunk = vector::loadu128(haystack, at);
1081        let (res0, res1, res2, res3) = members4m128(
1082            chunk, self.mask1, self.mask2, self.mask3, self.mask4,
1083        );
1084        let res0prev0 = _mm_alignr_epi8(res0, *prev0, 13);
1085        let res1prev1 = _mm_alignr_epi8(res1, *prev1, 14);
1086        let res2prev2 = _mm_alignr_epi8(res2, *prev2, 15);
1087        let res = _mm_and_si128(
1088            _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2prev2),
1089            res3,
1090        );
1091        *prev0 = res0;
1092        *prev1 = res1;
1093        *prev2 = res2;
1094        res
1095    }
1096}
1097
1098#[derive(Clone, Debug)]
1099pub struct TeddySlim4Mask256 {
1100    pub mask1: Mask256,
1101    pub mask2: Mask256,
1102    pub mask3: Mask256,
1103    pub mask4: Mask256,
1104}
1105
1106impl TeddySlim4Mask256 {
1107    #[target_feature(enable = "avx2")]
1108    unsafe fn find_at(
1109        &self,
1110        pats: &Patterns,
1111        teddy: &Teddy,
1112        haystack: &[u8],
1113        mut at: usize,
1114    ) -> Option<Match> {
1115        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
1116        // This assert helps eliminate bounds checks for bucket lookups in
1117        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
1118        assert_eq!(8, teddy.buckets.len());
1119
1120        at += 3;
1121        let len = haystack.len();
1122        let mut prev0 = vector::ones256();
1123        let mut prev1 = vector::ones256();
1124        let mut prev2 = vector::ones256();
1125        while at <= len - 32 {
1126            let c = self
1127                .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1128            if !vector::is_all_zeroes256(c) {
1129                if let Some(m) = teddy.verify256(pats, haystack, at - 3, c) {
1130                    return Some(m);
1131                }
1132            }
1133            at += 32;
1134        }
1135        if at < len {
1136            at = len - 32;
1137            prev0 = vector::ones256();
1138            prev1 = vector::ones256();
1139            prev2 = vector::ones256();
1140
1141            let c = self
1142                .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1143            if !vector::is_all_zeroes256(c) {
1144                if let Some(m) = teddy.verify256(pats, haystack, at - 3, c) {
1145                    return Some(m);
1146                }
1147            }
1148        }
1149        None
1150    }
1151
1152    #[inline(always)]
1153    unsafe fn candidate(
1154        &self,
1155        haystack: &[u8],
1156        at: usize,
1157        prev0: &mut __m256i,
1158        prev1: &mut __m256i,
1159        prev2: &mut __m256i,
1160    ) -> __m256i {
1161        debug_assert!(haystack[at..].len() >= 32);
1162
1163        let chunk = vector::loadu256(haystack, at);
1164        let (res0, res1, res2, res3) = members4m256(
1165            chunk, self.mask1, self.mask2, self.mask3, self.mask4,
1166        );
1167        let res0prev0 = vector::alignr256_13(res0, *prev0);
1168        let res1prev1 = vector::alignr256_14(res1, *prev1);
1169        let res2prev2 = vector::alignr256_15(res2, *prev2);
1170        let res = _mm256_and_si256(
1171            _mm256_and_si256(
1172                _mm256_and_si256(res0prev0, res1prev1),
1173                res2prev2,
1174            ),
1175            res3,
1176        );
1177        *prev0 = res0;
1178        *prev1 = res1;
1179        *prev2 = res2;
1180        res
1181    }
1182}
1183
1184#[derive(Clone, Debug)]
1185pub struct TeddyFat4Mask256 {
1186    pub mask1: Mask256,
1187    pub mask2: Mask256,
1188    pub mask3: Mask256,
1189    pub mask4: Mask256,
1190}
1191
1192impl TeddyFat4Mask256 {
1193    #[target_feature(enable = "avx2")]
1194    unsafe fn find_at(
1195        &self,
1196        pats: &Patterns,
1197        teddy: &Teddy,
1198        haystack: &[u8],
1199        mut at: usize,
1200    ) -> Option<Match> {
1201        debug_assert!(haystack[at..].len() >= teddy.minimum_len());
1202        // This assert helps eliminate bounds checks for bucket lookups in
1203        // Teddy::verify_bucket, which has a small (3-4%) performance boost.
1204        assert_eq!(16, teddy.buckets.len());
1205
1206        at += 3;
1207        let len = haystack.len();
1208        let mut prev0 = vector::ones256();
1209        let mut prev1 = vector::ones256();
1210        let mut prev2 = vector::ones256();
1211        while at <= len - 16 {
1212            let c = self
1213                .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1214            if !vector::is_all_zeroes256(c) {
1215                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 3, c)
1216                {
1217                    return Some(m);
1218                }
1219            }
1220            at += 16;
1221        }
1222        if at < len {
1223            at = len - 16;
1224            prev0 = vector::ones256();
1225            prev1 = vector::ones256();
1226            prev2 = vector::ones256();
1227
1228            let c = self
1229                .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1230            if !vector::is_all_zeroes256(c) {
1231                if let Some(m) = teddy.verify_fat256(pats, haystack, at - 3, c)
1232                {
1233                    return Some(m);
1234                }
1235            }
1236        }
1237        None
1238    }
1239
1240    #[inline(always)]
1241    unsafe fn candidate(
1242        &self,
1243        haystack: &[u8],
1244        at: usize,
1245        prev0: &mut __m256i,
1246        prev1: &mut __m256i,
1247        prev2: &mut __m256i,
1248    ) -> __m256i {
1249        debug_assert!(haystack[at..].len() >= 16);
1250
1251        let chunk =
1252            _mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
1253        let (res0, res1, res2, res3) = members4m256(
1254            chunk, self.mask1, self.mask2, self.mask3, self.mask4,
1255        );
1256        let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 13);
1257        let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 14);
1258        let res2prev2 = _mm256_alignr_epi8(res2, *prev2, 15);
1259        let res = _mm256_and_si256(
1260            _mm256_and_si256(
1261                _mm256_and_si256(res0prev0, res1prev1),
1262                res2prev2,
1263            ),
1264            res3,
1265        );
1266        *prev0 = res0;
1267        *prev1 = res1;
1268        *prev2 = res2;
1269        res
1270    }
1271}
1272
1273/// A 128-bit mask for the low and high nybbles in a set of patterns. Each
1274/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
1275/// the nybble `j` is in the bucket `i` at a particular position.
1276#[derive(Clone, Copy, Debug)]
1277pub struct Mask128 {
1278    lo: __m128i,
1279    hi: __m128i,
1280}
1281
1282impl Mask128 {
1283    /// Create a new SIMD mask from the mask produced by the Teddy builder.
1284    pub fn new(mask: compile::Mask) -> Mask128 {
1285        // SAFETY: This is safe since [u8; 16] has the same representation
1286        // as __m128i.
1287        unsafe {
1288            Mask128 {
1289                lo: mem::transmute(mask.lo128()),
1290                hi: mem::transmute(mask.hi128()),
1291            }
1292        }
1293    }
1294}
1295
1296/// A 256-bit mask for the low and high nybbles in a set of patterns. Each
1297/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
1298/// the nybble `j` is in the bucket `i` at a particular position.
1299///
1300/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
1301/// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as
1302/// the bitsets in the higher 128-bits, so that we can search 32 bytes at a
1303/// time. (Remember, the nybbles in the haystack are used as indices into these
1304/// masks, and 256-bit shuffles only operate on 128-bit lanes.)
1305///
1306/// For Fat Teddy, the bitsets are not repeated, but instead, the high 128
1307/// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets
1308/// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set
1309/// if it's in the higher 128 bits.
1310#[derive(Clone, Copy, Debug)]
1311pub struct Mask256 {
1312    lo: __m256i,
1313    hi: __m256i,
1314}
1315
1316impl Mask256 {
1317    /// Create a new SIMD mask from the mask produced by the Teddy builder.
1318    pub fn new(mask: compile::Mask) -> Mask256 {
1319        // SAFETY: This is safe since [u8; 32] has the same representation
1320        // as __m256i.
1321        unsafe {
1322            Mask256 {
1323                lo: mem::transmute(mask.lo256()),
1324                hi: mem::transmute(mask.hi256()),
1325            }
1326        }
1327    }
1328}
1329
1330// The "members" routines below are responsible for taking a chunk of bytes,
1331// a number of nybble masks and returning the result of using the masks to
1332// lookup bytes in the chunk. The results of the high and low nybble masks are
1333// AND'ed together, such that each candidate returned is a vector, with byte
1334// sized lanes, and where each lane is an 8-bit bitset corresponding to the
1335// buckets that contain the corresponding byte.
1336//
1337// In the case of masks of length greater than 1, callers will need to keep
1338// the results from the previous haystack's window, and then shift the vectors
1339// so that they all line up. Then they can be AND'ed together.
1340
1341/// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a
1342/// 16-byte window of the haystack (where the least significant byte
1343/// corresponds to the start of the window), and `mask1` corresponds to a
1344/// low/high mask for the first byte of all patterns that are being searched.
1345#[target_feature(enable = "ssse3")]
1346unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i {
1347    let lomask = _mm_set1_epi8(0xF);
1348    let hlo = _mm_and_si128(chunk, lomask);
1349    let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
1350    _mm_and_si128(
1351        _mm_shuffle_epi8(mask1.lo, hlo),
1352        _mm_shuffle_epi8(mask1.hi, hhi),
1353    )
1354}
1355
1356/// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a
1357/// 32-byte window of the haystack (where the least significant byte
1358/// corresponds to the start of the window), and `mask1` corresponds to a
1359/// low/high mask for the first byte of all patterns that are being searched.
1360///
1361/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1362/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1363/// window in the haystack.
1364#[target_feature(enable = "avx2")]
1365unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i {
1366    let lomask = _mm256_set1_epi8(0xF);
1367    let hlo = _mm256_and_si256(chunk, lomask);
1368    let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
1369    _mm256_and_si256(
1370        _mm256_shuffle_epi8(mask1.lo, hlo),
1371        _mm256_shuffle_epi8(mask1.hi, hhi),
1372    )
1373}
1374
1375/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1376/// to a 16-byte window of the haystack (where the least significant byte
1377/// corresponds to the start of the window), and the masks correspond to a
1378/// low/high mask for the first and second bytes of all patterns that are being
1379/// searched. The vectors returned correspond to candidates for the first and
1380/// second bytes in the patterns represented by the masks.
1381#[target_feature(enable = "ssse3")]
1382unsafe fn members2m128(
1383    chunk: __m128i,
1384    mask1: Mask128,
1385    mask2: Mask128,
1386) -> (__m128i, __m128i) {
1387    let lomask = _mm_set1_epi8(0xF);
1388    let hlo = _mm_and_si128(chunk, lomask);
1389    let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
1390    let res0 = _mm_and_si128(
1391        _mm_shuffle_epi8(mask1.lo, hlo),
1392        _mm_shuffle_epi8(mask1.hi, hhi),
1393    );
1394    let res1 = _mm_and_si128(
1395        _mm_shuffle_epi8(mask2.lo, hlo),
1396        _mm_shuffle_epi8(mask2.hi, hhi),
1397    );
1398    (res0, res1)
1399}
1400
1401/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1402/// to a 32-byte window of the haystack (where the least significant byte
1403/// corresponds to the start of the window), and the masks correspond to a
1404/// low/high mask for the first and second bytes of all patterns that are being
1405/// searched. The vectors returned correspond to candidates for the first and
1406/// second bytes in the patterns represented by the masks.
1407///
1408/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1409/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1410/// window in the haystack.
1411#[target_feature(enable = "avx2")]
1412unsafe fn members2m256(
1413    chunk: __m256i,
1414    mask1: Mask256,
1415    mask2: Mask256,
1416) -> (__m256i, __m256i) {
1417    let lomask = _mm256_set1_epi8(0xF);
1418    let hlo = _mm256_and_si256(chunk, lomask);
1419    let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
1420    let res0 = _mm256_and_si256(
1421        _mm256_shuffle_epi8(mask1.lo, hlo),
1422        _mm256_shuffle_epi8(mask1.hi, hhi),
1423    );
1424    let res1 = _mm256_and_si256(
1425        _mm256_shuffle_epi8(mask2.lo, hlo),
1426        _mm256_shuffle_epi8(mask2.hi, hhi),
1427    );
1428    (res0, res1)
1429}
1430
1431/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1432/// to a 16-byte window of the haystack (where the least significant byte
1433/// corresponds to the start of the window), and the masks correspond to a
1434/// low/high mask for the first, second and third bytes of all patterns that
1435/// are being searched. The vectors returned correspond to candidates for the
1436/// first, second and third bytes in the patterns represented by the masks.
1437#[target_feature(enable = "ssse3")]
1438unsafe fn members3m128(
1439    chunk: __m128i,
1440    mask1: Mask128,
1441    mask2: Mask128,
1442    mask3: Mask128,
1443) -> (__m128i, __m128i, __m128i) {
1444    let lomask = _mm_set1_epi8(0xF);
1445    let hlo = _mm_and_si128(chunk, lomask);
1446    let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
1447    let res0 = _mm_and_si128(
1448        _mm_shuffle_epi8(mask1.lo, hlo),
1449        _mm_shuffle_epi8(mask1.hi, hhi),
1450    );
1451    let res1 = _mm_and_si128(
1452        _mm_shuffle_epi8(mask2.lo, hlo),
1453        _mm_shuffle_epi8(mask2.hi, hhi),
1454    );
1455    let res2 = _mm_and_si128(
1456        _mm_shuffle_epi8(mask3.lo, hlo),
1457        _mm_shuffle_epi8(mask3.hi, hhi),
1458    );
1459    (res0, res1, res2)
1460}
1461
1462/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1463/// to a 32-byte window of the haystack (where the least significant byte
1464/// corresponds to the start of the window), and the masks correspond to a
1465/// low/high mask for the first, second and third bytes of all patterns that
1466/// are being searched. The vectors returned correspond to candidates for the
1467/// first, second and third bytes in the patterns represented by the masks.
1468///
1469/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1470/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1471/// window in the haystack.
1472#[target_feature(enable = "avx2")]
1473unsafe fn members3m256(
1474    chunk: __m256i,
1475    mask1: Mask256,
1476    mask2: Mask256,
1477    mask3: Mask256,
1478) -> (__m256i, __m256i, __m256i) {
1479    let lomask = _mm256_set1_epi8(0xF);
1480    let hlo = _mm256_and_si256(chunk, lomask);
1481    let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
1482    let res0 = _mm256_and_si256(
1483        _mm256_shuffle_epi8(mask1.lo, hlo),
1484        _mm256_shuffle_epi8(mask1.hi, hhi),
1485    );
1486    let res1 = _mm256_and_si256(
1487        _mm256_shuffle_epi8(mask2.lo, hlo),
1488        _mm256_shuffle_epi8(mask2.hi, hhi),
1489    );
1490    let res2 = _mm256_and_si256(
1491        _mm256_shuffle_epi8(mask3.lo, hlo),
1492        _mm256_shuffle_epi8(mask3.hi, hhi),
1493    );
1494    (res0, res1, res2)
1495}
1496
1497/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1498/// to a 16-byte window of the haystack (where the least significant byte
1499/// corresponds to the start of the window), and the masks correspond to a
1500/// low/high mask for the first, second, third and fourth bytes of all patterns
1501/// that are being searched. The vectors returned correspond to candidates for
1502/// the first, second, third and fourth bytes in the patterns represented by
1503/// the masks.
1504#[target_feature(enable = "ssse3")]
1505unsafe fn members4m128(
1506    chunk: __m128i,
1507    mask1: Mask128,
1508    mask2: Mask128,
1509    mask3: Mask128,
1510    mask4: Mask128,
1511) -> (__m128i, __m128i, __m128i, __m128i) {
1512    let lomask = _mm_set1_epi8(0xF);
1513    let hlo = _mm_and_si128(chunk, lomask);
1514    let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
1515    let res0 = _mm_and_si128(
1516        _mm_shuffle_epi8(mask1.lo, hlo),
1517        _mm_shuffle_epi8(mask1.hi, hhi),
1518    );
1519    let res1 = _mm_and_si128(
1520        _mm_shuffle_epi8(mask2.lo, hlo),
1521        _mm_shuffle_epi8(mask2.hi, hhi),
1522    );
1523    let res2 = _mm_and_si128(
1524        _mm_shuffle_epi8(mask3.lo, hlo),
1525        _mm_shuffle_epi8(mask3.hi, hhi),
1526    );
1527    let res3 = _mm_and_si128(
1528        _mm_shuffle_epi8(mask4.lo, hlo),
1529        _mm_shuffle_epi8(mask4.hi, hhi),
1530    );
1531    (res0, res1, res2, res3)
1532}
1533
1534/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1535/// to a 32-byte window of the haystack (where the least significant byte
1536/// corresponds to the start of the window), and the masks correspond to a
1537/// low/high mask for the first, second, third and fourth bytes of all patterns
1538/// that are being searched. The vectors returned correspond to candidates for
1539/// the first, second, third and fourth bytes in the patterns represented by
1540/// the masks.
1541///
1542/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1543/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1544/// window in the haystack.
1545#[target_feature(enable = "avx2")]
1546unsafe fn members4m256(
1547    chunk: __m256i,
1548    mask1: Mask256,
1549    mask2: Mask256,
1550    mask3: Mask256,
1551    mask4: Mask256,
1552) -> (__m256i, __m256i, __m256i, __m256i) {
1553    let lomask = _mm256_set1_epi8(0xF);
1554    let hlo = _mm256_and_si256(chunk, lomask);
1555    let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
1556    let res0 = _mm256_and_si256(
1557        _mm256_shuffle_epi8(mask1.lo, hlo),
1558        _mm256_shuffle_epi8(mask1.hi, hhi),
1559    );
1560    let res1 = _mm256_and_si256(
1561        _mm256_shuffle_epi8(mask2.lo, hlo),
1562        _mm256_shuffle_epi8(mask2.hi, hhi),
1563    );
1564    let res2 = _mm256_and_si256(
1565        _mm256_shuffle_epi8(mask3.lo, hlo),
1566        _mm256_shuffle_epi8(mask3.hi, hhi),
1567    );
1568    let res3 = _mm256_and_si256(
1569        _mm256_shuffle_epi8(mask4.lo, hlo),
1570        _mm256_shuffle_epi8(mask4.hi, hhi),
1571    );
1572    (res0, res1, res2, res3)
1573}