bstr/search/
prefilter.rs

1use core::mem;
2
3use bstr::BStr;
4use search::byte_frequencies::BYTE_FREQUENCIES;
5
6/// PrefilterState tracks state associated with the effectiveness of a
7/// prefilter. It is used to track how many bytes, on average, are skipped by
8/// the prefilter. If this average dips below a certain threshold over time,
9/// then the state renders the prefilter inert and stops using it.
10///
11/// A prefilter state should be created for each search. (Where creating an
12/// iterator via, e.g., `find_iter`, is treated as a single search.)
13#[derive(Clone, Debug)]
14pub struct PrefilterState {
15    /// The number of skips that has been executed.
16    skips: usize,
17    /// The total number of bytes that have been skipped.
18    skipped: usize,
19    /// The maximum length of a match. This is used to help determine how many
20    /// bytes on average should be skipped in order for a prefilter to be
21    /// effective.
22    max_match_len: usize,
23    /// Once this heuristic has been deemed ineffective, it will be inert
24    /// throughout the rest of its lifetime. This serves as a cheap way to
25    /// check inertness.
26    inert: bool,
27}
28
29impl PrefilterState {
30    /// The minimum number of skip attempts to try before considering whether
31    /// a prefilter is effective or not.
32    const MIN_SKIPS: usize = 10;
33
34    /// The minimum amount of bytes that skipping must average, expressed as
35    /// a factor of the multiple of the length of a possible match.
36    ///
37    /// That is, after MIN_SKIPS have occurred, if the average number of bytes
38    /// skipped ever falls below MIN_AVG_FACTOR, then this searcher is rendered
39    /// inert.
40    const MIN_AVG_FACTOR: usize = 4;
41
42    /// Create a fresh prefilter state.
43    pub fn new(max_match_len: usize) -> PrefilterState {
44        if max_match_len == 0 {
45            return PrefilterState::inert();
46        }
47        PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false }
48    }
49
50    /// Create a fresh prefilter state that is always inert.
51    fn inert() -> PrefilterState {
52        PrefilterState { skips: 0, skipped: 0, max_match_len: 0, inert: true }
53    }
54
55    /// Update this state with the number of bytes skipped on the last
56    /// invocation of the prefilter.
57    #[inline]
58    pub fn update(&mut self, skipped: usize) {
59        self.skips += 1;
60        self.skipped += skipped;
61    }
62
63    /// Return true if and only if this state indicates that a prefilter is
64    /// still effective.
65    #[inline]
66    pub fn is_effective(&mut self) -> bool {
67        if self.inert {
68            return false;
69        }
70        if self.skips < PrefilterState::MIN_SKIPS {
71            return true;
72        }
73
74        let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
75        if self.skipped >= min_avg * self.skips {
76            return true;
77        }
78
79        // We're inert.
80        self.inert = true;
81        false
82    }
83}
84
85/// A heuristic frequency based prefilter for searching a single needle.
86///
87/// This prefilter attempts to pick out the byte in a needle that is predicted
88/// to occur least frequently, and search for that using fast vectorized
89/// routines. If a rare enough byte could not be found, then this prefilter's
90/// constructors will return `None`.
91///
92/// This can be combined with `PrefilterState` to dynamically render this
93/// prefilter inert if it proves to ineffective.
94#[derive(Clone, Debug)]
95pub struct Freqy {
96    /// Whether this prefilter should be used or not.
97    inert: bool,
98    /// The length of the needle we're searching for.
99    needle_len: usize,
100    /// The rarest byte in the needle, according to pre-computed frequency
101    /// analysis.
102    rare1: u8,
103    /// The leftmost offset of the rarest byte in the needle.
104    rare1i: usize,
105    /// The second rarest byte in the needle, according to pre-computed
106    /// frequency analysis. (This may be equivalent to the rarest byte.)
107    ///
108    /// The second rarest byte is used as a type of guard for quickly detecting
109    /// a mismatch after memchr locates an instance of the rarest byte. This
110    /// is a hedge against pathological cases where the pre-computed frequency
111    /// analysis may be off. (But of course, does not prevent *all*
112    /// pathological cases.)
113    rare2: u8,
114    /// The leftmost offset of the second rarest byte in the needle.
115    rare2i: usize,
116}
117
118impl Freqy {
119    /// The maximum frequency rank permitted. If the rarest byte in the needle
120    /// has a frequency rank above this value, then Freqy is not used.
121    const MAX_RANK: usize = 200;
122
123    /// Return a fresh prefilter state that can be used with this prefilter. A
124    /// prefilter state is used to track the effectiveness of a prefilter for
125    /// speeding up searches. Therefore, the prefilter state should generally
126    /// be reused on subsequent searches (such as in an iterator). For searches
127    /// on a different haystack, then a new prefilter state should be used.
128    pub fn prefilter_state(&self) -> PrefilterState {
129        if self.inert {
130            PrefilterState::inert()
131        } else {
132            PrefilterState::new(self.needle_len)
133        }
134    }
135
136    /// Returns a valid but inert prefilter. This is valid for both the forward
137    /// and reverse direction.
138    ///
139    /// It is never correct to use an inert prefilter. The results of finding
140    /// the next (or previous) candidate are unspecified.
141    fn inert() -> Freqy {
142        Freqy {
143            inert: true,
144            needle_len: 0,
145            rare1: 0,
146            rare1i: 0,
147            rare2: 0,
148            rare2i: 0,
149        }
150    }
151
152    /// Return search info for the given needle in the forward direction.
153    pub fn forward(needle: &BStr) -> Freqy {
154        if needle.is_empty() {
155            return Freqy::inert();
156        }
157
158        // Find the rarest two bytes. Try to make them distinct (but it's not
159        // required).
160        let (mut rare1, mut rare1i) = (needle[0], 0);
161        let (mut rare2, mut rare2i) = (needle[0], 0);
162        if needle.len() >= 2 {
163            rare2 = needle[1];
164            rare2i = 1;
165        }
166        if Freqy::rank(rare2) < Freqy::rank(rare1) {
167            mem::swap(&mut rare1, &mut rare2);
168            mem::swap(&mut rare1i, &mut rare2i);
169        }
170        for (i, b) in needle.bytes().enumerate().skip(2) {
171            if Freqy::rank(b) < Freqy::rank(rare1) {
172                rare2 = rare1;
173                rare2i = rare1i;
174                rare1 = b;
175                rare1i = i;
176            } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
177                rare2 = b;
178                rare2i = i;
179            }
180        }
181        if Freqy::rank(rare1) > Freqy::MAX_RANK {
182            return Freqy::inert();
183        }
184        let needle_len = needle.len();
185        Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
186    }
187
188    /// Return search info for the given needle in the reverse direction.
189    pub fn reverse(needle: &BStr) -> Freqy {
190        if needle.is_empty() {
191            return Freqy::inert();
192        }
193
194        // Find the rarest two bytes. Try to make them distinct (but it's not
195        // required). In reverse, the offsets correspond to the number of bytes
196        // from the end of the needle. So `0` is the last byte in the needle.
197        let (mut rare1i, mut rare2i) = (0, 0);
198        if needle.len() >= 2 {
199            rare2i += 1;
200        }
201        let mut rare1 = needle[needle.len() - rare1i - 1];
202        let mut rare2 = needle[needle.len() - rare2i - 1];
203        if Freqy::rank(rare2) < Freqy::rank(rare1) {
204            mem::swap(&mut rare1, &mut rare2);
205            mem::swap(&mut rare1i, &mut rare2i);
206        }
207        for (i, b) in needle.bytes().rev().enumerate().skip(2) {
208            if Freqy::rank(b) < Freqy::rank(rare1) {
209                rare2 = rare1;
210                rare2i = rare1i;
211                rare1 = b;
212                rare1i = i;
213            } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
214                rare2 = b;
215                rare2i = i;
216            }
217        }
218        if Freqy::rank(rare1) > Freqy::MAX_RANK {
219            return Freqy::inert();
220        }
221        let needle_len = needle.len();
222        Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
223    }
224
225    /// Look for a possible occurrence of needle. The position returned
226    /// corresponds to the beginning of the occurrence, if one exists.
227    ///
228    /// Callers may assume that this never returns false negatives (i.e., it
229    /// never misses an actual occurrence), but must check that the returned
230    /// position corresponds to a match. That is, it can return false
231    /// positives.
232    ///
233    /// This should only be used when Freqy is constructed for forward
234    /// searching.
235    pub fn find_candidate(
236        &self,
237        prestate: &mut PrefilterState,
238        haystack: &BStr,
239    ) -> Option<usize> {
240        debug_assert!(!self.inert);
241
242        let mut i = 0;
243        while prestate.is_effective() {
244            // Use a fast vectorized implementation to skip to the next
245            // occurrence of the rarest byte (heuristically chosen) in the
246            // needle.
247            i += match haystack[i..].find_byte(self.rare1) {
248                None => return None,
249                Some(found) => {
250                    prestate.update(found);
251                    found
252                }
253            };
254
255            // If we can't align our first match with the haystack, then a
256            // match is impossible.
257            if i < self.rare1i {
258                i += 1;
259                continue;
260            }
261
262            // Align our rare2 byte with the haystack. A mismatch means that
263            // a match is impossible.
264            let aligned_rare2i = i - self.rare1i + self.rare2i;
265            if haystack.get(aligned_rare2i) != Some(&self.rare2) {
266                i += 1;
267                continue;
268            }
269
270            // We've done what we can. There might be a match here.
271            return Some(i - self.rare1i);
272        }
273        // The only way we get here is if we believe our skipping heuristic
274        // has become ineffective. We're allowed to return false positives,
275        // so return the position at which we advanced to, aligned to the
276        // haystack.
277        Some(i.saturating_sub(self.rare1i))
278    }
279
280    /// Look for a possible occurrence of needle, in reverse, starting from the
281    /// end of the given haystack. The position returned corresponds to the
282    /// position immediately after the end of the occurrence, if one exists.
283    ///
284    /// Callers may assume that this never returns false negatives (i.e., it
285    /// never misses an actual occurrence), but must check that the returned
286    /// position corresponds to a match. That is, it can return false
287    /// positives.
288    ///
289    /// This should only be used when Freqy is constructed for reverse
290    /// searching.
291    pub fn rfind_candidate(
292        &self,
293        prestate: &mut PrefilterState,
294        haystack: &BStr,
295    ) -> Option<usize> {
296        debug_assert!(!self.inert);
297
298        let mut i = haystack.len();
299        while prestate.is_effective() {
300            // Use a fast vectorized implementation to skip to the next
301            // occurrence of the rarest byte (heuristically chosen) in the
302            // needle.
303            i = match haystack[..i].rfind_byte(self.rare1) {
304                None => return None,
305                Some(found) => {
306                    prestate.update(i - found);
307                    found
308                }
309            };
310
311            // If we can't align our first match with the haystack, then a
312            // match is impossible.
313            if i + self.rare1i + 1 > haystack.len() {
314                continue;
315            }
316
317            // Align our rare2 byte with the haystack. A mismatch means that
318            // a match is impossible.
319            let aligned = match (i + self.rare1i).checked_sub(self.rare2i) {
320                None => continue,
321                Some(aligned) => aligned,
322            };
323            if haystack.get(aligned) != Some(&self.rare2) {
324                continue;
325            }
326
327            // We've done what we can. There might be a match here.
328            return Some(i + self.rare1i + 1);
329        }
330        // The only way we get here is if we believe our skipping heuristic
331        // has become ineffective. We're allowed to return false positives,
332        // so return the position at which we advanced to, aligned to the
333        // haystack.
334        Some(i + self.rare1i + 1)
335    }
336
337    /// Return the heuristical frequency rank of the given byte. A lower rank
338    /// means the byte is believed to occur less frequently.
339    fn rank(b: u8) -> usize {
340        BYTE_FREQUENCIES[b as usize] as usize
341    }
342}
343
344#[cfg(test)]
345mod tests {
346    use bstr::B;
347    use super::*;
348
349    #[test]
350    fn freqy_forward() {
351        // N.B. We sometimes use uppercase here since that mostly ensures freqy
352        // will be constructable. Lowercase letters may be too common for freqy
353        // to work.
354
355        let s = Freqy::forward(B("BAR"));
356        let mut pre = s.prefilter_state();
357        assert_eq!(Some(0), s.find_candidate(&mut pre, B("BARFOO")));
358
359        let s = Freqy::forward(B("BAR"));
360        let mut pre = s.prefilter_state();
361        assert_eq!(Some(3), s.find_candidate(&mut pre, B("FOOBAR")));
362
363        let s = Freqy::forward(B("zyzy"));
364        let mut pre = s.prefilter_state();
365        assert_eq!(Some(0), s.find_candidate(&mut pre, B("zyzz")));
366
367        let s = Freqy::forward(B("zyzy"));
368        let mut pre = s.prefilter_state();
369        assert_eq!(Some(2), s.find_candidate(&mut pre, B("zzzy")));
370
371        let s = Freqy::forward(B("zyzy"));
372        let mut pre = s.prefilter_state();
373        assert_eq!(None, s.find_candidate(&mut pre, B("zazb")));
374
375        let s = Freqy::forward(B("yzyz"));
376        let mut pre = s.prefilter_state();
377        assert_eq!(Some(0), s.find_candidate(&mut pre, B("yzyy")));
378
379        let s = Freqy::forward(B("yzyz"));
380        let mut pre = s.prefilter_state();
381        assert_eq!(Some(2), s.find_candidate(&mut pre, B("yyyz")));
382
383        let s = Freqy::forward(B("yzyz"));
384        let mut pre = s.prefilter_state();
385        assert_eq!(None, s.find_candidate(&mut pre, B("yayb")));
386    }
387
388    #[test]
389    fn freqy_reverse() {
390        // N.B. We sometimes use uppercase here since that mostly ensures freqy
391        // will be constructable. Lowercase letters may be too common for freqy
392        // to work.
393
394        let s = Freqy::reverse(B("BAR"));
395        let mut pre = s.prefilter_state();
396        assert_eq!(Some(3), s.rfind_candidate(&mut pre, B("BARFOO")));
397
398        let s = Freqy::reverse(B("BAR"));
399        let mut pre = s.prefilter_state();
400        assert_eq!(Some(6), s.rfind_candidate(&mut pre, B("FOOBAR")));
401
402        let s = Freqy::reverse(B("zyzy"));
403        let mut pre = s.prefilter_state();
404        assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("zyzz")));
405
406        let s = Freqy::reverse(B("zyzy"));
407        let mut pre = s.prefilter_state();
408        assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("zzzy")));
409
410        let s = Freqy::reverse(B("zyzy"));
411        let mut pre = s.prefilter_state();
412        assert_eq!(None, s.rfind_candidate(&mut pre, B("zazb")));
413
414        let s = Freqy::reverse(B("yzyz"));
415        let mut pre = s.prefilter_state();
416        assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("yzyy")));
417
418        let s = Freqy::reverse(B("yzyz"));
419        let mut pre = s.prefilter_state();
420        assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("yyyz")));
421
422        let s = Freqy::reverse(B("yzyz"));
423        let mut pre = s.prefilter_state();
424        assert_eq!(None, s.rfind_candidate(&mut pre, B("yayb")));
425    }
426}