bstr/search/prefilter.rs
1use core::mem;
2
3use bstr::BStr;
4use search::byte_frequencies::BYTE_FREQUENCIES;
5
6/// PrefilterState tracks state associated with the effectiveness of a
7/// prefilter. It is used to track how many bytes, on average, are skipped by
8/// the prefilter. If this average dips below a certain threshold over time,
9/// then the state renders the prefilter inert and stops using it.
10///
11/// A prefilter state should be created for each search. (Where creating an
12/// iterator via, e.g., `find_iter`, is treated as a single search.)
13#[derive(Clone, Debug)]
14pub struct PrefilterState {
15 /// The number of skips that has been executed.
16 skips: usize,
17 /// The total number of bytes that have been skipped.
18 skipped: usize,
19 /// The maximum length of a match. This is used to help determine how many
20 /// bytes on average should be skipped in order for a prefilter to be
21 /// effective.
22 max_match_len: usize,
23 /// Once this heuristic has been deemed ineffective, it will be inert
24 /// throughout the rest of its lifetime. This serves as a cheap way to
25 /// check inertness.
26 inert: bool,
27}
28
29impl PrefilterState {
30 /// The minimum number of skip attempts to try before considering whether
31 /// a prefilter is effective or not.
32 const MIN_SKIPS: usize = 10;
33
34 /// The minimum amount of bytes that skipping must average, expressed as
35 /// a factor of the multiple of the length of a possible match.
36 ///
37 /// That is, after MIN_SKIPS have occurred, if the average number of bytes
38 /// skipped ever falls below MIN_AVG_FACTOR, then this searcher is rendered
39 /// inert.
40 const MIN_AVG_FACTOR: usize = 4;
41
42 /// Create a fresh prefilter state.
43 pub fn new(max_match_len: usize) -> PrefilterState {
44 if max_match_len == 0 {
45 return PrefilterState::inert();
46 }
47 PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false }
48 }
49
50 /// Create a fresh prefilter state that is always inert.
51 fn inert() -> PrefilterState {
52 PrefilterState { skips: 0, skipped: 0, max_match_len: 0, inert: true }
53 }
54
55 /// Update this state with the number of bytes skipped on the last
56 /// invocation of the prefilter.
57 #[inline]
58 pub fn update(&mut self, skipped: usize) {
59 self.skips += 1;
60 self.skipped += skipped;
61 }
62
63 /// Return true if and only if this state indicates that a prefilter is
64 /// still effective.
65 #[inline]
66 pub fn is_effective(&mut self) -> bool {
67 if self.inert {
68 return false;
69 }
70 if self.skips < PrefilterState::MIN_SKIPS {
71 return true;
72 }
73
74 let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
75 if self.skipped >= min_avg * self.skips {
76 return true;
77 }
78
79 // We're inert.
80 self.inert = true;
81 false
82 }
83}
84
85/// A heuristic frequency based prefilter for searching a single needle.
86///
87/// This prefilter attempts to pick out the byte in a needle that is predicted
88/// to occur least frequently, and search for that using fast vectorized
89/// routines. If a rare enough byte could not be found, then this prefilter's
90/// constructors will return `None`.
91///
92/// This can be combined with `PrefilterState` to dynamically render this
93/// prefilter inert if it proves to ineffective.
94#[derive(Clone, Debug)]
95pub struct Freqy {
96 /// Whether this prefilter should be used or not.
97 inert: bool,
98 /// The length of the needle we're searching for.
99 needle_len: usize,
100 /// The rarest byte in the needle, according to pre-computed frequency
101 /// analysis.
102 rare1: u8,
103 /// The leftmost offset of the rarest byte in the needle.
104 rare1i: usize,
105 /// The second rarest byte in the needle, according to pre-computed
106 /// frequency analysis. (This may be equivalent to the rarest byte.)
107 ///
108 /// The second rarest byte is used as a type of guard for quickly detecting
109 /// a mismatch after memchr locates an instance of the rarest byte. This
110 /// is a hedge against pathological cases where the pre-computed frequency
111 /// analysis may be off. (But of course, does not prevent *all*
112 /// pathological cases.)
113 rare2: u8,
114 /// The leftmost offset of the second rarest byte in the needle.
115 rare2i: usize,
116}
117
118impl Freqy {
119 /// The maximum frequency rank permitted. If the rarest byte in the needle
120 /// has a frequency rank above this value, then Freqy is not used.
121 const MAX_RANK: usize = 200;
122
123 /// Return a fresh prefilter state that can be used with this prefilter. A
124 /// prefilter state is used to track the effectiveness of a prefilter for
125 /// speeding up searches. Therefore, the prefilter state should generally
126 /// be reused on subsequent searches (such as in an iterator). For searches
127 /// on a different haystack, then a new prefilter state should be used.
128 pub fn prefilter_state(&self) -> PrefilterState {
129 if self.inert {
130 PrefilterState::inert()
131 } else {
132 PrefilterState::new(self.needle_len)
133 }
134 }
135
136 /// Returns a valid but inert prefilter. This is valid for both the forward
137 /// and reverse direction.
138 ///
139 /// It is never correct to use an inert prefilter. The results of finding
140 /// the next (or previous) candidate are unspecified.
141 fn inert() -> Freqy {
142 Freqy {
143 inert: true,
144 needle_len: 0,
145 rare1: 0,
146 rare1i: 0,
147 rare2: 0,
148 rare2i: 0,
149 }
150 }
151
152 /// Return search info for the given needle in the forward direction.
153 pub fn forward(needle: &BStr) -> Freqy {
154 if needle.is_empty() {
155 return Freqy::inert();
156 }
157
158 // Find the rarest two bytes. Try to make them distinct (but it's not
159 // required).
160 let (mut rare1, mut rare1i) = (needle[0], 0);
161 let (mut rare2, mut rare2i) = (needle[0], 0);
162 if needle.len() >= 2 {
163 rare2 = needle[1];
164 rare2i = 1;
165 }
166 if Freqy::rank(rare2) < Freqy::rank(rare1) {
167 mem::swap(&mut rare1, &mut rare2);
168 mem::swap(&mut rare1i, &mut rare2i);
169 }
170 for (i, b) in needle.bytes().enumerate().skip(2) {
171 if Freqy::rank(b) < Freqy::rank(rare1) {
172 rare2 = rare1;
173 rare2i = rare1i;
174 rare1 = b;
175 rare1i = i;
176 } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
177 rare2 = b;
178 rare2i = i;
179 }
180 }
181 if Freqy::rank(rare1) > Freqy::MAX_RANK {
182 return Freqy::inert();
183 }
184 let needle_len = needle.len();
185 Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
186 }
187
188 /// Return search info for the given needle in the reverse direction.
189 pub fn reverse(needle: &BStr) -> Freqy {
190 if needle.is_empty() {
191 return Freqy::inert();
192 }
193
194 // Find the rarest two bytes. Try to make them distinct (but it's not
195 // required). In reverse, the offsets correspond to the number of bytes
196 // from the end of the needle. So `0` is the last byte in the needle.
197 let (mut rare1i, mut rare2i) = (0, 0);
198 if needle.len() >= 2 {
199 rare2i += 1;
200 }
201 let mut rare1 = needle[needle.len() - rare1i - 1];
202 let mut rare2 = needle[needle.len() - rare2i - 1];
203 if Freqy::rank(rare2) < Freqy::rank(rare1) {
204 mem::swap(&mut rare1, &mut rare2);
205 mem::swap(&mut rare1i, &mut rare2i);
206 }
207 for (i, b) in needle.bytes().rev().enumerate().skip(2) {
208 if Freqy::rank(b) < Freqy::rank(rare1) {
209 rare2 = rare1;
210 rare2i = rare1i;
211 rare1 = b;
212 rare1i = i;
213 } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
214 rare2 = b;
215 rare2i = i;
216 }
217 }
218 if Freqy::rank(rare1) > Freqy::MAX_RANK {
219 return Freqy::inert();
220 }
221 let needle_len = needle.len();
222 Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
223 }
224
225 /// Look for a possible occurrence of needle. The position returned
226 /// corresponds to the beginning of the occurrence, if one exists.
227 ///
228 /// Callers may assume that this never returns false negatives (i.e., it
229 /// never misses an actual occurrence), but must check that the returned
230 /// position corresponds to a match. That is, it can return false
231 /// positives.
232 ///
233 /// This should only be used when Freqy is constructed for forward
234 /// searching.
235 pub fn find_candidate(
236 &self,
237 prestate: &mut PrefilterState,
238 haystack: &BStr,
239 ) -> Option<usize> {
240 debug_assert!(!self.inert);
241
242 let mut i = 0;
243 while prestate.is_effective() {
244 // Use a fast vectorized implementation to skip to the next
245 // occurrence of the rarest byte (heuristically chosen) in the
246 // needle.
247 i += match haystack[i..].find_byte(self.rare1) {
248 None => return None,
249 Some(found) => {
250 prestate.update(found);
251 found
252 }
253 };
254
255 // If we can't align our first match with the haystack, then a
256 // match is impossible.
257 if i < self.rare1i {
258 i += 1;
259 continue;
260 }
261
262 // Align our rare2 byte with the haystack. A mismatch means that
263 // a match is impossible.
264 let aligned_rare2i = i - self.rare1i + self.rare2i;
265 if haystack.get(aligned_rare2i) != Some(&self.rare2) {
266 i += 1;
267 continue;
268 }
269
270 // We've done what we can. There might be a match here.
271 return Some(i - self.rare1i);
272 }
273 // The only way we get here is if we believe our skipping heuristic
274 // has become ineffective. We're allowed to return false positives,
275 // so return the position at which we advanced to, aligned to the
276 // haystack.
277 Some(i.saturating_sub(self.rare1i))
278 }
279
280 /// Look for a possible occurrence of needle, in reverse, starting from the
281 /// end of the given haystack. The position returned corresponds to the
282 /// position immediately after the end of the occurrence, if one exists.
283 ///
284 /// Callers may assume that this never returns false negatives (i.e., it
285 /// never misses an actual occurrence), but must check that the returned
286 /// position corresponds to a match. That is, it can return false
287 /// positives.
288 ///
289 /// This should only be used when Freqy is constructed for reverse
290 /// searching.
291 pub fn rfind_candidate(
292 &self,
293 prestate: &mut PrefilterState,
294 haystack: &BStr,
295 ) -> Option<usize> {
296 debug_assert!(!self.inert);
297
298 let mut i = haystack.len();
299 while prestate.is_effective() {
300 // Use a fast vectorized implementation to skip to the next
301 // occurrence of the rarest byte (heuristically chosen) in the
302 // needle.
303 i = match haystack[..i].rfind_byte(self.rare1) {
304 None => return None,
305 Some(found) => {
306 prestate.update(i - found);
307 found
308 }
309 };
310
311 // If we can't align our first match with the haystack, then a
312 // match is impossible.
313 if i + self.rare1i + 1 > haystack.len() {
314 continue;
315 }
316
317 // Align our rare2 byte with the haystack. A mismatch means that
318 // a match is impossible.
319 let aligned = match (i + self.rare1i).checked_sub(self.rare2i) {
320 None => continue,
321 Some(aligned) => aligned,
322 };
323 if haystack.get(aligned) != Some(&self.rare2) {
324 continue;
325 }
326
327 // We've done what we can. There might be a match here.
328 return Some(i + self.rare1i + 1);
329 }
330 // The only way we get here is if we believe our skipping heuristic
331 // has become ineffective. We're allowed to return false positives,
332 // so return the position at which we advanced to, aligned to the
333 // haystack.
334 Some(i + self.rare1i + 1)
335 }
336
337 /// Return the heuristical frequency rank of the given byte. A lower rank
338 /// means the byte is believed to occur less frequently.
339 fn rank(b: u8) -> usize {
340 BYTE_FREQUENCIES[b as usize] as usize
341 }
342}
343
344#[cfg(test)]
345mod tests {
346 use bstr::B;
347 use super::*;
348
349 #[test]
350 fn freqy_forward() {
351 // N.B. We sometimes use uppercase here since that mostly ensures freqy
352 // will be constructable. Lowercase letters may be too common for freqy
353 // to work.
354
355 let s = Freqy::forward(B("BAR"));
356 let mut pre = s.prefilter_state();
357 assert_eq!(Some(0), s.find_candidate(&mut pre, B("BARFOO")));
358
359 let s = Freqy::forward(B("BAR"));
360 let mut pre = s.prefilter_state();
361 assert_eq!(Some(3), s.find_candidate(&mut pre, B("FOOBAR")));
362
363 let s = Freqy::forward(B("zyzy"));
364 let mut pre = s.prefilter_state();
365 assert_eq!(Some(0), s.find_candidate(&mut pre, B("zyzz")));
366
367 let s = Freqy::forward(B("zyzy"));
368 let mut pre = s.prefilter_state();
369 assert_eq!(Some(2), s.find_candidate(&mut pre, B("zzzy")));
370
371 let s = Freqy::forward(B("zyzy"));
372 let mut pre = s.prefilter_state();
373 assert_eq!(None, s.find_candidate(&mut pre, B("zazb")));
374
375 let s = Freqy::forward(B("yzyz"));
376 let mut pre = s.prefilter_state();
377 assert_eq!(Some(0), s.find_candidate(&mut pre, B("yzyy")));
378
379 let s = Freqy::forward(B("yzyz"));
380 let mut pre = s.prefilter_state();
381 assert_eq!(Some(2), s.find_candidate(&mut pre, B("yyyz")));
382
383 let s = Freqy::forward(B("yzyz"));
384 let mut pre = s.prefilter_state();
385 assert_eq!(None, s.find_candidate(&mut pre, B("yayb")));
386 }
387
388 #[test]
389 fn freqy_reverse() {
390 // N.B. We sometimes use uppercase here since that mostly ensures freqy
391 // will be constructable. Lowercase letters may be too common for freqy
392 // to work.
393
394 let s = Freqy::reverse(B("BAR"));
395 let mut pre = s.prefilter_state();
396 assert_eq!(Some(3), s.rfind_candidate(&mut pre, B("BARFOO")));
397
398 let s = Freqy::reverse(B("BAR"));
399 let mut pre = s.prefilter_state();
400 assert_eq!(Some(6), s.rfind_candidate(&mut pre, B("FOOBAR")));
401
402 let s = Freqy::reverse(B("zyzy"));
403 let mut pre = s.prefilter_state();
404 assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("zyzz")));
405
406 let s = Freqy::reverse(B("zyzy"));
407 let mut pre = s.prefilter_state();
408 assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("zzzy")));
409
410 let s = Freqy::reverse(B("zyzy"));
411 let mut pre = s.prefilter_state();
412 assert_eq!(None, s.rfind_candidate(&mut pre, B("zazb")));
413
414 let s = Freqy::reverse(B("yzyz"));
415 let mut pre = s.prefilter_state();
416 assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("yzyy")));
417
418 let s = Freqy::reverse(B("yzyz"));
419 let mut pre = s.prefilter_state();
420 assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("yyyz")));
421
422 let s = Freqy::reverse(B("yzyz"));
423 let mut pre = s.prefilter_state();
424 assert_eq!(None, s.rfind_candidate(&mut pre, B("yayb")));
425 }
426}