ppv_lite86/x86_64/
sse2.rs

1use crate::soft::{x2, x4};
2use crate::types::*;
3use crate::vec128_storage;
4use crate::x86_64::Avx2Machine;
5use crate::x86_64::SseMachine as Machine86;
6use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7use core::arch::x86_64::*;
8use core::marker::PhantomData;
9use core::ops::{
10    Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11};
12
13macro_rules! impl_binop {
14    ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
15        impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
16            type Output = Self;
17            #[inline(always)]
18            fn $fn(self, rhs: Self) -> Self::Output {
19                Self::new(unsafe { $impl_fn(self.x, rhs.x) })
20            }
21        }
22    };
23}
24
25macro_rules! impl_binop_assign {
26    ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
27        impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
28        where
29            $vec<S3, S4, NI>: Copy,
30        {
31            #[inline(always)]
32            fn $fn_assign(&mut self, rhs: Self) {
33                *self = self.$fn(rhs);
34            }
35        }
36    };
37}
38
39macro_rules! def_vec {
40    ($vec:ident, $word:ident) => {
41        #[allow(non_camel_case_types)]
42        #[derive(Copy, Clone)]
43        pub struct $vec<S3, S4, NI> {
44            x: __m128i,
45            s3: PhantomData<S3>,
46            s4: PhantomData<S4>,
47            ni: PhantomData<NI>,
48        }
49
50        impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
51            #[inline(always)]
52            unsafe fn unpack(x: vec128_storage) -> Self {
53                Self::new(x.sse2)
54            }
55        }
56        impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
57            #[inline(always)]
58            fn from(x: $vec<S3, S4, NI>) -> Self {
59                vec128_storage { sse2: x.x }
60            }
61        }
62        impl<S3, S4, NI> $vec<S3, S4, NI> {
63            #[inline(always)]
64            fn new(x: __m128i) -> Self {
65                $vec {
66                    x,
67                    s3: PhantomData,
68                    s4: PhantomData,
69                    ni: PhantomData,
70                }
71            }
72        }
73
74        impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
75        where
76            Self: BSwap,
77        {
78            #[inline(always)]
79            unsafe fn unsafe_read_le(input: &[u8]) -> Self {
80                assert_eq!(input.len(), 16);
81                Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
82            }
83            #[inline(always)]
84            unsafe fn unsafe_read_be(input: &[u8]) -> Self {
85                assert_eq!(input.len(), 16);
86                Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
87            }
88            #[inline(always)]
89            fn write_le(self, out: &mut [u8]) {
90                assert_eq!(out.len(), 16);
91                unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
92            }
93            #[inline(always)]
94            fn write_be(self, out: &mut [u8]) {
95                assert_eq!(out.len(), 16);
96                let x = self.bswap().x;
97                unsafe {
98                    _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
99                }
100            }
101        }
102
103        impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
104            #[inline(always)]
105            fn default() -> Self {
106                Self::new(unsafe { _mm_setzero_si128() })
107            }
108        }
109
110        impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
111            type Output = Self;
112            #[inline(always)]
113            fn not(self) -> Self::Output {
114                unsafe {
115                    let ff = _mm_set1_epi64x(-1i64);
116                    self ^ Self::new(ff)
117                }
118            }
119        }
120
121        impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
122        impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
123        impl_binop!($vec, BitOr, bitor, _mm_or_si128);
124        impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
125        impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
126        impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
127        impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
128        impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
129            type Output = Self;
130            #[inline(always)]
131            fn andnot(self, rhs: Self) -> Self {
132                Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
133            }
134        }
135    };
136}
137
138macro_rules! impl_bitops32 {
139    ($vec:ident) => {
140        impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
141            $vec<S3, S4, NI>: RotateEachWord32
142        {
143        }
144    };
145}
146
147macro_rules! impl_bitops64 {
148    ($vec:ident) => {
149        impl_bitops32!($vec);
150        impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
151            $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
152        {
153        }
154    };
155}
156
157macro_rules! impl_bitops128 {
158    ($vec:ident) => {
159        impl_bitops64!($vec);
160        impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
161            $vec<S3, S4, NI>: RotateEachWord128
162        {
163        }
164    };
165}
166
167macro_rules! rotr_32_s3 {
168    ($name:ident, $k0:expr, $k1:expr) => {
169        #[inline(always)]
170        fn $name(self) -> Self {
171            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
172        }
173    };
174}
175macro_rules! rotr_32 {
176    ($name:ident, $i:expr) => {
177        #[inline(always)]
178        fn $name(self) -> Self {
179            Self::new(unsafe {
180                _mm_or_si128(
181                    _mm_srli_epi32(self.x, $i as i32),
182                    _mm_slli_epi32(self.x, 32 - $i as i32),
183                )
184            })
185        }
186    };
187}
188impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
189    rotr_32!(rotate_each_word_right7, 7);
190    rotr_32_s3!(
191        rotate_each_word_right8,
192        0x0c0f0e0d_080b0a09,
193        0x04070605_00030201
194    );
195    rotr_32!(rotate_each_word_right11, 11);
196    rotr_32!(rotate_each_word_right12, 12);
197    rotr_32_s3!(
198        rotate_each_word_right16,
199        0x0d0c0f0e_09080b0a,
200        0x05040706_01000302
201    );
202    rotr_32!(rotate_each_word_right20, 20);
203    rotr_32_s3!(
204        rotate_each_word_right24,
205        0x0e0d0c0f_0a09080b,
206        0x06050407_02010003
207    );
208    rotr_32!(rotate_each_word_right25, 25);
209}
210impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
211    rotr_32!(rotate_each_word_right7, 7);
212    rotr_32!(rotate_each_word_right8, 8);
213    rotr_32!(rotate_each_word_right11, 11);
214    rotr_32!(rotate_each_word_right12, 12);
215    #[inline(always)]
216    fn rotate_each_word_right16(self) -> Self {
217        Self::new(swap16_s2(self.x))
218    }
219    rotr_32!(rotate_each_word_right20, 20);
220    rotr_32!(rotate_each_word_right24, 24);
221    rotr_32!(rotate_each_word_right25, 25);
222}
223
224macro_rules! rotr_64_s3 {
225    ($name:ident, $k0:expr, $k1:expr) => {
226        #[inline(always)]
227        fn $name(self) -> Self {
228            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
229        }
230    };
231}
232macro_rules! rotr_64 {
233    ($name:ident, $i:expr) => {
234        #[inline(always)]
235        fn $name(self) -> Self {
236            Self::new(unsafe {
237                _mm_or_si128(
238                    _mm_srli_epi64(self.x, $i as i32),
239                    _mm_slli_epi64(self.x, 64 - $i as i32),
240                )
241            })
242        }
243    };
244}
245impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
246    rotr_64!(rotate_each_word_right7, 7);
247    rotr_64_s3!(
248        rotate_each_word_right8,
249        0x080f_0e0d_0c0b_0a09,
250        0x0007_0605_0403_0201
251    );
252    rotr_64!(rotate_each_word_right11, 11);
253    rotr_64!(rotate_each_word_right12, 12);
254    rotr_64_s3!(
255        rotate_each_word_right16,
256        0x0908_0f0e_0d0c_0b0a,
257        0x0100_0706_0504_0302
258    );
259    rotr_64!(rotate_each_word_right20, 20);
260    rotr_64_s3!(
261        rotate_each_word_right24,
262        0x0a09_080f_0e0d_0c0b,
263        0x0201_0007_0605_0403
264    );
265    rotr_64!(rotate_each_word_right25, 25);
266}
267impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
268    rotr_64!(rotate_each_word_right7, 7);
269    rotr_64!(rotate_each_word_right8, 8);
270    rotr_64!(rotate_each_word_right11, 11);
271    rotr_64!(rotate_each_word_right12, 12);
272    #[inline(always)]
273    fn rotate_each_word_right16(self) -> Self {
274        Self::new(swap16_s2(self.x))
275    }
276    rotr_64!(rotate_each_word_right20, 20);
277    rotr_64!(rotate_each_word_right24, 24);
278    rotr_64!(rotate_each_word_right25, 25);
279}
280impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
281    #[inline(always)]
282    fn rotate_each_word_right32(self) -> Self {
283        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
284    }
285}
286
287macro_rules! rotr_128 {
288    ($name:ident, $i:expr) => {
289        #[inline(always)]
290        fn $name(self) -> Self {
291            Self::new(unsafe {
292                _mm_or_si128(
293                    _mm_srli_si128(self.x, $i as i32),
294                    _mm_slli_si128(self.x, 128 - $i as i32),
295                )
296            })
297        }
298    };
299}
300// TODO: completely unoptimized
301impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
302    rotr_128!(rotate_each_word_right7, 7);
303    rotr_128!(rotate_each_word_right8, 8);
304    rotr_128!(rotate_each_word_right11, 11);
305    rotr_128!(rotate_each_word_right12, 12);
306    rotr_128!(rotate_each_word_right16, 16);
307    rotr_128!(rotate_each_word_right20, 20);
308    rotr_128!(rotate_each_word_right24, 24);
309    rotr_128!(rotate_each_word_right25, 25);
310}
311// TODO: completely unoptimized
312impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
313    rotr_128!(rotate_each_word_right32, 32);
314}
315impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
316
317def_vec!(u32x4_sse2, u32);
318def_vec!(u64x2_sse2, u64);
319def_vec!(u128x1_sse2, u128);
320
321impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
322    #[inline(always)]
323    fn to_lanes(self) -> [u32; 4] {
324        unsafe {
325            let x = _mm_cvtsi128_si64(self.x) as u64;
326            let y = _mm_extract_epi64(self.x, 1) as u64;
327            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
328        }
329    }
330    #[inline(always)]
331    fn from_lanes(xs: [u32; 4]) -> Self {
332        unsafe {
333            let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
334            x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
335            Self::new(x)
336        }
337    }
338}
339impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
340    #[inline(always)]
341    fn to_lanes(self) -> [u32; 4] {
342        unsafe {
343            let x = _mm_cvtsi128_si64(self.x) as u64;
344            let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
345            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
346        }
347    }
348    #[inline(always)]
349    fn from_lanes(xs: [u32; 4]) -> Self {
350        unsafe {
351            let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
352            let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
353            let x = _mm_cvtsi64_si128(x);
354            let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
355            Self::new(_mm_or_si128(x, y))
356        }
357    }
358}
359impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
360    #[inline(always)]
361    fn to_lanes(self) -> [u64; 2] {
362        unsafe {
363            [
364                _mm_cvtsi128_si64(self.x) as u64,
365                _mm_extract_epi64(self.x, 1) as u64,
366            ]
367        }
368    }
369    #[inline(always)]
370    fn from_lanes(xs: [u64; 2]) -> Self {
371        unsafe {
372            let mut x = _mm_cvtsi64_si128(xs[0] as i64);
373            x = _mm_insert_epi64(x, xs[1] as i64, 1);
374            Self::new(x)
375        }
376    }
377}
378impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
379    #[inline(always)]
380    fn to_lanes(self) -> [u64; 2] {
381        unsafe {
382            [
383                _mm_cvtsi128_si64(self.x) as u64,
384                _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
385            ]
386        }
387    }
388    #[inline(always)]
389    fn from_lanes(xs: [u64; 2]) -> Self {
390        unsafe {
391            let x = _mm_cvtsi64_si128(xs[0] as i64);
392            let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
393            Self::new(_mm_or_si128(x, y))
394        }
395    }
396}
397impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
398    #[inline(always)]
399    fn to_lanes(self) -> [u128; 1] {
400        unimplemented!()
401    }
402    #[inline(always)]
403    fn from_lanes(xs: [u128; 1]) -> Self {
404        unimplemented!("{:?}", xs)
405    }
406}
407
408impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
409where
410    u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
411{
412    #[inline(always)]
413    fn to_lanes(self) -> [u64; 4] {
414        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
415        [a[0], a[1], b[0], b[1]]
416    }
417    #[inline(always)]
418    fn from_lanes(xs: [u64; 4]) -> Self {
419        let (a, b) = (
420            u64x2_sse2::from_lanes([xs[0], xs[1]]),
421            u64x2_sse2::from_lanes([xs[2], xs[3]]),
422        );
423        x2::new([a, b])
424    }
425}
426
427macro_rules! impl_into {
428    ($from:ident, $to:ident) => {
429        impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
430            #[inline(always)]
431            fn from(x: $from<S3, S4, NI>) -> Self {
432                $to::new(x.x)
433            }
434        }
435    };
436}
437
438impl_into!(u128x1_sse2, u32x4_sse2);
439impl_into!(u128x1_sse2, u64x2_sse2);
440
441impl_bitops32!(u32x4_sse2);
442impl_bitops64!(u64x2_sse2);
443impl_bitops128!(u128x1_sse2);
444
445impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
446    u32x4_sse2<S3, S4, NI>: BSwap
447{
448}
449impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
450    u64x2_sse2<S3, S4, NI>: BSwap
451{
452}
453impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
454impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
455impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
456impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
457
458impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
459where
460    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
461    Machine86<S3, S4, NI>: Machine,
462{
463}
464impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
465where
466    u64x2_sse2<S3, S4, NI>:
467        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
468    Machine86<S3, S4, NI>: Machine,
469{
470}
471impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
472where
473    u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
474    Machine86<S3, S4, NI>: Machine,
475    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
476    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
477{
478}
479
480impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
481where
482    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
483    Machine86<YesS3, YesS4, NI>: Machine,
484{
485}
486impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
487where
488    u64x2_sse2<YesS3, YesS4, NI>:
489        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
490    Machine86<YesS3, YesS4, NI>: Machine,
491{
492}
493impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
494where
495    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
496    Machine86<YesS3, YesS4, NI>: Machine,
497    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
498    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
499{
500}
501
502impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
503    #[inline(always)]
504    unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
505        Self::new(_mm_set_epi32(
506            xs[3] as i32,
507            xs[2] as i32,
508            xs[1] as i32,
509            xs[0] as i32,
510        ))
511    }
512}
513
514impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
515where
516    Self: MultiLane<[u32; 4]>,
517{
518    #[inline(always)]
519    fn extract(self, i: u32) -> u32 {
520        self.to_lanes()[i as usize]
521    }
522    #[inline(always)]
523    fn insert(self, v: u32, i: u32) -> Self {
524        Self::new(unsafe {
525            match i {
526                0 => _mm_insert_epi32(self.x, v as i32, 0),
527                1 => _mm_insert_epi32(self.x, v as i32, 1),
528                2 => _mm_insert_epi32(self.x, v as i32, 2),
529                3 => _mm_insert_epi32(self.x, v as i32, 3),
530                _ => unreachable!(),
531            }
532        })
533    }
534}
535impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
536where
537    Self: MultiLane<[u32; 4]>,
538{
539    #[inline(always)]
540    fn extract(self, i: u32) -> u32 {
541        self.to_lanes()[i as usize]
542    }
543    #[inline(always)]
544    fn insert(self, v: u32, i: u32) -> Self {
545        Self::new(unsafe {
546            match i {
547                0 => {
548                    let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
549                    _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
550                }
551                1 => {
552                    let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
553                    x = _mm_slli_si128(x, 4);
554                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
555                    _mm_shuffle_epi32(x, 0b1110_0001)
556                }
557                2 => {
558                    let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
559                    x = _mm_slli_si128(x, 4);
560                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
561                    _mm_shuffle_epi32(x, 0b1100_1001)
562                }
563                3 => {
564                    let mut x = _mm_slli_si128(self.x, 4);
565                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
566                    _mm_shuffle_epi32(x, 0b0011_1001)
567                }
568                _ => unreachable!(),
569            }
570        })
571    }
572}
573
574impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
575    #[inline(always)]
576    fn shuffle_lane_words2301(self) -> Self {
577        self.shuffle2301()
578    }
579    #[inline(always)]
580    fn shuffle_lane_words1230(self) -> Self {
581        self.shuffle1230()
582    }
583    #[inline(always)]
584    fn shuffle_lane_words3012(self) -> Self {
585        self.shuffle3012()
586    }
587}
588
589impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
590    #[inline(always)]
591    fn shuffle2301(self) -> Self {
592        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
593    }
594    #[inline(always)]
595    fn shuffle1230(self) -> Self {
596        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
597    }
598    #[inline(always)]
599    fn shuffle3012(self) -> Self {
600        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
601    }
602}
603
604impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
605    #[inline(always)]
606    fn shuffle2301(self) -> Self {
607        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
608    }
609    #[inline(always)]
610    fn shuffle3012(self) -> Self {
611        unsafe {
612            x2::new([
613                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
614                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
615            ])
616        }
617    }
618    #[inline(always)]
619    fn shuffle1230(self) -> Self {
620        unsafe {
621            x2::new([
622                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
623                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
624            ])
625        }
626    }
627}
628impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
629    #[inline(always)]
630    fn shuffle2301(self) -> Self {
631        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
632    }
633    #[inline(always)]
634    fn shuffle3012(self) -> Self {
635        unsafe {
636            let a = _mm_srli_si128(self.0[0].x, 8);
637            let b = _mm_slli_si128(self.0[0].x, 8);
638            let c = _mm_srli_si128(self.0[1].x, 8);
639            let d = _mm_slli_si128(self.0[1].x, 8);
640            let da = _mm_or_si128(d, a);
641            let bc = _mm_or_si128(b, c);
642            x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
643        }
644    }
645    #[inline(always)]
646    fn shuffle1230(self) -> Self {
647        unsafe {
648            let a = _mm_srli_si128(self.0[0].x, 8);
649            let b = _mm_slli_si128(self.0[0].x, 8);
650            let c = _mm_srli_si128(self.0[1].x, 8);
651            let d = _mm_slli_si128(self.0[1].x, 8);
652            let da = _mm_or_si128(d, a);
653            let bc = _mm_or_si128(b, c);
654            x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
655        }
656    }
657}
658
659impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
660    #[inline(always)]
661    unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
662        Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
663    }
664}
665
666impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
667    #[inline(always)]
668    fn extract(self, i: u32) -> u64 {
669        unsafe {
670            match i {
671                0 => _mm_cvtsi128_si64(self.x) as u64,
672                1 => _mm_extract_epi64(self.x, 1) as u64,
673                _ => unreachable!(),
674            }
675        }
676    }
677    #[inline(always)]
678    fn insert(self, x: u64, i: u32) -> Self {
679        Self::new(unsafe {
680            match i {
681                0 => _mm_insert_epi64(self.x, x as i64, 0),
682                1 => _mm_insert_epi64(self.x, x as i64, 1),
683                _ => unreachable!(),
684            }
685        })
686    }
687}
688impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
689    #[inline(always)]
690    fn extract(self, i: u32) -> u64 {
691        unsafe {
692            match i {
693                0 => _mm_cvtsi128_si64(self.x) as u64,
694                1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
695                _ => unreachable!(),
696            }
697        }
698    }
699    #[inline(always)]
700    fn insert(self, x: u64, i: u32) -> Self {
701        Self::new(unsafe {
702            match i {
703                0 => _mm_or_si128(
704                    _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
705                    _mm_cvtsi64_si128(x as i64),
706                ),
707                1 => _mm_or_si128(
708                    _mm_move_epi64(self.x),
709                    _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
710                ),
711                _ => unreachable!(),
712            }
713        })
714    }
715}
716
717impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
718    #[inline(always)]
719    fn bswap(self) -> Self {
720        Self::new(unsafe {
721            let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
722            _mm_shuffle_epi8(self.x, k)
723        })
724    }
725}
726#[inline(always)]
727fn bswap32_s2(x: __m128i) -> __m128i {
728    unsafe {
729        let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
730        y = _mm_shufflehi_epi16(y, 0b0001_1011);
731        y = _mm_shufflelo_epi16(y, 0b0001_1011);
732        let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
733        z = _mm_shufflehi_epi16(z, 0b0001_1011);
734        z = _mm_shufflelo_epi16(z, 0b0001_1011);
735        _mm_packus_epi16(y, z)
736    }
737}
738impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
739    #[inline(always)]
740    fn bswap(self) -> Self {
741        Self::new(bswap32_s2(self.x))
742    }
743}
744
745impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
746    #[inline(always)]
747    fn bswap(self) -> Self {
748        Self::new(unsafe {
749            let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
750            _mm_shuffle_epi8(self.x, k)
751        })
752    }
753}
754impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
755    #[inline(always)]
756    fn bswap(self) -> Self {
757        Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
758    }
759}
760
761impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
762    #[inline(always)]
763    fn bswap(self) -> Self {
764        Self::new(unsafe {
765            let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
766            _mm_shuffle_epi8(self.x, k)
767        })
768    }
769}
770impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
771    #[inline(always)]
772    fn bswap(self) -> Self {
773        unimplemented!()
774    }
775}
776
777macro_rules! swapi {
778    ($x:expr, $i:expr, $k:expr) => {
779        unsafe {
780            const K: u8 = $k;
781            let k = _mm_set1_epi8(K as i8);
782            u128x1_sse2::new(_mm_or_si128(
783                _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
784                _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
785            ))
786        }
787    };
788}
789#[inline(always)]
790fn swap16_s2(x: __m128i) -> __m128i {
791    unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
792}
793impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
794    #[inline(always)]
795    fn swap1(self) -> Self {
796        swapi!(self, 1, 0xaa)
797    }
798    #[inline(always)]
799    fn swap2(self) -> Self {
800        swapi!(self, 2, 0xcc)
801    }
802    #[inline(always)]
803    fn swap4(self) -> Self {
804        swapi!(self, 4, 0xf0)
805    }
806    #[inline(always)]
807    fn swap8(self) -> Self {
808        u128x1_sse2::new(unsafe {
809            let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
810            _mm_shuffle_epi8(self.x, k)
811        })
812    }
813    #[inline(always)]
814    fn swap16(self) -> Self {
815        u128x1_sse2::new(unsafe {
816            let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
817            _mm_shuffle_epi8(self.x, k)
818        })
819    }
820    #[inline(always)]
821    fn swap32(self) -> Self {
822        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
823    }
824    #[inline(always)]
825    fn swap64(self) -> Self {
826        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
827    }
828}
829impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
830    #[inline(always)]
831    fn swap1(self) -> Self {
832        swapi!(self, 1, 0xaa)
833    }
834    #[inline(always)]
835    fn swap2(self) -> Self {
836        swapi!(self, 2, 0xcc)
837    }
838    #[inline(always)]
839    fn swap4(self) -> Self {
840        swapi!(self, 4, 0xf0)
841    }
842    #[inline(always)]
843    fn swap8(self) -> Self {
844        u128x1_sse2::new(unsafe {
845            _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
846        })
847    }
848    #[inline(always)]
849    fn swap16(self) -> Self {
850        u128x1_sse2::new(swap16_s2(self.x))
851    }
852    #[inline(always)]
853    fn swap32(self) -> Self {
854        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
855    }
856    #[inline(always)]
857    fn swap64(self) -> Self {
858        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
859    }
860}
861
862#[derive(Copy, Clone)]
863pub struct G0;
864#[derive(Copy, Clone)]
865pub struct G1;
866
867#[allow(non_camel_case_types)]
868pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
869#[allow(non_camel_case_types)]
870pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
871#[allow(non_camel_case_types)]
872pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
873#[allow(non_camel_case_types)]
874pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
875
876#[allow(non_camel_case_types)]
877pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
878#[allow(non_camel_case_types)]
879pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
880#[allow(non_camel_case_types)]
881pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
882
883impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
884    #[inline(always)]
885    fn to_scalars(self) -> [u32; 16] {
886        unsafe {
887            core::mem::transmute(self)
888        }
889    }
890}
891
892impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
893where
894    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
895    Machine86<S3, S4, NI>: Machine,
896    u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
897    u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
898{
899}
900impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
901where
902    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
903    Machine86<S3, S4, NI>: Machine,
904    u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
905    u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
906{
907}
908impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
909where
910    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
911    Machine86<S3, S4, NI>: Machine,
912    u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
913{
914}
915impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
916where
917    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
918    Machine86<S3, S4, NI>: Machine,
919    u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
920    u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
921    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
922    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
923    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
924{
925}
926
927impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
928where
929    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
930    Avx2Machine<NI>: Machine,
931    u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
932    u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
933{
934}
935impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
936where
937    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
938    Avx2Machine<NI>: Machine,
939    u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
940    u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
941{
942}
943impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
944where
945    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
946    Avx2Machine<NI>: Machine,
947    u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
948{
949}
950impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
951where
952    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
953    Avx2Machine<NI>: Machine,
954    u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
955    u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
956    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
957    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
958    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
959{
960}
961
962impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
963where
964    u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
965{
966    #[inline(always)]
967    fn extract(self, i: u32) -> u64 {
968        match i {
969            0 => self.0[0].extract(0),
970            1 => self.0[0].extract(1),
971            2 => self.0[1].extract(0),
972            3 => self.0[1].extract(1),
973            _ => panic!(),
974        }
975    }
976    #[inline(always)]
977    fn insert(mut self, w: u64, i: u32) -> Self {
978        match i {
979            0 => self.0[0] = self.0[0].insert(w, 0),
980            1 => self.0[0] = self.0[0].insert(w, 1),
981            2 => self.0[1] = self.0[1].insert(w, 0),
982            3 => self.0[1] = self.0[1].insert(w, 1),
983            _ => panic!(),
984        };
985        self
986    }
987}
988
989impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
990where
991    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
992    Machine86<S3, S4, NI>: Machine,
993    u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
994    u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
995    u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
996    u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
997{
998}
999impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
1000where
1001    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1002    Machine86<S3, S4, NI>: Machine,
1003    u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1004    u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1005{
1006}
1007impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1008where
1009    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1010    Machine86<S3, S4, NI>: Machine,
1011    u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1012    u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1013    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1014    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1015{
1016}
1017
1018impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
1019where
1020    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
1021    Avx2Machine<NI>: Machine,
1022    u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
1023    u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
1024    u32x4x4_sse2<YesS3, YesS4, NI>: Vec4Ext<<Avx2Machine<NI> as Machine>::u32x4>,
1025    u32x4x4_sse2<YesS3, YesS4, NI>: Vector<[u32; 16]>,
1026{
1027}
1028impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1029where
1030    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1031    Avx2Machine<NI>: Machine,
1032    u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1033    u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1034{
1035}
1036impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1037where
1038    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1039    Avx2Machine<NI>: Machine,
1040    u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1041    u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1042    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1043    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1044{
1045}
1046
1047macro_rules! impl_into_x {
1048    ($from:ident, $to:ident) => {
1049        impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1050            for x2<$to<S3, S4, NI>, Gt>
1051        {
1052            #[inline(always)]
1053            fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1054                x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1055            }
1056        }
1057        impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1058            #[inline(always)]
1059            fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1060                x4::new([
1061                    $to::from(x.0[0]),
1062                    $to::from(x.0[1]),
1063                    $to::from(x.0[2]),
1064                    $to::from(x.0[3]),
1065                ])
1066            }
1067        }
1068    };
1069}
1070impl_into_x!(u128x1_sse2, u64x2_sse2);
1071impl_into_x!(u128x1_sse2, u32x4_sse2);
1072
1073///// Debugging
1074
1075use core::fmt::{Debug, Formatter, Result};
1076
1077impl<W: PartialEq, G> PartialEq for x2<W, G> {
1078    #[inline(always)]
1079    fn eq(&self, rhs: &Self) -> bool {
1080        self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1081    }
1082}
1083
1084#[allow(unused)]
1085#[inline(always)]
1086unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1087    let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1088    _mm_cvtsi128_si64(q) == -1
1089}
1090
1091#[inline(always)]
1092unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1093    let q = _mm_cmpeq_epi32(x, y);
1094    let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1095    let q = _mm_cvtsi128_si64(q);
1096    (p & q) == -1
1097}
1098
1099impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1100    #[inline(always)]
1101    fn eq(&self, rhs: &Self) -> bool {
1102        unsafe { eq128_s2(self.x, rhs.x) }
1103    }
1104}
1105impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1106where
1107    Self: Copy + MultiLane<[u32; 4]>,
1108{
1109    #[cold]
1110    fn fmt(&self, fmt: &mut Formatter) -> Result {
1111        fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1112    }
1113}
1114
1115impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1116    #[inline(always)]
1117    fn eq(&self, rhs: &Self) -> bool {
1118        unsafe { eq128_s2(self.x, rhs.x) }
1119    }
1120}
1121impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1122where
1123    Self: Copy + MultiLane<[u64; 2]>,
1124{
1125    #[cold]
1126    fn fmt(&self, fmt: &mut Formatter) -> Result {
1127        fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1128    }
1129}
1130
1131impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1132where
1133    u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1134{
1135    #[cold]
1136    fn fmt(&self, fmt: &mut Formatter) -> Result {
1137        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1138        fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1139    }
1140}
1141
1142#[cfg(test)]
1143#[cfg(target_arch = "x86_64")]
1144mod test {
1145    use super::*;
1146    use crate::x86_64::{SSE2, SSE41, SSSE3};
1147    use crate::Machine;
1148
1149    #[test]
1150    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1151    fn test_bswap32_s2_vs_s3() {
1152        let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1153        let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1154
1155        let s2 = unsafe { SSE2::instance() };
1156        let s3 = unsafe { SSSE3::instance() };
1157
1158        let x_s2 = {
1159            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1160            x_s2.bswap()
1161        };
1162
1163        let x_s3 = {
1164            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1165            x_s3.bswap()
1166        };
1167
1168        assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
1169        assert_eq!(x_s2, s2.vec(ys));
1170    }
1171
1172    #[test]
1173    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1174    fn test_bswap64_s2_vs_s3() {
1175        let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1176        let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1177
1178        let s2 = unsafe { SSE2::instance() };
1179        let s3 = unsafe { SSSE3::instance() };
1180
1181        let x_s2 = {
1182            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1183            x_s2.bswap()
1184        };
1185
1186        let x_s3 = {
1187            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1188            x_s3.bswap()
1189        };
1190
1191        assert_eq!(x_s2, s2.vec(ys));
1192        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1193    }
1194
1195    #[test]
1196    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1197    fn test_shuffle32_s2_vs_s3() {
1198        let xs = [0x0, 0x1, 0x2, 0x3];
1199        let ys = [0x2, 0x3, 0x0, 0x1];
1200        let zs = [0x1, 0x2, 0x3, 0x0];
1201
1202        let s2 = unsafe { SSE2::instance() };
1203        let s3 = unsafe { SSSE3::instance() };
1204
1205        let x_s2 = {
1206            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1207            x_s2.shuffle2301()
1208        };
1209        let x_s3 = {
1210            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1211            x_s3.shuffle2301()
1212        };
1213        assert_eq!(x_s2, s2.vec(ys));
1214        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1215
1216        let x_s2 = {
1217            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1218            x_s2.shuffle3012()
1219        };
1220        let x_s3 = {
1221            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1222            x_s3.shuffle3012()
1223        };
1224        assert_eq!(x_s2, s2.vec(zs));
1225        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1226
1227        let x_s2 = x_s2.shuffle1230();
1228        let x_s3 = x_s3.shuffle1230();
1229        assert_eq!(x_s2, s2.vec(xs));
1230        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1231    }
1232
1233    #[test]
1234    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1235    fn test_shuffle64_s2_vs_s3() {
1236        let xs = [0x0, 0x1, 0x2, 0x3];
1237        let ys = [0x2, 0x3, 0x0, 0x1];
1238        let zs = [0x1, 0x2, 0x3, 0x0];
1239
1240        let s2 = unsafe { SSE2::instance() };
1241        let s3 = unsafe { SSSE3::instance() };
1242
1243        let x_s2 = {
1244            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1245            x_s2.shuffle2301()
1246        };
1247        let x_s3 = {
1248            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1249            x_s3.shuffle2301()
1250        };
1251        assert_eq!(x_s2, s2.vec(ys));
1252        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1253
1254        let x_s2 = {
1255            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1256            x_s2.shuffle3012()
1257        };
1258        let x_s3 = {
1259            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1260            x_s3.shuffle3012()
1261        };
1262        assert_eq!(x_s2, s2.vec(zs));
1263        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1264
1265        let x_s2 = x_s2.shuffle1230();
1266        let x_s3 = x_s3.shuffle1230();
1267        assert_eq!(x_s2, s2.vec(xs));
1268        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1269    }
1270
1271    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1272    #[test]
1273    fn test_lanes_u32x4() {
1274        let xs = [0x1, 0x2, 0x3, 0x4];
1275
1276        let s2 = unsafe { SSE2::instance() };
1277        let s3 = unsafe { SSSE3::instance() };
1278        let s4 = unsafe { SSE41::instance() };
1279
1280        {
1281            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1282            let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1283            assert_eq!(x_s2, y_s2);
1284            assert_eq!(xs, y_s2.to_lanes());
1285        }
1286
1287        {
1288            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1289            let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1290            assert_eq!(x_s3, y_s3);
1291            assert_eq!(xs, y_s3.to_lanes());
1292        }
1293
1294        {
1295            let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1296            let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1297            assert_eq!(x_s4, y_s4);
1298            assert_eq!(xs, y_s4.to_lanes());
1299        }
1300    }
1301
1302    #[test]
1303    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1304    fn test_lanes_u64x2() {
1305        let xs = [0x1, 0x2];
1306
1307        let s2 = unsafe { SSE2::instance() };
1308        let s3 = unsafe { SSSE3::instance() };
1309        let s4 = unsafe { SSE41::instance() };
1310
1311        {
1312            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1313            let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1314            assert_eq!(x_s2, y_s2);
1315            assert_eq!(xs, y_s2.to_lanes());
1316        }
1317
1318        {
1319            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1320            let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1321            assert_eq!(x_s3, y_s3);
1322            assert_eq!(xs, y_s3.to_lanes());
1323        }
1324
1325        {
1326            let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1327            let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1328            assert_eq!(x_s4, y_s4);
1329            assert_eq!(xs, y_s4.to_lanes());
1330        }
1331    }
1332
1333    #[test]
1334    fn test_vec4_u32x4_s2() {
1335        let xs = [1, 2, 3, 4];
1336        let s2 = unsafe { SSE2::instance() };
1337        let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1338        assert_eq!(x_s2.extract(0), 1);
1339        assert_eq!(x_s2.extract(1), 2);
1340        assert_eq!(x_s2.extract(2), 3);
1341        assert_eq!(x_s2.extract(3), 4);
1342        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1343        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1344        assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1345        assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1346    }
1347
1348    #[test]
1349    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1350    fn test_vec4_u32x4_s4() {
1351        let xs = [1, 2, 3, 4];
1352        let s4 = unsafe { SSE41::instance() };
1353        let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1354        assert_eq!(x_s4.extract(0), 1);
1355        assert_eq!(x_s4.extract(1), 2);
1356        assert_eq!(x_s4.extract(2), 3);
1357        assert_eq!(x_s4.extract(3), 4);
1358        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1359        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1360        assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1361        assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1362    }
1363
1364    #[test]
1365    fn test_vec2_u64x2_s2() {
1366        let xs = [0x1, 0x2];
1367        let s2 = unsafe { SSE2::instance() };
1368        let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1369        assert_eq!(x_s2.extract(0), 1);
1370        assert_eq!(x_s2.extract(1), 2);
1371        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1372        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1373    }
1374
1375    #[test]
1376    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1377    fn test_vec4_u64x2_s4() {
1378        let xs = [0x1, 0x2];
1379        let s4 = unsafe { SSE41::instance() };
1380        let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1381        assert_eq!(x_s4.extract(0), 1);
1382        assert_eq!(x_s4.extract(1), 2);
1383        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1384        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1385    }
1386}
1387
1388pub mod avx2 {
1389    #![allow(non_camel_case_types)]
1390    use crate::soft::x4;
1391    use crate::types::*;
1392    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
1393    use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1394    use core::arch::x86_64::*;
1395    use core::marker::PhantomData;
1396    use core::ops::*;
1397
1398    #[derive(Copy, Clone)]
1399    pub struct u32x4x4_avx2<NI> {
1400        x: [__m256i; 2],
1401        ni: PhantomData<NI>,
1402    }
1403
1404    impl<NI> u32x4x4_avx2<NI> {
1405        #[inline(always)]
1406        fn new(x: [__m256i; 2]) -> Self {
1407            Self { x, ni: PhantomData }
1408        }
1409    }
1410
1411    impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
1412    impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
1413        #[inline(always)]
1414        unsafe fn unpack(p: vec512_storage) -> Self {
1415            Self::new([p.avx[0].avx, p.avx[1].avx])
1416        }
1417    }
1418    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1419        #[inline(always)]
1420        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1421            unsafe {
1422                [
1423                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1424                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1425                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1426                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1427                ]
1428            }
1429        }
1430        #[inline(always)]
1431        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1432            Self::new(unsafe {
1433                [
1434                    _mm256_setr_m128i(x[0].x, x[1].x),
1435                    _mm256_setr_m128i(x[2].x, x[3].x),
1436                ]
1437            })
1438        }
1439    }
1440    impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1441        #[inline(always)]
1442        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1443            unsafe {
1444                match i {
1445                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1446                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1447                    2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1448                    3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1449                    _ => panic!(),
1450                }
1451            }
1452        }
1453        #[inline(always)]
1454        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1455            Self::new(unsafe {
1456                match i {
1457                    0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
1458                    1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
1459                    2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
1460                    3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
1461                    _ => panic!(),
1462                }
1463            })
1464        }
1465    }
1466    impl<NI> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1467        #[inline(always)]
1468        fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1469            /*
1470             * a00:a01 a10:a11
1471             * b00:b01 b10:b11
1472             * c00:c01 c10:c11
1473             * d00:d01 d10:d11
1474             *       =>
1475             * a00:b00 c00:d00
1476             * a01:b01 c01:d01
1477             * a10:b10 c10:d10
1478             * a11:b11 c11:d11
1479             */
1480            unsafe {
1481                let ab00 = _mm256_permute2x128_si256(a.x[0], b.x[0], 0x20);
1482                let ab01 = _mm256_permute2x128_si256(a.x[0], b.x[0], 0x31);
1483                let ab10 = _mm256_permute2x128_si256(a.x[1], b.x[1], 0x20);
1484                let ab11 = _mm256_permute2x128_si256(a.x[1], b.x[1], 0x31);
1485                let cd00 = _mm256_permute2x128_si256(c.x[0], d.x[0], 0x20);
1486                let cd01 = _mm256_permute2x128_si256(c.x[0], d.x[0], 0x31);
1487                let cd10 = _mm256_permute2x128_si256(c.x[1], d.x[1], 0x20);
1488                let cd11 = _mm256_permute2x128_si256(c.x[1], d.x[1], 0x31);
1489                (
1490                    Self { x: [ab00, cd00], ni: a.ni },
1491                    Self { x: [ab01, cd01], ni: a.ni },
1492                    Self { x: [ab10, cd10], ni: a.ni },
1493                    Self { x: [ab11, cd11], ni: a.ni },
1494                )
1495            }
1496        }
1497    }
1498    impl<NI> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
1499        #[inline(always)]
1500        fn to_scalars(self) -> [u32; 16] {
1501            unsafe {
1502                core::mem::transmute(self)
1503            }
1504        }
1505    }
1506    impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
1507        #[inline(always)]
1508        fn shuffle_lane_words1230(self) -> Self {
1509            Self::new(unsafe {
1510                [
1511                    _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
1512                    _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
1513                ]
1514            })
1515        }
1516        #[inline(always)]
1517        fn shuffle_lane_words2301(self) -> Self {
1518            Self::new(unsafe {
1519                [
1520                    _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
1521                    _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
1522                ]
1523            })
1524        }
1525        #[inline(always)]
1526        fn shuffle_lane_words3012(self) -> Self {
1527            Self::new(unsafe {
1528                [
1529                    _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
1530                    _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
1531                ]
1532            })
1533        }
1534    }
1535    impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
1536    impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
1537    macro_rules! shuf_lane_bytes {
1538        ($name:ident, $k0:expr, $k1:expr) => {
1539            #[inline(always)]
1540            fn $name(self) -> Self {
1541                Self::new(unsafe {
1542                    [
1543                        _mm256_shuffle_epi8(self.x[0], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1544                        _mm256_shuffle_epi8(self.x[1], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1545                    ]
1546                })
1547            }
1548        };
1549    }
1550    macro_rules! rotr_32 {
1551        ($name:ident, $i:expr) => {
1552            #[inline(always)]
1553            fn $name(self) -> Self {
1554                Self::new(unsafe {
1555                    [
1556                        _mm256_or_si256(
1557                            _mm256_srli_epi32(self.x[0], $i as i32),
1558                            _mm256_slli_epi32(self.x[0], 32 - $i as i32),
1559                        ),
1560                        _mm256_or_si256(
1561                            _mm256_srli_epi32(self.x[1], $i as i32),
1562                            _mm256_slli_epi32(self.x[1], 32 - $i as i32),
1563                        ),
1564                    ]
1565                })
1566            }
1567        };
1568    }
1569    impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
1570        rotr_32!(rotate_each_word_right7, 7);
1571        shuf_lane_bytes!(
1572            rotate_each_word_right8,
1573            0x0c0f0e0d_080b0a09,
1574            0x04070605_00030201
1575        );
1576        rotr_32!(rotate_each_word_right11, 11);
1577        rotr_32!(rotate_each_word_right12, 12);
1578        shuf_lane_bytes!(
1579            rotate_each_word_right16,
1580            0x0d0c0f0e_09080b0a,
1581            0x05040706_01000302
1582        );
1583        rotr_32!(rotate_each_word_right20, 20);
1584        shuf_lane_bytes!(
1585            rotate_each_word_right24,
1586            0x0e0d0c0f_0a09080b,
1587            0x06050407_02010003
1588        );
1589        rotr_32!(rotate_each_word_right25, 25);
1590    }
1591    impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
1592    impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
1593        #[inline(always)]
1594        fn from(x: u32x4x4_avx2<NI>) -> Self {
1595            Self {
1596                avx: [
1597                    vec256_storage { avx: x.x[0] },
1598                    vec256_storage { avx: x.x[1] },
1599                ],
1600            }
1601        }
1602    }
1603
1604    macro_rules! impl_assign {
1605        ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1606            impl<NI> $Assign for $vec<NI>
1607            where
1608                NI: Copy,
1609            {
1610                #[inline(always)]
1611                fn $assign_fn(&mut self, rhs: Self) {
1612                    *self = self.$bin_fn(rhs);
1613                }
1614            }
1615        };
1616    }
1617    impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
1618    impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
1619    impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
1620    impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
1621
1622    macro_rules! impl_bitop_x2 {
1623        ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1624            impl<NI> $Op for $vec<NI> {
1625                type Output = Self;
1626                #[inline(always)]
1627                fn $op_fn(self, rhs: Self) -> Self::Output {
1628                    Self::new(unsafe {
1629                        [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
1630                    })
1631                }
1632            }
1633        };
1634    }
1635    impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
1636    impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
1637    impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
1638    impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
1639    impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
1640
1641    impl<NI> Not for u32x4x4_avx2<NI> {
1642        type Output = Self;
1643        #[inline(always)]
1644        fn not(self) -> Self::Output {
1645            unsafe {
1646                let f = _mm256_set1_epi8(-0x7f);
1647                Self::new([f, f]) ^ self
1648            }
1649        }
1650    }
1651
1652    impl<NI> BSwap for u32x4x4_avx2<NI> {
1653        shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1654    }
1655
1656    impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
1657    where
1658        NI: Copy,
1659    {
1660        #[inline(always)]
1661        fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1662            Self::new(unsafe {
1663                [
1664                    _mm256_setr_m128i(x.0[0].x, x.0[1].x),
1665                    _mm256_setr_m128i(x.0[2].x, x.0[3].x),
1666                ]
1667            })
1668        }
1669    }
1670}