1use crate::soft::{x2, x4};
2use crate::types::*;
3use crate::vec128_storage;
4use crate::x86_64::Avx2Machine;
5use crate::x86_64::SseMachine as Machine86;
6use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7use core::arch::x86_64::*;
8use core::marker::PhantomData;
9use core::ops::{
10 Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11};
12
13macro_rules! impl_binop {
14 ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
15 impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
16 type Output = Self;
17 #[inline(always)]
18 fn $fn(self, rhs: Self) -> Self::Output {
19 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
20 }
21 }
22 };
23}
24
25macro_rules! impl_binop_assign {
26 ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
27 impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
28 where
29 $vec<S3, S4, NI>: Copy,
30 {
31 #[inline(always)]
32 fn $fn_assign(&mut self, rhs: Self) {
33 *self = self.$fn(rhs);
34 }
35 }
36 };
37}
38
39macro_rules! def_vec {
40 ($vec:ident, $word:ident) => {
41 #[allow(non_camel_case_types)]
42 #[derive(Copy, Clone)]
43 pub struct $vec<S3, S4, NI> {
44 x: __m128i,
45 s3: PhantomData<S3>,
46 s4: PhantomData<S4>,
47 ni: PhantomData<NI>,
48 }
49
50 impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
51 #[inline(always)]
52 unsafe fn unpack(x: vec128_storage) -> Self {
53 Self::new(x.sse2)
54 }
55 }
56 impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
57 #[inline(always)]
58 fn from(x: $vec<S3, S4, NI>) -> Self {
59 vec128_storage { sse2: x.x }
60 }
61 }
62 impl<S3, S4, NI> $vec<S3, S4, NI> {
63 #[inline(always)]
64 fn new(x: __m128i) -> Self {
65 $vec {
66 x,
67 s3: PhantomData,
68 s4: PhantomData,
69 ni: PhantomData,
70 }
71 }
72 }
73
74 impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
75 where
76 Self: BSwap,
77 {
78 #[inline(always)]
79 unsafe fn unsafe_read_le(input: &[u8]) -> Self {
80 assert_eq!(input.len(), 16);
81 Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
82 }
83 #[inline(always)]
84 unsafe fn unsafe_read_be(input: &[u8]) -> Self {
85 assert_eq!(input.len(), 16);
86 Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
87 }
88 #[inline(always)]
89 fn write_le(self, out: &mut [u8]) {
90 assert_eq!(out.len(), 16);
91 unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
92 }
93 #[inline(always)]
94 fn write_be(self, out: &mut [u8]) {
95 assert_eq!(out.len(), 16);
96 let x = self.bswap().x;
97 unsafe {
98 _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
99 }
100 }
101 }
102
103 impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
104 #[inline(always)]
105 fn default() -> Self {
106 Self::new(unsafe { _mm_setzero_si128() })
107 }
108 }
109
110 impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
111 type Output = Self;
112 #[inline(always)]
113 fn not(self) -> Self::Output {
114 unsafe {
115 let ff = _mm_set1_epi64x(-1i64);
116 self ^ Self::new(ff)
117 }
118 }
119 }
120
121 impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
122 impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
123 impl_binop!($vec, BitOr, bitor, _mm_or_si128);
124 impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
125 impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
126 impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
127 impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
128 impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
129 type Output = Self;
130 #[inline(always)]
131 fn andnot(self, rhs: Self) -> Self {
132 Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
133 }
134 }
135 };
136}
137
138macro_rules! impl_bitops32 {
139 ($vec:ident) => {
140 impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
141 $vec<S3, S4, NI>: RotateEachWord32
142 {
143 }
144 };
145}
146
147macro_rules! impl_bitops64 {
148 ($vec:ident) => {
149 impl_bitops32!($vec);
150 impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
151 $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
152 {
153 }
154 };
155}
156
157macro_rules! impl_bitops128 {
158 ($vec:ident) => {
159 impl_bitops64!($vec);
160 impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
161 $vec<S3, S4, NI>: RotateEachWord128
162 {
163 }
164 };
165}
166
167macro_rules! rotr_32_s3 {
168 ($name:ident, $k0:expr, $k1:expr) => {
169 #[inline(always)]
170 fn $name(self) -> Self {
171 Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
172 }
173 };
174}
175macro_rules! rotr_32 {
176 ($name:ident, $i:expr) => {
177 #[inline(always)]
178 fn $name(self) -> Self {
179 Self::new(unsafe {
180 _mm_or_si128(
181 _mm_srli_epi32(self.x, $i as i32),
182 _mm_slli_epi32(self.x, 32 - $i as i32),
183 )
184 })
185 }
186 };
187}
188impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
189 rotr_32!(rotate_each_word_right7, 7);
190 rotr_32_s3!(
191 rotate_each_word_right8,
192 0x0c0f0e0d_080b0a09,
193 0x04070605_00030201
194 );
195 rotr_32!(rotate_each_word_right11, 11);
196 rotr_32!(rotate_each_word_right12, 12);
197 rotr_32_s3!(
198 rotate_each_word_right16,
199 0x0d0c0f0e_09080b0a,
200 0x05040706_01000302
201 );
202 rotr_32!(rotate_each_word_right20, 20);
203 rotr_32_s3!(
204 rotate_each_word_right24,
205 0x0e0d0c0f_0a09080b,
206 0x06050407_02010003
207 );
208 rotr_32!(rotate_each_word_right25, 25);
209}
210impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
211 rotr_32!(rotate_each_word_right7, 7);
212 rotr_32!(rotate_each_word_right8, 8);
213 rotr_32!(rotate_each_word_right11, 11);
214 rotr_32!(rotate_each_word_right12, 12);
215 #[inline(always)]
216 fn rotate_each_word_right16(self) -> Self {
217 Self::new(swap16_s2(self.x))
218 }
219 rotr_32!(rotate_each_word_right20, 20);
220 rotr_32!(rotate_each_word_right24, 24);
221 rotr_32!(rotate_each_word_right25, 25);
222}
223
224macro_rules! rotr_64_s3 {
225 ($name:ident, $k0:expr, $k1:expr) => {
226 #[inline(always)]
227 fn $name(self) -> Self {
228 Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
229 }
230 };
231}
232macro_rules! rotr_64 {
233 ($name:ident, $i:expr) => {
234 #[inline(always)]
235 fn $name(self) -> Self {
236 Self::new(unsafe {
237 _mm_or_si128(
238 _mm_srli_epi64(self.x, $i as i32),
239 _mm_slli_epi64(self.x, 64 - $i as i32),
240 )
241 })
242 }
243 };
244}
245impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
246 rotr_64!(rotate_each_word_right7, 7);
247 rotr_64_s3!(
248 rotate_each_word_right8,
249 0x080f_0e0d_0c0b_0a09,
250 0x0007_0605_0403_0201
251 );
252 rotr_64!(rotate_each_word_right11, 11);
253 rotr_64!(rotate_each_word_right12, 12);
254 rotr_64_s3!(
255 rotate_each_word_right16,
256 0x0908_0f0e_0d0c_0b0a,
257 0x0100_0706_0504_0302
258 );
259 rotr_64!(rotate_each_word_right20, 20);
260 rotr_64_s3!(
261 rotate_each_word_right24,
262 0x0a09_080f_0e0d_0c0b,
263 0x0201_0007_0605_0403
264 );
265 rotr_64!(rotate_each_word_right25, 25);
266}
267impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
268 rotr_64!(rotate_each_word_right7, 7);
269 rotr_64!(rotate_each_word_right8, 8);
270 rotr_64!(rotate_each_word_right11, 11);
271 rotr_64!(rotate_each_word_right12, 12);
272 #[inline(always)]
273 fn rotate_each_word_right16(self) -> Self {
274 Self::new(swap16_s2(self.x))
275 }
276 rotr_64!(rotate_each_word_right20, 20);
277 rotr_64!(rotate_each_word_right24, 24);
278 rotr_64!(rotate_each_word_right25, 25);
279}
280impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
281 #[inline(always)]
282 fn rotate_each_word_right32(self) -> Self {
283 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
284 }
285}
286
287macro_rules! rotr_128 {
288 ($name:ident, $i:expr) => {
289 #[inline(always)]
290 fn $name(self) -> Self {
291 Self::new(unsafe {
292 _mm_or_si128(
293 _mm_srli_si128(self.x, $i as i32),
294 _mm_slli_si128(self.x, 128 - $i as i32),
295 )
296 })
297 }
298 };
299}
300impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
302 rotr_128!(rotate_each_word_right7, 7);
303 rotr_128!(rotate_each_word_right8, 8);
304 rotr_128!(rotate_each_word_right11, 11);
305 rotr_128!(rotate_each_word_right12, 12);
306 rotr_128!(rotate_each_word_right16, 16);
307 rotr_128!(rotate_each_word_right20, 20);
308 rotr_128!(rotate_each_word_right24, 24);
309 rotr_128!(rotate_each_word_right25, 25);
310}
311impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
313 rotr_128!(rotate_each_word_right32, 32);
314}
315impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
316
317def_vec!(u32x4_sse2, u32);
318def_vec!(u64x2_sse2, u64);
319def_vec!(u128x1_sse2, u128);
320
321impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
322 #[inline(always)]
323 fn to_lanes(self) -> [u32; 4] {
324 unsafe {
325 let x = _mm_cvtsi128_si64(self.x) as u64;
326 let y = _mm_extract_epi64(self.x, 1) as u64;
327 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
328 }
329 }
330 #[inline(always)]
331 fn from_lanes(xs: [u32; 4]) -> Self {
332 unsafe {
333 let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
334 x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
335 Self::new(x)
336 }
337 }
338}
339impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
340 #[inline(always)]
341 fn to_lanes(self) -> [u32; 4] {
342 unsafe {
343 let x = _mm_cvtsi128_si64(self.x) as u64;
344 let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
345 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
346 }
347 }
348 #[inline(always)]
349 fn from_lanes(xs: [u32; 4]) -> Self {
350 unsafe {
351 let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
352 let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
353 let x = _mm_cvtsi64_si128(x);
354 let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
355 Self::new(_mm_or_si128(x, y))
356 }
357 }
358}
359impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
360 #[inline(always)]
361 fn to_lanes(self) -> [u64; 2] {
362 unsafe {
363 [
364 _mm_cvtsi128_si64(self.x) as u64,
365 _mm_extract_epi64(self.x, 1) as u64,
366 ]
367 }
368 }
369 #[inline(always)]
370 fn from_lanes(xs: [u64; 2]) -> Self {
371 unsafe {
372 let mut x = _mm_cvtsi64_si128(xs[0] as i64);
373 x = _mm_insert_epi64(x, xs[1] as i64, 1);
374 Self::new(x)
375 }
376 }
377}
378impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
379 #[inline(always)]
380 fn to_lanes(self) -> [u64; 2] {
381 unsafe {
382 [
383 _mm_cvtsi128_si64(self.x) as u64,
384 _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
385 ]
386 }
387 }
388 #[inline(always)]
389 fn from_lanes(xs: [u64; 2]) -> Self {
390 unsafe {
391 let x = _mm_cvtsi64_si128(xs[0] as i64);
392 let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
393 Self::new(_mm_or_si128(x, y))
394 }
395 }
396}
397impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
398 #[inline(always)]
399 fn to_lanes(self) -> [u128; 1] {
400 unimplemented!()
401 }
402 #[inline(always)]
403 fn from_lanes(xs: [u128; 1]) -> Self {
404 unimplemented!("{:?}", xs)
405 }
406}
407
408impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
409where
410 u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
411{
412 #[inline(always)]
413 fn to_lanes(self) -> [u64; 4] {
414 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
415 [a[0], a[1], b[0], b[1]]
416 }
417 #[inline(always)]
418 fn from_lanes(xs: [u64; 4]) -> Self {
419 let (a, b) = (
420 u64x2_sse2::from_lanes([xs[0], xs[1]]),
421 u64x2_sse2::from_lanes([xs[2], xs[3]]),
422 );
423 x2::new([a, b])
424 }
425}
426
427macro_rules! impl_into {
428 ($from:ident, $to:ident) => {
429 impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
430 #[inline(always)]
431 fn from(x: $from<S3, S4, NI>) -> Self {
432 $to::new(x.x)
433 }
434 }
435 };
436}
437
438impl_into!(u128x1_sse2, u32x4_sse2);
439impl_into!(u128x1_sse2, u64x2_sse2);
440
441impl_bitops32!(u32x4_sse2);
442impl_bitops64!(u64x2_sse2);
443impl_bitops128!(u128x1_sse2);
444
445impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
446 u32x4_sse2<S3, S4, NI>: BSwap
447{
448}
449impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
450 u64x2_sse2<S3, S4, NI>: BSwap
451{
452}
453impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
454impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
455impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
456impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
457
458impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
459where
460 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
461 Machine86<S3, S4, NI>: Machine,
462{
463}
464impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
465where
466 u64x2_sse2<S3, S4, NI>:
467 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
468 Machine86<S3, S4, NI>: Machine,
469{
470}
471impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
472where
473 u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
474 Machine86<S3, S4, NI>: Machine,
475 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
476 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
477{
478}
479
480impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
481where
482 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
483 Machine86<YesS3, YesS4, NI>: Machine,
484{
485}
486impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
487where
488 u64x2_sse2<YesS3, YesS4, NI>:
489 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
490 Machine86<YesS3, YesS4, NI>: Machine,
491{
492}
493impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
494where
495 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
496 Machine86<YesS3, YesS4, NI>: Machine,
497 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
498 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
499{
500}
501
502impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
503 #[inline(always)]
504 unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
505 Self::new(_mm_set_epi32(
506 xs[3] as i32,
507 xs[2] as i32,
508 xs[1] as i32,
509 xs[0] as i32,
510 ))
511 }
512}
513
514impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
515where
516 Self: MultiLane<[u32; 4]>,
517{
518 #[inline(always)]
519 fn extract(self, i: u32) -> u32 {
520 self.to_lanes()[i as usize]
521 }
522 #[inline(always)]
523 fn insert(self, v: u32, i: u32) -> Self {
524 Self::new(unsafe {
525 match i {
526 0 => _mm_insert_epi32(self.x, v as i32, 0),
527 1 => _mm_insert_epi32(self.x, v as i32, 1),
528 2 => _mm_insert_epi32(self.x, v as i32, 2),
529 3 => _mm_insert_epi32(self.x, v as i32, 3),
530 _ => unreachable!(),
531 }
532 })
533 }
534}
535impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
536where
537 Self: MultiLane<[u32; 4]>,
538{
539 #[inline(always)]
540 fn extract(self, i: u32) -> u32 {
541 self.to_lanes()[i as usize]
542 }
543 #[inline(always)]
544 fn insert(self, v: u32, i: u32) -> Self {
545 Self::new(unsafe {
546 match i {
547 0 => {
548 let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
549 _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
550 }
551 1 => {
552 let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
553 x = _mm_slli_si128(x, 4);
554 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
555 _mm_shuffle_epi32(x, 0b1110_0001)
556 }
557 2 => {
558 let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
559 x = _mm_slli_si128(x, 4);
560 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
561 _mm_shuffle_epi32(x, 0b1100_1001)
562 }
563 3 => {
564 let mut x = _mm_slli_si128(self.x, 4);
565 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
566 _mm_shuffle_epi32(x, 0b0011_1001)
567 }
568 _ => unreachable!(),
569 }
570 })
571 }
572}
573
574impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
575 #[inline(always)]
576 fn shuffle_lane_words2301(self) -> Self {
577 self.shuffle2301()
578 }
579 #[inline(always)]
580 fn shuffle_lane_words1230(self) -> Self {
581 self.shuffle1230()
582 }
583 #[inline(always)]
584 fn shuffle_lane_words3012(self) -> Self {
585 self.shuffle3012()
586 }
587}
588
589impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
590 #[inline(always)]
591 fn shuffle2301(self) -> Self {
592 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
593 }
594 #[inline(always)]
595 fn shuffle1230(self) -> Self {
596 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
597 }
598 #[inline(always)]
599 fn shuffle3012(self) -> Self {
600 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
601 }
602}
603
604impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
605 #[inline(always)]
606 fn shuffle2301(self) -> Self {
607 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
608 }
609 #[inline(always)]
610 fn shuffle3012(self) -> Self {
611 unsafe {
612 x2::new([
613 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
614 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
615 ])
616 }
617 }
618 #[inline(always)]
619 fn shuffle1230(self) -> Self {
620 unsafe {
621 x2::new([
622 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
623 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
624 ])
625 }
626 }
627}
628impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
629 #[inline(always)]
630 fn shuffle2301(self) -> Self {
631 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
632 }
633 #[inline(always)]
634 fn shuffle3012(self) -> Self {
635 unsafe {
636 let a = _mm_srli_si128(self.0[0].x, 8);
637 let b = _mm_slli_si128(self.0[0].x, 8);
638 let c = _mm_srli_si128(self.0[1].x, 8);
639 let d = _mm_slli_si128(self.0[1].x, 8);
640 let da = _mm_or_si128(d, a);
641 let bc = _mm_or_si128(b, c);
642 x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
643 }
644 }
645 #[inline(always)]
646 fn shuffle1230(self) -> Self {
647 unsafe {
648 let a = _mm_srli_si128(self.0[0].x, 8);
649 let b = _mm_slli_si128(self.0[0].x, 8);
650 let c = _mm_srli_si128(self.0[1].x, 8);
651 let d = _mm_slli_si128(self.0[1].x, 8);
652 let da = _mm_or_si128(d, a);
653 let bc = _mm_or_si128(b, c);
654 x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
655 }
656 }
657}
658
659impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
660 #[inline(always)]
661 unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
662 Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
663 }
664}
665
666impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
667 #[inline(always)]
668 fn extract(self, i: u32) -> u64 {
669 unsafe {
670 match i {
671 0 => _mm_cvtsi128_si64(self.x) as u64,
672 1 => _mm_extract_epi64(self.x, 1) as u64,
673 _ => unreachable!(),
674 }
675 }
676 }
677 #[inline(always)]
678 fn insert(self, x: u64, i: u32) -> Self {
679 Self::new(unsafe {
680 match i {
681 0 => _mm_insert_epi64(self.x, x as i64, 0),
682 1 => _mm_insert_epi64(self.x, x as i64, 1),
683 _ => unreachable!(),
684 }
685 })
686 }
687}
688impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
689 #[inline(always)]
690 fn extract(self, i: u32) -> u64 {
691 unsafe {
692 match i {
693 0 => _mm_cvtsi128_si64(self.x) as u64,
694 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
695 _ => unreachable!(),
696 }
697 }
698 }
699 #[inline(always)]
700 fn insert(self, x: u64, i: u32) -> Self {
701 Self::new(unsafe {
702 match i {
703 0 => _mm_or_si128(
704 _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
705 _mm_cvtsi64_si128(x as i64),
706 ),
707 1 => _mm_or_si128(
708 _mm_move_epi64(self.x),
709 _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
710 ),
711 _ => unreachable!(),
712 }
713 })
714 }
715}
716
717impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
718 #[inline(always)]
719 fn bswap(self) -> Self {
720 Self::new(unsafe {
721 let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
722 _mm_shuffle_epi8(self.x, k)
723 })
724 }
725}
726#[inline(always)]
727fn bswap32_s2(x: __m128i) -> __m128i {
728 unsafe {
729 let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
730 y = _mm_shufflehi_epi16(y, 0b0001_1011);
731 y = _mm_shufflelo_epi16(y, 0b0001_1011);
732 let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
733 z = _mm_shufflehi_epi16(z, 0b0001_1011);
734 z = _mm_shufflelo_epi16(z, 0b0001_1011);
735 _mm_packus_epi16(y, z)
736 }
737}
738impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
739 #[inline(always)]
740 fn bswap(self) -> Self {
741 Self::new(bswap32_s2(self.x))
742 }
743}
744
745impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
746 #[inline(always)]
747 fn bswap(self) -> Self {
748 Self::new(unsafe {
749 let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
750 _mm_shuffle_epi8(self.x, k)
751 })
752 }
753}
754impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
755 #[inline(always)]
756 fn bswap(self) -> Self {
757 Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
758 }
759}
760
761impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
762 #[inline(always)]
763 fn bswap(self) -> Self {
764 Self::new(unsafe {
765 let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
766 _mm_shuffle_epi8(self.x, k)
767 })
768 }
769}
770impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
771 #[inline(always)]
772 fn bswap(self) -> Self {
773 unimplemented!()
774 }
775}
776
777macro_rules! swapi {
778 ($x:expr, $i:expr, $k:expr) => {
779 unsafe {
780 const K: u8 = $k;
781 let k = _mm_set1_epi8(K as i8);
782 u128x1_sse2::new(_mm_or_si128(
783 _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
784 _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
785 ))
786 }
787 };
788}
789#[inline(always)]
790fn swap16_s2(x: __m128i) -> __m128i {
791 unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
792}
793impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
794 #[inline(always)]
795 fn swap1(self) -> Self {
796 swapi!(self, 1, 0xaa)
797 }
798 #[inline(always)]
799 fn swap2(self) -> Self {
800 swapi!(self, 2, 0xcc)
801 }
802 #[inline(always)]
803 fn swap4(self) -> Self {
804 swapi!(self, 4, 0xf0)
805 }
806 #[inline(always)]
807 fn swap8(self) -> Self {
808 u128x1_sse2::new(unsafe {
809 let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
810 _mm_shuffle_epi8(self.x, k)
811 })
812 }
813 #[inline(always)]
814 fn swap16(self) -> Self {
815 u128x1_sse2::new(unsafe {
816 let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
817 _mm_shuffle_epi8(self.x, k)
818 })
819 }
820 #[inline(always)]
821 fn swap32(self) -> Self {
822 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
823 }
824 #[inline(always)]
825 fn swap64(self) -> Self {
826 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
827 }
828}
829impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
830 #[inline(always)]
831 fn swap1(self) -> Self {
832 swapi!(self, 1, 0xaa)
833 }
834 #[inline(always)]
835 fn swap2(self) -> Self {
836 swapi!(self, 2, 0xcc)
837 }
838 #[inline(always)]
839 fn swap4(self) -> Self {
840 swapi!(self, 4, 0xf0)
841 }
842 #[inline(always)]
843 fn swap8(self) -> Self {
844 u128x1_sse2::new(unsafe {
845 _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
846 })
847 }
848 #[inline(always)]
849 fn swap16(self) -> Self {
850 u128x1_sse2::new(swap16_s2(self.x))
851 }
852 #[inline(always)]
853 fn swap32(self) -> Self {
854 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
855 }
856 #[inline(always)]
857 fn swap64(self) -> Self {
858 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
859 }
860}
861
862#[derive(Copy, Clone)]
863pub struct G0;
864#[derive(Copy, Clone)]
865pub struct G1;
866
867#[allow(non_camel_case_types)]
868pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
869#[allow(non_camel_case_types)]
870pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
871#[allow(non_camel_case_types)]
872pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
873#[allow(non_camel_case_types)]
874pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
875
876#[allow(non_camel_case_types)]
877pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
878#[allow(non_camel_case_types)]
879pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
880#[allow(non_camel_case_types)]
881pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
882
883impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
884 #[inline(always)]
885 fn to_scalars(self) -> [u32; 16] {
886 unsafe {
887 core::mem::transmute(self)
888 }
889 }
890}
891
892impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
893where
894 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
895 Machine86<S3, S4, NI>: Machine,
896 u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
897 u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
898{
899}
900impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
901where
902 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
903 Machine86<S3, S4, NI>: Machine,
904 u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
905 u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
906{
907}
908impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
909where
910 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
911 Machine86<S3, S4, NI>: Machine,
912 u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
913{
914}
915impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
916where
917 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
918 Machine86<S3, S4, NI>: Machine,
919 u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
920 u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
921 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
922 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
923 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
924{
925}
926
927impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
928where
929 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
930 Avx2Machine<NI>: Machine,
931 u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
932 u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
933{
934}
935impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
936where
937 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
938 Avx2Machine<NI>: Machine,
939 u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
940 u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
941{
942}
943impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
944where
945 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
946 Avx2Machine<NI>: Machine,
947 u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
948{
949}
950impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
951where
952 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
953 Avx2Machine<NI>: Machine,
954 u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
955 u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
956 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
957 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
958 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
959{
960}
961
962impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
963where
964 u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
965{
966 #[inline(always)]
967 fn extract(self, i: u32) -> u64 {
968 match i {
969 0 => self.0[0].extract(0),
970 1 => self.0[0].extract(1),
971 2 => self.0[1].extract(0),
972 3 => self.0[1].extract(1),
973 _ => panic!(),
974 }
975 }
976 #[inline(always)]
977 fn insert(mut self, w: u64, i: u32) -> Self {
978 match i {
979 0 => self.0[0] = self.0[0].insert(w, 0),
980 1 => self.0[0] = self.0[0].insert(w, 1),
981 2 => self.0[1] = self.0[1].insert(w, 0),
982 3 => self.0[1] = self.0[1].insert(w, 1),
983 _ => panic!(),
984 };
985 self
986 }
987}
988
989impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
990where
991 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
992 Machine86<S3, S4, NI>: Machine,
993 u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
994 u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
995 u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
996 u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
997{
998}
999impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
1000where
1001 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1002 Machine86<S3, S4, NI>: Machine,
1003 u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1004 u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1005{
1006}
1007impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1008where
1009 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1010 Machine86<S3, S4, NI>: Machine,
1011 u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1012 u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1013 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1014 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1015{
1016}
1017
1018impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
1019where
1020 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
1021 Avx2Machine<NI>: Machine,
1022 u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
1023 u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
1024 u32x4x4_sse2<YesS3, YesS4, NI>: Vec4Ext<<Avx2Machine<NI> as Machine>::u32x4>,
1025 u32x4x4_sse2<YesS3, YesS4, NI>: Vector<[u32; 16]>,
1026{
1027}
1028impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1029where
1030 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1031 Avx2Machine<NI>: Machine,
1032 u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1033 u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1034{
1035}
1036impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1037where
1038 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1039 Avx2Machine<NI>: Machine,
1040 u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1041 u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1042 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1043 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1044{
1045}
1046
1047macro_rules! impl_into_x {
1048 ($from:ident, $to:ident) => {
1049 impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1050 for x2<$to<S3, S4, NI>, Gt>
1051 {
1052 #[inline(always)]
1053 fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1054 x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1055 }
1056 }
1057 impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1058 #[inline(always)]
1059 fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1060 x4::new([
1061 $to::from(x.0[0]),
1062 $to::from(x.0[1]),
1063 $to::from(x.0[2]),
1064 $to::from(x.0[3]),
1065 ])
1066 }
1067 }
1068 };
1069}
1070impl_into_x!(u128x1_sse2, u64x2_sse2);
1071impl_into_x!(u128x1_sse2, u32x4_sse2);
1072
1073use core::fmt::{Debug, Formatter, Result};
1076
1077impl<W: PartialEq, G> PartialEq for x2<W, G> {
1078 #[inline(always)]
1079 fn eq(&self, rhs: &Self) -> bool {
1080 self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1081 }
1082}
1083
1084#[allow(unused)]
1085#[inline(always)]
1086unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1087 let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1088 _mm_cvtsi128_si64(q) == -1
1089}
1090
1091#[inline(always)]
1092unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1093 let q = _mm_cmpeq_epi32(x, y);
1094 let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1095 let q = _mm_cvtsi128_si64(q);
1096 (p & q) == -1
1097}
1098
1099impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1100 #[inline(always)]
1101 fn eq(&self, rhs: &Self) -> bool {
1102 unsafe { eq128_s2(self.x, rhs.x) }
1103 }
1104}
1105impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1106where
1107 Self: Copy + MultiLane<[u32; 4]>,
1108{
1109 #[cold]
1110 fn fmt(&self, fmt: &mut Formatter) -> Result {
1111 fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1112 }
1113}
1114
1115impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1116 #[inline(always)]
1117 fn eq(&self, rhs: &Self) -> bool {
1118 unsafe { eq128_s2(self.x, rhs.x) }
1119 }
1120}
1121impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1122where
1123 Self: Copy + MultiLane<[u64; 2]>,
1124{
1125 #[cold]
1126 fn fmt(&self, fmt: &mut Formatter) -> Result {
1127 fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1128 }
1129}
1130
1131impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1132where
1133 u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1134{
1135 #[cold]
1136 fn fmt(&self, fmt: &mut Formatter) -> Result {
1137 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1138 fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1139 }
1140}
1141
1142#[cfg(test)]
1143#[cfg(target_arch = "x86_64")]
1144mod test {
1145 use super::*;
1146 use crate::x86_64::{SSE2, SSE41, SSSE3};
1147 use crate::Machine;
1148
1149 #[test]
1150 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1151 fn test_bswap32_s2_vs_s3() {
1152 let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1153 let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1154
1155 let s2 = unsafe { SSE2::instance() };
1156 let s3 = unsafe { SSSE3::instance() };
1157
1158 let x_s2 = {
1159 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1160 x_s2.bswap()
1161 };
1162
1163 let x_s3 = {
1164 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1165 x_s3.bswap()
1166 };
1167
1168 assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
1169 assert_eq!(x_s2, s2.vec(ys));
1170 }
1171
1172 #[test]
1173 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1174 fn test_bswap64_s2_vs_s3() {
1175 let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1176 let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1177
1178 let s2 = unsafe { SSE2::instance() };
1179 let s3 = unsafe { SSSE3::instance() };
1180
1181 let x_s2 = {
1182 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1183 x_s2.bswap()
1184 };
1185
1186 let x_s3 = {
1187 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1188 x_s3.bswap()
1189 };
1190
1191 assert_eq!(x_s2, s2.vec(ys));
1192 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1193 }
1194
1195 #[test]
1196 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1197 fn test_shuffle32_s2_vs_s3() {
1198 let xs = [0x0, 0x1, 0x2, 0x3];
1199 let ys = [0x2, 0x3, 0x0, 0x1];
1200 let zs = [0x1, 0x2, 0x3, 0x0];
1201
1202 let s2 = unsafe { SSE2::instance() };
1203 let s3 = unsafe { SSSE3::instance() };
1204
1205 let x_s2 = {
1206 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1207 x_s2.shuffle2301()
1208 };
1209 let x_s3 = {
1210 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1211 x_s3.shuffle2301()
1212 };
1213 assert_eq!(x_s2, s2.vec(ys));
1214 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1215
1216 let x_s2 = {
1217 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1218 x_s2.shuffle3012()
1219 };
1220 let x_s3 = {
1221 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1222 x_s3.shuffle3012()
1223 };
1224 assert_eq!(x_s2, s2.vec(zs));
1225 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1226
1227 let x_s2 = x_s2.shuffle1230();
1228 let x_s3 = x_s3.shuffle1230();
1229 assert_eq!(x_s2, s2.vec(xs));
1230 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1231 }
1232
1233 #[test]
1234 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1235 fn test_shuffle64_s2_vs_s3() {
1236 let xs = [0x0, 0x1, 0x2, 0x3];
1237 let ys = [0x2, 0x3, 0x0, 0x1];
1238 let zs = [0x1, 0x2, 0x3, 0x0];
1239
1240 let s2 = unsafe { SSE2::instance() };
1241 let s3 = unsafe { SSSE3::instance() };
1242
1243 let x_s2 = {
1244 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1245 x_s2.shuffle2301()
1246 };
1247 let x_s3 = {
1248 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1249 x_s3.shuffle2301()
1250 };
1251 assert_eq!(x_s2, s2.vec(ys));
1252 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1253
1254 let x_s2 = {
1255 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1256 x_s2.shuffle3012()
1257 };
1258 let x_s3 = {
1259 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1260 x_s3.shuffle3012()
1261 };
1262 assert_eq!(x_s2, s2.vec(zs));
1263 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1264
1265 let x_s2 = x_s2.shuffle1230();
1266 let x_s3 = x_s3.shuffle1230();
1267 assert_eq!(x_s2, s2.vec(xs));
1268 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1269 }
1270
1271 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1272 #[test]
1273 fn test_lanes_u32x4() {
1274 let xs = [0x1, 0x2, 0x3, 0x4];
1275
1276 let s2 = unsafe { SSE2::instance() };
1277 let s3 = unsafe { SSSE3::instance() };
1278 let s4 = unsafe { SSE41::instance() };
1279
1280 {
1281 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1282 let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1283 assert_eq!(x_s2, y_s2);
1284 assert_eq!(xs, y_s2.to_lanes());
1285 }
1286
1287 {
1288 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1289 let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1290 assert_eq!(x_s3, y_s3);
1291 assert_eq!(xs, y_s3.to_lanes());
1292 }
1293
1294 {
1295 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1296 let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1297 assert_eq!(x_s4, y_s4);
1298 assert_eq!(xs, y_s4.to_lanes());
1299 }
1300 }
1301
1302 #[test]
1303 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1304 fn test_lanes_u64x2() {
1305 let xs = [0x1, 0x2];
1306
1307 let s2 = unsafe { SSE2::instance() };
1308 let s3 = unsafe { SSSE3::instance() };
1309 let s4 = unsafe { SSE41::instance() };
1310
1311 {
1312 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1313 let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1314 assert_eq!(x_s2, y_s2);
1315 assert_eq!(xs, y_s2.to_lanes());
1316 }
1317
1318 {
1319 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1320 let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1321 assert_eq!(x_s3, y_s3);
1322 assert_eq!(xs, y_s3.to_lanes());
1323 }
1324
1325 {
1326 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1327 let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1328 assert_eq!(x_s4, y_s4);
1329 assert_eq!(xs, y_s4.to_lanes());
1330 }
1331 }
1332
1333 #[test]
1334 fn test_vec4_u32x4_s2() {
1335 let xs = [1, 2, 3, 4];
1336 let s2 = unsafe { SSE2::instance() };
1337 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1338 assert_eq!(x_s2.extract(0), 1);
1339 assert_eq!(x_s2.extract(1), 2);
1340 assert_eq!(x_s2.extract(2), 3);
1341 assert_eq!(x_s2.extract(3), 4);
1342 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1343 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1344 assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1345 assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1346 }
1347
1348 #[test]
1349 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1350 fn test_vec4_u32x4_s4() {
1351 let xs = [1, 2, 3, 4];
1352 let s4 = unsafe { SSE41::instance() };
1353 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1354 assert_eq!(x_s4.extract(0), 1);
1355 assert_eq!(x_s4.extract(1), 2);
1356 assert_eq!(x_s4.extract(2), 3);
1357 assert_eq!(x_s4.extract(3), 4);
1358 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1359 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1360 assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1361 assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1362 }
1363
1364 #[test]
1365 fn test_vec2_u64x2_s2() {
1366 let xs = [0x1, 0x2];
1367 let s2 = unsafe { SSE2::instance() };
1368 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1369 assert_eq!(x_s2.extract(0), 1);
1370 assert_eq!(x_s2.extract(1), 2);
1371 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1372 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1373 }
1374
1375 #[test]
1376 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1377 fn test_vec4_u64x2_s4() {
1378 let xs = [0x1, 0x2];
1379 let s4 = unsafe { SSE41::instance() };
1380 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1381 assert_eq!(x_s4.extract(0), 1);
1382 assert_eq!(x_s4.extract(1), 2);
1383 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1384 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1385 }
1386}
1387
1388pub mod avx2 {
1389 #![allow(non_camel_case_types)]
1390 use crate::soft::x4;
1391 use crate::types::*;
1392 use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
1393 use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1394 use core::arch::x86_64::*;
1395 use core::marker::PhantomData;
1396 use core::ops::*;
1397
1398 #[derive(Copy, Clone)]
1399 pub struct u32x4x4_avx2<NI> {
1400 x: [__m256i; 2],
1401 ni: PhantomData<NI>,
1402 }
1403
1404 impl<NI> u32x4x4_avx2<NI> {
1405 #[inline(always)]
1406 fn new(x: [__m256i; 2]) -> Self {
1407 Self { x, ni: PhantomData }
1408 }
1409 }
1410
1411 impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
1412 impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
1413 #[inline(always)]
1414 unsafe fn unpack(p: vec512_storage) -> Self {
1415 Self::new([p.avx[0].avx, p.avx[1].avx])
1416 }
1417 }
1418 impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1419 #[inline(always)]
1420 fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1421 unsafe {
1422 [
1423 u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1424 u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1425 u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1426 u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1427 ]
1428 }
1429 }
1430 #[inline(always)]
1431 fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1432 Self::new(unsafe {
1433 [
1434 _mm256_setr_m128i(x[0].x, x[1].x),
1435 _mm256_setr_m128i(x[2].x, x[3].x),
1436 ]
1437 })
1438 }
1439 }
1440 impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1441 #[inline(always)]
1442 fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1443 unsafe {
1444 match i {
1445 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1446 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1447 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1448 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1449 _ => panic!(),
1450 }
1451 }
1452 }
1453 #[inline(always)]
1454 fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1455 Self::new(unsafe {
1456 match i {
1457 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
1458 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
1459 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
1460 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
1461 _ => panic!(),
1462 }
1463 })
1464 }
1465 }
1466 impl<NI> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1467 #[inline(always)]
1468 fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1469 unsafe {
1481 let ab00 = _mm256_permute2x128_si256(a.x[0], b.x[0], 0x20);
1482 let ab01 = _mm256_permute2x128_si256(a.x[0], b.x[0], 0x31);
1483 let ab10 = _mm256_permute2x128_si256(a.x[1], b.x[1], 0x20);
1484 let ab11 = _mm256_permute2x128_si256(a.x[1], b.x[1], 0x31);
1485 let cd00 = _mm256_permute2x128_si256(c.x[0], d.x[0], 0x20);
1486 let cd01 = _mm256_permute2x128_si256(c.x[0], d.x[0], 0x31);
1487 let cd10 = _mm256_permute2x128_si256(c.x[1], d.x[1], 0x20);
1488 let cd11 = _mm256_permute2x128_si256(c.x[1], d.x[1], 0x31);
1489 (
1490 Self { x: [ab00, cd00], ni: a.ni },
1491 Self { x: [ab01, cd01], ni: a.ni },
1492 Self { x: [ab10, cd10], ni: a.ni },
1493 Self { x: [ab11, cd11], ni: a.ni },
1494 )
1495 }
1496 }
1497 }
1498 impl<NI> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
1499 #[inline(always)]
1500 fn to_scalars(self) -> [u32; 16] {
1501 unsafe {
1502 core::mem::transmute(self)
1503 }
1504 }
1505 }
1506 impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
1507 #[inline(always)]
1508 fn shuffle_lane_words1230(self) -> Self {
1509 Self::new(unsafe {
1510 [
1511 _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
1512 _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
1513 ]
1514 })
1515 }
1516 #[inline(always)]
1517 fn shuffle_lane_words2301(self) -> Self {
1518 Self::new(unsafe {
1519 [
1520 _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
1521 _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
1522 ]
1523 })
1524 }
1525 #[inline(always)]
1526 fn shuffle_lane_words3012(self) -> Self {
1527 Self::new(unsafe {
1528 [
1529 _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
1530 _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
1531 ]
1532 })
1533 }
1534 }
1535 impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
1536 impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
1537 macro_rules! shuf_lane_bytes {
1538 ($name:ident, $k0:expr, $k1:expr) => {
1539 #[inline(always)]
1540 fn $name(self) -> Self {
1541 Self::new(unsafe {
1542 [
1543 _mm256_shuffle_epi8(self.x[0], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1544 _mm256_shuffle_epi8(self.x[1], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1545 ]
1546 })
1547 }
1548 };
1549 }
1550 macro_rules! rotr_32 {
1551 ($name:ident, $i:expr) => {
1552 #[inline(always)]
1553 fn $name(self) -> Self {
1554 Self::new(unsafe {
1555 [
1556 _mm256_or_si256(
1557 _mm256_srli_epi32(self.x[0], $i as i32),
1558 _mm256_slli_epi32(self.x[0], 32 - $i as i32),
1559 ),
1560 _mm256_or_si256(
1561 _mm256_srli_epi32(self.x[1], $i as i32),
1562 _mm256_slli_epi32(self.x[1], 32 - $i as i32),
1563 ),
1564 ]
1565 })
1566 }
1567 };
1568 }
1569 impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
1570 rotr_32!(rotate_each_word_right7, 7);
1571 shuf_lane_bytes!(
1572 rotate_each_word_right8,
1573 0x0c0f0e0d_080b0a09,
1574 0x04070605_00030201
1575 );
1576 rotr_32!(rotate_each_word_right11, 11);
1577 rotr_32!(rotate_each_word_right12, 12);
1578 shuf_lane_bytes!(
1579 rotate_each_word_right16,
1580 0x0d0c0f0e_09080b0a,
1581 0x05040706_01000302
1582 );
1583 rotr_32!(rotate_each_word_right20, 20);
1584 shuf_lane_bytes!(
1585 rotate_each_word_right24,
1586 0x0e0d0c0f_0a09080b,
1587 0x06050407_02010003
1588 );
1589 rotr_32!(rotate_each_word_right25, 25);
1590 }
1591 impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
1592 impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
1593 #[inline(always)]
1594 fn from(x: u32x4x4_avx2<NI>) -> Self {
1595 Self {
1596 avx: [
1597 vec256_storage { avx: x.x[0] },
1598 vec256_storage { avx: x.x[1] },
1599 ],
1600 }
1601 }
1602 }
1603
1604 macro_rules! impl_assign {
1605 ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1606 impl<NI> $Assign for $vec<NI>
1607 where
1608 NI: Copy,
1609 {
1610 #[inline(always)]
1611 fn $assign_fn(&mut self, rhs: Self) {
1612 *self = self.$bin_fn(rhs);
1613 }
1614 }
1615 };
1616 }
1617 impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
1618 impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
1619 impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
1620 impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
1621
1622 macro_rules! impl_bitop_x2 {
1623 ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1624 impl<NI> $Op for $vec<NI> {
1625 type Output = Self;
1626 #[inline(always)]
1627 fn $op_fn(self, rhs: Self) -> Self::Output {
1628 Self::new(unsafe {
1629 [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
1630 })
1631 }
1632 }
1633 };
1634 }
1635 impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
1636 impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
1637 impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
1638 impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
1639 impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
1640
1641 impl<NI> Not for u32x4x4_avx2<NI> {
1642 type Output = Self;
1643 #[inline(always)]
1644 fn not(self) -> Self::Output {
1645 unsafe {
1646 let f = _mm256_set1_epi8(-0x7f);
1647 Self::new([f, f]) ^ self
1648 }
1649 }
1650 }
1651
1652 impl<NI> BSwap for u32x4x4_avx2<NI> {
1653 shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1654 }
1655
1656 impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
1657 where
1658 NI: Copy,
1659 {
1660 #[inline(always)]
1661 fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1662 Self::new(unsafe {
1663 [
1664 _mm256_setr_m128i(x.0[0].x, x.0[1].x),
1665 _mm256_setr_m128i(x.0[2].x, x.0[3].x),
1666 ]
1667 })
1668 }
1669 }
1670}