core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::{simd::*, x86::*};
22use crate::intrinsics::simd::*;
23
24#[cfg(test)]
25use stdarch_test::assert_instr;
26
27/// Computes the absolute values of packed 32-bit integers in `a`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
30#[inline]
31#[target_feature(enable = "avx2")]
32#[cfg_attr(test, assert_instr(vpabsd))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
35    unsafe {
36        let a = a.as_i32x8();
37        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
38        transmute(r)
39    }
40}
41
42/// Computes the absolute values of packed 16-bit integers in `a`.
43///
44/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
45#[inline]
46#[target_feature(enable = "avx2")]
47#[cfg_attr(test, assert_instr(vpabsw))]
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
50    unsafe {
51        let a = a.as_i16x16();
52        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
53        transmute(r)
54    }
55}
56
57/// Computes the absolute values of packed 8-bit integers in `a`.
58///
59/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
60#[inline]
61#[target_feature(enable = "avx2")]
62#[cfg_attr(test, assert_instr(vpabsb))]
63#[stable(feature = "simd_x86", since = "1.27.0")]
64pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
65    unsafe {
66        let a = a.as_i8x32();
67        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
68        transmute(r)
69    }
70}
71
72/// Adds packed 64-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
75#[inline]
76#[target_feature(enable = "avx2")]
77#[cfg_attr(test, assert_instr(vpaddq))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
80    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
81}
82
83/// Adds packed 32-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
86#[inline]
87#[target_feature(enable = "avx2")]
88#[cfg_attr(test, assert_instr(vpaddd))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
91    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
92}
93
94/// Adds packed 16-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
97#[inline]
98#[target_feature(enable = "avx2")]
99#[cfg_attr(test, assert_instr(vpaddw))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
102    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
103}
104
105/// Adds packed 8-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
108#[inline]
109#[target_feature(enable = "avx2")]
110#[cfg_attr(test, assert_instr(vpaddb))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
113    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
119#[inline]
120#[target_feature(enable = "avx2")]
121#[cfg_attr(test, assert_instr(vpaddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
130#[inline]
131#[target_feature(enable = "avx2")]
132#[cfg_attr(test, assert_instr(vpaddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
141#[inline]
142#[target_feature(enable = "avx2")]
143#[cfg_attr(test, assert_instr(vpaddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
152#[inline]
153#[target_feature(enable = "avx2")]
154#[cfg_attr(test, assert_instr(vpaddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
158}
159
160/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
161/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
162///
163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
164#[inline]
165#[target_feature(enable = "avx2")]
166#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
167#[rustc_legacy_const_generics(2)]
168#[stable(feature = "simd_x86", since = "1.27.0")]
169pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
170    static_assert_uimm_bits!(IMM8, 8);
171
172    // If palignr is shifting the pair of vectors more than the size of two
173    // lanes, emit zero.
174    if IMM8 >= 32 {
175        return _mm256_setzero_si256();
176    }
177    // If palignr is shifting the pair of input vectors more than one lane,
178    // but less than two lanes, convert to shifting in zeroes.
179    let (a, b) = if IMM8 > 16 {
180        (_mm256_setzero_si256(), a)
181    } else {
182        (a, b)
183    };
184    unsafe {
185        if IMM8 == 16 {
186            return transmute(a);
187        }
188    }
189    const fn mask(shift: u32, i: u32) -> u32 {
190        let shift = shift % 16;
191        let mod_i = i % 16;
192        if mod_i < (16 - shift) {
193            i + shift
194        } else {
195            i + 16 + shift
196        }
197    }
198
199    unsafe {
200        let r: i8x32 = simd_shuffle!(
201            b.as_i8x32(),
202            a.as_i8x32(),
203            [
204                mask(IMM8 as u32, 0),
205                mask(IMM8 as u32, 1),
206                mask(IMM8 as u32, 2),
207                mask(IMM8 as u32, 3),
208                mask(IMM8 as u32, 4),
209                mask(IMM8 as u32, 5),
210                mask(IMM8 as u32, 6),
211                mask(IMM8 as u32, 7),
212                mask(IMM8 as u32, 8),
213                mask(IMM8 as u32, 9),
214                mask(IMM8 as u32, 10),
215                mask(IMM8 as u32, 11),
216                mask(IMM8 as u32, 12),
217                mask(IMM8 as u32, 13),
218                mask(IMM8 as u32, 14),
219                mask(IMM8 as u32, 15),
220                mask(IMM8 as u32, 16),
221                mask(IMM8 as u32, 17),
222                mask(IMM8 as u32, 18),
223                mask(IMM8 as u32, 19),
224                mask(IMM8 as u32, 20),
225                mask(IMM8 as u32, 21),
226                mask(IMM8 as u32, 22),
227                mask(IMM8 as u32, 23),
228                mask(IMM8 as u32, 24),
229                mask(IMM8 as u32, 25),
230                mask(IMM8 as u32, 26),
231                mask(IMM8 as u32, 27),
232                mask(IMM8 as u32, 28),
233                mask(IMM8 as u32, 29),
234                mask(IMM8 as u32, 30),
235                mask(IMM8 as u32, 31),
236            ],
237        );
238        transmute(r)
239    }
240}
241
242/// Computes the bitwise AND of 256 bits (representing integer data)
243/// in `a` and `b`.
244///
245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
246#[inline]
247#[target_feature(enable = "avx2")]
248#[cfg_attr(test, assert_instr(vandps))]
249#[stable(feature = "simd_x86", since = "1.27.0")]
250pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
251    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
252}
253
254/// Computes the bitwise NOT of 256 bits (representing integer data)
255/// in `a` and then AND with `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
258#[inline]
259#[target_feature(enable = "avx2")]
260#[cfg_attr(test, assert_instr(vandnps))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
263    unsafe {
264        let all_ones = _mm256_set1_epi8(-1);
265        transmute(simd_and(
266            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
267            b.as_i64x4(),
268        ))
269    }
270}
271
272/// Averages packed unsigned 16-bit integers in `a` and `b`.
273///
274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
275#[inline]
276#[target_feature(enable = "avx2")]
277#[cfg_attr(test, assert_instr(vpavgw))]
278#[stable(feature = "simd_x86", since = "1.27.0")]
279pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
280    unsafe {
281        let a = simd_cast::<_, u32x16>(a.as_u16x16());
282        let b = simd_cast::<_, u32x16>(b.as_u16x16());
283        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
284        transmute(simd_cast::<_, u16x16>(r))
285    }
286}
287
288/// Averages packed unsigned 8-bit integers in `a` and `b`.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
291#[inline]
292#[target_feature(enable = "avx2")]
293#[cfg_attr(test, assert_instr(vpavgb))]
294#[stable(feature = "simd_x86", since = "1.27.0")]
295pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
296    unsafe {
297        let a = simd_cast::<_, u16x32>(a.as_u8x32());
298        let b = simd_cast::<_, u16x32>(b.as_u8x32());
299        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
300        transmute(simd_cast::<_, u8x32>(r))
301    }
302}
303
304/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
305///
306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
307#[inline]
308#[target_feature(enable = "avx2")]
309#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
310#[rustc_legacy_const_generics(2)]
311#[stable(feature = "simd_x86", since = "1.27.0")]
312pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
313    static_assert_uimm_bits!(IMM4, 4);
314    unsafe {
315        let a = a.as_i32x4();
316        let b = b.as_i32x4();
317        let r: i32x4 = simd_shuffle!(
318            a,
319            b,
320            [
321                [0, 4, 0, 4][IMM4 as usize & 0b11],
322                [1, 1, 5, 5][IMM4 as usize & 0b11],
323                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
324                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
325            ],
326        );
327        transmute(r)
328    }
329}
330
331/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
334#[inline]
335#[target_feature(enable = "avx2")]
336#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
337#[rustc_legacy_const_generics(2)]
338#[stable(feature = "simd_x86", since = "1.27.0")]
339pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
340    static_assert_uimm_bits!(IMM8, 8);
341    unsafe {
342        let a = a.as_i32x8();
343        let b = b.as_i32x8();
344        let r: i32x8 = simd_shuffle!(
345            a,
346            b,
347            [
348                [0, 8, 0, 8][IMM8 as usize & 0b11],
349                [1, 1, 9, 9][IMM8 as usize & 0b11],
350                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
351                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
352                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
353                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
354                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
355                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
356            ],
357        );
358        transmute(r)
359    }
360}
361
362/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
363///
364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
365#[inline]
366#[target_feature(enable = "avx2")]
367#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
368#[rustc_legacy_const_generics(2)]
369#[stable(feature = "simd_x86", since = "1.27.0")]
370pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
371    static_assert_uimm_bits!(IMM8, 8);
372    unsafe {
373        let a = a.as_i16x16();
374        let b = b.as_i16x16();
375
376        let r: i16x16 = simd_shuffle!(
377            a,
378            b,
379            [
380                [0, 16, 0, 16][IMM8 as usize & 0b11],
381                [1, 1, 17, 17][IMM8 as usize & 0b11],
382                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
383                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
384                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
385                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
386                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
387                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
388                [8, 24, 8, 24][IMM8 as usize & 0b11],
389                [9, 9, 25, 25][IMM8 as usize & 0b11],
390                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
391                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
392                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
393                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
394                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
395                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
396            ],
397        );
398        transmute(r)
399    }
400}
401
402/// Blends packed 8-bit integers from `a` and `b` using `mask`.
403///
404/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
405#[inline]
406#[target_feature(enable = "avx2")]
407#[cfg_attr(test, assert_instr(vpblendvb))]
408#[stable(feature = "simd_x86", since = "1.27.0")]
409pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
410    unsafe {
411        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
412        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
413    }
414}
415
416/// Broadcasts the low packed 8-bit integer from `a` to all elements of
417/// the 128-bit returned value.
418///
419/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
420#[inline]
421#[target_feature(enable = "avx2")]
422#[cfg_attr(test, assert_instr(vpbroadcastb))]
423#[stable(feature = "simd_x86", since = "1.27.0")]
424pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
425    unsafe {
426        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
427        transmute::<i8x16, _>(ret)
428    }
429}
430
431/// Broadcasts the low packed 8-bit integer from `a` to all elements of
432/// the 256-bit returned value.
433///
434/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
435#[inline]
436#[target_feature(enable = "avx2")]
437#[cfg_attr(test, assert_instr(vpbroadcastb))]
438#[stable(feature = "simd_x86", since = "1.27.0")]
439pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
440    unsafe {
441        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
442        transmute::<i8x32, _>(ret)
443    }
444}
445
446// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
447// often compiled to `vbroadcastss`.
448/// Broadcasts the low packed 32-bit integer from `a` to all elements of
449/// the 128-bit returned value.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
452#[inline]
453#[target_feature(enable = "avx2")]
454#[cfg_attr(test, assert_instr(vbroadcastss))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
457    unsafe {
458        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
459        transmute::<i32x4, _>(ret)
460    }
461}
462
463// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
464// often compiled to `vbroadcastss`.
465/// Broadcasts the low packed 32-bit integer from `a` to all elements of
466/// the 256-bit returned value.
467///
468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
469#[inline]
470#[target_feature(enable = "avx2")]
471#[cfg_attr(test, assert_instr(vbroadcastss))]
472#[stable(feature = "simd_x86", since = "1.27.0")]
473pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
474    unsafe {
475        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
476        transmute::<i32x8, _>(ret)
477    }
478}
479
480/// Broadcasts the low packed 64-bit integer from `a` to all elements of
481/// the 128-bit returned value.
482///
483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
484#[inline]
485#[target_feature(enable = "avx2")]
486// Emits `vmovddup` instead of `vpbroadcastq`
487// See https://github.com/rust-lang/stdarch/issues/791
488#[cfg_attr(test, assert_instr(vmovddup))]
489#[stable(feature = "simd_x86", since = "1.27.0")]
490pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
491    unsafe {
492        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
493        transmute::<i64x2, _>(ret)
494    }
495}
496
497/// Broadcasts the low packed 64-bit integer from `a` to all elements of
498/// the 256-bit returned value.
499///
500/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
501#[inline]
502#[target_feature(enable = "avx2")]
503#[cfg_attr(test, assert_instr(vbroadcastsd))]
504#[stable(feature = "simd_x86", since = "1.27.0")]
505pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
506    unsafe {
507        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
508        transmute::<i64x4, _>(ret)
509    }
510}
511
512/// Broadcasts the low double-precision (64-bit) floating-point element
513/// from `a` to all elements of the 128-bit returned value.
514///
515/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
516#[inline]
517#[target_feature(enable = "avx2")]
518#[cfg_attr(test, assert_instr(vmovddup))]
519#[stable(feature = "simd_x86", since = "1.27.0")]
520pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
521    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
522}
523
524/// Broadcasts the low double-precision (64-bit) floating-point element
525/// from `a` to all elements of the 256-bit returned value.
526///
527/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
528#[inline]
529#[target_feature(enable = "avx2")]
530#[cfg_attr(test, assert_instr(vbroadcastsd))]
531#[stable(feature = "simd_x86", since = "1.27.0")]
532pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
533    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
534}
535
536/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
537/// the 256-bit returned value.
538///
539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
540#[inline]
541#[target_feature(enable = "avx2")]
542#[stable(feature = "simd_x86_updates", since = "1.82.0")]
543pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
544    unsafe {
545        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
546        transmute::<i64x4, _>(ret)
547    }
548}
549
550// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
551// `vbroadcastf128`.
552/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
553/// the 256-bit returned value.
554///
555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
556#[inline]
557#[target_feature(enable = "avx2")]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
560    unsafe {
561        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
562        transmute::<i64x4, _>(ret)
563    }
564}
565
566/// Broadcasts the low single-precision (32-bit) floating-point element
567/// from `a` to all elements of the 128-bit returned value.
568///
569/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
570#[inline]
571#[target_feature(enable = "avx2")]
572#[cfg_attr(test, assert_instr(vbroadcastss))]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
575    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
576}
577
578/// Broadcasts the low single-precision (32-bit) floating-point element
579/// from `a` to all elements of the 256-bit returned value.
580///
581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
582#[inline]
583#[target_feature(enable = "avx2")]
584#[cfg_attr(test, assert_instr(vbroadcastss))]
585#[stable(feature = "simd_x86", since = "1.27.0")]
586pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
587    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
588}
589
590/// Broadcasts the low packed 16-bit integer from a to all elements of
591/// the 128-bit returned value
592///
593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
594#[inline]
595#[target_feature(enable = "avx2")]
596#[cfg_attr(test, assert_instr(vpbroadcastw))]
597#[stable(feature = "simd_x86", since = "1.27.0")]
598pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
599    unsafe {
600        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
601        transmute::<i16x8, _>(ret)
602    }
603}
604
605/// Broadcasts the low packed 16-bit integer from a to all elements of
606/// the 256-bit returned value
607///
608/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
609#[inline]
610#[target_feature(enable = "avx2")]
611#[cfg_attr(test, assert_instr(vpbroadcastw))]
612#[stable(feature = "simd_x86", since = "1.27.0")]
613pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
614    unsafe {
615        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
616        transmute::<i16x16, _>(ret)
617    }
618}
619
620/// Compares packed 64-bit integers in `a` and `b` for equality.
621///
622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
623#[inline]
624#[target_feature(enable = "avx2")]
625#[cfg_attr(test, assert_instr(vpcmpeqq))]
626#[stable(feature = "simd_x86", since = "1.27.0")]
627pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
628    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
629}
630
631/// Compares packed 32-bit integers in `a` and `b` for equality.
632///
633/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
634#[inline]
635#[target_feature(enable = "avx2")]
636#[cfg_attr(test, assert_instr(vpcmpeqd))]
637#[stable(feature = "simd_x86", since = "1.27.0")]
638pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
639    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
640}
641
642/// Compares packed 16-bit integers in `a` and `b` for equality.
643///
644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
645#[inline]
646#[target_feature(enable = "avx2")]
647#[cfg_attr(test, assert_instr(vpcmpeqw))]
648#[stable(feature = "simd_x86", since = "1.27.0")]
649pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
650    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
651}
652
653/// Compares packed 8-bit integers in `a` and `b` for equality.
654///
655/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
656#[inline]
657#[target_feature(enable = "avx2")]
658#[cfg_attr(test, assert_instr(vpcmpeqb))]
659#[stable(feature = "simd_x86", since = "1.27.0")]
660pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
661    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
662}
663
664/// Compares packed 64-bit integers in `a` and `b` for greater-than.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
667#[inline]
668#[target_feature(enable = "avx2")]
669#[cfg_attr(test, assert_instr(vpcmpgtq))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
672    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
673}
674
675/// Compares packed 32-bit integers in `a` and `b` for greater-than.
676///
677/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
678#[inline]
679#[target_feature(enable = "avx2")]
680#[cfg_attr(test, assert_instr(vpcmpgtd))]
681#[stable(feature = "simd_x86", since = "1.27.0")]
682pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
683    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
684}
685
686/// Compares packed 16-bit integers in `a` and `b` for greater-than.
687///
688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
689#[inline]
690#[target_feature(enable = "avx2")]
691#[cfg_attr(test, assert_instr(vpcmpgtw))]
692#[stable(feature = "simd_x86", since = "1.27.0")]
693pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
694    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
695}
696
697/// Compares packed 8-bit integers in `a` and `b` for greater-than.
698///
699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
700#[inline]
701#[target_feature(enable = "avx2")]
702#[cfg_attr(test, assert_instr(vpcmpgtb))]
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
705    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
706}
707
708/// Sign-extend 16-bit integers to 32-bit integers.
709///
710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
711#[inline]
712#[target_feature(enable = "avx2")]
713#[cfg_attr(test, assert_instr(vpmovsxwd))]
714#[stable(feature = "simd_x86", since = "1.27.0")]
715pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
716    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
717}
718
719/// Sign-extend 16-bit integers to 64-bit integers.
720///
721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
722#[inline]
723#[target_feature(enable = "avx2")]
724#[cfg_attr(test, assert_instr(vpmovsxwq))]
725#[stable(feature = "simd_x86", since = "1.27.0")]
726pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
727    unsafe {
728        let a = a.as_i16x8();
729        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
730        transmute::<i64x4, _>(simd_cast(v64))
731    }
732}
733
734/// Sign-extend 32-bit integers to 64-bit integers.
735///
736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
737#[inline]
738#[target_feature(enable = "avx2")]
739#[cfg_attr(test, assert_instr(vpmovsxdq))]
740#[stable(feature = "simd_x86", since = "1.27.0")]
741pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
742    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
743}
744
745/// Sign-extend 8-bit integers to 16-bit integers.
746///
747/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
748#[inline]
749#[target_feature(enable = "avx2")]
750#[cfg_attr(test, assert_instr(vpmovsxbw))]
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
753    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
754}
755
756/// Sign-extend 8-bit integers to 32-bit integers.
757///
758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
759#[inline]
760#[target_feature(enable = "avx2")]
761#[cfg_attr(test, assert_instr(vpmovsxbd))]
762#[stable(feature = "simd_x86", since = "1.27.0")]
763pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
764    unsafe {
765        let a = a.as_i8x16();
766        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
767        transmute::<i32x8, _>(simd_cast(v64))
768    }
769}
770
771/// Sign-extend 8-bit integers to 64-bit integers.
772///
773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
774#[inline]
775#[target_feature(enable = "avx2")]
776#[cfg_attr(test, assert_instr(vpmovsxbq))]
777#[stable(feature = "simd_x86", since = "1.27.0")]
778pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
779    unsafe {
780        let a = a.as_i8x16();
781        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
782        transmute::<i64x4, _>(simd_cast(v32))
783    }
784}
785
786/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
787/// integers, and stores the results in `dst`.
788///
789/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
790#[inline]
791#[target_feature(enable = "avx2")]
792#[cfg_attr(test, assert_instr(vpmovzxwd))]
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
795    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
796}
797
798/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
799/// integers. The upper four elements of `a` are unused.
800///
801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
802#[inline]
803#[target_feature(enable = "avx2")]
804#[cfg_attr(test, assert_instr(vpmovzxwq))]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
807    unsafe {
808        let a = a.as_u16x8();
809        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
810        transmute::<i64x4, _>(simd_cast(v64))
811    }
812}
813
814/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
815///
816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
817#[inline]
818#[target_feature(enable = "avx2")]
819#[cfg_attr(test, assert_instr(vpmovzxdq))]
820#[stable(feature = "simd_x86", since = "1.27.0")]
821pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
822    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
823}
824
825/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
826///
827/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
828#[inline]
829#[target_feature(enable = "avx2")]
830#[cfg_attr(test, assert_instr(vpmovzxbw))]
831#[stable(feature = "simd_x86", since = "1.27.0")]
832pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
833    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
834}
835
836/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
837/// integers. The upper eight elements of `a` are unused.
838///
839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
840#[inline]
841#[target_feature(enable = "avx2")]
842#[cfg_attr(test, assert_instr(vpmovzxbd))]
843#[stable(feature = "simd_x86", since = "1.27.0")]
844pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
845    unsafe {
846        let a = a.as_u8x16();
847        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
848        transmute::<i32x8, _>(simd_cast(v64))
849    }
850}
851
852/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
853/// integers. The upper twelve elements of `a` are unused.
854///
855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
856#[inline]
857#[target_feature(enable = "avx2")]
858#[cfg_attr(test, assert_instr(vpmovzxbq))]
859#[stable(feature = "simd_x86", since = "1.27.0")]
860pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
861    unsafe {
862        let a = a.as_u8x16();
863        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
864        transmute::<i64x4, _>(simd_cast(v32))
865    }
866}
867
868/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
869///
870/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
871#[inline]
872#[target_feature(enable = "avx2")]
873#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
874#[rustc_legacy_const_generics(1)]
875#[stable(feature = "simd_x86", since = "1.27.0")]
876pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
877    static_assert_uimm_bits!(IMM1, 1);
878    unsafe {
879        let a = a.as_i64x4();
880        let b = i64x4::ZERO;
881        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
882        transmute(dst)
883    }
884}
885
886/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
887///
888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
889#[inline]
890#[target_feature(enable = "avx2")]
891#[cfg_attr(test, assert_instr(vphaddw))]
892#[stable(feature = "simd_x86", since = "1.27.0")]
893pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
894    let a = a.as_i16x16();
895    let b = b.as_i16x16();
896    unsafe {
897        let even: i16x16 = simd_shuffle!(
898            a,
899            b,
900            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
901        );
902        let odd: i16x16 = simd_shuffle!(
903            a,
904            b,
905            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
906        );
907        simd_add(even, odd).as_m256i()
908    }
909}
910
911/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
912///
913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
914#[inline]
915#[target_feature(enable = "avx2")]
916#[cfg_attr(test, assert_instr(vphaddd))]
917#[stable(feature = "simd_x86", since = "1.27.0")]
918pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
919    let a = a.as_i32x8();
920    let b = b.as_i32x8();
921    unsafe {
922        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
923        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
924        simd_add(even, odd).as_m256i()
925    }
926}
927
928/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
929/// using saturation.
930///
931/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
932#[inline]
933#[target_feature(enable = "avx2")]
934#[cfg_attr(test, assert_instr(vphaddsw))]
935#[stable(feature = "simd_x86", since = "1.27.0")]
936pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
937    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
938}
939
940/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
941///
942/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
943#[inline]
944#[target_feature(enable = "avx2")]
945#[cfg_attr(test, assert_instr(vphsubw))]
946#[stable(feature = "simd_x86", since = "1.27.0")]
947pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
948    let a = a.as_i16x16();
949    let b = b.as_i16x16();
950    unsafe {
951        let even: i16x16 = simd_shuffle!(
952            a,
953            b,
954            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
955        );
956        let odd: i16x16 = simd_shuffle!(
957            a,
958            b,
959            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
960        );
961        simd_sub(even, odd).as_m256i()
962    }
963}
964
965/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
968#[inline]
969#[target_feature(enable = "avx2")]
970#[cfg_attr(test, assert_instr(vphsubd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
973    let a = a.as_i32x8();
974    let b = b.as_i32x8();
975    unsafe {
976        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
977        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
978        simd_sub(even, odd).as_m256i()
979    }
980}
981
982/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
983/// using saturation.
984///
985/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
986#[inline]
987#[target_feature(enable = "avx2")]
988#[cfg_attr(test, assert_instr(vphsubsw))]
989#[stable(feature = "simd_x86", since = "1.27.0")]
990pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
991    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
992}
993
994/// Returns values from `slice` at offsets determined by `offsets * scale`,
995/// where
996/// `scale` should be 1, 2, 4 or 8.
997///
998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
999#[inline]
1000#[target_feature(enable = "avx2")]
1001#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1002#[rustc_legacy_const_generics(2)]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1005    slice: *const i32,
1006    offsets: __m128i,
1007) -> __m128i {
1008    static_assert_imm8_scale!(SCALE);
1009    let zero = i32x4::ZERO;
1010    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1011    let offsets = offsets.as_i32x4();
1012    let slice = slice as *const i8;
1013    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1014    transmute(r)
1015}
1016
1017/// Returns values from `slice` at offsets determined by `offsets * scale`,
1018/// where
1019/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1020/// that position instead.
1021///
1022/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1023#[inline]
1024#[target_feature(enable = "avx2")]
1025#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1026#[rustc_legacy_const_generics(4)]
1027#[stable(feature = "simd_x86", since = "1.27.0")]
1028pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1029    src: __m128i,
1030    slice: *const i32,
1031    offsets: __m128i,
1032    mask: __m128i,
1033) -> __m128i {
1034    static_assert_imm8_scale!(SCALE);
1035    let src = src.as_i32x4();
1036    let mask = mask.as_i32x4();
1037    let offsets = offsets.as_i32x4();
1038    let slice = slice as *const i8;
1039    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1040    transmute(r)
1041}
1042
1043/// Returns values from `slice` at offsets determined by `offsets * scale`,
1044/// where
1045/// `scale` should be 1, 2, 4 or 8.
1046///
1047/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1048#[inline]
1049#[target_feature(enable = "avx2")]
1050#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1051#[rustc_legacy_const_generics(2)]
1052#[stable(feature = "simd_x86", since = "1.27.0")]
1053pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1054    slice: *const i32,
1055    offsets: __m256i,
1056) -> __m256i {
1057    static_assert_imm8_scale!(SCALE);
1058    let zero = i32x8::ZERO;
1059    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1060    let offsets = offsets.as_i32x8();
1061    let slice = slice as *const i8;
1062    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1063    transmute(r)
1064}
1065
1066/// Returns values from `slice` at offsets determined by `offsets * scale`,
1067/// where
1068/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1069/// that position instead.
1070///
1071/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1072#[inline]
1073#[target_feature(enable = "avx2")]
1074#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1075#[rustc_legacy_const_generics(4)]
1076#[stable(feature = "simd_x86", since = "1.27.0")]
1077pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1078    src: __m256i,
1079    slice: *const i32,
1080    offsets: __m256i,
1081    mask: __m256i,
1082) -> __m256i {
1083    static_assert_imm8_scale!(SCALE);
1084    let src = src.as_i32x8();
1085    let mask = mask.as_i32x8();
1086    let offsets = offsets.as_i32x8();
1087    let slice = slice as *const i8;
1088    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1089    transmute(r)
1090}
1091
1092/// Returns values from `slice` at offsets determined by `offsets * scale`,
1093/// where
1094/// `scale` should be 1, 2, 4 or 8.
1095///
1096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1097#[inline]
1098#[target_feature(enable = "avx2")]
1099#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1100#[rustc_legacy_const_generics(2)]
1101#[stable(feature = "simd_x86", since = "1.27.0")]
1102pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1103    static_assert_imm8_scale!(SCALE);
1104    let zero = _mm_setzero_ps();
1105    let neg_one = _mm_set1_ps(-1.0);
1106    let offsets = offsets.as_i32x4();
1107    let slice = slice as *const i8;
1108    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1109}
1110
1111/// Returns values from `slice` at offsets determined by `offsets * scale`,
1112/// where
1113/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1114/// that position instead.
1115///
1116/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1117#[inline]
1118#[target_feature(enable = "avx2")]
1119#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1120#[rustc_legacy_const_generics(4)]
1121#[stable(feature = "simd_x86", since = "1.27.0")]
1122pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1123    src: __m128,
1124    slice: *const f32,
1125    offsets: __m128i,
1126    mask: __m128,
1127) -> __m128 {
1128    static_assert_imm8_scale!(SCALE);
1129    let offsets = offsets.as_i32x4();
1130    let slice = slice as *const i8;
1131    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1132}
1133
1134/// Returns values from `slice` at offsets determined by `offsets * scale`,
1135/// where
1136/// `scale` should be 1, 2, 4 or 8.
1137///
1138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1139#[inline]
1140#[target_feature(enable = "avx2")]
1141#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1142#[rustc_legacy_const_generics(2)]
1143#[stable(feature = "simd_x86", since = "1.27.0")]
1144pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1145    static_assert_imm8_scale!(SCALE);
1146    let zero = _mm256_setzero_ps();
1147    let neg_one = _mm256_set1_ps(-1.0);
1148    let offsets = offsets.as_i32x8();
1149    let slice = slice as *const i8;
1150    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1151}
1152
1153/// Returns values from `slice` at offsets determined by `offsets * scale`,
1154/// where
1155/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1156/// that position instead.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1159#[inline]
1160#[target_feature(enable = "avx2")]
1161#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1162#[rustc_legacy_const_generics(4)]
1163#[stable(feature = "simd_x86", since = "1.27.0")]
1164pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1165    src: __m256,
1166    slice: *const f32,
1167    offsets: __m256i,
1168    mask: __m256,
1169) -> __m256 {
1170    static_assert_imm8_scale!(SCALE);
1171    let offsets = offsets.as_i32x8();
1172    let slice = slice as *const i8;
1173    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1174}
1175
1176/// Returns values from `slice` at offsets determined by `offsets * scale`,
1177/// where
1178/// `scale` should be 1, 2, 4 or 8.
1179///
1180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1181#[inline]
1182#[target_feature(enable = "avx2")]
1183#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1184#[rustc_legacy_const_generics(2)]
1185#[stable(feature = "simd_x86", since = "1.27.0")]
1186pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1187    slice: *const i64,
1188    offsets: __m128i,
1189) -> __m128i {
1190    static_assert_imm8_scale!(SCALE);
1191    let zero = i64x2::ZERO;
1192    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1193    let offsets = offsets.as_i32x4();
1194    let slice = slice as *const i8;
1195    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1196    transmute(r)
1197}
1198
1199/// Returns values from `slice` at offsets determined by `offsets * scale`,
1200/// where
1201/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1202/// that position instead.
1203///
1204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1205#[inline]
1206#[target_feature(enable = "avx2")]
1207#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1208#[rustc_legacy_const_generics(4)]
1209#[stable(feature = "simd_x86", since = "1.27.0")]
1210pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1211    src: __m128i,
1212    slice: *const i64,
1213    offsets: __m128i,
1214    mask: __m128i,
1215) -> __m128i {
1216    static_assert_imm8_scale!(SCALE);
1217    let src = src.as_i64x2();
1218    let mask = mask.as_i64x2();
1219    let offsets = offsets.as_i32x4();
1220    let slice = slice as *const i8;
1221    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1222    transmute(r)
1223}
1224
1225/// Returns values from `slice` at offsets determined by `offsets * scale`,
1226/// where
1227/// `scale` should be 1, 2, 4 or 8.
1228///
1229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1230#[inline]
1231#[target_feature(enable = "avx2")]
1232#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1233#[rustc_legacy_const_generics(2)]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1236    slice: *const i64,
1237    offsets: __m128i,
1238) -> __m256i {
1239    static_assert_imm8_scale!(SCALE);
1240    let zero = i64x4::ZERO;
1241    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1242    let offsets = offsets.as_i32x4();
1243    let slice = slice as *const i8;
1244    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1245    transmute(r)
1246}
1247
1248/// Returns values from `slice` at offsets determined by `offsets * scale`,
1249/// where
1250/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1251/// that position instead.
1252///
1253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1254#[inline]
1255#[target_feature(enable = "avx2")]
1256#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1257#[rustc_legacy_const_generics(4)]
1258#[stable(feature = "simd_x86", since = "1.27.0")]
1259pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1260    src: __m256i,
1261    slice: *const i64,
1262    offsets: __m128i,
1263    mask: __m256i,
1264) -> __m256i {
1265    static_assert_imm8_scale!(SCALE);
1266    let src = src.as_i64x4();
1267    let mask = mask.as_i64x4();
1268    let offsets = offsets.as_i32x4();
1269    let slice = slice as *const i8;
1270    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1271    transmute(r)
1272}
1273
1274/// Returns values from `slice` at offsets determined by `offsets * scale`,
1275/// where
1276/// `scale` should be 1, 2, 4 or 8.
1277///
1278/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1279#[inline]
1280#[target_feature(enable = "avx2")]
1281#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1282#[rustc_legacy_const_generics(2)]
1283#[stable(feature = "simd_x86", since = "1.27.0")]
1284pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1285    static_assert_imm8_scale!(SCALE);
1286    let zero = _mm_setzero_pd();
1287    let neg_one = _mm_set1_pd(-1.0);
1288    let offsets = offsets.as_i32x4();
1289    let slice = slice as *const i8;
1290    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1291}
1292
1293/// Returns values from `slice` at offsets determined by `offsets * scale`,
1294/// where
1295/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1296/// that position instead.
1297///
1298/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1299#[inline]
1300#[target_feature(enable = "avx2")]
1301#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1302#[rustc_legacy_const_generics(4)]
1303#[stable(feature = "simd_x86", since = "1.27.0")]
1304pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1305    src: __m128d,
1306    slice: *const f64,
1307    offsets: __m128i,
1308    mask: __m128d,
1309) -> __m128d {
1310    static_assert_imm8_scale!(SCALE);
1311    let offsets = offsets.as_i32x4();
1312    let slice = slice as *const i8;
1313    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1314}
1315
1316/// Returns values from `slice` at offsets determined by `offsets * scale`,
1317/// where
1318/// `scale` should be 1, 2, 4 or 8.
1319///
1320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1321#[inline]
1322#[target_feature(enable = "avx2")]
1323#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1324#[rustc_legacy_const_generics(2)]
1325#[stable(feature = "simd_x86", since = "1.27.0")]
1326pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1327    slice: *const f64,
1328    offsets: __m128i,
1329) -> __m256d {
1330    static_assert_imm8_scale!(SCALE);
1331    let zero = _mm256_setzero_pd();
1332    let neg_one = _mm256_set1_pd(-1.0);
1333    let offsets = offsets.as_i32x4();
1334    let slice = slice as *const i8;
1335    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1336}
1337
1338/// Returns values from `slice` at offsets determined by `offsets * scale`,
1339/// where
1340/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1341/// that position instead.
1342///
1343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1344#[inline]
1345#[target_feature(enable = "avx2")]
1346#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1347#[rustc_legacy_const_generics(4)]
1348#[stable(feature = "simd_x86", since = "1.27.0")]
1349pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1350    src: __m256d,
1351    slice: *const f64,
1352    offsets: __m128i,
1353    mask: __m256d,
1354) -> __m256d {
1355    static_assert_imm8_scale!(SCALE);
1356    let offsets = offsets.as_i32x4();
1357    let slice = slice as *const i8;
1358    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1359}
1360
1361/// Returns values from `slice` at offsets determined by `offsets * scale`,
1362/// where
1363/// `scale` should be 1, 2, 4 or 8.
1364///
1365/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1366#[inline]
1367#[target_feature(enable = "avx2")]
1368#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1369#[rustc_legacy_const_generics(2)]
1370#[stable(feature = "simd_x86", since = "1.27.0")]
1371pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1372    slice: *const i32,
1373    offsets: __m128i,
1374) -> __m128i {
1375    static_assert_imm8_scale!(SCALE);
1376    let zero = i32x4::ZERO;
1377    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1378    let offsets = offsets.as_i64x2();
1379    let slice = slice as *const i8;
1380    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1381    transmute(r)
1382}
1383
1384/// Returns values from `slice` at offsets determined by `offsets * scale`,
1385/// where
1386/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1387/// that position instead.
1388///
1389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1390#[inline]
1391#[target_feature(enable = "avx2")]
1392#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1393#[rustc_legacy_const_generics(4)]
1394#[stable(feature = "simd_x86", since = "1.27.0")]
1395pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1396    src: __m128i,
1397    slice: *const i32,
1398    offsets: __m128i,
1399    mask: __m128i,
1400) -> __m128i {
1401    static_assert_imm8_scale!(SCALE);
1402    let src = src.as_i32x4();
1403    let mask = mask.as_i32x4();
1404    let offsets = offsets.as_i64x2();
1405    let slice = slice as *const i8;
1406    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1407    transmute(r)
1408}
1409
1410/// Returns values from `slice` at offsets determined by `offsets * scale`,
1411/// where
1412/// `scale` should be 1, 2, 4 or 8.
1413///
1414/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1415#[inline]
1416#[target_feature(enable = "avx2")]
1417#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1418#[rustc_legacy_const_generics(2)]
1419#[stable(feature = "simd_x86", since = "1.27.0")]
1420pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1421    slice: *const i32,
1422    offsets: __m256i,
1423) -> __m128i {
1424    static_assert_imm8_scale!(SCALE);
1425    let zero = i32x4::ZERO;
1426    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1427    let offsets = offsets.as_i64x4();
1428    let slice = slice as *const i8;
1429    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1430    transmute(r)
1431}
1432
1433/// Returns values from `slice` at offsets determined by `offsets * scale`,
1434/// where
1435/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1436/// that position instead.
1437///
1438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1439#[inline]
1440#[target_feature(enable = "avx2")]
1441#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1442#[rustc_legacy_const_generics(4)]
1443#[stable(feature = "simd_x86", since = "1.27.0")]
1444pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1445    src: __m128i,
1446    slice: *const i32,
1447    offsets: __m256i,
1448    mask: __m128i,
1449) -> __m128i {
1450    static_assert_imm8_scale!(SCALE);
1451    let src = src.as_i32x4();
1452    let mask = mask.as_i32x4();
1453    let offsets = offsets.as_i64x4();
1454    let slice = slice as *const i8;
1455    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1456    transmute(r)
1457}
1458
1459/// Returns values from `slice` at offsets determined by `offsets * scale`,
1460/// where
1461/// `scale` should be 1, 2, 4 or 8.
1462///
1463/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1464#[inline]
1465#[target_feature(enable = "avx2")]
1466#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1467#[rustc_legacy_const_generics(2)]
1468#[stable(feature = "simd_x86", since = "1.27.0")]
1469pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1470    static_assert_imm8_scale!(SCALE);
1471    let zero = _mm_setzero_ps();
1472    let neg_one = _mm_set1_ps(-1.0);
1473    let offsets = offsets.as_i64x2();
1474    let slice = slice as *const i8;
1475    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1476}
1477
1478/// Returns values from `slice` at offsets determined by `offsets * scale`,
1479/// where
1480/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1481/// that position instead.
1482///
1483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1484#[inline]
1485#[target_feature(enable = "avx2")]
1486#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1487#[rustc_legacy_const_generics(4)]
1488#[stable(feature = "simd_x86", since = "1.27.0")]
1489pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1490    src: __m128,
1491    slice: *const f32,
1492    offsets: __m128i,
1493    mask: __m128,
1494) -> __m128 {
1495    static_assert_imm8_scale!(SCALE);
1496    let offsets = offsets.as_i64x2();
1497    let slice = slice as *const i8;
1498    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1499}
1500
1501/// Returns values from `slice` at offsets determined by `offsets * scale`,
1502/// where
1503/// `scale` should be 1, 2, 4 or 8.
1504///
1505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1506#[inline]
1507#[target_feature(enable = "avx2")]
1508#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1509#[rustc_legacy_const_generics(2)]
1510#[stable(feature = "simd_x86", since = "1.27.0")]
1511pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1512    static_assert_imm8_scale!(SCALE);
1513    let zero = _mm_setzero_ps();
1514    let neg_one = _mm_set1_ps(-1.0);
1515    let offsets = offsets.as_i64x4();
1516    let slice = slice as *const i8;
1517    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1518}
1519
1520/// Returns values from `slice` at offsets determined by `offsets * scale`,
1521/// where
1522/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1523/// that position instead.
1524///
1525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1526#[inline]
1527#[target_feature(enable = "avx2")]
1528#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1529#[rustc_legacy_const_generics(4)]
1530#[stable(feature = "simd_x86", since = "1.27.0")]
1531pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1532    src: __m128,
1533    slice: *const f32,
1534    offsets: __m256i,
1535    mask: __m128,
1536) -> __m128 {
1537    static_assert_imm8_scale!(SCALE);
1538    let offsets = offsets.as_i64x4();
1539    let slice = slice as *const i8;
1540    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1541}
1542
1543/// Returns values from `slice` at offsets determined by `offsets * scale`,
1544/// where
1545/// `scale` should be 1, 2, 4 or 8.
1546///
1547/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1548#[inline]
1549#[target_feature(enable = "avx2")]
1550#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1551#[rustc_legacy_const_generics(2)]
1552#[stable(feature = "simd_x86", since = "1.27.0")]
1553pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1554    slice: *const i64,
1555    offsets: __m128i,
1556) -> __m128i {
1557    static_assert_imm8_scale!(SCALE);
1558    let zero = i64x2::ZERO;
1559    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1560    let slice = slice as *const i8;
1561    let offsets = offsets.as_i64x2();
1562    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1563    transmute(r)
1564}
1565
1566/// Returns values from `slice` at offsets determined by `offsets * scale`,
1567/// where
1568/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1569/// that position instead.
1570///
1571/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1572#[inline]
1573#[target_feature(enable = "avx2")]
1574#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1575#[rustc_legacy_const_generics(4)]
1576#[stable(feature = "simd_x86", since = "1.27.0")]
1577pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1578    src: __m128i,
1579    slice: *const i64,
1580    offsets: __m128i,
1581    mask: __m128i,
1582) -> __m128i {
1583    static_assert_imm8_scale!(SCALE);
1584    let src = src.as_i64x2();
1585    let mask = mask.as_i64x2();
1586    let offsets = offsets.as_i64x2();
1587    let slice = slice as *const i8;
1588    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1589    transmute(r)
1590}
1591
1592/// Returns values from `slice` at offsets determined by `offsets * scale`,
1593/// where
1594/// `scale` should be 1, 2, 4 or 8.
1595///
1596/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1597#[inline]
1598#[target_feature(enable = "avx2")]
1599#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1600#[rustc_legacy_const_generics(2)]
1601#[stable(feature = "simd_x86", since = "1.27.0")]
1602pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1603    slice: *const i64,
1604    offsets: __m256i,
1605) -> __m256i {
1606    static_assert_imm8_scale!(SCALE);
1607    let zero = i64x4::ZERO;
1608    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1609    let slice = slice as *const i8;
1610    let offsets = offsets.as_i64x4();
1611    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1612    transmute(r)
1613}
1614
1615/// Returns values from `slice` at offsets determined by `offsets * scale`,
1616/// where
1617/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1618/// that position instead.
1619///
1620/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1621#[inline]
1622#[target_feature(enable = "avx2")]
1623#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1624#[rustc_legacy_const_generics(4)]
1625#[stable(feature = "simd_x86", since = "1.27.0")]
1626pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1627    src: __m256i,
1628    slice: *const i64,
1629    offsets: __m256i,
1630    mask: __m256i,
1631) -> __m256i {
1632    static_assert_imm8_scale!(SCALE);
1633    let src = src.as_i64x4();
1634    let mask = mask.as_i64x4();
1635    let offsets = offsets.as_i64x4();
1636    let slice = slice as *const i8;
1637    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1638    transmute(r)
1639}
1640
1641/// Returns values from `slice` at offsets determined by `offsets * scale`,
1642/// where
1643/// `scale` should be 1, 2, 4 or 8.
1644///
1645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1646#[inline]
1647#[target_feature(enable = "avx2")]
1648#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1649#[rustc_legacy_const_generics(2)]
1650#[stable(feature = "simd_x86", since = "1.27.0")]
1651pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1652    static_assert_imm8_scale!(SCALE);
1653    let zero = _mm_setzero_pd();
1654    let neg_one = _mm_set1_pd(-1.0);
1655    let slice = slice as *const i8;
1656    let offsets = offsets.as_i64x2();
1657    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1658}
1659
1660/// Returns values from `slice` at offsets determined by `offsets * scale`,
1661/// where
1662/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1663/// that position instead.
1664///
1665/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1666#[inline]
1667#[target_feature(enable = "avx2")]
1668#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1669#[rustc_legacy_const_generics(4)]
1670#[stable(feature = "simd_x86", since = "1.27.0")]
1671pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1672    src: __m128d,
1673    slice: *const f64,
1674    offsets: __m128i,
1675    mask: __m128d,
1676) -> __m128d {
1677    static_assert_imm8_scale!(SCALE);
1678    let slice = slice as *const i8;
1679    let offsets = offsets.as_i64x2();
1680    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1681}
1682
1683/// Returns values from `slice` at offsets determined by `offsets * scale`,
1684/// where
1685/// `scale` should be 1, 2, 4 or 8.
1686///
1687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1688#[inline]
1689#[target_feature(enable = "avx2")]
1690#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1691#[rustc_legacy_const_generics(2)]
1692#[stable(feature = "simd_x86", since = "1.27.0")]
1693pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1694    slice: *const f64,
1695    offsets: __m256i,
1696) -> __m256d {
1697    static_assert_imm8_scale!(SCALE);
1698    let zero = _mm256_setzero_pd();
1699    let neg_one = _mm256_set1_pd(-1.0);
1700    let slice = slice as *const i8;
1701    let offsets = offsets.as_i64x4();
1702    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1703}
1704
1705/// Returns values from `slice` at offsets determined by `offsets * scale`,
1706/// where
1707/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1708/// that position instead.
1709///
1710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1711#[inline]
1712#[target_feature(enable = "avx2")]
1713#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1714#[rustc_legacy_const_generics(4)]
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1717    src: __m256d,
1718    slice: *const f64,
1719    offsets: __m256i,
1720    mask: __m256d,
1721) -> __m256d {
1722    static_assert_imm8_scale!(SCALE);
1723    let slice = slice as *const i8;
1724    let offsets = offsets.as_i64x4();
1725    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1726}
1727
1728/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1729/// location specified by `IMM1`.
1730///
1731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1732#[inline]
1733#[target_feature(enable = "avx2")]
1734#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1735#[rustc_legacy_const_generics(2)]
1736#[stable(feature = "simd_x86", since = "1.27.0")]
1737pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1738    static_assert_uimm_bits!(IMM1, 1);
1739    unsafe {
1740        let a = a.as_i64x4();
1741        let b = _mm256_castsi128_si256(b).as_i64x4();
1742        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1743        transmute(dst)
1744    }
1745}
1746
1747/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1748/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1749/// of intermediate 32-bit integers.
1750///
1751/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1752#[inline]
1753#[target_feature(enable = "avx2")]
1754#[cfg_attr(test, assert_instr(vpmaddwd))]
1755#[stable(feature = "simd_x86", since = "1.27.0")]
1756pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1757    unsafe {
1758        let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
1759        let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
1760        let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
1761        simd_add(even, odd).as_m256i()
1762    }
1763}
1764
1765/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1766/// corresponding signed 8-bit integer from `b`, producing intermediate
1767/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1768/// signed 16-bit integers
1769///
1770/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1771#[inline]
1772#[target_feature(enable = "avx2")]
1773#[cfg_attr(test, assert_instr(vpmaddubsw))]
1774#[stable(feature = "simd_x86", since = "1.27.0")]
1775pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1776    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_i8x32())) }
1777}
1778
1779/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1780/// (elements are zeroed out when the highest bit is not set in the
1781/// corresponding element).
1782///
1783/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1784#[inline]
1785#[target_feature(enable = "avx2")]
1786#[cfg_attr(test, assert_instr(vpmaskmovd))]
1787#[stable(feature = "simd_x86", since = "1.27.0")]
1788pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1789    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1790    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x4::ZERO).as_m128i()
1791}
1792
1793/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1794/// (elements are zeroed out when the highest bit is not set in the
1795/// corresponding element).
1796///
1797/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1798#[inline]
1799#[target_feature(enable = "avx2")]
1800#[cfg_attr(test, assert_instr(vpmaskmovd))]
1801#[stable(feature = "simd_x86", since = "1.27.0")]
1802pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1803    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1804    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x8::ZERO).as_m256i()
1805}
1806
1807/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1808/// (elements are zeroed out when the highest bit is not set in the
1809/// corresponding element).
1810///
1811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1812#[inline]
1813#[target_feature(enable = "avx2")]
1814#[cfg_attr(test, assert_instr(vpmaskmovq))]
1815#[stable(feature = "simd_x86", since = "1.27.0")]
1816pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1817    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1818    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x2::ZERO).as_m128i()
1819}
1820
1821/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1822/// (elements are zeroed out when the highest bit is not set in the
1823/// corresponding element).
1824///
1825/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1826#[inline]
1827#[target_feature(enable = "avx2")]
1828#[cfg_attr(test, assert_instr(vpmaskmovq))]
1829#[stable(feature = "simd_x86", since = "1.27.0")]
1830pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1831    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1832    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x4::ZERO).as_m256i()
1833}
1834
1835/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1836/// using `mask` (elements are not stored when the highest bit is not set
1837/// in the corresponding element).
1838///
1839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1840#[inline]
1841#[target_feature(enable = "avx2")]
1842#[cfg_attr(test, assert_instr(vpmaskmovd))]
1843#[stable(feature = "simd_x86", since = "1.27.0")]
1844pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1845    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1846    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4())
1847}
1848
1849/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1850/// using `mask` (elements are not stored when the highest bit is not set
1851/// in the corresponding element).
1852///
1853/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1854#[inline]
1855#[target_feature(enable = "avx2")]
1856#[cfg_attr(test, assert_instr(vpmaskmovd))]
1857#[stable(feature = "simd_x86", since = "1.27.0")]
1858pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1859    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1860    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8())
1861}
1862
1863/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1864/// using `mask` (elements are not stored when the highest bit is not set
1865/// in the corresponding element).
1866///
1867/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1868#[inline]
1869#[target_feature(enable = "avx2")]
1870#[cfg_attr(test, assert_instr(vpmaskmovq))]
1871#[stable(feature = "simd_x86", since = "1.27.0")]
1872pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1873    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1874    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2())
1875}
1876
1877/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1878/// using `mask` (elements are not stored when the highest bit is not set
1879/// in the corresponding element).
1880///
1881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1882#[inline]
1883#[target_feature(enable = "avx2")]
1884#[cfg_attr(test, assert_instr(vpmaskmovq))]
1885#[stable(feature = "simd_x86", since = "1.27.0")]
1886pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1887    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1888    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4())
1889}
1890
1891/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1892/// maximum values.
1893///
1894/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1895#[inline]
1896#[target_feature(enable = "avx2")]
1897#[cfg_attr(test, assert_instr(vpmaxsw))]
1898#[stable(feature = "simd_x86", since = "1.27.0")]
1899pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1900    unsafe {
1901        let a = a.as_i16x16();
1902        let b = b.as_i16x16();
1903        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1904    }
1905}
1906
1907/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1908/// maximum values.
1909///
1910/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1911#[inline]
1912#[target_feature(enable = "avx2")]
1913#[cfg_attr(test, assert_instr(vpmaxsd))]
1914#[stable(feature = "simd_x86", since = "1.27.0")]
1915pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1916    unsafe {
1917        let a = a.as_i32x8();
1918        let b = b.as_i32x8();
1919        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1920    }
1921}
1922
1923/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1924/// maximum values.
1925///
1926/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1927#[inline]
1928#[target_feature(enable = "avx2")]
1929#[cfg_attr(test, assert_instr(vpmaxsb))]
1930#[stable(feature = "simd_x86", since = "1.27.0")]
1931pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
1932    unsafe {
1933        let a = a.as_i8x32();
1934        let b = b.as_i8x32();
1935        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1936    }
1937}
1938
1939/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
1940/// the packed maximum values.
1941///
1942/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
1943#[inline]
1944#[target_feature(enable = "avx2")]
1945#[cfg_attr(test, assert_instr(vpmaxuw))]
1946#[stable(feature = "simd_x86", since = "1.27.0")]
1947pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
1948    unsafe {
1949        let a = a.as_u16x16();
1950        let b = b.as_u16x16();
1951        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1952    }
1953}
1954
1955/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
1956/// the packed maximum values.
1957///
1958/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
1959#[inline]
1960#[target_feature(enable = "avx2")]
1961#[cfg_attr(test, assert_instr(vpmaxud))]
1962#[stable(feature = "simd_x86", since = "1.27.0")]
1963pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
1964    unsafe {
1965        let a = a.as_u32x8();
1966        let b = b.as_u32x8();
1967        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1968    }
1969}
1970
1971/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
1972/// the packed maximum values.
1973///
1974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
1975#[inline]
1976#[target_feature(enable = "avx2")]
1977#[cfg_attr(test, assert_instr(vpmaxub))]
1978#[stable(feature = "simd_x86", since = "1.27.0")]
1979pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
1980    unsafe {
1981        let a = a.as_u8x32();
1982        let b = b.as_u8x32();
1983        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1984    }
1985}
1986
1987/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1988/// minimum values.
1989///
1990/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
1991#[inline]
1992#[target_feature(enable = "avx2")]
1993#[cfg_attr(test, assert_instr(vpminsw))]
1994#[stable(feature = "simd_x86", since = "1.27.0")]
1995pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
1996    unsafe {
1997        let a = a.as_i16x16();
1998        let b = b.as_i16x16();
1999        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2000    }
2001}
2002
2003/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2004/// minimum values.
2005///
2006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
2007#[inline]
2008#[target_feature(enable = "avx2")]
2009#[cfg_attr(test, assert_instr(vpminsd))]
2010#[stable(feature = "simd_x86", since = "1.27.0")]
2011pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2012    unsafe {
2013        let a = a.as_i32x8();
2014        let b = b.as_i32x8();
2015        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2016    }
2017}
2018
2019/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2020/// minimum values.
2021///
2022/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2023#[inline]
2024#[target_feature(enable = "avx2")]
2025#[cfg_attr(test, assert_instr(vpminsb))]
2026#[stable(feature = "simd_x86", since = "1.27.0")]
2027pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2028    unsafe {
2029        let a = a.as_i8x32();
2030        let b = b.as_i8x32();
2031        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2032    }
2033}
2034
2035/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2036/// the packed minimum values.
2037///
2038/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2039#[inline]
2040#[target_feature(enable = "avx2")]
2041#[cfg_attr(test, assert_instr(vpminuw))]
2042#[stable(feature = "simd_x86", since = "1.27.0")]
2043pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2044    unsafe {
2045        let a = a.as_u16x16();
2046        let b = b.as_u16x16();
2047        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2048    }
2049}
2050
2051/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2052/// the packed minimum values.
2053///
2054/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2055#[inline]
2056#[target_feature(enable = "avx2")]
2057#[cfg_attr(test, assert_instr(vpminud))]
2058#[stable(feature = "simd_x86", since = "1.27.0")]
2059pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2060    unsafe {
2061        let a = a.as_u32x8();
2062        let b = b.as_u32x8();
2063        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2064    }
2065}
2066
2067/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2068/// the packed minimum values.
2069///
2070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2071#[inline]
2072#[target_feature(enable = "avx2")]
2073#[cfg_attr(test, assert_instr(vpminub))]
2074#[stable(feature = "simd_x86", since = "1.27.0")]
2075pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2076    unsafe {
2077        let a = a.as_u8x32();
2078        let b = b.as_u8x32();
2079        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2080    }
2081}
2082
2083/// Creates mask from the most significant bit of each 8-bit element in `a`,
2084/// return the result.
2085///
2086/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2087#[inline]
2088#[target_feature(enable = "avx2")]
2089#[cfg_attr(test, assert_instr(vpmovmskb))]
2090#[stable(feature = "simd_x86", since = "1.27.0")]
2091pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2092    unsafe {
2093        let z = i8x32::ZERO;
2094        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2095        simd_bitmask::<_, u32>(m) as i32
2096    }
2097}
2098
2099/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2100/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2101/// results in dst. Eight SADs are performed for each 128-bit lane using one
2102/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2103/// selected from `b` starting at on the offset specified in `imm8`. Eight
2104/// quadruplets are formed from sequential 8-bit integers selected from `a`
2105/// starting at the offset specified in `imm8`.
2106///
2107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2108#[inline]
2109#[target_feature(enable = "avx2")]
2110#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2111#[rustc_legacy_const_generics(2)]
2112#[stable(feature = "simd_x86", since = "1.27.0")]
2113pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2114    static_assert_uimm_bits!(IMM8, 8);
2115    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
2116}
2117
2118/// Multiplies the low 32-bit integers from each packed 64-bit element in
2119/// `a` and `b`
2120///
2121/// Returns the 64-bit results.
2122///
2123/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2124#[inline]
2125#[target_feature(enable = "avx2")]
2126#[cfg_attr(test, assert_instr(vpmuldq))]
2127#[stable(feature = "simd_x86", since = "1.27.0")]
2128pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2129    unsafe {
2130        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2131        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2132        transmute(simd_mul(a, b))
2133    }
2134}
2135
2136/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2137/// element in `a` and `b`
2138///
2139/// Returns the unsigned 64-bit results.
2140///
2141/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2142#[inline]
2143#[target_feature(enable = "avx2")]
2144#[cfg_attr(test, assert_instr(vpmuludq))]
2145#[stable(feature = "simd_x86", since = "1.27.0")]
2146pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2147    unsafe {
2148        let a = a.as_u64x4();
2149        let b = b.as_u64x4();
2150        let mask = u64x4::splat(u32::MAX.into());
2151        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2152    }
2153}
2154
2155/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2156/// intermediate 32-bit integers and returning the high 16 bits of the
2157/// intermediate integers.
2158///
2159/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2160#[inline]
2161#[target_feature(enable = "avx2")]
2162#[cfg_attr(test, assert_instr(vpmulhw))]
2163#[stable(feature = "simd_x86", since = "1.27.0")]
2164pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2165    unsafe {
2166        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2167        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2168        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2169        transmute(simd_cast::<i32x16, i16x16>(r))
2170    }
2171}
2172
2173/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2174/// intermediate 32-bit integers and returning the high 16 bits of the
2175/// intermediate integers.
2176///
2177/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2178#[inline]
2179#[target_feature(enable = "avx2")]
2180#[cfg_attr(test, assert_instr(vpmulhuw))]
2181#[stable(feature = "simd_x86", since = "1.27.0")]
2182pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2183    unsafe {
2184        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2185        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2186        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2187        transmute(simd_cast::<u32x16, u16x16>(r))
2188    }
2189}
2190
2191/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2192/// intermediate 32-bit integers, and returns the low 16 bits of the
2193/// intermediate integers
2194///
2195/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2196#[inline]
2197#[target_feature(enable = "avx2")]
2198#[cfg_attr(test, assert_instr(vpmullw))]
2199#[stable(feature = "simd_x86", since = "1.27.0")]
2200pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2201    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2202}
2203
2204/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2205/// intermediate 64-bit integers, and returns the low 32 bits of the
2206/// intermediate integers
2207///
2208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2209#[inline]
2210#[target_feature(enable = "avx2")]
2211#[cfg_attr(test, assert_instr(vpmulld))]
2212#[stable(feature = "simd_x86", since = "1.27.0")]
2213pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2214    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2215}
2216
2217/// Multiplies packed 16-bit integers in `a` and `b`, producing
2218/// intermediate signed 32-bit integers. Truncate each intermediate
2219/// integer to the 18 most significant bits, round by adding 1, and
2220/// return bits `[16:1]`.
2221///
2222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2223#[inline]
2224#[target_feature(enable = "avx2")]
2225#[cfg_attr(test, assert_instr(vpmulhrsw))]
2226#[stable(feature = "simd_x86", since = "1.27.0")]
2227pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2228    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2229}
2230
2231/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2232/// and `b`
2233///
2234/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2235#[inline]
2236#[target_feature(enable = "avx2")]
2237#[cfg_attr(test, assert_instr(vorps))]
2238#[stable(feature = "simd_x86", since = "1.27.0")]
2239pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2240    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2241}
2242
2243/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2244/// using signed saturation
2245///
2246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2247#[inline]
2248#[target_feature(enable = "avx2")]
2249#[cfg_attr(test, assert_instr(vpacksswb))]
2250#[stable(feature = "simd_x86", since = "1.27.0")]
2251pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2252    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2253}
2254
2255/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2256/// using signed saturation
2257///
2258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2259#[inline]
2260#[target_feature(enable = "avx2")]
2261#[cfg_attr(test, assert_instr(vpackssdw))]
2262#[stable(feature = "simd_x86", since = "1.27.0")]
2263pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2264    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2265}
2266
2267/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2268/// using unsigned saturation
2269///
2270/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2271#[inline]
2272#[target_feature(enable = "avx2")]
2273#[cfg_attr(test, assert_instr(vpackuswb))]
2274#[stable(feature = "simd_x86", since = "1.27.0")]
2275pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2276    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2277}
2278
2279/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2280/// using unsigned saturation
2281///
2282/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2283#[inline]
2284#[target_feature(enable = "avx2")]
2285#[cfg_attr(test, assert_instr(vpackusdw))]
2286#[stable(feature = "simd_x86", since = "1.27.0")]
2287pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2288    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2289}
2290
2291/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2292///
2293/// The last 3 bits of each integer of `b` are used as addresses into the 8
2294/// integers of `a`.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2297#[inline]
2298#[target_feature(enable = "avx2")]
2299#[cfg_attr(test, assert_instr(vpermps))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2302    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2303}
2304
2305/// Permutes 64-bit integers from `a` using control mask `imm8`.
2306///
2307/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2308#[inline]
2309#[target_feature(enable = "avx2")]
2310#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2311#[rustc_legacy_const_generics(1)]
2312#[stable(feature = "simd_x86", since = "1.27.0")]
2313pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2314    static_assert_uimm_bits!(IMM8, 8);
2315    unsafe {
2316        let zero = i64x4::ZERO;
2317        let r: i64x4 = simd_shuffle!(
2318            a.as_i64x4(),
2319            zero,
2320            [
2321                IMM8 as u32 & 0b11,
2322                (IMM8 as u32 >> 2) & 0b11,
2323                (IMM8 as u32 >> 4) & 0b11,
2324                (IMM8 as u32 >> 6) & 0b11,
2325            ],
2326        );
2327        transmute(r)
2328    }
2329}
2330
2331/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2332///
2333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2334#[inline]
2335#[target_feature(enable = "avx2")]
2336#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2337#[rustc_legacy_const_generics(2)]
2338#[stable(feature = "simd_x86", since = "1.27.0")]
2339pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2340    static_assert_uimm_bits!(IMM8, 8);
2341    _mm256_permute2f128_si256::<IMM8>(a, b)
2342}
2343
2344/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2345/// control in `imm8`.
2346///
2347/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2348#[inline]
2349#[target_feature(enable = "avx2")]
2350#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2351#[rustc_legacy_const_generics(1)]
2352#[stable(feature = "simd_x86", since = "1.27.0")]
2353pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2354    static_assert_uimm_bits!(IMM8, 8);
2355    unsafe {
2356        simd_shuffle!(
2357            a,
2358            _mm256_undefined_pd(),
2359            [
2360                IMM8 as u32 & 0b11,
2361                (IMM8 as u32 >> 2) & 0b11,
2362                (IMM8 as u32 >> 4) & 0b11,
2363                (IMM8 as u32 >> 6) & 0b11,
2364            ],
2365        )
2366    }
2367}
2368
2369/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2370/// the corresponding 32-bit integer index in `idx`.
2371///
2372/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2373#[inline]
2374#[target_feature(enable = "avx2")]
2375#[cfg_attr(test, assert_instr(vpermps))]
2376#[stable(feature = "simd_x86", since = "1.27.0")]
2377pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2378    unsafe { permps(a, idx.as_i32x8()) }
2379}
2380
2381/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2382/// and `b`, then horizontally sum each consecutive 8 differences to
2383/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2384/// integers in the low 16 bits of the 64-bit return value
2385///
2386/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2387#[inline]
2388#[target_feature(enable = "avx2")]
2389#[cfg_attr(test, assert_instr(vpsadbw))]
2390#[stable(feature = "simd_x86", since = "1.27.0")]
2391pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2392    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2393}
2394
2395/// Shuffles bytes from `a` according to the content of `b`.
2396///
2397/// For each of the 128-bit low and high halves of the vectors, the last
2398/// 4 bits of each byte of `b` are used as addresses into the respective
2399/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2400///
2401/// In addition, if the highest significant bit of a byte of `b` is set, the
2402/// respective destination byte is set to 0.
2403///
2404/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2405/// equivalent to:
2406///
2407/// ```
2408/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2409///     let mut r = [0; 32];
2410///     for i in 0..16 {
2411///         // if the most significant bit of b is set,
2412///         // then the destination byte is set to 0.
2413///         if b[i] & 0x80 == 0u8 {
2414///             r[i] = a[(b[i] % 16) as usize];
2415///         }
2416///         if b[i + 16] & 0x80 == 0u8 {
2417///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2418///         }
2419///     }
2420///     r
2421/// }
2422/// ```
2423///
2424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2425#[inline]
2426#[target_feature(enable = "avx2")]
2427#[cfg_attr(test, assert_instr(vpshufb))]
2428#[stable(feature = "simd_x86", since = "1.27.0")]
2429pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2430    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2431}
2432
2433/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2434/// `imm8`.
2435///
2436/// ```rust
2437/// #[cfg(target_arch = "x86")]
2438/// use std::arch::x86::*;
2439/// #[cfg(target_arch = "x86_64")]
2440/// use std::arch::x86_64::*;
2441///
2442/// # fn main() {
2443/// #     if is_x86_feature_detected!("avx2") {
2444/// #         #[target_feature(enable = "avx2")]
2445/// #         unsafe fn worker() {
2446/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2447///
2448/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2449/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2450///
2451/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2452/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2453///
2454/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2455/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2456/// #         }
2457/// #         unsafe { worker(); }
2458/// #     }
2459/// # }
2460/// ```
2461///
2462/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2463#[inline]
2464#[target_feature(enable = "avx2")]
2465#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2466#[rustc_legacy_const_generics(1)]
2467#[stable(feature = "simd_x86", since = "1.27.0")]
2468pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2469    static_assert_uimm_bits!(MASK, 8);
2470    unsafe {
2471        let r: i32x8 = simd_shuffle!(
2472            a.as_i32x8(),
2473            a.as_i32x8(),
2474            [
2475                MASK as u32 & 0b11,
2476                (MASK as u32 >> 2) & 0b11,
2477                (MASK as u32 >> 4) & 0b11,
2478                (MASK as u32 >> 6) & 0b11,
2479                (MASK as u32 & 0b11) + 4,
2480                ((MASK as u32 >> 2) & 0b11) + 4,
2481                ((MASK as u32 >> 4) & 0b11) + 4,
2482                ((MASK as u32 >> 6) & 0b11) + 4,
2483            ],
2484        );
2485        transmute(r)
2486    }
2487}
2488
2489/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2490/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2491/// to the output.
2492///
2493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2494#[inline]
2495#[target_feature(enable = "avx2")]
2496#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2497#[rustc_legacy_const_generics(1)]
2498#[stable(feature = "simd_x86", since = "1.27.0")]
2499pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2500    static_assert_uimm_bits!(IMM8, 8);
2501    unsafe {
2502        let a = a.as_i16x16();
2503        let r: i16x16 = simd_shuffle!(
2504            a,
2505            a,
2506            [
2507                0,
2508                1,
2509                2,
2510                3,
2511                4 + (IMM8 as u32 & 0b11),
2512                4 + ((IMM8 as u32 >> 2) & 0b11),
2513                4 + ((IMM8 as u32 >> 4) & 0b11),
2514                4 + ((IMM8 as u32 >> 6) & 0b11),
2515                8,
2516                9,
2517                10,
2518                11,
2519                12 + (IMM8 as u32 & 0b11),
2520                12 + ((IMM8 as u32 >> 2) & 0b11),
2521                12 + ((IMM8 as u32 >> 4) & 0b11),
2522                12 + ((IMM8 as u32 >> 6) & 0b11),
2523            ],
2524        );
2525        transmute(r)
2526    }
2527}
2528
2529/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2530/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2531/// to the output.
2532///
2533/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2534#[inline]
2535#[target_feature(enable = "avx2")]
2536#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2537#[rustc_legacy_const_generics(1)]
2538#[stable(feature = "simd_x86", since = "1.27.0")]
2539pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2540    static_assert_uimm_bits!(IMM8, 8);
2541    unsafe {
2542        let a = a.as_i16x16();
2543        let r: i16x16 = simd_shuffle!(
2544            a,
2545            a,
2546            [
2547                0 + (IMM8 as u32 & 0b11),
2548                0 + ((IMM8 as u32 >> 2) & 0b11),
2549                0 + ((IMM8 as u32 >> 4) & 0b11),
2550                0 + ((IMM8 as u32 >> 6) & 0b11),
2551                4,
2552                5,
2553                6,
2554                7,
2555                8 + (IMM8 as u32 & 0b11),
2556                8 + ((IMM8 as u32 >> 2) & 0b11),
2557                8 + ((IMM8 as u32 >> 4) & 0b11),
2558                8 + ((IMM8 as u32 >> 6) & 0b11),
2559                12,
2560                13,
2561                14,
2562                15,
2563            ],
2564        );
2565        transmute(r)
2566    }
2567}
2568
2569/// Negates packed 16-bit integers in `a` when the corresponding signed
2570/// 16-bit integer in `b` is negative, and returns the results.
2571/// Results are zeroed out when the corresponding element in `b` is zero.
2572///
2573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2574#[inline]
2575#[target_feature(enable = "avx2")]
2576#[cfg_attr(test, assert_instr(vpsignw))]
2577#[stable(feature = "simd_x86", since = "1.27.0")]
2578pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2579    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2580}
2581
2582/// Negates packed 32-bit integers in `a` when the corresponding signed
2583/// 32-bit integer in `b` is negative, and returns the results.
2584/// Results are zeroed out when the corresponding element in `b` is zero.
2585///
2586/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2587#[inline]
2588#[target_feature(enable = "avx2")]
2589#[cfg_attr(test, assert_instr(vpsignd))]
2590#[stable(feature = "simd_x86", since = "1.27.0")]
2591pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2592    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2593}
2594
2595/// Negates packed 8-bit integers in `a` when the corresponding signed
2596/// 8-bit integer in `b` is negative, and returns the results.
2597/// Results are zeroed out when the corresponding element in `b` is zero.
2598///
2599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2600#[inline]
2601#[target_feature(enable = "avx2")]
2602#[cfg_attr(test, assert_instr(vpsignb))]
2603#[stable(feature = "simd_x86", since = "1.27.0")]
2604pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2605    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2606}
2607
2608/// Shifts packed 16-bit integers in `a` left by `count` while
2609/// shifting in zeros, and returns the result
2610///
2611/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2612#[inline]
2613#[target_feature(enable = "avx2")]
2614#[cfg_attr(test, assert_instr(vpsllw))]
2615#[stable(feature = "simd_x86", since = "1.27.0")]
2616pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2617    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2618}
2619
2620/// Shifts packed 32-bit integers in `a` left by `count` while
2621/// shifting in zeros, and returns the result
2622///
2623/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2624#[inline]
2625#[target_feature(enable = "avx2")]
2626#[cfg_attr(test, assert_instr(vpslld))]
2627#[stable(feature = "simd_x86", since = "1.27.0")]
2628pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2629    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2630}
2631
2632/// Shifts packed 64-bit integers in `a` left by `count` while
2633/// shifting in zeros, and returns the result
2634///
2635/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2636#[inline]
2637#[target_feature(enable = "avx2")]
2638#[cfg_attr(test, assert_instr(vpsllq))]
2639#[stable(feature = "simd_x86", since = "1.27.0")]
2640pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2641    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2642}
2643
2644/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2645/// shifting in zeros, return the results;
2646///
2647/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2648#[inline]
2649#[target_feature(enable = "avx2")]
2650#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2651#[rustc_legacy_const_generics(1)]
2652#[stable(feature = "simd_x86", since = "1.27.0")]
2653pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2654    static_assert_uimm_bits!(IMM8, 8);
2655    unsafe {
2656        if IMM8 >= 16 {
2657            _mm256_setzero_si256()
2658        } else {
2659            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2660        }
2661    }
2662}
2663
2664/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2665/// shifting in zeros, return the results;
2666///
2667/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2668#[inline]
2669#[target_feature(enable = "avx2")]
2670#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2671#[rustc_legacy_const_generics(1)]
2672#[stable(feature = "simd_x86", since = "1.27.0")]
2673pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2674    unsafe {
2675        static_assert_uimm_bits!(IMM8, 8);
2676        if IMM8 >= 32 {
2677            _mm256_setzero_si256()
2678        } else {
2679            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2680        }
2681    }
2682}
2683
2684/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2685/// shifting in zeros, return the results;
2686///
2687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2688#[inline]
2689#[target_feature(enable = "avx2")]
2690#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2691#[rustc_legacy_const_generics(1)]
2692#[stable(feature = "simd_x86", since = "1.27.0")]
2693pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2694    unsafe {
2695        static_assert_uimm_bits!(IMM8, 8);
2696        if IMM8 >= 64 {
2697            _mm256_setzero_si256()
2698        } else {
2699            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2700        }
2701    }
2702}
2703
2704/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2705///
2706/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2707#[inline]
2708#[target_feature(enable = "avx2")]
2709#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2710#[rustc_legacy_const_generics(1)]
2711#[stable(feature = "simd_x86", since = "1.27.0")]
2712pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2713    static_assert_uimm_bits!(IMM8, 8);
2714    _mm256_bslli_epi128::<IMM8>(a)
2715}
2716
2717/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2718///
2719/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2720#[inline]
2721#[target_feature(enable = "avx2")]
2722#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2723#[rustc_legacy_const_generics(1)]
2724#[stable(feature = "simd_x86", since = "1.27.0")]
2725pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2726    static_assert_uimm_bits!(IMM8, 8);
2727    const fn mask(shift: i32, i: u32) -> u32 {
2728        let shift = shift as u32 & 0xff;
2729        if shift > 15 || i % 16 < shift {
2730            0
2731        } else {
2732            32 + (i - shift)
2733        }
2734    }
2735    unsafe {
2736        let a = a.as_i8x32();
2737        let r: i8x32 = simd_shuffle!(
2738            i8x32::ZERO,
2739            a,
2740            [
2741                mask(IMM8, 0),
2742                mask(IMM8, 1),
2743                mask(IMM8, 2),
2744                mask(IMM8, 3),
2745                mask(IMM8, 4),
2746                mask(IMM8, 5),
2747                mask(IMM8, 6),
2748                mask(IMM8, 7),
2749                mask(IMM8, 8),
2750                mask(IMM8, 9),
2751                mask(IMM8, 10),
2752                mask(IMM8, 11),
2753                mask(IMM8, 12),
2754                mask(IMM8, 13),
2755                mask(IMM8, 14),
2756                mask(IMM8, 15),
2757                mask(IMM8, 16),
2758                mask(IMM8, 17),
2759                mask(IMM8, 18),
2760                mask(IMM8, 19),
2761                mask(IMM8, 20),
2762                mask(IMM8, 21),
2763                mask(IMM8, 22),
2764                mask(IMM8, 23),
2765                mask(IMM8, 24),
2766                mask(IMM8, 25),
2767                mask(IMM8, 26),
2768                mask(IMM8, 27),
2769                mask(IMM8, 28),
2770                mask(IMM8, 29),
2771                mask(IMM8, 30),
2772                mask(IMM8, 31),
2773            ],
2774        );
2775        transmute(r)
2776    }
2777}
2778
2779/// Shifts packed 32-bit integers in `a` left by the amount
2780/// specified by the corresponding element in `count` while
2781/// shifting in zeros, and returns the result.
2782///
2783/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2784#[inline]
2785#[target_feature(enable = "avx2")]
2786#[cfg_attr(test, assert_instr(vpsllvd))]
2787#[stable(feature = "simd_x86", since = "1.27.0")]
2788pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2789    unsafe {
2790        let count = count.as_u32x4();
2791        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2792        let count = simd_select(no_overflow, count, u32x4::ZERO);
2793        simd_select(no_overflow, simd_shl(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
2794    }
2795}
2796
2797/// Shifts packed 32-bit integers in `a` left by the amount
2798/// specified by the corresponding element in `count` while
2799/// shifting in zeros, and returns the result.
2800///
2801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2802#[inline]
2803#[target_feature(enable = "avx2")]
2804#[cfg_attr(test, assert_instr(vpsllvd))]
2805#[stable(feature = "simd_x86", since = "1.27.0")]
2806pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2807    unsafe {
2808        let count = count.as_u32x8();
2809        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2810        let count = simd_select(no_overflow, count, u32x8::ZERO);
2811        simd_select(no_overflow, simd_shl(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
2812    }
2813}
2814
2815/// Shifts packed 64-bit integers in `a` left by the amount
2816/// specified by the corresponding element in `count` while
2817/// shifting in zeros, and returns the result.
2818///
2819/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2820#[inline]
2821#[target_feature(enable = "avx2")]
2822#[cfg_attr(test, assert_instr(vpsllvq))]
2823#[stable(feature = "simd_x86", since = "1.27.0")]
2824pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2825    unsafe {
2826        let count = count.as_u64x2();
2827        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
2828        let count = simd_select(no_overflow, count, u64x2::ZERO);
2829        simd_select(no_overflow, simd_shl(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
2830    }
2831}
2832
2833/// Shifts packed 64-bit integers in `a` left by the amount
2834/// specified by the corresponding element in `count` while
2835/// shifting in zeros, and returns the result.
2836///
2837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2838#[inline]
2839#[target_feature(enable = "avx2")]
2840#[cfg_attr(test, assert_instr(vpsllvq))]
2841#[stable(feature = "simd_x86", since = "1.27.0")]
2842pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2843    unsafe {
2844        let count = count.as_u64x4();
2845        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
2846        let count = simd_select(no_overflow, count, u64x4::ZERO);
2847        simd_select(no_overflow, simd_shl(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
2848    }
2849}
2850
2851/// Shifts packed 16-bit integers in `a` right by `count` while
2852/// shifting in sign bits.
2853///
2854/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2855#[inline]
2856#[target_feature(enable = "avx2")]
2857#[cfg_attr(test, assert_instr(vpsraw))]
2858#[stable(feature = "simd_x86", since = "1.27.0")]
2859pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2860    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2861}
2862
2863/// Shifts packed 32-bit integers in `a` right by `count` while
2864/// shifting in sign bits.
2865///
2866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2867#[inline]
2868#[target_feature(enable = "avx2")]
2869#[cfg_attr(test, assert_instr(vpsrad))]
2870#[stable(feature = "simd_x86", since = "1.27.0")]
2871pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2872    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2873}
2874
2875/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2876/// shifting in sign bits.
2877///
2878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2879#[inline]
2880#[target_feature(enable = "avx2")]
2881#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2882#[rustc_legacy_const_generics(1)]
2883#[stable(feature = "simd_x86", since = "1.27.0")]
2884pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2885    static_assert_uimm_bits!(IMM8, 8);
2886    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2887}
2888
2889/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2890/// shifting in sign bits.
2891///
2892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2893#[inline]
2894#[target_feature(enable = "avx2")]
2895#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2896#[rustc_legacy_const_generics(1)]
2897#[stable(feature = "simd_x86", since = "1.27.0")]
2898pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2899    static_assert_uimm_bits!(IMM8, 8);
2900    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2901}
2902
2903/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2904/// corresponding element in `count` while shifting in sign bits.
2905///
2906/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2907#[inline]
2908#[target_feature(enable = "avx2")]
2909#[cfg_attr(test, assert_instr(vpsravd))]
2910#[stable(feature = "simd_x86", since = "1.27.0")]
2911pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2912    unsafe {
2913        let count = count.as_u32x4();
2914        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2915        let count = simd_select(no_overflow, transmute(count), i32x4::splat(31));
2916        simd_shr(a.as_i32x4(), count).as_m128i()
2917    }
2918}
2919
2920/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2921/// corresponding element in `count` while shifting in sign bits.
2922///
2923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2924#[inline]
2925#[target_feature(enable = "avx2")]
2926#[cfg_attr(test, assert_instr(vpsravd))]
2927#[stable(feature = "simd_x86", since = "1.27.0")]
2928pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2929    unsafe {
2930        let count = count.as_u32x8();
2931        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2932        let count = simd_select(no_overflow, transmute(count), i32x8::splat(31));
2933        simd_shr(a.as_i32x8(), count).as_m256i()
2934    }
2935}
2936
2937/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2938///
2939/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
2940#[inline]
2941#[target_feature(enable = "avx2")]
2942#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2943#[rustc_legacy_const_generics(1)]
2944#[stable(feature = "simd_x86", since = "1.27.0")]
2945pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2946    static_assert_uimm_bits!(IMM8, 8);
2947    _mm256_bsrli_epi128::<IMM8>(a)
2948}
2949
2950/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2951///
2952/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
2953#[inline]
2954#[target_feature(enable = "avx2")]
2955#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2956#[rustc_legacy_const_generics(1)]
2957#[stable(feature = "simd_x86", since = "1.27.0")]
2958pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2959    static_assert_uimm_bits!(IMM8, 8);
2960    const fn mask(shift: i32, i: u32) -> u32 {
2961        let shift = shift as u32 & 0xff;
2962        if shift > 15 || (15 - (i % 16)) < shift {
2963            0
2964        } else {
2965            32 + (i + shift)
2966        }
2967    }
2968    unsafe {
2969        let a = a.as_i8x32();
2970        let r: i8x32 = simd_shuffle!(
2971            i8x32::ZERO,
2972            a,
2973            [
2974                mask(IMM8, 0),
2975                mask(IMM8, 1),
2976                mask(IMM8, 2),
2977                mask(IMM8, 3),
2978                mask(IMM8, 4),
2979                mask(IMM8, 5),
2980                mask(IMM8, 6),
2981                mask(IMM8, 7),
2982                mask(IMM8, 8),
2983                mask(IMM8, 9),
2984                mask(IMM8, 10),
2985                mask(IMM8, 11),
2986                mask(IMM8, 12),
2987                mask(IMM8, 13),
2988                mask(IMM8, 14),
2989                mask(IMM8, 15),
2990                mask(IMM8, 16),
2991                mask(IMM8, 17),
2992                mask(IMM8, 18),
2993                mask(IMM8, 19),
2994                mask(IMM8, 20),
2995                mask(IMM8, 21),
2996                mask(IMM8, 22),
2997                mask(IMM8, 23),
2998                mask(IMM8, 24),
2999                mask(IMM8, 25),
3000                mask(IMM8, 26),
3001                mask(IMM8, 27),
3002                mask(IMM8, 28),
3003                mask(IMM8, 29),
3004                mask(IMM8, 30),
3005                mask(IMM8, 31),
3006            ],
3007        );
3008        transmute(r)
3009    }
3010}
3011
3012/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3013/// zeros.
3014///
3015/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3016#[inline]
3017#[target_feature(enable = "avx2")]
3018#[cfg_attr(test, assert_instr(vpsrlw))]
3019#[stable(feature = "simd_x86", since = "1.27.0")]
3020pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3021    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
3022}
3023
3024/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3025/// zeros.
3026///
3027/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3028#[inline]
3029#[target_feature(enable = "avx2")]
3030#[cfg_attr(test, assert_instr(vpsrld))]
3031#[stable(feature = "simd_x86", since = "1.27.0")]
3032pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3033    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
3034}
3035
3036/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3037/// zeros.
3038///
3039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3040#[inline]
3041#[target_feature(enable = "avx2")]
3042#[cfg_attr(test, assert_instr(vpsrlq))]
3043#[stable(feature = "simd_x86", since = "1.27.0")]
3044pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3045    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3046}
3047
3048/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3049/// zeros
3050///
3051/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3052#[inline]
3053#[target_feature(enable = "avx2")]
3054#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3055#[rustc_legacy_const_generics(1)]
3056#[stable(feature = "simd_x86", since = "1.27.0")]
3057pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3058    static_assert_uimm_bits!(IMM8, 8);
3059    unsafe {
3060        if IMM8 >= 16 {
3061            _mm256_setzero_si256()
3062        } else {
3063            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3064        }
3065    }
3066}
3067
3068/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3069/// zeros
3070///
3071/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3072#[inline]
3073#[target_feature(enable = "avx2")]
3074#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3075#[rustc_legacy_const_generics(1)]
3076#[stable(feature = "simd_x86", since = "1.27.0")]
3077pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3078    static_assert_uimm_bits!(IMM8, 8);
3079    unsafe {
3080        if IMM8 >= 32 {
3081            _mm256_setzero_si256()
3082        } else {
3083            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3084        }
3085    }
3086}
3087
3088/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3089/// zeros
3090///
3091/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3092#[inline]
3093#[target_feature(enable = "avx2")]
3094#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3095#[rustc_legacy_const_generics(1)]
3096#[stable(feature = "simd_x86", since = "1.27.0")]
3097pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3098    static_assert_uimm_bits!(IMM8, 8);
3099    unsafe {
3100        if IMM8 >= 64 {
3101            _mm256_setzero_si256()
3102        } else {
3103            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3104        }
3105    }
3106}
3107
3108/// Shifts packed 32-bit integers in `a` right by the amount specified by
3109/// the corresponding element in `count` while shifting in zeros,
3110///
3111/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3112#[inline]
3113#[target_feature(enable = "avx2")]
3114#[cfg_attr(test, assert_instr(vpsrlvd))]
3115#[stable(feature = "simd_x86", since = "1.27.0")]
3116pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3117    unsafe {
3118        let count = count.as_u32x4();
3119        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
3120        let count = simd_select(no_overflow, count, u32x4::ZERO);
3121        simd_select(no_overflow, simd_shr(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
3122    }
3123}
3124
3125/// Shifts packed 32-bit integers in `a` right by the amount specified by
3126/// the corresponding element in `count` while shifting in zeros,
3127///
3128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3129#[inline]
3130#[target_feature(enable = "avx2")]
3131#[cfg_attr(test, assert_instr(vpsrlvd))]
3132#[stable(feature = "simd_x86", since = "1.27.0")]
3133pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3134    unsafe {
3135        let count = count.as_u32x8();
3136        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
3137        let count = simd_select(no_overflow, count, u32x8::ZERO);
3138        simd_select(no_overflow, simd_shr(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
3139    }
3140}
3141
3142/// Shifts packed 64-bit integers in `a` right by the amount specified by
3143/// the corresponding element in `count` while shifting in zeros,
3144///
3145/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3146#[inline]
3147#[target_feature(enable = "avx2")]
3148#[cfg_attr(test, assert_instr(vpsrlvq))]
3149#[stable(feature = "simd_x86", since = "1.27.0")]
3150pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3151    unsafe {
3152        let count = count.as_u64x2();
3153        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
3154        let count = simd_select(no_overflow, count, u64x2::ZERO);
3155        simd_select(no_overflow, simd_shr(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
3156    }
3157}
3158
3159/// Shifts packed 64-bit integers in `a` right by the amount specified by
3160/// the corresponding element in `count` while shifting in zeros,
3161///
3162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3163#[inline]
3164#[target_feature(enable = "avx2")]
3165#[cfg_attr(test, assert_instr(vpsrlvq))]
3166#[stable(feature = "simd_x86", since = "1.27.0")]
3167pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3168    unsafe {
3169        let count = count.as_u64x4();
3170        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
3171        let count = simd_select(no_overflow, count, u64x4::ZERO);
3172        simd_select(no_overflow, simd_shr(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
3173    }
3174}
3175
3176/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3177/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3178/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3179///
3180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3181#[inline]
3182#[target_feature(enable = "avx2")]
3183#[cfg_attr(test, assert_instr(vmovntdqa))]
3184#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3185pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3186    let dst: __m256i;
3187    crate::arch::asm!(
3188        vpl!("vmovntdqa {a}"),
3189        a = out(ymm_reg) dst,
3190        p = in(reg) mem_addr,
3191        options(pure, readonly, nostack, preserves_flags),
3192    );
3193    dst
3194}
3195
3196/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3197///
3198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3199#[inline]
3200#[target_feature(enable = "avx2")]
3201#[cfg_attr(test, assert_instr(vpsubw))]
3202#[stable(feature = "simd_x86", since = "1.27.0")]
3203pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3204    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3205}
3206
3207/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3208///
3209/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3210#[inline]
3211#[target_feature(enable = "avx2")]
3212#[cfg_attr(test, assert_instr(vpsubd))]
3213#[stable(feature = "simd_x86", since = "1.27.0")]
3214pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3215    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3216}
3217
3218/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3219///
3220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3221#[inline]
3222#[target_feature(enable = "avx2")]
3223#[cfg_attr(test, assert_instr(vpsubq))]
3224#[stable(feature = "simd_x86", since = "1.27.0")]
3225pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3226    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3227}
3228
3229/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3230///
3231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3232#[inline]
3233#[target_feature(enable = "avx2")]
3234#[cfg_attr(test, assert_instr(vpsubb))]
3235#[stable(feature = "simd_x86", since = "1.27.0")]
3236pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3237    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3238}
3239
3240/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3241/// `a` using saturation.
3242///
3243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3244#[inline]
3245#[target_feature(enable = "avx2")]
3246#[cfg_attr(test, assert_instr(vpsubsw))]
3247#[stable(feature = "simd_x86", since = "1.27.0")]
3248pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3249    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3250}
3251
3252/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3253/// `a` using saturation.
3254///
3255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3256#[inline]
3257#[target_feature(enable = "avx2")]
3258#[cfg_attr(test, assert_instr(vpsubsb))]
3259#[stable(feature = "simd_x86", since = "1.27.0")]
3260pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3261    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3262}
3263
3264/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3265/// integers in `a` using saturation.
3266///
3267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3268#[inline]
3269#[target_feature(enable = "avx2")]
3270#[cfg_attr(test, assert_instr(vpsubusw))]
3271#[stable(feature = "simd_x86", since = "1.27.0")]
3272pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3273    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3274}
3275
3276/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3277/// integers in `a` using saturation.
3278///
3279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3280#[inline]
3281#[target_feature(enable = "avx2")]
3282#[cfg_attr(test, assert_instr(vpsubusb))]
3283#[stable(feature = "simd_x86", since = "1.27.0")]
3284pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3285    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3286}
3287
3288/// Unpacks and interleave 8-bit integers from the high half of each
3289/// 128-bit lane in `a` and `b`.
3290///
3291/// ```rust
3292/// #[cfg(target_arch = "x86")]
3293/// use std::arch::x86::*;
3294/// #[cfg(target_arch = "x86_64")]
3295/// use std::arch::x86_64::*;
3296///
3297/// # fn main() {
3298/// #     if is_x86_feature_detected!("avx2") {
3299/// #         #[target_feature(enable = "avx2")]
3300/// #         unsafe fn worker() {
3301/// let a = _mm256_setr_epi8(
3302///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3303///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3304/// );
3305/// let b = _mm256_setr_epi8(
3306///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3307///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3308///     -30, -31,
3309/// );
3310///
3311/// let c = _mm256_unpackhi_epi8(a, b);
3312///
3313/// let expected = _mm256_setr_epi8(
3314///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3315///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3316///     -31,
3317/// );
3318/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3319///
3320/// #         }
3321/// #         unsafe { worker(); }
3322/// #     }
3323/// # }
3324/// ```
3325///
3326/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3327#[inline]
3328#[target_feature(enable = "avx2")]
3329#[cfg_attr(test, assert_instr(vpunpckhbw))]
3330#[stable(feature = "simd_x86", since = "1.27.0")]
3331pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3332    unsafe {
3333        #[rustfmt::skip]
3334        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3335                8, 40, 9, 41, 10, 42, 11, 43,
3336                12, 44, 13, 45, 14, 46, 15, 47,
3337                24, 56, 25, 57, 26, 58, 27, 59,
3338                28, 60, 29, 61, 30, 62, 31, 63,
3339        ]);
3340        transmute(r)
3341    }
3342}
3343
3344/// Unpacks and interleave 8-bit integers from the low half of each
3345/// 128-bit lane of `a` and `b`.
3346///
3347/// ```rust
3348/// #[cfg(target_arch = "x86")]
3349/// use std::arch::x86::*;
3350/// #[cfg(target_arch = "x86_64")]
3351/// use std::arch::x86_64::*;
3352///
3353/// # fn main() {
3354/// #     if is_x86_feature_detected!("avx2") {
3355/// #         #[target_feature(enable = "avx2")]
3356/// #         unsafe fn worker() {
3357/// let a = _mm256_setr_epi8(
3358///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3359///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3360/// );
3361/// let b = _mm256_setr_epi8(
3362///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3363///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3364///     -30, -31,
3365/// );
3366///
3367/// let c = _mm256_unpacklo_epi8(a, b);
3368///
3369/// let expected = _mm256_setr_epi8(
3370///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3371///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3372/// );
3373/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3374///
3375/// #         }
3376/// #         unsafe { worker(); }
3377/// #     }
3378/// # }
3379/// ```
3380///
3381/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3382#[inline]
3383#[target_feature(enable = "avx2")]
3384#[cfg_attr(test, assert_instr(vpunpcklbw))]
3385#[stable(feature = "simd_x86", since = "1.27.0")]
3386pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3387    unsafe {
3388        #[rustfmt::skip]
3389        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3390            0, 32, 1, 33, 2, 34, 3, 35,
3391            4, 36, 5, 37, 6, 38, 7, 39,
3392            16, 48, 17, 49, 18, 50, 19, 51,
3393            20, 52, 21, 53, 22, 54, 23, 55,
3394        ]);
3395        transmute(r)
3396    }
3397}
3398
3399/// Unpacks and interleave 16-bit integers from the high half of each
3400/// 128-bit lane of `a` and `b`.
3401///
3402/// ```rust
3403/// #[cfg(target_arch = "x86")]
3404/// use std::arch::x86::*;
3405/// #[cfg(target_arch = "x86_64")]
3406/// use std::arch::x86_64::*;
3407///
3408/// # fn main() {
3409/// #     if is_x86_feature_detected!("avx2") {
3410/// #         #[target_feature(enable = "avx2")]
3411/// #         unsafe fn worker() {
3412/// let a = _mm256_setr_epi16(
3413///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3414/// );
3415/// let b = _mm256_setr_epi16(
3416///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3417/// );
3418///
3419/// let c = _mm256_unpackhi_epi16(a, b);
3420///
3421/// let expected = _mm256_setr_epi16(
3422///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3423/// );
3424/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3425///
3426/// #         }
3427/// #         unsafe { worker(); }
3428/// #     }
3429/// # }
3430/// ```
3431///
3432/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3433#[inline]
3434#[target_feature(enable = "avx2")]
3435#[cfg_attr(test, assert_instr(vpunpckhwd))]
3436#[stable(feature = "simd_x86", since = "1.27.0")]
3437pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3438    unsafe {
3439        let r: i16x16 = simd_shuffle!(
3440            a.as_i16x16(),
3441            b.as_i16x16(),
3442            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3443        );
3444        transmute(r)
3445    }
3446}
3447
3448/// Unpacks and interleave 16-bit integers from the low half of each
3449/// 128-bit lane of `a` and `b`.
3450///
3451/// ```rust
3452/// #[cfg(target_arch = "x86")]
3453/// use std::arch::x86::*;
3454/// #[cfg(target_arch = "x86_64")]
3455/// use std::arch::x86_64::*;
3456///
3457/// # fn main() {
3458/// #     if is_x86_feature_detected!("avx2") {
3459/// #         #[target_feature(enable = "avx2")]
3460/// #         unsafe fn worker() {
3461///
3462/// let a = _mm256_setr_epi16(
3463///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3464/// );
3465/// let b = _mm256_setr_epi16(
3466///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3467/// );
3468///
3469/// let c = _mm256_unpacklo_epi16(a, b);
3470///
3471/// let expected = _mm256_setr_epi16(
3472///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3473/// );
3474/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3475///
3476/// #         }
3477/// #         unsafe { worker(); }
3478/// #     }
3479/// # }
3480/// ```
3481///
3482/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3483#[inline]
3484#[target_feature(enable = "avx2")]
3485#[cfg_attr(test, assert_instr(vpunpcklwd))]
3486#[stable(feature = "simd_x86", since = "1.27.0")]
3487pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3488    unsafe {
3489        let r: i16x16 = simd_shuffle!(
3490            a.as_i16x16(),
3491            b.as_i16x16(),
3492            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3493        );
3494        transmute(r)
3495    }
3496}
3497
3498/// Unpacks and interleave 32-bit integers from the high half of each
3499/// 128-bit lane of `a` and `b`.
3500///
3501/// ```rust
3502/// #[cfg(target_arch = "x86")]
3503/// use std::arch::x86::*;
3504/// #[cfg(target_arch = "x86_64")]
3505/// use std::arch::x86_64::*;
3506///
3507/// # fn main() {
3508/// #     if is_x86_feature_detected!("avx2") {
3509/// #         #[target_feature(enable = "avx2")]
3510/// #         unsafe fn worker() {
3511/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3512/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3513///
3514/// let c = _mm256_unpackhi_epi32(a, b);
3515///
3516/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3517/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3518///
3519/// #         }
3520/// #         unsafe { worker(); }
3521/// #     }
3522/// # }
3523/// ```
3524///
3525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3526#[inline]
3527#[target_feature(enable = "avx2")]
3528#[cfg_attr(test, assert_instr(vunpckhps))]
3529#[stable(feature = "simd_x86", since = "1.27.0")]
3530pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3531    unsafe {
3532        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3533        transmute(r)
3534    }
3535}
3536
3537/// Unpacks and interleave 32-bit integers from the low half of each
3538/// 128-bit lane of `a` and `b`.
3539///
3540/// ```rust
3541/// #[cfg(target_arch = "x86")]
3542/// use std::arch::x86::*;
3543/// #[cfg(target_arch = "x86_64")]
3544/// use std::arch::x86_64::*;
3545///
3546/// # fn main() {
3547/// #     if is_x86_feature_detected!("avx2") {
3548/// #         #[target_feature(enable = "avx2")]
3549/// #         unsafe fn worker() {
3550/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3551/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3552///
3553/// let c = _mm256_unpacklo_epi32(a, b);
3554///
3555/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3556/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3557///
3558/// #         }
3559/// #         unsafe { worker(); }
3560/// #     }
3561/// # }
3562/// ```
3563///
3564/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3565#[inline]
3566#[target_feature(enable = "avx2")]
3567#[cfg_attr(test, assert_instr(vunpcklps))]
3568#[stable(feature = "simd_x86", since = "1.27.0")]
3569pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3570    unsafe {
3571        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3572        transmute(r)
3573    }
3574}
3575
3576/// Unpacks and interleave 64-bit integers from the high half of each
3577/// 128-bit lane of `a` and `b`.
3578///
3579/// ```rust
3580/// #[cfg(target_arch = "x86")]
3581/// use std::arch::x86::*;
3582/// #[cfg(target_arch = "x86_64")]
3583/// use std::arch::x86_64::*;
3584///
3585/// # fn main() {
3586/// #     if is_x86_feature_detected!("avx2") {
3587/// #         #[target_feature(enable = "avx2")]
3588/// #         unsafe fn worker() {
3589/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3590/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3591///
3592/// let c = _mm256_unpackhi_epi64(a, b);
3593///
3594/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3595/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3596///
3597/// #         }
3598/// #         unsafe { worker(); }
3599/// #     }
3600/// # }
3601/// ```
3602///
3603/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3604#[inline]
3605#[target_feature(enable = "avx2")]
3606#[cfg_attr(test, assert_instr(vunpckhpd))]
3607#[stable(feature = "simd_x86", since = "1.27.0")]
3608pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3609    unsafe {
3610        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3611        transmute(r)
3612    }
3613}
3614
3615/// Unpacks and interleave 64-bit integers from the low half of each
3616/// 128-bit lane of `a` and `b`.
3617///
3618/// ```rust
3619/// #[cfg(target_arch = "x86")]
3620/// use std::arch::x86::*;
3621/// #[cfg(target_arch = "x86_64")]
3622/// use std::arch::x86_64::*;
3623///
3624/// # fn main() {
3625/// #     if is_x86_feature_detected!("avx2") {
3626/// #         #[target_feature(enable = "avx2")]
3627/// #         unsafe fn worker() {
3628/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3629/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3630///
3631/// let c = _mm256_unpacklo_epi64(a, b);
3632///
3633/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3634/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3635///
3636/// #         }
3637/// #         unsafe { worker(); }
3638/// #     }
3639/// # }
3640/// ```
3641///
3642/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3643#[inline]
3644#[target_feature(enable = "avx2")]
3645#[cfg_attr(test, assert_instr(vunpcklpd))]
3646#[stable(feature = "simd_x86", since = "1.27.0")]
3647pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3648    unsafe {
3649        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3650        transmute(r)
3651    }
3652}
3653
3654/// Computes the bitwise XOR of 256 bits (representing integer data)
3655/// in `a` and `b`
3656///
3657/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3658#[inline]
3659#[target_feature(enable = "avx2")]
3660#[cfg_attr(test, assert_instr(vxorps))]
3661#[stable(feature = "simd_x86", since = "1.27.0")]
3662pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3663    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3664}
3665
3666/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3667/// integer containing the zero-extended integer data.
3668///
3669/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3670///
3671/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3672#[inline]
3673#[target_feature(enable = "avx2")]
3674// This intrinsic has no corresponding instruction.
3675#[rustc_legacy_const_generics(1)]
3676#[stable(feature = "simd_x86", since = "1.27.0")]
3677pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3678    static_assert_uimm_bits!(INDEX, 5);
3679    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3680}
3681
3682/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3683/// integer containing the zero-extended integer data.
3684///
3685/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3686///
3687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3688#[inline]
3689#[target_feature(enable = "avx2")]
3690// This intrinsic has no corresponding instruction.
3691#[rustc_legacy_const_generics(1)]
3692#[stable(feature = "simd_x86", since = "1.27.0")]
3693pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3694    static_assert_uimm_bits!(INDEX, 4);
3695    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3696}
3697
3698#[allow(improper_ctypes)]
3699unsafe extern "C" {
3700    #[link_name = "llvm.x86.avx2.phadd.sw"]
3701    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3702    #[link_name = "llvm.x86.avx2.phsub.sw"]
3703    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3704    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3705    fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
3706    #[link_name = "llvm.x86.avx2.mpsadbw"]
3707    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
3708    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3709    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3710    #[link_name = "llvm.x86.avx2.packsswb"]
3711    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3712    #[link_name = "llvm.x86.avx2.packssdw"]
3713    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3714    #[link_name = "llvm.x86.avx2.packuswb"]
3715    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3716    #[link_name = "llvm.x86.avx2.packusdw"]
3717    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3718    #[link_name = "llvm.x86.avx2.psad.bw"]
3719    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3720    #[link_name = "llvm.x86.avx2.psign.b"]
3721    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3722    #[link_name = "llvm.x86.avx2.psign.w"]
3723    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3724    #[link_name = "llvm.x86.avx2.psign.d"]
3725    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3726    #[link_name = "llvm.x86.avx2.psll.w"]
3727    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3728    #[link_name = "llvm.x86.avx2.psll.d"]
3729    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3730    #[link_name = "llvm.x86.avx2.psll.q"]
3731    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3732    #[link_name = "llvm.x86.avx2.psra.w"]
3733    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3734    #[link_name = "llvm.x86.avx2.psra.d"]
3735    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3736    #[link_name = "llvm.x86.avx2.psrl.w"]
3737    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3738    #[link_name = "llvm.x86.avx2.psrl.d"]
3739    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3740    #[link_name = "llvm.x86.avx2.psrl.q"]
3741    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3742    #[link_name = "llvm.x86.avx2.pshuf.b"]
3743    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3744    #[link_name = "llvm.x86.avx2.permd"]
3745    fn permd(a: u32x8, b: u32x8) -> u32x8;
3746    #[link_name = "llvm.x86.avx2.permps"]
3747    fn permps(a: __m256, b: i32x8) -> __m256;
3748    #[link_name = "llvm.x86.avx2.gather.d.d"]
3749    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3750    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3751    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3752    #[link_name = "llvm.x86.avx2.gather.d.q"]
3753    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3754    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3755    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3756    #[link_name = "llvm.x86.avx2.gather.q.d"]
3757    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3758    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3759    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3760    #[link_name = "llvm.x86.avx2.gather.q.q"]
3761    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3762    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3763    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3764    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3765    fn pgatherdpd(
3766        src: __m128d,
3767        slice: *const i8,
3768        offsets: i32x4,
3769        mask: __m128d,
3770        scale: i8,
3771    ) -> __m128d;
3772    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3773    fn vpgatherdpd(
3774        src: __m256d,
3775        slice: *const i8,
3776        offsets: i32x4,
3777        mask: __m256d,
3778        scale: i8,
3779    ) -> __m256d;
3780    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3781    fn pgatherqpd(
3782        src: __m128d,
3783        slice: *const i8,
3784        offsets: i64x2,
3785        mask: __m128d,
3786        scale: i8,
3787    ) -> __m128d;
3788    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3789    fn vpgatherqpd(
3790        src: __m256d,
3791        slice: *const i8,
3792        offsets: i64x4,
3793        mask: __m256d,
3794        scale: i8,
3795    ) -> __m256d;
3796    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3797    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3798    -> __m128;
3799    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3800    fn vpgatherdps(
3801        src: __m256,
3802        slice: *const i8,
3803        offsets: i32x8,
3804        mask: __m256,
3805        scale: i8,
3806    ) -> __m256;
3807    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3808    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3809    -> __m128;
3810    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3811    fn vpgatherqps(
3812        src: __m128,
3813        slice: *const i8,
3814        offsets: i64x4,
3815        mask: __m128,
3816        scale: i8,
3817    ) -> __m128;
3818}
3819
3820#[cfg(test)]
3821mod tests {
3822
3823    use stdarch_test::simd_test;
3824
3825    use crate::core_arch::x86::*;
3826
3827    #[simd_test(enable = "avx2")]
3828    unsafe fn test_mm256_abs_epi32() {
3829        #[rustfmt::skip]
3830        let a = _mm256_setr_epi32(
3831            0, 1, -1, i32::MAX,
3832            i32::MIN, 100, -100, -32,
3833        );
3834        let r = _mm256_abs_epi32(a);
3835        #[rustfmt::skip]
3836        let e = _mm256_setr_epi32(
3837            0, 1, 1, i32::MAX,
3838            i32::MAX.wrapping_add(1), 100, 100, 32,
3839        );
3840        assert_eq_m256i(r, e);
3841    }
3842
3843    #[simd_test(enable = "avx2")]
3844    unsafe fn test_mm256_abs_epi16() {
3845        #[rustfmt::skip]
3846        let a = _mm256_setr_epi16(
3847            0,  1, -1, 2, -2, 3, -3, 4,
3848            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3849        );
3850        let r = _mm256_abs_epi16(a);
3851        #[rustfmt::skip]
3852        let e = _mm256_setr_epi16(
3853            0, 1, 1, 2, 2, 3, 3, 4,
3854            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3855        );
3856        assert_eq_m256i(r, e);
3857    }
3858
3859    #[simd_test(enable = "avx2")]
3860    unsafe fn test_mm256_abs_epi8() {
3861        #[rustfmt::skip]
3862        let a = _mm256_setr_epi8(
3863            0, 1, -1, 2, -2, 3, -3, 4,
3864            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3865            0, 1, -1, 2, -2, 3, -3, 4,
3866            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3867        );
3868        let r = _mm256_abs_epi8(a);
3869        #[rustfmt::skip]
3870        let e = _mm256_setr_epi8(
3871            0, 1, 1, 2, 2, 3, 3, 4,
3872            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3873            0, 1, 1, 2, 2, 3, 3, 4,
3874            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3875        );
3876        assert_eq_m256i(r, e);
3877    }
3878
3879    #[simd_test(enable = "avx2")]
3880    unsafe fn test_mm256_add_epi64() {
3881        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
3882        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
3883        let r = _mm256_add_epi64(a, b);
3884        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
3885        assert_eq_m256i(r, e);
3886    }
3887
3888    #[simd_test(enable = "avx2")]
3889    unsafe fn test_mm256_add_epi32() {
3890        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
3891        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3892        let r = _mm256_add_epi32(a, b);
3893        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
3894        assert_eq_m256i(r, e);
3895    }
3896
3897    #[simd_test(enable = "avx2")]
3898    unsafe fn test_mm256_add_epi16() {
3899        #[rustfmt::skip]
3900        let a = _mm256_setr_epi16(
3901            0, 1, 2, 3, 4, 5, 6, 7,
3902            8, 9, 10, 11, 12, 13, 14, 15,
3903        );
3904        #[rustfmt::skip]
3905        let b = _mm256_setr_epi16(
3906            0, 1, 2, 3, 4, 5, 6, 7,
3907            8, 9, 10, 11, 12, 13, 14, 15,
3908        );
3909        let r = _mm256_add_epi16(a, b);
3910        #[rustfmt::skip]
3911        let e = _mm256_setr_epi16(
3912            0, 2, 4, 6, 8, 10, 12, 14,
3913            16, 18, 20, 22, 24, 26, 28, 30,
3914        );
3915        assert_eq_m256i(r, e);
3916    }
3917
3918    #[simd_test(enable = "avx2")]
3919    unsafe fn test_mm256_add_epi8() {
3920        #[rustfmt::skip]
3921        let a = _mm256_setr_epi8(
3922            0, 1, 2, 3, 4, 5, 6, 7,
3923            8, 9, 10, 11, 12, 13, 14, 15,
3924            16, 17, 18, 19, 20, 21, 22, 23,
3925            24, 25, 26, 27, 28, 29, 30, 31,
3926        );
3927        #[rustfmt::skip]
3928        let b = _mm256_setr_epi8(
3929            0, 1, 2, 3, 4, 5, 6, 7,
3930            8, 9, 10, 11, 12, 13, 14, 15,
3931            16, 17, 18, 19, 20, 21, 22, 23,
3932            24, 25, 26, 27, 28, 29, 30, 31,
3933        );
3934        let r = _mm256_add_epi8(a, b);
3935        #[rustfmt::skip]
3936        let e = _mm256_setr_epi8(
3937            0, 2, 4, 6, 8, 10, 12, 14,
3938            16, 18, 20, 22, 24, 26, 28, 30,
3939            32, 34, 36, 38, 40, 42, 44, 46,
3940            48, 50, 52, 54, 56, 58, 60, 62,
3941        );
3942        assert_eq_m256i(r, e);
3943    }
3944
3945    #[simd_test(enable = "avx2")]
3946    unsafe fn test_mm256_adds_epi8() {
3947        #[rustfmt::skip]
3948        let a = _mm256_setr_epi8(
3949            0, 1, 2, 3, 4, 5, 6, 7,
3950            8, 9, 10, 11, 12, 13, 14, 15,
3951            16, 17, 18, 19, 20, 21, 22, 23,
3952            24, 25, 26, 27, 28, 29, 30, 31,
3953        );
3954        #[rustfmt::skip]
3955        let b = _mm256_setr_epi8(
3956            32, 33, 34, 35, 36, 37, 38, 39,
3957            40, 41, 42, 43, 44, 45, 46, 47,
3958            48, 49, 50, 51, 52, 53, 54, 55,
3959            56, 57, 58, 59, 60, 61, 62, 63,
3960        );
3961        let r = _mm256_adds_epi8(a, b);
3962        #[rustfmt::skip]
3963        let e = _mm256_setr_epi8(
3964            32, 34, 36, 38, 40, 42, 44, 46,
3965            48, 50, 52, 54, 56, 58, 60, 62,
3966            64, 66, 68, 70, 72, 74, 76, 78,
3967            80, 82, 84, 86, 88, 90, 92, 94,
3968        );
3969        assert_eq_m256i(r, e);
3970    }
3971
3972    #[simd_test(enable = "avx2")]
3973    unsafe fn test_mm256_adds_epi8_saturate_positive() {
3974        let a = _mm256_set1_epi8(0x7F);
3975        let b = _mm256_set1_epi8(1);
3976        let r = _mm256_adds_epi8(a, b);
3977        assert_eq_m256i(r, a);
3978    }
3979
3980    #[simd_test(enable = "avx2")]
3981    unsafe fn test_mm256_adds_epi8_saturate_negative() {
3982        let a = _mm256_set1_epi8(-0x80);
3983        let b = _mm256_set1_epi8(-1);
3984        let r = _mm256_adds_epi8(a, b);
3985        assert_eq_m256i(r, a);
3986    }
3987
3988    #[simd_test(enable = "avx2")]
3989    unsafe fn test_mm256_adds_epi16() {
3990        #[rustfmt::skip]
3991        let a = _mm256_setr_epi16(
3992            0, 1, 2, 3, 4, 5, 6, 7,
3993            8, 9, 10, 11, 12, 13, 14, 15,
3994        );
3995        #[rustfmt::skip]
3996        let b = _mm256_setr_epi16(
3997            32, 33, 34, 35, 36, 37, 38, 39,
3998            40, 41, 42, 43, 44, 45, 46, 47,
3999        );
4000        let r = _mm256_adds_epi16(a, b);
4001        #[rustfmt::skip]
4002        let e = _mm256_setr_epi16(
4003            32, 34, 36, 38, 40, 42, 44, 46,
4004            48, 50, 52, 54, 56, 58, 60, 62,
4005        );
4006
4007        assert_eq_m256i(r, e);
4008    }
4009
4010    #[simd_test(enable = "avx2")]
4011    unsafe fn test_mm256_adds_epi16_saturate_positive() {
4012        let a = _mm256_set1_epi16(0x7FFF);
4013        let b = _mm256_set1_epi16(1);
4014        let r = _mm256_adds_epi16(a, b);
4015        assert_eq_m256i(r, a);
4016    }
4017
4018    #[simd_test(enable = "avx2")]
4019    unsafe fn test_mm256_adds_epi16_saturate_negative() {
4020        let a = _mm256_set1_epi16(-0x8000);
4021        let b = _mm256_set1_epi16(-1);
4022        let r = _mm256_adds_epi16(a, b);
4023        assert_eq_m256i(r, a);
4024    }
4025
4026    #[simd_test(enable = "avx2")]
4027    unsafe fn test_mm256_adds_epu8() {
4028        #[rustfmt::skip]
4029        let a = _mm256_setr_epi8(
4030            0, 1, 2, 3, 4, 5, 6, 7,
4031            8, 9, 10, 11, 12, 13, 14, 15,
4032            16, 17, 18, 19, 20, 21, 22, 23,
4033            24, 25, 26, 27, 28, 29, 30, 31,
4034        );
4035        #[rustfmt::skip]
4036        let b = _mm256_setr_epi8(
4037            32, 33, 34, 35, 36, 37, 38, 39,
4038            40, 41, 42, 43, 44, 45, 46, 47,
4039            48, 49, 50, 51, 52, 53, 54, 55,
4040            56, 57, 58, 59, 60, 61, 62, 63,
4041        );
4042        let r = _mm256_adds_epu8(a, b);
4043        #[rustfmt::skip]
4044        let e = _mm256_setr_epi8(
4045            32, 34, 36, 38, 40, 42, 44, 46,
4046            48, 50, 52, 54, 56, 58, 60, 62,
4047            64, 66, 68, 70, 72, 74, 76, 78,
4048            80, 82, 84, 86, 88, 90, 92, 94,
4049        );
4050        assert_eq_m256i(r, e);
4051    }
4052
4053    #[simd_test(enable = "avx2")]
4054    unsafe fn test_mm256_adds_epu8_saturate() {
4055        let a = _mm256_set1_epi8(!0);
4056        let b = _mm256_set1_epi8(1);
4057        let r = _mm256_adds_epu8(a, b);
4058        assert_eq_m256i(r, a);
4059    }
4060
4061    #[simd_test(enable = "avx2")]
4062    unsafe fn test_mm256_adds_epu16() {
4063        #[rustfmt::skip]
4064        let a = _mm256_setr_epi16(
4065            0, 1, 2, 3, 4, 5, 6, 7,
4066            8, 9, 10, 11, 12, 13, 14, 15,
4067        );
4068        #[rustfmt::skip]
4069        let b = _mm256_setr_epi16(
4070            32, 33, 34, 35, 36, 37, 38, 39,
4071            40, 41, 42, 43, 44, 45, 46, 47,
4072        );
4073        let r = _mm256_adds_epu16(a, b);
4074        #[rustfmt::skip]
4075        let e = _mm256_setr_epi16(
4076            32, 34, 36, 38, 40, 42, 44, 46,
4077            48, 50, 52, 54, 56, 58, 60, 62,
4078        );
4079
4080        assert_eq_m256i(r, e);
4081    }
4082
4083    #[simd_test(enable = "avx2")]
4084    unsafe fn test_mm256_adds_epu16_saturate() {
4085        let a = _mm256_set1_epi16(!0);
4086        let b = _mm256_set1_epi16(1);
4087        let r = _mm256_adds_epu16(a, b);
4088        assert_eq_m256i(r, a);
4089    }
4090
4091    #[simd_test(enable = "avx2")]
4092    unsafe fn test_mm256_and_si256() {
4093        let a = _mm256_set1_epi8(5);
4094        let b = _mm256_set1_epi8(3);
4095        let got = _mm256_and_si256(a, b);
4096        assert_eq_m256i(got, _mm256_set1_epi8(1));
4097    }
4098
4099    #[simd_test(enable = "avx2")]
4100    unsafe fn test_mm256_andnot_si256() {
4101        let a = _mm256_set1_epi8(5);
4102        let b = _mm256_set1_epi8(3);
4103        let got = _mm256_andnot_si256(a, b);
4104        assert_eq_m256i(got, _mm256_set1_epi8(2));
4105    }
4106
4107    #[simd_test(enable = "avx2")]
4108    unsafe fn test_mm256_avg_epu8() {
4109        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4110        let r = _mm256_avg_epu8(a, b);
4111        assert_eq_m256i(r, _mm256_set1_epi8(6));
4112    }
4113
4114    #[simd_test(enable = "avx2")]
4115    unsafe fn test_mm256_avg_epu16() {
4116        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4117        let r = _mm256_avg_epu16(a, b);
4118        assert_eq_m256i(r, _mm256_set1_epi16(6));
4119    }
4120
4121    #[simd_test(enable = "avx2")]
4122    unsafe fn test_mm_blend_epi32() {
4123        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4124        let e = _mm_setr_epi32(9, 3, 3, 3);
4125        let r = _mm_blend_epi32::<0x01>(a, b);
4126        assert_eq_m128i(r, e);
4127
4128        let r = _mm_blend_epi32::<0x0E>(b, a);
4129        assert_eq_m128i(r, e);
4130    }
4131
4132    #[simd_test(enable = "avx2")]
4133    unsafe fn test_mm256_blend_epi32() {
4134        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4135        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4136        let r = _mm256_blend_epi32::<0x01>(a, b);
4137        assert_eq_m256i(r, e);
4138
4139        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4140        let r = _mm256_blend_epi32::<0x82>(a, b);
4141        assert_eq_m256i(r, e);
4142
4143        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4144        let r = _mm256_blend_epi32::<0x7C>(a, b);
4145        assert_eq_m256i(r, e);
4146    }
4147
4148    #[simd_test(enable = "avx2")]
4149    unsafe fn test_mm256_blend_epi16() {
4150        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4151        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4152        let r = _mm256_blend_epi16::<0x01>(a, b);
4153        assert_eq_m256i(r, e);
4154
4155        let r = _mm256_blend_epi16::<0xFE>(b, a);
4156        assert_eq_m256i(r, e);
4157    }
4158
4159    #[simd_test(enable = "avx2")]
4160    unsafe fn test_mm256_blendv_epi8() {
4161        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4162        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4163        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4164        let r = _mm256_blendv_epi8(a, b, mask);
4165        assert_eq_m256i(r, e);
4166    }
4167
4168    #[simd_test(enable = "avx2")]
4169    unsafe fn test_mm_broadcastb_epi8() {
4170        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4171        let res = _mm_broadcastb_epi8(a);
4172        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4173    }
4174
4175    #[simd_test(enable = "avx2")]
4176    unsafe fn test_mm256_broadcastb_epi8() {
4177        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4178        let res = _mm256_broadcastb_epi8(a);
4179        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4180    }
4181
4182    #[simd_test(enable = "avx2")]
4183    unsafe fn test_mm_broadcastd_epi32() {
4184        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4185        let res = _mm_broadcastd_epi32(a);
4186        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4187    }
4188
4189    #[simd_test(enable = "avx2")]
4190    unsafe fn test_mm256_broadcastd_epi32() {
4191        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4192        let res = _mm256_broadcastd_epi32(a);
4193        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4194    }
4195
4196    #[simd_test(enable = "avx2")]
4197    unsafe fn test_mm_broadcastq_epi64() {
4198        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4199        let res = _mm_broadcastq_epi64(a);
4200        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4201    }
4202
4203    #[simd_test(enable = "avx2")]
4204    unsafe fn test_mm256_broadcastq_epi64() {
4205        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4206        let res = _mm256_broadcastq_epi64(a);
4207        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4208    }
4209
4210    #[simd_test(enable = "avx2")]
4211    unsafe fn test_mm_broadcastsd_pd() {
4212        let a = _mm_setr_pd(6.88, 3.44);
4213        let res = _mm_broadcastsd_pd(a);
4214        assert_eq_m128d(res, _mm_set1_pd(6.88));
4215    }
4216
4217    #[simd_test(enable = "avx2")]
4218    unsafe fn test_mm256_broadcastsd_pd() {
4219        let a = _mm_setr_pd(6.88, 3.44);
4220        let res = _mm256_broadcastsd_pd(a);
4221        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4222    }
4223
4224    #[simd_test(enable = "avx2")]
4225    unsafe fn test_mm_broadcastsi128_si256() {
4226        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4227        let res = _mm_broadcastsi128_si256(a);
4228        let retval = _mm256_setr_epi64x(
4229            0x0987654321012334,
4230            0x5678909876543210,
4231            0x0987654321012334,
4232            0x5678909876543210,
4233        );
4234        assert_eq_m256i(res, retval);
4235    }
4236
4237    #[simd_test(enable = "avx2")]
4238    unsafe fn test_mm256_broadcastsi128_si256() {
4239        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4240        let res = _mm256_broadcastsi128_si256(a);
4241        let retval = _mm256_setr_epi64x(
4242            0x0987654321012334,
4243            0x5678909876543210,
4244            0x0987654321012334,
4245            0x5678909876543210,
4246        );
4247        assert_eq_m256i(res, retval);
4248    }
4249
4250    #[simd_test(enable = "avx2")]
4251    unsafe fn test_mm_broadcastss_ps() {
4252        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4253        let res = _mm_broadcastss_ps(a);
4254        assert_eq_m128(res, _mm_set1_ps(6.88));
4255    }
4256
4257    #[simd_test(enable = "avx2")]
4258    unsafe fn test_mm256_broadcastss_ps() {
4259        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4260        let res = _mm256_broadcastss_ps(a);
4261        assert_eq_m256(res, _mm256_set1_ps(6.88));
4262    }
4263
4264    #[simd_test(enable = "avx2")]
4265    unsafe fn test_mm_broadcastw_epi16() {
4266        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4267        let res = _mm_broadcastw_epi16(a);
4268        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4269    }
4270
4271    #[simd_test(enable = "avx2")]
4272    unsafe fn test_mm256_broadcastw_epi16() {
4273        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4274        let res = _mm256_broadcastw_epi16(a);
4275        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4276    }
4277
4278    #[simd_test(enable = "avx2")]
4279    unsafe fn test_mm256_cmpeq_epi8() {
4280        #[rustfmt::skip]
4281        let a = _mm256_setr_epi8(
4282            0, 1, 2, 3, 4, 5, 6, 7,
4283            8, 9, 10, 11, 12, 13, 14, 15,
4284            16, 17, 18, 19, 20, 21, 22, 23,
4285            24, 25, 26, 27, 28, 29, 30, 31,
4286        );
4287        #[rustfmt::skip]
4288        let b = _mm256_setr_epi8(
4289            31, 30, 2, 28, 27, 26, 25, 24,
4290            23, 22, 21, 20, 19, 18, 17, 16,
4291            15, 14, 13, 12, 11, 10, 9, 8,
4292            7, 6, 5, 4, 3, 2, 1, 0,
4293        );
4294        let r = _mm256_cmpeq_epi8(a, b);
4295        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4296    }
4297
4298    #[simd_test(enable = "avx2")]
4299    unsafe fn test_mm256_cmpeq_epi16() {
4300        #[rustfmt::skip]
4301        let a = _mm256_setr_epi16(
4302            0, 1, 2, 3, 4, 5, 6, 7,
4303            8, 9, 10, 11, 12, 13, 14, 15,
4304        );
4305        #[rustfmt::skip]
4306        let b = _mm256_setr_epi16(
4307            15, 14, 2, 12, 11, 10, 9, 8,
4308            7, 6, 5, 4, 3, 2, 1, 0,
4309        );
4310        let r = _mm256_cmpeq_epi16(a, b);
4311        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4312    }
4313
4314    #[simd_test(enable = "avx2")]
4315    unsafe fn test_mm256_cmpeq_epi32() {
4316        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4317        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4318        let r = _mm256_cmpeq_epi32(a, b);
4319        let e = _mm256_set1_epi32(0);
4320        let e = _mm256_insert_epi32::<2>(e, !0);
4321        assert_eq_m256i(r, e);
4322    }
4323
4324    #[simd_test(enable = "avx2")]
4325    unsafe fn test_mm256_cmpeq_epi64() {
4326        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4327        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4328        let r = _mm256_cmpeq_epi64(a, b);
4329        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4330    }
4331
4332    #[simd_test(enable = "avx2")]
4333    unsafe fn test_mm256_cmpgt_epi8() {
4334        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4335        let b = _mm256_set1_epi8(0);
4336        let r = _mm256_cmpgt_epi8(a, b);
4337        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4338    }
4339
4340    #[simd_test(enable = "avx2")]
4341    unsafe fn test_mm256_cmpgt_epi16() {
4342        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4343        let b = _mm256_set1_epi16(0);
4344        let r = _mm256_cmpgt_epi16(a, b);
4345        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4346    }
4347
4348    #[simd_test(enable = "avx2")]
4349    unsafe fn test_mm256_cmpgt_epi32() {
4350        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4351        let b = _mm256_set1_epi32(0);
4352        let r = _mm256_cmpgt_epi32(a, b);
4353        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4354    }
4355
4356    #[simd_test(enable = "avx2")]
4357    unsafe fn test_mm256_cmpgt_epi64() {
4358        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4359        let b = _mm256_set1_epi64x(0);
4360        let r = _mm256_cmpgt_epi64(a, b);
4361        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4362    }
4363
4364    #[simd_test(enable = "avx2")]
4365    unsafe fn test_mm256_cvtepi8_epi16() {
4366        #[rustfmt::skip]
4367        let a = _mm_setr_epi8(
4368            0, 0, -1, 1, -2, 2, -3, 3,
4369            -4, 4, -5, 5, -6, 6, -7, 7,
4370        );
4371        #[rustfmt::skip]
4372        let r = _mm256_setr_epi16(
4373            0, 0, -1, 1, -2, 2, -3, 3,
4374            -4, 4, -5, 5, -6, 6, -7, 7,
4375        );
4376        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4377    }
4378
4379    #[simd_test(enable = "avx2")]
4380    unsafe fn test_mm256_cvtepi8_epi32() {
4381        #[rustfmt::skip]
4382        let a = _mm_setr_epi8(
4383            0, 0, -1, 1, -2, 2, -3, 3,
4384            -4, 4, -5, 5, -6, 6, -7, 7,
4385        );
4386        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4387        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4388    }
4389
4390    #[simd_test(enable = "avx2")]
4391    unsafe fn test_mm256_cvtepi8_epi64() {
4392        #[rustfmt::skip]
4393        let a = _mm_setr_epi8(
4394            0, 0, -1, 1, -2, 2, -3, 3,
4395            -4, 4, -5, 5, -6, 6, -7, 7,
4396        );
4397        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4398        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4399    }
4400
4401    #[simd_test(enable = "avx2")]
4402    unsafe fn test_mm256_cvtepi16_epi32() {
4403        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4404        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4405        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4406    }
4407
4408    #[simd_test(enable = "avx2")]
4409    unsafe fn test_mm256_cvtepi16_epi64() {
4410        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4411        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4412        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4413    }
4414
4415    #[simd_test(enable = "avx2")]
4416    unsafe fn test_mm256_cvtepi32_epi64() {
4417        let a = _mm_setr_epi32(0, 0, -1, 1);
4418        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4419        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4420    }
4421
4422    #[simd_test(enable = "avx2")]
4423    unsafe fn test_mm256_cvtepu16_epi32() {
4424        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4425        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4426        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4427    }
4428
4429    #[simd_test(enable = "avx2")]
4430    unsafe fn test_mm256_cvtepu16_epi64() {
4431        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4432        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4433        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4434    }
4435
4436    #[simd_test(enable = "avx2")]
4437    unsafe fn test_mm256_cvtepu32_epi64() {
4438        let a = _mm_setr_epi32(0, 1, 2, 3);
4439        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4440        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4441    }
4442
4443    #[simd_test(enable = "avx2")]
4444    unsafe fn test_mm256_cvtepu8_epi16() {
4445        #[rustfmt::skip]
4446        let a = _mm_setr_epi8(
4447            0, 1, 2, 3, 4, 5, 6, 7,
4448            8, 9, 10, 11, 12, 13, 14, 15,
4449        );
4450        #[rustfmt::skip]
4451        let r = _mm256_setr_epi16(
4452            0, 1, 2, 3, 4, 5, 6, 7,
4453            8, 9, 10, 11, 12, 13, 14, 15,
4454        );
4455        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4456    }
4457
4458    #[simd_test(enable = "avx2")]
4459    unsafe fn test_mm256_cvtepu8_epi32() {
4460        #[rustfmt::skip]
4461        let a = _mm_setr_epi8(
4462            0, 1, 2, 3, 4, 5, 6, 7,
4463            8, 9, 10, 11, 12, 13, 14, 15,
4464        );
4465        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4466        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4467    }
4468
4469    #[simd_test(enable = "avx2")]
4470    unsafe fn test_mm256_cvtepu8_epi64() {
4471        #[rustfmt::skip]
4472        let a = _mm_setr_epi8(
4473            0, 1, 2, 3, 4, 5, 6, 7,
4474            8, 9, 10, 11, 12, 13, 14, 15,
4475        );
4476        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4477        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4478    }
4479
4480    #[simd_test(enable = "avx2")]
4481    unsafe fn test_mm256_extracti128_si256() {
4482        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4483        let r = _mm256_extracti128_si256::<1>(a);
4484        let e = _mm_setr_epi64x(3, 4);
4485        assert_eq_m128i(r, e);
4486    }
4487
4488    #[simd_test(enable = "avx2")]
4489    unsafe fn test_mm256_hadd_epi16() {
4490        let a = _mm256_set1_epi16(2);
4491        let b = _mm256_set1_epi16(4);
4492        let r = _mm256_hadd_epi16(a, b);
4493        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4494        assert_eq_m256i(r, e);
4495    }
4496
4497    #[simd_test(enable = "avx2")]
4498    unsafe fn test_mm256_hadd_epi32() {
4499        let a = _mm256_set1_epi32(2);
4500        let b = _mm256_set1_epi32(4);
4501        let r = _mm256_hadd_epi32(a, b);
4502        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4503        assert_eq_m256i(r, e);
4504    }
4505
4506    #[simd_test(enable = "avx2")]
4507    unsafe fn test_mm256_hadds_epi16() {
4508        let a = _mm256_set1_epi16(2);
4509        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4510        let a = _mm256_insert_epi16::<1>(a, 1);
4511        let b = _mm256_set1_epi16(4);
4512        let r = _mm256_hadds_epi16(a, b);
4513        #[rustfmt::skip]
4514        let e = _mm256_setr_epi16(
4515            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4516            4, 4, 4, 4, 8, 8, 8, 8,
4517        );
4518        assert_eq_m256i(r, e);
4519    }
4520
4521    #[simd_test(enable = "avx2")]
4522    unsafe fn test_mm256_hsub_epi16() {
4523        let a = _mm256_set1_epi16(2);
4524        let b = _mm256_set1_epi16(4);
4525        let r = _mm256_hsub_epi16(a, b);
4526        let e = _mm256_set1_epi16(0);
4527        assert_eq_m256i(r, e);
4528    }
4529
4530    #[simd_test(enable = "avx2")]
4531    unsafe fn test_mm256_hsub_epi32() {
4532        let a = _mm256_set1_epi32(2);
4533        let b = _mm256_set1_epi32(4);
4534        let r = _mm256_hsub_epi32(a, b);
4535        let e = _mm256_set1_epi32(0);
4536        assert_eq_m256i(r, e);
4537    }
4538
4539    #[simd_test(enable = "avx2")]
4540    unsafe fn test_mm256_hsubs_epi16() {
4541        let a = _mm256_set1_epi16(2);
4542        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4543        let a = _mm256_insert_epi16::<1>(a, -1);
4544        let b = _mm256_set1_epi16(4);
4545        let r = _mm256_hsubs_epi16(a, b);
4546        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4547        assert_eq_m256i(r, e);
4548    }
4549
4550    #[simd_test(enable = "avx2")]
4551    unsafe fn test_mm256_madd_epi16() {
4552        let a = _mm256_set1_epi16(2);
4553        let b = _mm256_set1_epi16(4);
4554        let r = _mm256_madd_epi16(a, b);
4555        let e = _mm256_set1_epi32(16);
4556        assert_eq_m256i(r, e);
4557    }
4558
4559    #[simd_test(enable = "avx2")]
4560    unsafe fn test_mm256_inserti128_si256() {
4561        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4562        let b = _mm_setr_epi64x(7, 8);
4563        let r = _mm256_inserti128_si256::<1>(a, b);
4564        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4565        assert_eq_m256i(r, e);
4566    }
4567
4568    #[simd_test(enable = "avx2")]
4569    unsafe fn test_mm256_maddubs_epi16() {
4570        let a = _mm256_set1_epi8(2);
4571        let b = _mm256_set1_epi8(4);
4572        let r = _mm256_maddubs_epi16(a, b);
4573        let e = _mm256_set1_epi16(16);
4574        assert_eq_m256i(r, e);
4575    }
4576
4577    #[simd_test(enable = "avx2")]
4578    unsafe fn test_mm_maskload_epi32() {
4579        let nums = [1, 2, 3, 4];
4580        let a = &nums as *const i32;
4581        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4582        let r = _mm_maskload_epi32(a, mask);
4583        let e = _mm_setr_epi32(1, 0, 0, 4);
4584        assert_eq_m128i(r, e);
4585    }
4586
4587    #[simd_test(enable = "avx2")]
4588    unsafe fn test_mm256_maskload_epi32() {
4589        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4590        let a = &nums as *const i32;
4591        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4592        let r = _mm256_maskload_epi32(a, mask);
4593        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4594        assert_eq_m256i(r, e);
4595    }
4596
4597    #[simd_test(enable = "avx2")]
4598    unsafe fn test_mm_maskload_epi64() {
4599        let nums = [1_i64, 2_i64];
4600        let a = &nums as *const i64;
4601        let mask = _mm_setr_epi64x(0, -1);
4602        let r = _mm_maskload_epi64(a, mask);
4603        let e = _mm_setr_epi64x(0, 2);
4604        assert_eq_m128i(r, e);
4605    }
4606
4607    #[simd_test(enable = "avx2")]
4608    unsafe fn test_mm256_maskload_epi64() {
4609        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4610        let a = &nums as *const i64;
4611        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4612        let r = _mm256_maskload_epi64(a, mask);
4613        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4614        assert_eq_m256i(r, e);
4615    }
4616
4617    #[simd_test(enable = "avx2")]
4618    unsafe fn test_mm_maskstore_epi32() {
4619        let a = _mm_setr_epi32(1, 2, 3, 4);
4620        let mut arr = [-1, -1, -1, -1];
4621        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4622        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4623        let e = [1, -1, -1, 4];
4624        assert_eq!(arr, e);
4625    }
4626
4627    #[simd_test(enable = "avx2")]
4628    unsafe fn test_mm256_maskstore_epi32() {
4629        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4630        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4631        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4632        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4633        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4634        assert_eq!(arr, e);
4635    }
4636
4637    #[simd_test(enable = "avx2")]
4638    unsafe fn test_mm_maskstore_epi64() {
4639        let a = _mm_setr_epi64x(1_i64, 2_i64);
4640        let mut arr = [-1_i64, -1_i64];
4641        let mask = _mm_setr_epi64x(0, -1);
4642        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4643        let e = [-1, 2];
4644        assert_eq!(arr, e);
4645    }
4646
4647    #[simd_test(enable = "avx2")]
4648    unsafe fn test_mm256_maskstore_epi64() {
4649        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4650        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4651        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4652        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4653        let e = [-1, 2, 3, -1];
4654        assert_eq!(arr, e);
4655    }
4656
4657    #[simd_test(enable = "avx2")]
4658    unsafe fn test_mm256_max_epi16() {
4659        let a = _mm256_set1_epi16(2);
4660        let b = _mm256_set1_epi16(4);
4661        let r = _mm256_max_epi16(a, b);
4662        assert_eq_m256i(r, b);
4663    }
4664
4665    #[simd_test(enable = "avx2")]
4666    unsafe fn test_mm256_max_epi32() {
4667        let a = _mm256_set1_epi32(2);
4668        let b = _mm256_set1_epi32(4);
4669        let r = _mm256_max_epi32(a, b);
4670        assert_eq_m256i(r, b);
4671    }
4672
4673    #[simd_test(enable = "avx2")]
4674    unsafe fn test_mm256_max_epi8() {
4675        let a = _mm256_set1_epi8(2);
4676        let b = _mm256_set1_epi8(4);
4677        let r = _mm256_max_epi8(a, b);
4678        assert_eq_m256i(r, b);
4679    }
4680
4681    #[simd_test(enable = "avx2")]
4682    unsafe fn test_mm256_max_epu16() {
4683        let a = _mm256_set1_epi16(2);
4684        let b = _mm256_set1_epi16(4);
4685        let r = _mm256_max_epu16(a, b);
4686        assert_eq_m256i(r, b);
4687    }
4688
4689    #[simd_test(enable = "avx2")]
4690    unsafe fn test_mm256_max_epu32() {
4691        let a = _mm256_set1_epi32(2);
4692        let b = _mm256_set1_epi32(4);
4693        let r = _mm256_max_epu32(a, b);
4694        assert_eq_m256i(r, b);
4695    }
4696
4697    #[simd_test(enable = "avx2")]
4698    unsafe fn test_mm256_max_epu8() {
4699        let a = _mm256_set1_epi8(2);
4700        let b = _mm256_set1_epi8(4);
4701        let r = _mm256_max_epu8(a, b);
4702        assert_eq_m256i(r, b);
4703    }
4704
4705    #[simd_test(enable = "avx2")]
4706    unsafe fn test_mm256_min_epi16() {
4707        let a = _mm256_set1_epi16(2);
4708        let b = _mm256_set1_epi16(4);
4709        let r = _mm256_min_epi16(a, b);
4710        assert_eq_m256i(r, a);
4711    }
4712
4713    #[simd_test(enable = "avx2")]
4714    unsafe fn test_mm256_min_epi32() {
4715        let a = _mm256_set1_epi32(2);
4716        let b = _mm256_set1_epi32(4);
4717        let r = _mm256_min_epi32(a, b);
4718        assert_eq_m256i(r, a);
4719    }
4720
4721    #[simd_test(enable = "avx2")]
4722    unsafe fn test_mm256_min_epi8() {
4723        let a = _mm256_set1_epi8(2);
4724        let b = _mm256_set1_epi8(4);
4725        let r = _mm256_min_epi8(a, b);
4726        assert_eq_m256i(r, a);
4727    }
4728
4729    #[simd_test(enable = "avx2")]
4730    unsafe fn test_mm256_min_epu16() {
4731        let a = _mm256_set1_epi16(2);
4732        let b = _mm256_set1_epi16(4);
4733        let r = _mm256_min_epu16(a, b);
4734        assert_eq_m256i(r, a);
4735    }
4736
4737    #[simd_test(enable = "avx2")]
4738    unsafe fn test_mm256_min_epu32() {
4739        let a = _mm256_set1_epi32(2);
4740        let b = _mm256_set1_epi32(4);
4741        let r = _mm256_min_epu32(a, b);
4742        assert_eq_m256i(r, a);
4743    }
4744
4745    #[simd_test(enable = "avx2")]
4746    unsafe fn test_mm256_min_epu8() {
4747        let a = _mm256_set1_epi8(2);
4748        let b = _mm256_set1_epi8(4);
4749        let r = _mm256_min_epu8(a, b);
4750        assert_eq_m256i(r, a);
4751    }
4752
4753    #[simd_test(enable = "avx2")]
4754    unsafe fn test_mm256_movemask_epi8() {
4755        let a = _mm256_set1_epi8(-1);
4756        let r = _mm256_movemask_epi8(a);
4757        let e = -1;
4758        assert_eq!(r, e);
4759    }
4760
4761    #[simd_test(enable = "avx2")]
4762    unsafe fn test_mm256_mpsadbw_epu8() {
4763        let a = _mm256_set1_epi8(2);
4764        let b = _mm256_set1_epi8(4);
4765        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4766        let e = _mm256_set1_epi16(8);
4767        assert_eq_m256i(r, e);
4768    }
4769
4770    #[simd_test(enable = "avx2")]
4771    unsafe fn test_mm256_mul_epi32() {
4772        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4773        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4774        let r = _mm256_mul_epi32(a, b);
4775        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4776        assert_eq_m256i(r, e);
4777    }
4778
4779    #[simd_test(enable = "avx2")]
4780    unsafe fn test_mm256_mul_epu32() {
4781        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4782        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4783        let r = _mm256_mul_epu32(a, b);
4784        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4785        assert_eq_m256i(r, e);
4786    }
4787
4788    #[simd_test(enable = "avx2")]
4789    unsafe fn test_mm256_mulhi_epi16() {
4790        let a = _mm256_set1_epi16(6535);
4791        let b = _mm256_set1_epi16(6535);
4792        let r = _mm256_mulhi_epi16(a, b);
4793        let e = _mm256_set1_epi16(651);
4794        assert_eq_m256i(r, e);
4795    }
4796
4797    #[simd_test(enable = "avx2")]
4798    unsafe fn test_mm256_mulhi_epu16() {
4799        let a = _mm256_set1_epi16(6535);
4800        let b = _mm256_set1_epi16(6535);
4801        let r = _mm256_mulhi_epu16(a, b);
4802        let e = _mm256_set1_epi16(651);
4803        assert_eq_m256i(r, e);
4804    }
4805
4806    #[simd_test(enable = "avx2")]
4807    unsafe fn test_mm256_mullo_epi16() {
4808        let a = _mm256_set1_epi16(2);
4809        let b = _mm256_set1_epi16(4);
4810        let r = _mm256_mullo_epi16(a, b);
4811        let e = _mm256_set1_epi16(8);
4812        assert_eq_m256i(r, e);
4813    }
4814
4815    #[simd_test(enable = "avx2")]
4816    unsafe fn test_mm256_mullo_epi32() {
4817        let a = _mm256_set1_epi32(2);
4818        let b = _mm256_set1_epi32(4);
4819        let r = _mm256_mullo_epi32(a, b);
4820        let e = _mm256_set1_epi32(8);
4821        assert_eq_m256i(r, e);
4822    }
4823
4824    #[simd_test(enable = "avx2")]
4825    unsafe fn test_mm256_mulhrs_epi16() {
4826        let a = _mm256_set1_epi16(2);
4827        let b = _mm256_set1_epi16(4);
4828        let r = _mm256_mullo_epi16(a, b);
4829        let e = _mm256_set1_epi16(8);
4830        assert_eq_m256i(r, e);
4831    }
4832
4833    #[simd_test(enable = "avx2")]
4834    unsafe fn test_mm256_or_si256() {
4835        let a = _mm256_set1_epi8(-1);
4836        let b = _mm256_set1_epi8(0);
4837        let r = _mm256_or_si256(a, b);
4838        assert_eq_m256i(r, a);
4839    }
4840
4841    #[simd_test(enable = "avx2")]
4842    unsafe fn test_mm256_packs_epi16() {
4843        let a = _mm256_set1_epi16(2);
4844        let b = _mm256_set1_epi16(4);
4845        let r = _mm256_packs_epi16(a, b);
4846        #[rustfmt::skip]
4847        let e = _mm256_setr_epi8(
4848            2, 2, 2, 2, 2, 2, 2, 2,
4849            4, 4, 4, 4, 4, 4, 4, 4,
4850            2, 2, 2, 2, 2, 2, 2, 2,
4851            4, 4, 4, 4, 4, 4, 4, 4,
4852        );
4853
4854        assert_eq_m256i(r, e);
4855    }
4856
4857    #[simd_test(enable = "avx2")]
4858    unsafe fn test_mm256_packs_epi32() {
4859        let a = _mm256_set1_epi32(2);
4860        let b = _mm256_set1_epi32(4);
4861        let r = _mm256_packs_epi32(a, b);
4862        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4863
4864        assert_eq_m256i(r, e);
4865    }
4866
4867    #[simd_test(enable = "avx2")]
4868    unsafe fn test_mm256_packus_epi16() {
4869        let a = _mm256_set1_epi16(2);
4870        let b = _mm256_set1_epi16(4);
4871        let r = _mm256_packus_epi16(a, b);
4872        #[rustfmt::skip]
4873        let e = _mm256_setr_epi8(
4874            2, 2, 2, 2, 2, 2, 2, 2,
4875            4, 4, 4, 4, 4, 4, 4, 4,
4876            2, 2, 2, 2, 2, 2, 2, 2,
4877            4, 4, 4, 4, 4, 4, 4, 4,
4878        );
4879
4880        assert_eq_m256i(r, e);
4881    }
4882
4883    #[simd_test(enable = "avx2")]
4884    unsafe fn test_mm256_packus_epi32() {
4885        let a = _mm256_set1_epi32(2);
4886        let b = _mm256_set1_epi32(4);
4887        let r = _mm256_packus_epi32(a, b);
4888        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4889
4890        assert_eq_m256i(r, e);
4891    }
4892
4893    #[simd_test(enable = "avx2")]
4894    unsafe fn test_mm256_sad_epu8() {
4895        let a = _mm256_set1_epi8(2);
4896        let b = _mm256_set1_epi8(4);
4897        let r = _mm256_sad_epu8(a, b);
4898        let e = _mm256_set1_epi64x(16);
4899        assert_eq_m256i(r, e);
4900    }
4901
4902    #[simd_test(enable = "avx2")]
4903    unsafe fn test_mm256_shufflehi_epi16() {
4904        #[rustfmt::skip]
4905        let a = _mm256_setr_epi16(
4906            0, 1, 2, 3, 11, 22, 33, 44,
4907            4, 5, 6, 7, 55, 66, 77, 88,
4908        );
4909        #[rustfmt::skip]
4910        let e = _mm256_setr_epi16(
4911            0, 1, 2, 3, 44, 22, 22, 11,
4912            4, 5, 6, 7, 88, 66, 66, 55,
4913        );
4914        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
4915        assert_eq_m256i(r, e);
4916    }
4917
4918    #[simd_test(enable = "avx2")]
4919    unsafe fn test_mm256_shufflelo_epi16() {
4920        #[rustfmt::skip]
4921        let a = _mm256_setr_epi16(
4922            11, 22, 33, 44, 0, 1, 2, 3,
4923            55, 66, 77, 88, 4, 5, 6, 7,
4924        );
4925        #[rustfmt::skip]
4926        let e = _mm256_setr_epi16(
4927            44, 22, 22, 11, 0, 1, 2, 3,
4928            88, 66, 66, 55, 4, 5, 6, 7,
4929        );
4930        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
4931        assert_eq_m256i(r, e);
4932    }
4933
4934    #[simd_test(enable = "avx2")]
4935    unsafe fn test_mm256_sign_epi16() {
4936        let a = _mm256_set1_epi16(2);
4937        let b = _mm256_set1_epi16(-1);
4938        let r = _mm256_sign_epi16(a, b);
4939        let e = _mm256_set1_epi16(-2);
4940        assert_eq_m256i(r, e);
4941    }
4942
4943    #[simd_test(enable = "avx2")]
4944    unsafe fn test_mm256_sign_epi32() {
4945        let a = _mm256_set1_epi32(2);
4946        let b = _mm256_set1_epi32(-1);
4947        let r = _mm256_sign_epi32(a, b);
4948        let e = _mm256_set1_epi32(-2);
4949        assert_eq_m256i(r, e);
4950    }
4951
4952    #[simd_test(enable = "avx2")]
4953    unsafe fn test_mm256_sign_epi8() {
4954        let a = _mm256_set1_epi8(2);
4955        let b = _mm256_set1_epi8(-1);
4956        let r = _mm256_sign_epi8(a, b);
4957        let e = _mm256_set1_epi8(-2);
4958        assert_eq_m256i(r, e);
4959    }
4960
4961    #[simd_test(enable = "avx2")]
4962    unsafe fn test_mm256_sll_epi16() {
4963        let a = _mm256_set1_epi16(0xFF);
4964        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
4965        let r = _mm256_sll_epi16(a, b);
4966        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
4967    }
4968
4969    #[simd_test(enable = "avx2")]
4970    unsafe fn test_mm256_sll_epi32() {
4971        let a = _mm256_set1_epi32(0xFFFF);
4972        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
4973        let r = _mm256_sll_epi32(a, b);
4974        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
4975    }
4976
4977    #[simd_test(enable = "avx2")]
4978    unsafe fn test_mm256_sll_epi64() {
4979        let a = _mm256_set1_epi64x(0xFFFFFFFF);
4980        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
4981        let r = _mm256_sll_epi64(a, b);
4982        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
4983    }
4984
4985    #[simd_test(enable = "avx2")]
4986    unsafe fn test_mm256_slli_epi16() {
4987        assert_eq_m256i(
4988            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
4989            _mm256_set1_epi16(0xFF0),
4990        );
4991    }
4992
4993    #[simd_test(enable = "avx2")]
4994    unsafe fn test_mm256_slli_epi32() {
4995        assert_eq_m256i(
4996            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
4997            _mm256_set1_epi32(0xFFFF0),
4998        );
4999    }
5000
5001    #[simd_test(enable = "avx2")]
5002    unsafe fn test_mm256_slli_epi64() {
5003        assert_eq_m256i(
5004            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5005            _mm256_set1_epi64x(0xFFFFFFFF0),
5006        );
5007    }
5008
5009    #[simd_test(enable = "avx2")]
5010    unsafe fn test_mm256_slli_si256() {
5011        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5012        let r = _mm256_slli_si256::<3>(a);
5013        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5014    }
5015
5016    #[simd_test(enable = "avx2")]
5017    unsafe fn test_mm_sllv_epi32() {
5018        let a = _mm_set1_epi32(2);
5019        let b = _mm_set1_epi32(1);
5020        let r = _mm_sllv_epi32(a, b);
5021        let e = _mm_set1_epi32(4);
5022        assert_eq_m128i(r, e);
5023    }
5024
5025    #[simd_test(enable = "avx2")]
5026    unsafe fn test_mm256_sllv_epi32() {
5027        let a = _mm256_set1_epi32(2);
5028        let b = _mm256_set1_epi32(1);
5029        let r = _mm256_sllv_epi32(a, b);
5030        let e = _mm256_set1_epi32(4);
5031        assert_eq_m256i(r, e);
5032    }
5033
5034    #[simd_test(enable = "avx2")]
5035    unsafe fn test_mm_sllv_epi64() {
5036        let a = _mm_set1_epi64x(2);
5037        let b = _mm_set1_epi64x(1);
5038        let r = _mm_sllv_epi64(a, b);
5039        let e = _mm_set1_epi64x(4);
5040        assert_eq_m128i(r, e);
5041    }
5042
5043    #[simd_test(enable = "avx2")]
5044    unsafe fn test_mm256_sllv_epi64() {
5045        let a = _mm256_set1_epi64x(2);
5046        let b = _mm256_set1_epi64x(1);
5047        let r = _mm256_sllv_epi64(a, b);
5048        let e = _mm256_set1_epi64x(4);
5049        assert_eq_m256i(r, e);
5050    }
5051
5052    #[simd_test(enable = "avx2")]
5053    unsafe fn test_mm256_sra_epi16() {
5054        let a = _mm256_set1_epi16(-1);
5055        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5056        let r = _mm256_sra_epi16(a, b);
5057        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5058    }
5059
5060    #[simd_test(enable = "avx2")]
5061    unsafe fn test_mm256_sra_epi32() {
5062        let a = _mm256_set1_epi32(-1);
5063        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5064        let r = _mm256_sra_epi32(a, b);
5065        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5066    }
5067
5068    #[simd_test(enable = "avx2")]
5069    unsafe fn test_mm256_srai_epi16() {
5070        assert_eq_m256i(
5071            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5072            _mm256_set1_epi16(-1),
5073        );
5074    }
5075
5076    #[simd_test(enable = "avx2")]
5077    unsafe fn test_mm256_srai_epi32() {
5078        assert_eq_m256i(
5079            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5080            _mm256_set1_epi32(-1),
5081        );
5082    }
5083
5084    #[simd_test(enable = "avx2")]
5085    unsafe fn test_mm_srav_epi32() {
5086        let a = _mm_set1_epi32(4);
5087        let count = _mm_set1_epi32(1);
5088        let r = _mm_srav_epi32(a, count);
5089        let e = _mm_set1_epi32(2);
5090        assert_eq_m128i(r, e);
5091    }
5092
5093    #[simd_test(enable = "avx2")]
5094    unsafe fn test_mm256_srav_epi32() {
5095        let a = _mm256_set1_epi32(4);
5096        let count = _mm256_set1_epi32(1);
5097        let r = _mm256_srav_epi32(a, count);
5098        let e = _mm256_set1_epi32(2);
5099        assert_eq_m256i(r, e);
5100    }
5101
5102    #[simd_test(enable = "avx2")]
5103    unsafe fn test_mm256_srli_si256() {
5104        #[rustfmt::skip]
5105        let a = _mm256_setr_epi8(
5106            1, 2, 3, 4, 5, 6, 7, 8,
5107            9, 10, 11, 12, 13, 14, 15, 16,
5108            17, 18, 19, 20, 21, 22, 23, 24,
5109            25, 26, 27, 28, 29, 30, 31, 32,
5110        );
5111        let r = _mm256_srli_si256::<3>(a);
5112        #[rustfmt::skip]
5113        let e = _mm256_setr_epi8(
5114            4, 5, 6, 7, 8, 9, 10, 11,
5115            12, 13, 14, 15, 16, 0, 0, 0,
5116            20, 21, 22, 23, 24, 25, 26, 27,
5117            28, 29, 30, 31, 32, 0, 0, 0,
5118        );
5119        assert_eq_m256i(r, e);
5120    }
5121
5122    #[simd_test(enable = "avx2")]
5123    unsafe fn test_mm256_srl_epi16() {
5124        let a = _mm256_set1_epi16(0xFF);
5125        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5126        let r = _mm256_srl_epi16(a, b);
5127        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5128    }
5129
5130    #[simd_test(enable = "avx2")]
5131    unsafe fn test_mm256_srl_epi32() {
5132        let a = _mm256_set1_epi32(0xFFFF);
5133        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5134        let r = _mm256_srl_epi32(a, b);
5135        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5136    }
5137
5138    #[simd_test(enable = "avx2")]
5139    unsafe fn test_mm256_srl_epi64() {
5140        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5141        let b = _mm_setr_epi64x(4, 0);
5142        let r = _mm256_srl_epi64(a, b);
5143        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5144    }
5145
5146    #[simd_test(enable = "avx2")]
5147    unsafe fn test_mm256_srli_epi16() {
5148        assert_eq_m256i(
5149            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5150            _mm256_set1_epi16(0xF),
5151        );
5152    }
5153
5154    #[simd_test(enable = "avx2")]
5155    unsafe fn test_mm256_srli_epi32() {
5156        assert_eq_m256i(
5157            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5158            _mm256_set1_epi32(0xFFF),
5159        );
5160    }
5161
5162    #[simd_test(enable = "avx2")]
5163    unsafe fn test_mm256_srli_epi64() {
5164        assert_eq_m256i(
5165            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5166            _mm256_set1_epi64x(0xFFFFFFF),
5167        );
5168    }
5169
5170    #[simd_test(enable = "avx2")]
5171    unsafe fn test_mm_srlv_epi32() {
5172        let a = _mm_set1_epi32(2);
5173        let count = _mm_set1_epi32(1);
5174        let r = _mm_srlv_epi32(a, count);
5175        let e = _mm_set1_epi32(1);
5176        assert_eq_m128i(r, e);
5177    }
5178
5179    #[simd_test(enable = "avx2")]
5180    unsafe fn test_mm256_srlv_epi32() {
5181        let a = _mm256_set1_epi32(2);
5182        let count = _mm256_set1_epi32(1);
5183        let r = _mm256_srlv_epi32(a, count);
5184        let e = _mm256_set1_epi32(1);
5185        assert_eq_m256i(r, e);
5186    }
5187
5188    #[simd_test(enable = "avx2")]
5189    unsafe fn test_mm_srlv_epi64() {
5190        let a = _mm_set1_epi64x(2);
5191        let count = _mm_set1_epi64x(1);
5192        let r = _mm_srlv_epi64(a, count);
5193        let e = _mm_set1_epi64x(1);
5194        assert_eq_m128i(r, e);
5195    }
5196
5197    #[simd_test(enable = "avx2")]
5198    unsafe fn test_mm256_srlv_epi64() {
5199        let a = _mm256_set1_epi64x(2);
5200        let count = _mm256_set1_epi64x(1);
5201        let r = _mm256_srlv_epi64(a, count);
5202        let e = _mm256_set1_epi64x(1);
5203        assert_eq_m256i(r, e);
5204    }
5205
5206    #[simd_test(enable = "avx2")]
5207    unsafe fn test_mm256_stream_load_si256() {
5208        let a = _mm256_set_epi64x(5, 6, 7, 8);
5209        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5210        assert_eq_m256i(a, r);
5211    }
5212
5213    #[simd_test(enable = "avx2")]
5214    unsafe fn test_mm256_sub_epi16() {
5215        let a = _mm256_set1_epi16(4);
5216        let b = _mm256_set1_epi16(2);
5217        let r = _mm256_sub_epi16(a, b);
5218        assert_eq_m256i(r, b);
5219    }
5220
5221    #[simd_test(enable = "avx2")]
5222    unsafe fn test_mm256_sub_epi32() {
5223        let a = _mm256_set1_epi32(4);
5224        let b = _mm256_set1_epi32(2);
5225        let r = _mm256_sub_epi32(a, b);
5226        assert_eq_m256i(r, b);
5227    }
5228
5229    #[simd_test(enable = "avx2")]
5230    unsafe fn test_mm256_sub_epi64() {
5231        let a = _mm256_set1_epi64x(4);
5232        let b = _mm256_set1_epi64x(2);
5233        let r = _mm256_sub_epi64(a, b);
5234        assert_eq_m256i(r, b);
5235    }
5236
5237    #[simd_test(enable = "avx2")]
5238    unsafe fn test_mm256_sub_epi8() {
5239        let a = _mm256_set1_epi8(4);
5240        let b = _mm256_set1_epi8(2);
5241        let r = _mm256_sub_epi8(a, b);
5242        assert_eq_m256i(r, b);
5243    }
5244
5245    #[simd_test(enable = "avx2")]
5246    unsafe fn test_mm256_subs_epi16() {
5247        let a = _mm256_set1_epi16(4);
5248        let b = _mm256_set1_epi16(2);
5249        let r = _mm256_subs_epi16(a, b);
5250        assert_eq_m256i(r, b);
5251    }
5252
5253    #[simd_test(enable = "avx2")]
5254    unsafe fn test_mm256_subs_epi8() {
5255        let a = _mm256_set1_epi8(4);
5256        let b = _mm256_set1_epi8(2);
5257        let r = _mm256_subs_epi8(a, b);
5258        assert_eq_m256i(r, b);
5259    }
5260
5261    #[simd_test(enable = "avx2")]
5262    unsafe fn test_mm256_subs_epu16() {
5263        let a = _mm256_set1_epi16(4);
5264        let b = _mm256_set1_epi16(2);
5265        let r = _mm256_subs_epu16(a, b);
5266        assert_eq_m256i(r, b);
5267    }
5268
5269    #[simd_test(enable = "avx2")]
5270    unsafe fn test_mm256_subs_epu8() {
5271        let a = _mm256_set1_epi8(4);
5272        let b = _mm256_set1_epi8(2);
5273        let r = _mm256_subs_epu8(a, b);
5274        assert_eq_m256i(r, b);
5275    }
5276
5277    #[simd_test(enable = "avx2")]
5278    unsafe fn test_mm256_xor_si256() {
5279        let a = _mm256_set1_epi8(5);
5280        let b = _mm256_set1_epi8(3);
5281        let r = _mm256_xor_si256(a, b);
5282        assert_eq_m256i(r, _mm256_set1_epi8(6));
5283    }
5284
5285    #[simd_test(enable = "avx2")]
5286    unsafe fn test_mm256_alignr_epi8() {
5287        #[rustfmt::skip]
5288        let a = _mm256_setr_epi8(
5289            1, 2, 3, 4, 5, 6, 7, 8,
5290            9, 10, 11, 12, 13, 14, 15, 16,
5291            17, 18, 19, 20, 21, 22, 23, 24,
5292            25, 26, 27, 28, 29, 30, 31, 32,
5293        );
5294        #[rustfmt::skip]
5295        let b = _mm256_setr_epi8(
5296            -1, -2, -3, -4, -5, -6, -7, -8,
5297            -9, -10, -11, -12, -13, -14, -15, -16,
5298            -17, -18, -19, -20, -21, -22, -23, -24,
5299            -25, -26, -27, -28, -29, -30, -31, -32,
5300        );
5301        let r = _mm256_alignr_epi8::<33>(a, b);
5302        assert_eq_m256i(r, _mm256_set1_epi8(0));
5303
5304        let r = _mm256_alignr_epi8::<17>(a, b);
5305        #[rustfmt::skip]
5306        let expected = _mm256_setr_epi8(
5307            2, 3, 4, 5, 6, 7, 8, 9,
5308            10, 11, 12, 13, 14, 15, 16, 0,
5309            18, 19, 20, 21, 22, 23, 24, 25,
5310            26, 27, 28, 29, 30, 31, 32, 0,
5311        );
5312        assert_eq_m256i(r, expected);
5313
5314        let r = _mm256_alignr_epi8::<4>(a, b);
5315        #[rustfmt::skip]
5316        let expected = _mm256_setr_epi8(
5317            -5, -6, -7, -8, -9, -10, -11, -12,
5318            -13, -14, -15, -16, 1, 2, 3, 4,
5319            -21, -22, -23, -24, -25, -26, -27, -28,
5320            -29, -30, -31, -32, 17, 18, 19, 20,
5321        );
5322        assert_eq_m256i(r, expected);
5323
5324        let r = _mm256_alignr_epi8::<15>(a, b);
5325        #[rustfmt::skip]
5326        let expected = _mm256_setr_epi8(
5327            -16, 1, 2, 3, 4, 5, 6, 7,
5328            8, 9, 10, 11, 12, 13, 14, 15,
5329            -32, 17, 18, 19, 20, 21, 22, 23,
5330            24, 25, 26, 27, 28, 29, 30, 31,
5331        );
5332        assert_eq_m256i(r, expected);
5333
5334        let r = _mm256_alignr_epi8::<0>(a, b);
5335        assert_eq_m256i(r, b);
5336
5337        let r = _mm256_alignr_epi8::<16>(a, b);
5338        assert_eq_m256i(r, a);
5339    }
5340
5341    #[simd_test(enable = "avx2")]
5342    unsafe fn test_mm256_shuffle_epi8() {
5343        #[rustfmt::skip]
5344        let a = _mm256_setr_epi8(
5345            1, 2, 3, 4, 5, 6, 7, 8,
5346            9, 10, 11, 12, 13, 14, 15, 16,
5347            17, 18, 19, 20, 21, 22, 23, 24,
5348            25, 26, 27, 28, 29, 30, 31, 32,
5349        );
5350        #[rustfmt::skip]
5351        let b = _mm256_setr_epi8(
5352            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5353            12, 5, 5, 10, 4, 1, 8, 0,
5354            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5355            12, 5, 5, 10, 4, 1, 8, 0,
5356        );
5357        #[rustfmt::skip]
5358        let expected = _mm256_setr_epi8(
5359            5, 0, 5, 4, 9, 13, 7, 4,
5360            13, 6, 6, 11, 5, 2, 9, 1,
5361            21, 0, 21, 20, 25, 29, 23, 20,
5362            29, 22, 22, 27, 21, 18, 25, 17,
5363        );
5364        let r = _mm256_shuffle_epi8(a, b);
5365        assert_eq_m256i(r, expected);
5366    }
5367
5368    #[simd_test(enable = "avx2")]
5369    unsafe fn test_mm256_permutevar8x32_epi32() {
5370        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5371        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5372        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5373        let r = _mm256_permutevar8x32_epi32(a, b);
5374        assert_eq_m256i(r, expected);
5375    }
5376
5377    #[simd_test(enable = "avx2")]
5378    unsafe fn test_mm256_permute4x64_epi64() {
5379        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5380        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5381        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5382        assert_eq_m256i(r, expected);
5383    }
5384
5385    #[simd_test(enable = "avx2")]
5386    unsafe fn test_mm256_permute2x128_si256() {
5387        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5388        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5389        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5390        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5391        assert_eq_m256i(r, e);
5392    }
5393
5394    #[simd_test(enable = "avx2")]
5395    unsafe fn test_mm256_permute4x64_pd() {
5396        let a = _mm256_setr_pd(1., 2., 3., 4.);
5397        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5398        let e = _mm256_setr_pd(4., 1., 2., 1.);
5399        assert_eq_m256d(r, e);
5400    }
5401
5402    #[simd_test(enable = "avx2")]
5403    unsafe fn test_mm256_permutevar8x32_ps() {
5404        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5405        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5406        let r = _mm256_permutevar8x32_ps(a, b);
5407        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5408        assert_eq_m256(r, e);
5409    }
5410
5411    #[simd_test(enable = "avx2")]
5412    unsafe fn test_mm_i32gather_epi32() {
5413        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5414        // A multiplier of 4 is word-addressing
5415        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5416        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5417    }
5418
5419    #[simd_test(enable = "avx2")]
5420    unsafe fn test_mm_mask_i32gather_epi32() {
5421        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5422        // A multiplier of 4 is word-addressing
5423        let r = _mm_mask_i32gather_epi32::<4>(
5424            _mm_set1_epi32(256),
5425            arr.as_ptr(),
5426            _mm_setr_epi32(0, 16, 64, 96),
5427            _mm_setr_epi32(-1, -1, -1, 0),
5428        );
5429        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5430    }
5431
5432    #[simd_test(enable = "avx2")]
5433    unsafe fn test_mm256_i32gather_epi32() {
5434        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5435        // A multiplier of 4 is word-addressing
5436        let r =
5437            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5438        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5439    }
5440
5441    #[simd_test(enable = "avx2")]
5442    unsafe fn test_mm256_mask_i32gather_epi32() {
5443        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5444        // A multiplier of 4 is word-addressing
5445        let r = _mm256_mask_i32gather_epi32::<4>(
5446            _mm256_set1_epi32(256),
5447            arr.as_ptr(),
5448            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5449            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5450        );
5451        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5452    }
5453
5454    #[simd_test(enable = "avx2")]
5455    unsafe fn test_mm_i32gather_ps() {
5456        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5457        // A multiplier of 4 is word-addressing for f32s
5458        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5459        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5460    }
5461
5462    #[simd_test(enable = "avx2")]
5463    unsafe fn test_mm_mask_i32gather_ps() {
5464        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5465        // A multiplier of 4 is word-addressing for f32s
5466        let r = _mm_mask_i32gather_ps::<4>(
5467            _mm_set1_ps(256.0),
5468            arr.as_ptr(),
5469            _mm_setr_epi32(0, 16, 64, 96),
5470            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5471        );
5472        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5473    }
5474
5475    #[simd_test(enable = "avx2")]
5476    unsafe fn test_mm256_i32gather_ps() {
5477        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5478        // A multiplier of 4 is word-addressing for f32s
5479        let r =
5480            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5481        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5482    }
5483
5484    #[simd_test(enable = "avx2")]
5485    unsafe fn test_mm256_mask_i32gather_ps() {
5486        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5487        // A multiplier of 4 is word-addressing for f32s
5488        let r = _mm256_mask_i32gather_ps::<4>(
5489            _mm256_set1_ps(256.0),
5490            arr.as_ptr(),
5491            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5492            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5493        );
5494        assert_eq_m256(
5495            r,
5496            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5497        );
5498    }
5499
5500    #[simd_test(enable = "avx2")]
5501    unsafe fn test_mm_i32gather_epi64() {
5502        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5503        // A multiplier of 8 is word-addressing for i64s
5504        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5505        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5506    }
5507
5508    #[simd_test(enable = "avx2")]
5509    unsafe fn test_mm_mask_i32gather_epi64() {
5510        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5511        // A multiplier of 8 is word-addressing for i64s
5512        let r = _mm_mask_i32gather_epi64::<8>(
5513            _mm_set1_epi64x(256),
5514            arr.as_ptr(),
5515            _mm_setr_epi32(16, 16, 16, 16),
5516            _mm_setr_epi64x(-1, 0),
5517        );
5518        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5519    }
5520
5521    #[simd_test(enable = "avx2")]
5522    unsafe fn test_mm256_i32gather_epi64() {
5523        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5524        // A multiplier of 8 is word-addressing for i64s
5525        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5526        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5527    }
5528
5529    #[simd_test(enable = "avx2")]
5530    unsafe fn test_mm256_mask_i32gather_epi64() {
5531        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5532        // A multiplier of 8 is word-addressing for i64s
5533        let r = _mm256_mask_i32gather_epi64::<8>(
5534            _mm256_set1_epi64x(256),
5535            arr.as_ptr(),
5536            _mm_setr_epi32(0, 16, 64, 96),
5537            _mm256_setr_epi64x(-1, -1, -1, 0),
5538        );
5539        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5540    }
5541
5542    #[simd_test(enable = "avx2")]
5543    unsafe fn test_mm_i32gather_pd() {
5544        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5545        // A multiplier of 8 is word-addressing for f64s
5546        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5547        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5548    }
5549
5550    #[simd_test(enable = "avx2")]
5551    unsafe fn test_mm_mask_i32gather_pd() {
5552        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5553        // A multiplier of 8 is word-addressing for f64s
5554        let r = _mm_mask_i32gather_pd::<8>(
5555            _mm_set1_pd(256.0),
5556            arr.as_ptr(),
5557            _mm_setr_epi32(16, 16, 16, 16),
5558            _mm_setr_pd(-1.0, 0.0),
5559        );
5560        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5561    }
5562
5563    #[simd_test(enable = "avx2")]
5564    unsafe fn test_mm256_i32gather_pd() {
5565        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5566        // A multiplier of 8 is word-addressing for f64s
5567        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5568        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5569    }
5570
5571    #[simd_test(enable = "avx2")]
5572    unsafe fn test_mm256_mask_i32gather_pd() {
5573        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5574        // A multiplier of 8 is word-addressing for f64s
5575        let r = _mm256_mask_i32gather_pd::<8>(
5576            _mm256_set1_pd(256.0),
5577            arr.as_ptr(),
5578            _mm_setr_epi32(0, 16, 64, 96),
5579            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5580        );
5581        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5582    }
5583
5584    #[simd_test(enable = "avx2")]
5585    unsafe fn test_mm_i64gather_epi32() {
5586        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5587        // A multiplier of 4 is word-addressing
5588        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5589        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5590    }
5591
5592    #[simd_test(enable = "avx2")]
5593    unsafe fn test_mm_mask_i64gather_epi32() {
5594        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5595        // A multiplier of 4 is word-addressing
5596        let r = _mm_mask_i64gather_epi32::<4>(
5597            _mm_set1_epi32(256),
5598            arr.as_ptr(),
5599            _mm_setr_epi64x(0, 16),
5600            _mm_setr_epi32(-1, 0, -1, 0),
5601        );
5602        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5603    }
5604
5605    #[simd_test(enable = "avx2")]
5606    unsafe fn test_mm256_i64gather_epi32() {
5607        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5608        // A multiplier of 4 is word-addressing
5609        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5610        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5611    }
5612
5613    #[simd_test(enable = "avx2")]
5614    unsafe fn test_mm256_mask_i64gather_epi32() {
5615        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5616        // A multiplier of 4 is word-addressing
5617        let r = _mm256_mask_i64gather_epi32::<4>(
5618            _mm_set1_epi32(256),
5619            arr.as_ptr(),
5620            _mm256_setr_epi64x(0, 16, 64, 96),
5621            _mm_setr_epi32(-1, -1, -1, 0),
5622        );
5623        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5624    }
5625
5626    #[simd_test(enable = "avx2")]
5627    unsafe fn test_mm_i64gather_ps() {
5628        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5629        // A multiplier of 4 is word-addressing for f32s
5630        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5631        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5632    }
5633
5634    #[simd_test(enable = "avx2")]
5635    unsafe fn test_mm_mask_i64gather_ps() {
5636        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5637        // A multiplier of 4 is word-addressing for f32s
5638        let r = _mm_mask_i64gather_ps::<4>(
5639            _mm_set1_ps(256.0),
5640            arr.as_ptr(),
5641            _mm_setr_epi64x(0, 16),
5642            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5643        );
5644        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5645    }
5646
5647    #[simd_test(enable = "avx2")]
5648    unsafe fn test_mm256_i64gather_ps() {
5649        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5650        // A multiplier of 4 is word-addressing for f32s
5651        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5652        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5653    }
5654
5655    #[simd_test(enable = "avx2")]
5656    unsafe fn test_mm256_mask_i64gather_ps() {
5657        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5658        // A multiplier of 4 is word-addressing for f32s
5659        let r = _mm256_mask_i64gather_ps::<4>(
5660            _mm_set1_ps(256.0),
5661            arr.as_ptr(),
5662            _mm256_setr_epi64x(0, 16, 64, 96),
5663            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5664        );
5665        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5666    }
5667
5668    #[simd_test(enable = "avx2")]
5669    unsafe fn test_mm_i64gather_epi64() {
5670        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5671        // A multiplier of 8 is word-addressing for i64s
5672        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5673        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5674    }
5675
5676    #[simd_test(enable = "avx2")]
5677    unsafe fn test_mm_mask_i64gather_epi64() {
5678        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5679        // A multiplier of 8 is word-addressing for i64s
5680        let r = _mm_mask_i64gather_epi64::<8>(
5681            _mm_set1_epi64x(256),
5682            arr.as_ptr(),
5683            _mm_setr_epi64x(16, 16),
5684            _mm_setr_epi64x(-1, 0),
5685        );
5686        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5687    }
5688
5689    #[simd_test(enable = "avx2")]
5690    unsafe fn test_mm256_i64gather_epi64() {
5691        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5692        // A multiplier of 8 is word-addressing for i64s
5693        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5694        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5695    }
5696
5697    #[simd_test(enable = "avx2")]
5698    unsafe fn test_mm256_mask_i64gather_epi64() {
5699        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5700        // A multiplier of 8 is word-addressing for i64s
5701        let r = _mm256_mask_i64gather_epi64::<8>(
5702            _mm256_set1_epi64x(256),
5703            arr.as_ptr(),
5704            _mm256_setr_epi64x(0, 16, 64, 96),
5705            _mm256_setr_epi64x(-1, -1, -1, 0),
5706        );
5707        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5708    }
5709
5710    #[simd_test(enable = "avx2")]
5711    unsafe fn test_mm_i64gather_pd() {
5712        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5713        // A multiplier of 8 is word-addressing for f64s
5714        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5715        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5716    }
5717
5718    #[simd_test(enable = "avx2")]
5719    unsafe fn test_mm_mask_i64gather_pd() {
5720        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5721        // A multiplier of 8 is word-addressing for f64s
5722        let r = _mm_mask_i64gather_pd::<8>(
5723            _mm_set1_pd(256.0),
5724            arr.as_ptr(),
5725            _mm_setr_epi64x(16, 16),
5726            _mm_setr_pd(-1.0, 0.0),
5727        );
5728        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5729    }
5730
5731    #[simd_test(enable = "avx2")]
5732    unsafe fn test_mm256_i64gather_pd() {
5733        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5734        // A multiplier of 8 is word-addressing for f64s
5735        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5736        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5737    }
5738
5739    #[simd_test(enable = "avx2")]
5740    unsafe fn test_mm256_mask_i64gather_pd() {
5741        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5742        // A multiplier of 8 is word-addressing for f64s
5743        let r = _mm256_mask_i64gather_pd::<8>(
5744            _mm256_set1_pd(256.0),
5745            arr.as_ptr(),
5746            _mm256_setr_epi64x(0, 16, 64, 96),
5747            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5748        );
5749        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5750    }
5751
5752    #[simd_test(enable = "avx2")]
5753    unsafe fn test_mm256_extract_epi8() {
5754        #[rustfmt::skip]
5755        let a = _mm256_setr_epi8(
5756            -1, 1, 2, 3, 4, 5, 6, 7,
5757            8, 9, 10, 11, 12, 13, 14, 15,
5758            16, 17, 18, 19, 20, 21, 22, 23,
5759            24, 25, 26, 27, 28, 29, 30, 31
5760        );
5761        let r1 = _mm256_extract_epi8::<0>(a);
5762        let r2 = _mm256_extract_epi8::<3>(a);
5763        assert_eq!(r1, 0xFF);
5764        assert_eq!(r2, 3);
5765    }
5766
5767    #[simd_test(enable = "avx2")]
5768    unsafe fn test_mm256_extract_epi16() {
5769        #[rustfmt::skip]
5770        let a = _mm256_setr_epi16(
5771            -1, 1, 2, 3, 4, 5, 6, 7,
5772            8, 9, 10, 11, 12, 13, 14, 15,
5773        );
5774        let r1 = _mm256_extract_epi16::<0>(a);
5775        let r2 = _mm256_extract_epi16::<3>(a);
5776        assert_eq!(r1, 0xFFFF);
5777        assert_eq!(r2, 3);
5778    }
5779}