Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
34pub const fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
35    unsafe { simd_add(a, b) }
36}
37
38/// Adds packed single-precision (32-bit) floating-point elements in `a` and
39/// `b`.
40///
41/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
42#[inline]
43#[target_feature(enable = "avx")]
44#[cfg_attr(test, assert_instr(vaddps))]
45#[stable(feature = "simd_x86", since = "1.27.0")]
46#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
47pub const fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
48    unsafe { simd_add(a, b) }
49}
50
51/// Computes the bitwise AND of a packed double-precision (64-bit)
52/// floating-point elements in `a` and `b`.
53///
54/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
55#[inline]
56#[target_feature(enable = "avx")]
57// See https://github.com/rust-lang/stdarch/issues/71
58#[cfg_attr(test, assert_instr(vandp))]
59#[stable(feature = "simd_x86", since = "1.27.0")]
60#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
61pub const fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
62    unsafe {
63        let a: u64x4 = transmute(a);
64        let b: u64x4 = transmute(b);
65        transmute(simd_and(a, b))
66    }
67}
68
69/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
70/// elements in `a` and `b`.
71///
72/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
73#[inline]
74#[target_feature(enable = "avx")]
75#[cfg_attr(test, assert_instr(vandps))]
76#[stable(feature = "simd_x86", since = "1.27.0")]
77#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
78pub const fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
79    unsafe {
80        let a: u32x8 = transmute(a);
81        let b: u32x8 = transmute(b);
82        transmute(simd_and(a, b))
83    }
84}
85
86/// Computes the bitwise OR packed double-precision (64-bit) floating-point
87/// elements in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
90#[inline]
91#[target_feature(enable = "avx")]
92// See <https://github.com/rust-lang/stdarch/issues/71>.
93#[cfg_attr(test, assert_instr(vorp))]
94#[stable(feature = "simd_x86", since = "1.27.0")]
95#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
96pub const fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
97    unsafe {
98        let a: u64x4 = transmute(a);
99        let b: u64x4 = transmute(b);
100        transmute(simd_or(a, b))
101    }
102}
103
104/// Computes the bitwise OR packed single-precision (32-bit) floating-point
105/// elements in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
108#[inline]
109#[target_feature(enable = "avx")]
110#[cfg_attr(test, assert_instr(vorps))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
113pub const fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
114    unsafe {
115        let a: u32x8 = transmute(a);
116        let b: u32x8 = transmute(b);
117        transmute(simd_or(a, b))
118    }
119}
120
121/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
122/// lanes using the control in `imm8`.
123///
124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
125#[inline]
126#[target_feature(enable = "avx")]
127#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
128#[rustc_legacy_const_generics(2)]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
132    static_assert_uimm_bits!(MASK, 8);
133    unsafe {
134        simd_shuffle!(
135            a,
136            b,
137            [
138                MASK as u32 & 0b1,
139                ((MASK as u32 >> 1) & 0b1) + 4,
140                ((MASK as u32 >> 2) & 0b1) + 2,
141                ((MASK as u32 >> 3) & 0b1) + 6,
142            ],
143        )
144    }
145}
146
147/// Shuffles single-precision (32-bit) floating-point elements in `a` within
148/// 128-bit lanes using the control in `imm8`.
149///
150/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
151#[inline]
152#[target_feature(enable = "avx")]
153#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
154#[rustc_legacy_const_generics(2)]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
157pub const fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
158    static_assert_uimm_bits!(MASK, 8);
159    unsafe {
160        simd_shuffle!(
161            a,
162            b,
163            [
164                MASK as u32 & 0b11,
165                (MASK as u32 >> 2) & 0b11,
166                ((MASK as u32 >> 4) & 0b11) + 8,
167                ((MASK as u32 >> 6) & 0b11) + 8,
168                (MASK as u32 & 0b11) + 4,
169                ((MASK as u32 >> 2) & 0b11) + 4,
170                ((MASK as u32 >> 4) & 0b11) + 12,
171                ((MASK as u32 >> 6) & 0b11) + 12,
172            ],
173        )
174    }
175}
176
177/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
178/// elements in `a`, and then AND with `b`.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
181#[inline]
182#[target_feature(enable = "avx")]
183#[cfg_attr(test, assert_instr(vandnp))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
186pub const fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
187    unsafe {
188        let a: u64x4 = transmute(a);
189        let b: u64x4 = transmute(b);
190        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
191    }
192}
193
194/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
195/// elements in `a`
196/// and then AND with `b`.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
199#[inline]
200#[target_feature(enable = "avx")]
201#[cfg_attr(test, assert_instr(vandnps))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
204pub const fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
205    unsafe {
206        let a: u32x8 = transmute(a);
207        let b: u32x8 = transmute(b);
208        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
209    }
210}
211
212/// Compares packed double-precision (64-bit) floating-point elements
213/// in `a` and `b`, and returns packed maximum values
214///
215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
216#[inline]
217#[target_feature(enable = "avx")]
218#[cfg_attr(test, assert_instr(vmaxpd))]
219#[stable(feature = "simd_x86", since = "1.27.0")]
220pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
221    unsafe { vmaxpd(a, b) }
222}
223
224/// Compares packed single-precision (32-bit) floating-point elements in `a`
225/// and `b`, and returns packed maximum values
226///
227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
228#[inline]
229#[target_feature(enable = "avx")]
230#[cfg_attr(test, assert_instr(vmaxps))]
231#[stable(feature = "simd_x86", since = "1.27.0")]
232pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
233    unsafe { vmaxps(a, b) }
234}
235
236/// Compares packed double-precision (64-bit) floating-point elements
237/// in `a` and `b`, and returns packed minimum values
238///
239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
240#[inline]
241#[target_feature(enable = "avx")]
242#[cfg_attr(test, assert_instr(vminpd))]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
245    unsafe { vminpd(a, b) }
246}
247
248/// Compares packed single-precision (32-bit) floating-point elements in `a`
249/// and `b`, and returns packed minimum values
250///
251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
252#[inline]
253#[target_feature(enable = "avx")]
254#[cfg_attr(test, assert_instr(vminps))]
255#[stable(feature = "simd_x86", since = "1.27.0")]
256pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
257    unsafe { vminps(a, b) }
258}
259
260/// Multiplies packed double-precision (64-bit) floating-point elements
261/// in `a` and `b`.
262///
263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
264#[inline]
265#[target_feature(enable = "avx")]
266#[cfg_attr(test, assert_instr(vmulpd))]
267#[stable(feature = "simd_x86", since = "1.27.0")]
268#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
269pub const fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
270    unsafe { simd_mul(a, b) }
271}
272
273/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
274/// `b`.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
277#[inline]
278#[target_feature(enable = "avx")]
279#[cfg_attr(test, assert_instr(vmulps))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
282pub const fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
283    unsafe { simd_mul(a, b) }
284}
285
286/// Alternatively adds and subtracts packed double-precision (64-bit)
287/// floating-point elements in `a` to/from packed elements in `b`.
288///
289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
290#[inline]
291#[target_feature(enable = "avx")]
292#[cfg_attr(test, assert_instr(vaddsubpd))]
293#[stable(feature = "simd_x86", since = "1.27.0")]
294#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
295pub const fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
296    unsafe {
297        let a = a.as_f64x4();
298        let b = b.as_f64x4();
299        let add = simd_add(a, b);
300        let sub = simd_sub(a, b);
301        simd_shuffle!(add, sub, [4, 1, 6, 3])
302    }
303}
304
305/// Alternatively adds and subtracts packed single-precision (32-bit)
306/// floating-point elements in `a` to/from packed elements in `b`.
307///
308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
309#[inline]
310#[target_feature(enable = "avx")]
311#[cfg_attr(test, assert_instr(vaddsubps))]
312#[stable(feature = "simd_x86", since = "1.27.0")]
313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
314pub const fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
315    unsafe {
316        let a = a.as_f32x8();
317        let b = b.as_f32x8();
318        let add = simd_add(a, b);
319        let sub = simd_sub(a, b);
320        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
321    }
322}
323
324/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
325/// from packed elements in `a`.
326///
327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
328#[inline]
329#[target_feature(enable = "avx")]
330#[cfg_attr(test, assert_instr(vsubpd))]
331#[stable(feature = "simd_x86", since = "1.27.0")]
332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
333pub const fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
334    unsafe { simd_sub(a, b) }
335}
336
337/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
338/// from packed elements in `a`.
339///
340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
341#[inline]
342#[target_feature(enable = "avx")]
343#[cfg_attr(test, assert_instr(vsubps))]
344#[stable(feature = "simd_x86", since = "1.27.0")]
345#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
346pub const fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
347    unsafe { simd_sub(a, b) }
348}
349
350/// Computes the division of each of the 8 packed 32-bit floating-point elements
351/// in `a` by the corresponding packed elements in `b`.
352///
353/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
354#[inline]
355#[target_feature(enable = "avx")]
356#[cfg_attr(test, assert_instr(vdivps))]
357#[stable(feature = "simd_x86", since = "1.27.0")]
358#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
359pub const fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
360    unsafe { simd_div(a, b) }
361}
362
363/// Computes the division of each of the 4 packed 64-bit floating-point elements
364/// in `a` by the corresponding packed elements in `b`.
365///
366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
367#[inline]
368#[target_feature(enable = "avx")]
369#[cfg_attr(test, assert_instr(vdivpd))]
370#[stable(feature = "simd_x86", since = "1.27.0")]
371#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
372pub const fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
373    unsafe { simd_div(a, b) }
374}
375
376/// Rounds packed double-precision (64-bit) floating point elements in `a`
377/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
378///
379/// - `0x00`: Round to the nearest whole number.
380/// - `0x01`: Round down, toward negative infinity.
381/// - `0x02`: Round up, toward positive infinity.
382/// - `0x03`: Truncate the values.
383///
384/// For a complete list of options, check [the LLVM docs][llvm_docs].
385///
386/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
387///
388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
389#[inline]
390#[target_feature(enable = "avx")]
391#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
392#[rustc_legacy_const_generics(1)]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
395    static_assert_uimm_bits!(ROUNDING, 4);
396    unsafe { roundpd256(a, ROUNDING) }
397}
398
399/// Rounds packed double-precision (64-bit) floating point elements in `a`
400/// toward positive infinity.
401///
402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
403#[inline]
404#[target_feature(enable = "avx")]
405#[cfg_attr(test, assert_instr(vroundpd))]
406#[stable(feature = "simd_x86", since = "1.27.0")]
407#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
408pub const fn _mm256_ceil_pd(a: __m256d) -> __m256d {
409    unsafe { simd_ceil(a) }
410}
411
412/// Rounds packed double-precision (64-bit) floating point elements in `a`
413/// toward negative infinity.
414///
415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
416#[inline]
417#[target_feature(enable = "avx")]
418#[cfg_attr(test, assert_instr(vroundpd))]
419#[stable(feature = "simd_x86", since = "1.27.0")]
420#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
421pub const fn _mm256_floor_pd(a: __m256d) -> __m256d {
422    unsafe { simd_floor(a) }
423}
424
425/// Rounds packed single-precision (32-bit) floating point elements in `a`
426/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
427///
428/// - `0x00`: Round to the nearest whole number.
429/// - `0x01`: Round down, toward negative infinity.
430/// - `0x02`: Round up, toward positive infinity.
431/// - `0x03`: Truncate the values.
432///
433/// For a complete list of options, check [the LLVM docs][llvm_docs].
434///
435/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
436///
437/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
438#[inline]
439#[target_feature(enable = "avx")]
440#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
441#[rustc_legacy_const_generics(1)]
442#[stable(feature = "simd_x86", since = "1.27.0")]
443pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
444    static_assert_uimm_bits!(ROUNDING, 4);
445    unsafe { roundps256(a, ROUNDING) }
446}
447
448/// Rounds packed single-precision (32-bit) floating point elements in `a`
449/// toward positive infinity.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
452#[inline]
453#[target_feature(enable = "avx")]
454#[cfg_attr(test, assert_instr(vroundps))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
457pub const fn _mm256_ceil_ps(a: __m256) -> __m256 {
458    unsafe { simd_ceil(a) }
459}
460
461/// Rounds packed single-precision (32-bit) floating point elements in `a`
462/// toward negative infinity.
463///
464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
465#[inline]
466#[target_feature(enable = "avx")]
467#[cfg_attr(test, assert_instr(vroundps))]
468#[stable(feature = "simd_x86", since = "1.27.0")]
469#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
470pub const fn _mm256_floor_ps(a: __m256) -> __m256 {
471    unsafe { simd_floor(a) }
472}
473
474/// Returns the square root of packed single-precision (32-bit) floating point
475/// elements in `a`.
476///
477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
478#[inline]
479#[target_feature(enable = "avx")]
480#[cfg_attr(test, assert_instr(vsqrtps))]
481#[stable(feature = "simd_x86", since = "1.27.0")]
482pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
483    unsafe { simd_fsqrt(a) }
484}
485
486/// Returns the square root of packed double-precision (64-bit) floating point
487/// elements in `a`.
488///
489/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
490#[inline]
491#[target_feature(enable = "avx")]
492#[cfg_attr(test, assert_instr(vsqrtpd))]
493#[stable(feature = "simd_x86", since = "1.27.0")]
494pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
495    unsafe { simd_fsqrt(a) }
496}
497
498/// Blends packed double-precision (64-bit) floating-point elements from
499/// `a` and `b` using control mask `imm8`.
500///
501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
502#[inline]
503#[target_feature(enable = "avx")]
504// Note: LLVM7 prefers single-precision blend instructions when
505// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
506// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
507#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
508#[rustc_legacy_const_generics(2)]
509#[stable(feature = "simd_x86", since = "1.27.0")]
510#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
511pub const fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
512    static_assert_uimm_bits!(IMM4, 4);
513    unsafe {
514        simd_shuffle!(
515            a,
516            b,
517            [
518                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
519                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
520                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
521                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
522            ],
523        )
524    }
525}
526
527/// Blends packed single-precision (32-bit) floating-point elements from
528/// `a` and `b` using control mask `imm8`.
529///
530/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
531#[inline]
532#[target_feature(enable = "avx")]
533#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
534#[rustc_legacy_const_generics(2)]
535#[stable(feature = "simd_x86", since = "1.27.0")]
536#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
537pub const fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
538    static_assert_uimm_bits!(IMM8, 8);
539    unsafe {
540        simd_shuffle!(
541            a,
542            b,
543            [
544                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
545                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
546                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
547                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
548                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
549                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
550                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
551                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
552            ],
553        )
554    }
555}
556
557/// Blends packed double-precision (64-bit) floating-point elements from
558/// `a` and `b` using `c` as a mask.
559///
560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
561#[inline]
562#[target_feature(enable = "avx")]
563#[cfg_attr(test, assert_instr(vblendvpd))]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
566pub const fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
567    unsafe {
568        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
569        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
570    }
571}
572
573/// Blends packed single-precision (32-bit) floating-point elements from
574/// `a` and `b` using `c` as a mask.
575///
576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
577#[inline]
578#[target_feature(enable = "avx")]
579#[cfg_attr(test, assert_instr(vblendvps))]
580#[stable(feature = "simd_x86", since = "1.27.0")]
581#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
582pub const fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
583    unsafe {
584        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
585        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
586    }
587}
588
589/// Conditionally multiplies the packed single-precision (32-bit) floating-point
590/// elements in `a` and `b` using the high 4 bits in `imm8`,
591/// sum the four products, and conditionally return the sum
592///  using the low 4 bits of `imm8`.
593///
594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
595#[inline]
596#[target_feature(enable = "avx")]
597#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
598#[rustc_legacy_const_generics(2)]
599#[stable(feature = "simd_x86", since = "1.27.0")]
600pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
601    static_assert_uimm_bits!(IMM8, 8);
602    unsafe { vdpps(a, b, IMM8 as i8) }
603}
604
605/// Horizontal addition of adjacent pairs in the two packed vectors
606/// of 4 64-bit floating points `a` and `b`.
607/// In the result, sums of elements from `a` are returned in even locations,
608/// while sums of elements from `b` are returned in odd locations.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
611#[inline]
612#[target_feature(enable = "avx")]
613#[cfg_attr(test, assert_instr(vhaddpd))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
616pub const fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
617    unsafe {
618        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
619        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
620        simd_add(even, odd)
621    }
622}
623
624/// Horizontal addition of adjacent pairs in the two packed vectors
625/// of 8 32-bit floating points `a` and `b`.
626/// In the result, sums of elements from `a` are returned in locations of
627/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
628/// 2, 3, 6, 7.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
631#[inline]
632#[target_feature(enable = "avx")]
633#[cfg_attr(test, assert_instr(vhaddps))]
634#[stable(feature = "simd_x86", since = "1.27.0")]
635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
636pub const fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
637    unsafe {
638        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
639        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
640        simd_add(even, odd)
641    }
642}
643
644/// Horizontal subtraction of adjacent pairs in the two packed vectors
645/// of 4 64-bit floating points `a` and `b`.
646/// In the result, sums of elements from `a` are returned in even locations,
647/// while sums of elements from `b` are returned in odd locations.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
650#[inline]
651#[target_feature(enable = "avx")]
652#[cfg_attr(test, assert_instr(vhsubpd))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
655pub const fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
656    unsafe {
657        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
658        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
659        simd_sub(even, odd)
660    }
661}
662
663/// Horizontal subtraction of adjacent pairs in the two packed vectors
664/// of 8 32-bit floating points `a` and `b`.
665/// In the result, sums of elements from `a` are returned in locations of
666/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
667/// 2, 3, 6, 7.
668///
669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
670#[inline]
671#[target_feature(enable = "avx")]
672#[cfg_attr(test, assert_instr(vhsubps))]
673#[stable(feature = "simd_x86", since = "1.27.0")]
674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
675pub const fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
676    unsafe {
677        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
678        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
679        simd_sub(even, odd)
680    }
681}
682
683/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
684/// elements in `a` and `b`.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
687#[inline]
688#[target_feature(enable = "avx")]
689#[cfg_attr(test, assert_instr(vxorp))]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
692pub const fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
693    unsafe {
694        let a: u64x4 = transmute(a);
695        let b: u64x4 = transmute(b);
696        transmute(simd_xor(a, b))
697    }
698}
699
700/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
701/// elements in `a` and `b`.
702///
703/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
704#[inline]
705#[target_feature(enable = "avx")]
706#[cfg_attr(test, assert_instr(vxorps))]
707#[stable(feature = "simd_x86", since = "1.27.0")]
708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
709pub const fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
710    unsafe {
711        let a: u32x8 = transmute(a);
712        let b: u32x8 = transmute(b);
713        transmute(simd_xor(a, b))
714    }
715}
716
717/// Equal (ordered, non-signaling)
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub const _CMP_EQ_OQ: i32 = 0x00;
720/// Less-than (ordered, signaling)
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub const _CMP_LT_OS: i32 = 0x01;
723/// Less-than-or-equal (ordered, signaling)
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub const _CMP_LE_OS: i32 = 0x02;
726/// Unordered (non-signaling)
727#[stable(feature = "simd_x86", since = "1.27.0")]
728pub const _CMP_UNORD_Q: i32 = 0x03;
729/// Not-equal (unordered, non-signaling)
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub const _CMP_NEQ_UQ: i32 = 0x04;
732/// Not-less-than (unordered, signaling)
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub const _CMP_NLT_US: i32 = 0x05;
735/// Not-less-than-or-equal (unordered, signaling)
736#[stable(feature = "simd_x86", since = "1.27.0")]
737pub const _CMP_NLE_US: i32 = 0x06;
738/// Ordered (non-signaling)
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub const _CMP_ORD_Q: i32 = 0x07;
741/// Equal (unordered, non-signaling)
742#[stable(feature = "simd_x86", since = "1.27.0")]
743pub const _CMP_EQ_UQ: i32 = 0x08;
744/// Not-greater-than-or-equal (unordered, signaling)
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub const _CMP_NGE_US: i32 = 0x09;
747/// Not-greater-than (unordered, signaling)
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub const _CMP_NGT_US: i32 = 0x0a;
750/// False (ordered, non-signaling)
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub const _CMP_FALSE_OQ: i32 = 0x0b;
753/// Not-equal (ordered, non-signaling)
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub const _CMP_NEQ_OQ: i32 = 0x0c;
756/// Greater-than-or-equal (ordered, signaling)
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub const _CMP_GE_OS: i32 = 0x0d;
759/// Greater-than (ordered, signaling)
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub const _CMP_GT_OS: i32 = 0x0e;
762/// True (unordered, non-signaling)
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub const _CMP_TRUE_UQ: i32 = 0x0f;
765/// Equal (ordered, signaling)
766#[stable(feature = "simd_x86", since = "1.27.0")]
767pub const _CMP_EQ_OS: i32 = 0x10;
768/// Less-than (ordered, non-signaling)
769#[stable(feature = "simd_x86", since = "1.27.0")]
770pub const _CMP_LT_OQ: i32 = 0x11;
771/// Less-than-or-equal (ordered, non-signaling)
772#[stable(feature = "simd_x86", since = "1.27.0")]
773pub const _CMP_LE_OQ: i32 = 0x12;
774/// Unordered (signaling)
775#[stable(feature = "simd_x86", since = "1.27.0")]
776pub const _CMP_UNORD_S: i32 = 0x13;
777/// Not-equal (unordered, signaling)
778#[stable(feature = "simd_x86", since = "1.27.0")]
779pub const _CMP_NEQ_US: i32 = 0x14;
780/// Not-less-than (unordered, non-signaling)
781#[stable(feature = "simd_x86", since = "1.27.0")]
782pub const _CMP_NLT_UQ: i32 = 0x15;
783/// Not-less-than-or-equal (unordered, non-signaling)
784#[stable(feature = "simd_x86", since = "1.27.0")]
785pub const _CMP_NLE_UQ: i32 = 0x16;
786/// Ordered (signaling)
787#[stable(feature = "simd_x86", since = "1.27.0")]
788pub const _CMP_ORD_S: i32 = 0x17;
789/// Equal (unordered, signaling)
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub const _CMP_EQ_US: i32 = 0x18;
792/// Not-greater-than-or-equal (unordered, non-signaling)
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub const _CMP_NGE_UQ: i32 = 0x19;
795/// Not-greater-than (unordered, non-signaling)
796#[stable(feature = "simd_x86", since = "1.27.0")]
797pub const _CMP_NGT_UQ: i32 = 0x1a;
798/// False (ordered, signaling)
799#[stable(feature = "simd_x86", since = "1.27.0")]
800pub const _CMP_FALSE_OS: i32 = 0x1b;
801/// Not-equal (ordered, signaling)
802#[stable(feature = "simd_x86", since = "1.27.0")]
803pub const _CMP_NEQ_OS: i32 = 0x1c;
804/// Greater-than-or-equal (ordered, non-signaling)
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub const _CMP_GE_OQ: i32 = 0x1d;
807/// Greater-than (ordered, non-signaling)
808#[stable(feature = "simd_x86", since = "1.27.0")]
809pub const _CMP_GT_OQ: i32 = 0x1e;
810/// True (unordered, signaling)
811#[stable(feature = "simd_x86", since = "1.27.0")]
812pub const _CMP_TRUE_US: i32 = 0x1f;
813
814/// Compares packed double-precision (64-bit) floating-point
815/// elements in `a` and `b` based on the comparison operand
816/// specified by `IMM5`.
817///
818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
819#[inline]
820#[target_feature(enable = "avx")]
821#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
822#[rustc_legacy_const_generics(2)]
823#[stable(feature = "simd_x86", since = "1.27.0")]
824pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
825    static_assert_uimm_bits!(IMM5, 5);
826    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
827}
828
829/// Compares packed double-precision (64-bit) floating-point
830/// elements in `a` and `b` based on the comparison operand
831/// specified by `IMM5`.
832///
833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
834#[inline]
835#[target_feature(enable = "avx")]
836#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
837#[rustc_legacy_const_generics(2)]
838#[stable(feature = "simd_x86", since = "1.27.0")]
839pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
840    static_assert_uimm_bits!(IMM5, 5);
841    unsafe { vcmppd256(a, b, IMM5 as u8) }
842}
843
844/// Compares packed single-precision (32-bit) floating-point
845/// elements in `a` and `b` based on the comparison operand
846/// specified by `IMM5`.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
849#[inline]
850#[target_feature(enable = "avx")]
851#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
852#[rustc_legacy_const_generics(2)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
855    static_assert_uimm_bits!(IMM5, 5);
856    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
857}
858
859/// Compares packed single-precision (32-bit) floating-point
860/// elements in `a` and `b` based on the comparison operand
861/// specified by `IMM5`.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
864#[inline]
865#[target_feature(enable = "avx")]
866#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
867#[rustc_legacy_const_generics(2)]
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
870    static_assert_uimm_bits!(IMM5, 5);
871    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
872}
873
874/// Compares the lower double-precision (64-bit) floating-point element in
875/// `a` and `b` based on the comparison operand specified by `IMM5`,
876/// store the result in the lower element of returned vector,
877/// and copies the upper element from `a` to the upper element of returned
878/// vector.
879///
880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
881#[inline]
882#[target_feature(enable = "avx")]
883#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
884#[rustc_legacy_const_generics(2)]
885#[stable(feature = "simd_x86", since = "1.27.0")]
886pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
887    static_assert_uimm_bits!(IMM5, 5);
888    unsafe { vcmpsd(a, b, IMM5 as i8) }
889}
890
891/// Compares the lower single-precision (32-bit) floating-point element in
892/// `a` and `b` based on the comparison operand specified by `IMM5`,
893/// store the result in the lower element of returned vector,
894/// and copies the upper 3 packed elements from `a` to the upper elements of
895/// returned vector.
896///
897/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
898#[inline]
899#[target_feature(enable = "avx")]
900#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
901#[rustc_legacy_const_generics(2)]
902#[stable(feature = "simd_x86", since = "1.27.0")]
903pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
904    static_assert_uimm_bits!(IMM5, 5);
905    unsafe { vcmpss(a, b, IMM5 as i8) }
906}
907
908/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
909/// floating-point elements.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
912#[inline]
913#[target_feature(enable = "avx")]
914#[cfg_attr(test, assert_instr(vcvtdq2pd))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
917pub const fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
918    unsafe { simd_cast(a.as_i32x4()) }
919}
920
921/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
922/// floating-point elements.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
925#[inline]
926#[target_feature(enable = "avx")]
927#[cfg_attr(test, assert_instr(vcvtdq2ps))]
928#[stable(feature = "simd_x86", since = "1.27.0")]
929#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
930pub const fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
931    unsafe { simd_cast(a.as_i32x8()) }
932}
933
934/// Converts packed double-precision (64-bit) floating-point elements in `a`
935/// to packed single-precision (32-bit) floating-point elements.
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
938#[inline]
939#[target_feature(enable = "avx")]
940#[cfg_attr(test, assert_instr(vcvtpd2ps))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
943pub const fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
944    unsafe { simd_cast(a) }
945}
946
947/// Converts packed single-precision (32-bit) floating-point elements in `a`
948/// to packed 32-bit integers.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
951#[inline]
952#[target_feature(enable = "avx")]
953#[cfg_attr(test, assert_instr(vcvtps2dq))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
956    unsafe { transmute(vcvtps2dq(a)) }
957}
958
959/// Converts packed single-precision (32-bit) floating-point elements in `a`
960/// to packed double-precision (64-bit) floating-point elements.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
963#[inline]
964#[target_feature(enable = "avx")]
965#[cfg_attr(test, assert_instr(vcvtps2pd))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
968pub const fn _mm256_cvtps_pd(a: __m128) -> __m256d {
969    unsafe { simd_cast(a) }
970}
971
972/// Returns the first element of the input vector of `[4 x double]`.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
975#[inline]
976#[target_feature(enable = "avx")]
977//#[cfg_attr(test, assert_instr(movsd))] FIXME
978#[stable(feature = "simd_x86", since = "1.27.0")]
979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
980pub const fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
981    unsafe { simd_extract!(a, 0) }
982}
983
984/// Converts packed double-precision (64-bit) floating-point elements in `a`
985/// to packed 32-bit integers with truncation.
986///
987/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
988#[inline]
989#[target_feature(enable = "avx")]
990#[cfg_attr(test, assert_instr(vcvttpd2dq))]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
993    unsafe { transmute(vcvttpd2dq(a)) }
994}
995
996/// Converts packed double-precision (64-bit) floating-point elements in `a`
997/// to packed 32-bit integers.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
1000#[inline]
1001#[target_feature(enable = "avx")]
1002#[cfg_attr(test, assert_instr(vcvtpd2dq))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
1005    unsafe { transmute(vcvtpd2dq(a)) }
1006}
1007
1008/// Converts packed single-precision (32-bit) floating-point elements in `a`
1009/// to packed 32-bit integers with truncation.
1010///
1011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
1012#[inline]
1013#[target_feature(enable = "avx")]
1014#[cfg_attr(test, assert_instr(vcvttps2dq))]
1015#[stable(feature = "simd_x86", since = "1.27.0")]
1016pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
1017    unsafe { transmute(vcvttps2dq(a)) }
1018}
1019
1020/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
1021/// floating-point elements) from `a`, selected with `imm8`.
1022///
1023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
1024#[inline]
1025#[target_feature(enable = "avx")]
1026#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1027#[rustc_legacy_const_generics(1)]
1028#[stable(feature = "simd_x86", since = "1.27.0")]
1029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1030pub const fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
1031    static_assert_uimm_bits!(IMM1, 1);
1032    unsafe {
1033        simd_shuffle!(
1034            a,
1035            _mm256_undefined_ps(),
1036            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
1037        )
1038    }
1039}
1040
1041/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
1042/// floating-point elements) from `a`, selected with `imm8`.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
1045#[inline]
1046#[target_feature(enable = "avx")]
1047#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1048#[rustc_legacy_const_generics(1)]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1052    static_assert_uimm_bits!(IMM1, 1);
1053    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1054}
1055
1056/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1059#[inline]
1060#[target_feature(enable = "avx")]
1061#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1062#[rustc_legacy_const_generics(1)]
1063#[stable(feature = "simd_x86", since = "1.27.0")]
1064#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1065pub const fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1066    static_assert_uimm_bits!(IMM1, 1);
1067    unsafe {
1068        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1069        transmute(dst)
1070    }
1071}
1072
1073/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1074///
1075/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1076#[inline]
1077#[target_feature(enable = "avx")]
1078// This intrinsic has no corresponding instruction.
1079#[rustc_legacy_const_generics(1)]
1080#[stable(feature = "simd_x86", since = "1.27.0")]
1081#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1082pub const fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1083    static_assert_uimm_bits!(INDEX, 3);
1084    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1085}
1086
1087/// Returns the first element of the input vector of `[8 x i32]`.
1088///
1089/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1090#[inline]
1091#[target_feature(enable = "avx")]
1092#[stable(feature = "simd_x86", since = "1.27.0")]
1093#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1094pub const fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1095    unsafe { simd_extract!(a.as_i32x8(), 0) }
1096}
1097
1098/// Zeroes the contents of all XMM or YMM registers.
1099///
1100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1101#[inline]
1102#[target_feature(enable = "avx")]
1103#[cfg_attr(test, assert_instr(vzeroall))]
1104#[stable(feature = "simd_x86", since = "1.27.0")]
1105pub fn _mm256_zeroall() {
1106    unsafe { vzeroall() }
1107}
1108
1109/// Zeroes the upper 128 bits of all YMM registers;
1110/// the lower 128-bits of the registers are unmodified.
1111///
1112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1113#[inline]
1114#[target_feature(enable = "avx")]
1115#[cfg_attr(test, assert_instr(vzeroupper))]
1116#[stable(feature = "simd_x86", since = "1.27.0")]
1117pub fn _mm256_zeroupper() {
1118    unsafe { vzeroupper() }
1119}
1120
1121/// Shuffles single-precision (32-bit) floating-point elements in `a`
1122/// within 128-bit lanes using the control in `b`.
1123///
1124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1125#[inline]
1126#[target_feature(enable = "avx")]
1127#[cfg_attr(test, assert_instr(vpermilps))]
1128#[stable(feature = "simd_x86", since = "1.27.0")]
1129pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1130    unsafe { vpermilps256(a, b.as_i32x8()) }
1131}
1132
1133/// Shuffles single-precision (32-bit) floating-point elements in `a`
1134/// using the control in `b`.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1137#[inline]
1138#[target_feature(enable = "avx")]
1139#[cfg_attr(test, assert_instr(vpermilps))]
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1142    unsafe { vpermilps(a, b.as_i32x4()) }
1143}
1144
1145/// Shuffles single-precision (32-bit) floating-point elements in `a`
1146/// within 128-bit lanes using the control in `imm8`.
1147///
1148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1149#[inline]
1150#[target_feature(enable = "avx")]
1151#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1152#[rustc_legacy_const_generics(1)]
1153#[stable(feature = "simd_x86", since = "1.27.0")]
1154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1155pub const fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1156    static_assert_uimm_bits!(IMM8, 8);
1157    unsafe {
1158        simd_shuffle!(
1159            a,
1160            _mm256_undefined_ps(),
1161            [
1162                (IMM8 as u32 >> 0) & 0b11,
1163                (IMM8 as u32 >> 2) & 0b11,
1164                (IMM8 as u32 >> 4) & 0b11,
1165                (IMM8 as u32 >> 6) & 0b11,
1166                ((IMM8 as u32 >> 0) & 0b11) + 4,
1167                ((IMM8 as u32 >> 2) & 0b11) + 4,
1168                ((IMM8 as u32 >> 4) & 0b11) + 4,
1169                ((IMM8 as u32 >> 6) & 0b11) + 4,
1170            ],
1171        )
1172    }
1173}
1174
1175/// Shuffles single-precision (32-bit) floating-point elements in `a`
1176/// using the control in `imm8`.
1177///
1178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1179#[inline]
1180#[target_feature(enable = "avx")]
1181#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1182#[rustc_legacy_const_generics(1)]
1183#[stable(feature = "simd_x86", since = "1.27.0")]
1184#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1185pub const fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1186    static_assert_uimm_bits!(IMM8, 8);
1187    unsafe {
1188        simd_shuffle!(
1189            a,
1190            _mm_undefined_ps(),
1191            [
1192                (IMM8 as u32 >> 0) & 0b11,
1193                (IMM8 as u32 >> 2) & 0b11,
1194                (IMM8 as u32 >> 4) & 0b11,
1195                (IMM8 as u32 >> 6) & 0b11,
1196            ],
1197        )
1198    }
1199}
1200
1201/// Shuffles double-precision (64-bit) floating-point elements in `a`
1202/// within 256-bit lanes using the control in `b`.
1203///
1204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1205#[inline]
1206#[target_feature(enable = "avx")]
1207#[cfg_attr(test, assert_instr(vpermilpd))]
1208#[stable(feature = "simd_x86", since = "1.27.0")]
1209pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1210    unsafe { vpermilpd256(a, b.as_i64x4()) }
1211}
1212
1213/// Shuffles double-precision (64-bit) floating-point elements in `a`
1214/// using the control in `b`.
1215///
1216/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1217#[inline]
1218#[target_feature(enable = "avx")]
1219#[cfg_attr(test, assert_instr(vpermilpd))]
1220#[stable(feature = "simd_x86", since = "1.27.0")]
1221pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1222    unsafe { vpermilpd(a, b.as_i64x2()) }
1223}
1224
1225/// Shuffles double-precision (64-bit) floating-point elements in `a`
1226/// within 128-bit lanes using the control in `imm8`.
1227///
1228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1229#[inline]
1230#[target_feature(enable = "avx")]
1231#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1232#[rustc_legacy_const_generics(1)]
1233#[stable(feature = "simd_x86", since = "1.27.0")]
1234#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1235pub const fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1236    static_assert_uimm_bits!(IMM4, 4);
1237    unsafe {
1238        simd_shuffle!(
1239            a,
1240            _mm256_undefined_pd(),
1241            [
1242                ((IMM4 as u32 >> 0) & 1),
1243                ((IMM4 as u32 >> 1) & 1),
1244                ((IMM4 as u32 >> 2) & 1) + 2,
1245                ((IMM4 as u32 >> 3) & 1) + 2,
1246            ],
1247        )
1248    }
1249}
1250
1251/// Shuffles double-precision (64-bit) floating-point elements in `a`
1252/// using the control in `imm8`.
1253///
1254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1255#[inline]
1256#[target_feature(enable = "avx")]
1257#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1258#[rustc_legacy_const_generics(1)]
1259#[stable(feature = "simd_x86", since = "1.27.0")]
1260#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1261pub const fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1262    static_assert_uimm_bits!(IMM2, 2);
1263    unsafe {
1264        simd_shuffle!(
1265            a,
1266            _mm_undefined_pd(),
1267            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1268        )
1269    }
1270}
1271
1272/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1273/// floating-point elements) selected by `imm8` from `a` and `b`.
1274///
1275/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1276#[inline]
1277#[target_feature(enable = "avx")]
1278#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1279#[rustc_legacy_const_generics(2)]
1280#[stable(feature = "simd_x86", since = "1.27.0")]
1281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1282pub const fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1283    static_assert_uimm_bits!(IMM8, 8);
1284    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
1285        _mm256_castps_si256(a),
1286        _mm256_castps_si256(b),
1287    ))
1288}
1289
1290/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1291/// floating-point elements) selected by `imm8` from `a` and `b`.
1292///
1293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1294#[inline]
1295#[target_feature(enable = "avx")]
1296#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1297#[rustc_legacy_const_generics(2)]
1298#[stable(feature = "simd_x86", since = "1.27.0")]
1299#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1300pub const fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1301    static_assert_uimm_bits!(IMM8, 8);
1302    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
1303        _mm256_castpd_si256(a),
1304        _mm256_castpd_si256(b),
1305    ))
1306}
1307
1308/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1309/// from `a` and `b`.
1310///
1311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1312#[inline]
1313#[target_feature(enable = "avx")]
1314#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1315#[rustc_legacy_const_generics(2)]
1316#[stable(feature = "simd_x86", since = "1.27.0")]
1317#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1318pub const fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1319    static_assert_uimm_bits!(IMM8, 8);
1320    const fn idx(imm8: i32, pos: u32) -> u32 {
1321        let part = if pos < 2 {
1322            imm8 & 0xf
1323        } else {
1324            (imm8 & 0xf0) >> 4
1325        };
1326        2 * (part as u32 & 0b11) + (pos & 1)
1327    }
1328    const fn idx0(imm8: i32, pos: u32) -> u32 {
1329        let part = if pos < 2 {
1330            imm8 & 0xf
1331        } else {
1332            (imm8 & 0xf0) >> 4
1333        };
1334        if part & 0b1000 != 0 { 4 } else { pos }
1335    }
1336    unsafe {
1337        let r = simd_shuffle!(
1338            a.as_i64x4(),
1339            b.as_i64x4(),
1340            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
1341        );
1342        let r: i64x4 = simd_shuffle!(
1343            r,
1344            i64x4::ZERO,
1345            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
1346        );
1347        r.as_m256i()
1348    }
1349}
1350
1351/// Broadcasts a single-precision (32-bit) floating-point element from memory
1352/// to all elements of the returned vector.
1353///
1354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1355#[inline]
1356#[target_feature(enable = "avx")]
1357#[cfg_attr(test, assert_instr(vbroadcastss))]
1358#[stable(feature = "simd_x86", since = "1.27.0")]
1359#[allow(clippy::trivially_copy_pass_by_ref)]
1360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1361pub const fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1362    _mm256_set1_ps(*f)
1363}
1364
1365/// Broadcasts a single-precision (32-bit) floating-point element from memory
1366/// to all elements of the returned vector.
1367///
1368/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1369#[inline]
1370#[target_feature(enable = "avx")]
1371#[cfg_attr(test, assert_instr(vbroadcastss))]
1372#[stable(feature = "simd_x86", since = "1.27.0")]
1373#[allow(clippy::trivially_copy_pass_by_ref)]
1374#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1375pub const fn _mm_broadcast_ss(f: &f32) -> __m128 {
1376    _mm_set1_ps(*f)
1377}
1378
1379/// Broadcasts a double-precision (64-bit) floating-point element from memory
1380/// to all elements of the returned vector.
1381///
1382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1383#[inline]
1384#[target_feature(enable = "avx")]
1385#[cfg_attr(test, assert_instr(vbroadcastsd))]
1386#[stable(feature = "simd_x86", since = "1.27.0")]
1387#[allow(clippy::trivially_copy_pass_by_ref)]
1388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1389pub const fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1390    _mm256_set1_pd(*f)
1391}
1392
1393/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1394/// (32-bit) floating-point elements) to all elements of the returned vector.
1395///
1396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1397#[inline]
1398#[target_feature(enable = "avx")]
1399#[cfg_attr(test, assert_instr(vbroadcastf128))]
1400#[stable(feature = "simd_x86", since = "1.27.0")]
1401#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1402pub const fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1403    unsafe { simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3]) }
1404}
1405
1406/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1407/// (64-bit) floating-point elements) to all elements of the returned vector.
1408///
1409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1410#[inline]
1411#[target_feature(enable = "avx")]
1412#[cfg_attr(test, assert_instr(vbroadcastf128))]
1413#[stable(feature = "simd_x86", since = "1.27.0")]
1414#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1415pub const fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1416    unsafe { simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1]) }
1417}
1418
1419/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1420/// single-precision (32-bit) floating-point elements) from `b` into result
1421/// at the location specified by `imm8`.
1422///
1423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1424#[inline]
1425#[target_feature(enable = "avx")]
1426#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1427#[rustc_legacy_const_generics(2)]
1428#[stable(feature = "simd_x86", since = "1.27.0")]
1429#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1430pub const fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1431    static_assert_uimm_bits!(IMM1, 1);
1432    unsafe {
1433        simd_shuffle!(
1434            a,
1435            _mm256_castps128_ps256(b),
1436            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1437        )
1438    }
1439}
1440
1441/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1442/// double-precision (64-bit) floating-point elements) from `b` into result
1443/// at the location specified by `imm8`.
1444///
1445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1446#[inline]
1447#[target_feature(enable = "avx")]
1448#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1449#[rustc_legacy_const_generics(2)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1452pub const fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1453    static_assert_uimm_bits!(IMM1, 1);
1454    unsafe {
1455        simd_shuffle!(
1456            a,
1457            _mm256_castpd128_pd256(b),
1458            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1459        )
1460    }
1461}
1462
1463/// Copies `a` to result, then inserts 128 bits from `b` into result
1464/// at the location specified by `imm8`.
1465///
1466/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1467#[inline]
1468#[target_feature(enable = "avx")]
1469#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1470#[rustc_legacy_const_generics(2)]
1471#[stable(feature = "simd_x86", since = "1.27.0")]
1472#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1473pub const fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1474    static_assert_uimm_bits!(IMM1, 1);
1475    unsafe {
1476        let dst: i64x4 = simd_shuffle!(
1477            a.as_i64x4(),
1478            _mm256_castsi128_si256(b).as_i64x4(),
1479            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1480        );
1481        transmute(dst)
1482    }
1483}
1484
1485/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1486/// at the location specified by `index`.
1487///
1488/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1489#[inline]
1490#[target_feature(enable = "avx")]
1491// This intrinsic has no corresponding instruction.
1492#[rustc_legacy_const_generics(2)]
1493#[stable(feature = "simd_x86", since = "1.27.0")]
1494#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1495pub const fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1496    static_assert_uimm_bits!(INDEX, 5);
1497    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1498}
1499
1500/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1501/// at the location specified by `index`.
1502///
1503/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1504#[inline]
1505#[target_feature(enable = "avx")]
1506// This intrinsic has no corresponding instruction.
1507#[rustc_legacy_const_generics(2)]
1508#[stable(feature = "simd_x86", since = "1.27.0")]
1509#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1510pub const fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1511    static_assert_uimm_bits!(INDEX, 4);
1512    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1513}
1514
1515/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1516/// at the location specified by `index`.
1517///
1518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1519#[inline]
1520#[target_feature(enable = "avx")]
1521// This intrinsic has no corresponding instruction.
1522#[rustc_legacy_const_generics(2)]
1523#[stable(feature = "simd_x86", since = "1.27.0")]
1524#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1525pub const fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1526    static_assert_uimm_bits!(INDEX, 3);
1527    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1528}
1529
1530/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1531/// floating-point elements) from memory into result.
1532/// `mem_addr` must be aligned on a 32-byte boundary or a
1533/// general-protection exception may be generated.
1534///
1535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1536#[inline]
1537#[target_feature(enable = "avx")]
1538#[cfg_attr(
1539    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1540    assert_instr(vmovap)
1541)]
1542#[stable(feature = "simd_x86", since = "1.27.0")]
1543#[allow(clippy::cast_ptr_alignment)]
1544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1545pub const unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1546    *(mem_addr as *const __m256d)
1547}
1548
1549/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1550/// floating-point elements) from `a` into memory.
1551/// `mem_addr` must be aligned on a 32-byte boundary or a
1552/// general-protection exception may be generated.
1553///
1554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1555#[inline]
1556#[target_feature(enable = "avx")]
1557#[cfg_attr(
1558    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1559    assert_instr(vmovap)
1560)]
1561#[stable(feature = "simd_x86", since = "1.27.0")]
1562#[allow(clippy::cast_ptr_alignment)]
1563#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1564pub const unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1565    *(mem_addr as *mut __m256d) = a;
1566}
1567
1568/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1569/// floating-point elements) from memory into result.
1570/// `mem_addr` must be aligned on a 32-byte boundary or a
1571/// general-protection exception may be generated.
1572///
1573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1574#[inline]
1575#[target_feature(enable = "avx")]
1576#[cfg_attr(
1577    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1578    assert_instr(vmovaps)
1579)]
1580#[stable(feature = "simd_x86", since = "1.27.0")]
1581#[allow(clippy::cast_ptr_alignment)]
1582#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1583pub const unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1584    *(mem_addr as *const __m256)
1585}
1586
1587/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1588/// floating-point elements) from `a` into memory.
1589/// `mem_addr` must be aligned on a 32-byte boundary or a
1590/// general-protection exception may be generated.
1591///
1592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1593#[inline]
1594#[target_feature(enable = "avx")]
1595#[cfg_attr(
1596    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1597    assert_instr(vmovaps)
1598)]
1599#[stable(feature = "simd_x86", since = "1.27.0")]
1600#[allow(clippy::cast_ptr_alignment)]
1601#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1602pub const unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1603    *(mem_addr as *mut __m256) = a;
1604}
1605
1606/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1607/// floating-point elements) from memory into result.
1608/// `mem_addr` does not need to be aligned on any particular boundary.
1609///
1610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1611#[inline]
1612#[target_feature(enable = "avx")]
1613#[cfg_attr(test, assert_instr(vmovup))]
1614#[stable(feature = "simd_x86", since = "1.27.0")]
1615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1616pub const unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1617    let mut dst = _mm256_undefined_pd();
1618    ptr::copy_nonoverlapping(
1619        mem_addr as *const u8,
1620        ptr::addr_of_mut!(dst) as *mut u8,
1621        mem::size_of::<__m256d>(),
1622    );
1623    dst
1624}
1625
1626/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1627/// floating-point elements) from `a` into memory.
1628/// `mem_addr` does not need to be aligned on any particular boundary.
1629///
1630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1631#[inline]
1632#[target_feature(enable = "avx")]
1633#[cfg_attr(test, assert_instr(vmovup))]
1634#[stable(feature = "simd_x86", since = "1.27.0")]
1635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1636pub const unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1637    mem_addr.cast::<__m256d>().write_unaligned(a);
1638}
1639
1640/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1641/// floating-point elements) from memory into result.
1642/// `mem_addr` does not need to be aligned on any particular boundary.
1643///
1644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1645#[inline]
1646#[target_feature(enable = "avx")]
1647#[cfg_attr(test, assert_instr(vmovups))]
1648#[stable(feature = "simd_x86", since = "1.27.0")]
1649#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1650pub const unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1651    let mut dst = _mm256_undefined_ps();
1652    ptr::copy_nonoverlapping(
1653        mem_addr as *const u8,
1654        ptr::addr_of_mut!(dst) as *mut u8,
1655        mem::size_of::<__m256>(),
1656    );
1657    dst
1658}
1659
1660/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1661/// floating-point elements) from `a` into memory.
1662/// `mem_addr` does not need to be aligned on any particular boundary.
1663///
1664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1665#[inline]
1666#[target_feature(enable = "avx")]
1667#[cfg_attr(test, assert_instr(vmovups))]
1668#[stable(feature = "simd_x86", since = "1.27.0")]
1669#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1670pub const unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1671    mem_addr.cast::<__m256>().write_unaligned(a);
1672}
1673
1674/// Loads 256-bits of integer data from memory into result.
1675/// `mem_addr` must be aligned on a 32-byte boundary or a
1676/// general-protection exception may be generated.
1677///
1678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1679#[inline]
1680#[target_feature(enable = "avx")]
1681#[cfg_attr(
1682    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1683    assert_instr(vmovaps)
1684)] // FIXME vmovdqa expected
1685#[stable(feature = "simd_x86", since = "1.27.0")]
1686#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1687pub const unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1688    *mem_addr
1689}
1690
1691/// Stores 256-bits of integer data from `a` into memory.
1692/// `mem_addr` must be aligned on a 32-byte boundary or a
1693/// general-protection exception may be generated.
1694///
1695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1696#[inline]
1697#[target_feature(enable = "avx")]
1698#[cfg_attr(
1699    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1700    assert_instr(vmovaps)
1701)] // FIXME vmovdqa expected
1702#[stable(feature = "simd_x86", since = "1.27.0")]
1703#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1704pub const unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1705    *mem_addr = a;
1706}
1707
1708/// Loads 256-bits of integer data from memory into result.
1709/// `mem_addr` does not need to be aligned on any particular boundary.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1712#[inline]
1713#[target_feature(enable = "avx")]
1714#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1717pub const unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1718    let mut dst = _mm256_undefined_si256();
1719    ptr::copy_nonoverlapping(
1720        mem_addr as *const u8,
1721        ptr::addr_of_mut!(dst) as *mut u8,
1722        mem::size_of::<__m256i>(),
1723    );
1724    dst
1725}
1726
1727/// Stores 256-bits of integer data from `a` into memory.
1728/// `mem_addr` does not need to be aligned on any particular boundary.
1729///
1730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1731#[inline]
1732#[target_feature(enable = "avx")]
1733#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1734#[stable(feature = "simd_x86", since = "1.27.0")]
1735#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1736pub const unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1737    mem_addr.write_unaligned(a);
1738}
1739
1740/// Loads packed double-precision (64-bit) floating-point elements from memory
1741/// into result using `mask` (elements are zeroed out when the high bit of the
1742/// corresponding element is not set).
1743///
1744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1745#[inline]
1746#[target_feature(enable = "avx")]
1747#[cfg_attr(test, assert_instr(vmaskmovpd))]
1748#[stable(feature = "simd_x86", since = "1.27.0")]
1749#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1750pub const unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1751    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1752    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd())
1753}
1754
1755/// Stores packed double-precision (64-bit) floating-point elements from `a`
1756/// into memory using `mask`.
1757///
1758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1759#[inline]
1760#[target_feature(enable = "avx")]
1761#[cfg_attr(test, assert_instr(vmaskmovpd))]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1764pub const unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1765    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1766    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1767}
1768
1769/// Loads packed double-precision (64-bit) floating-point elements from memory
1770/// into result using `mask` (elements are zeroed out when the high bit of the
1771/// corresponding element is not set).
1772///
1773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1774#[inline]
1775#[target_feature(enable = "avx")]
1776#[cfg_attr(test, assert_instr(vmaskmovpd))]
1777#[stable(feature = "simd_x86", since = "1.27.0")]
1778#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1779pub const unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1780    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1781    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd())
1782}
1783
1784/// Stores packed double-precision (64-bit) floating-point elements from `a`
1785/// into memory using `mask`.
1786///
1787/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1788#[inline]
1789#[target_feature(enable = "avx")]
1790#[cfg_attr(test, assert_instr(vmaskmovpd))]
1791#[stable(feature = "simd_x86", since = "1.27.0")]
1792#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1793pub const unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1794    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1795    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1796}
1797
1798/// Loads packed single-precision (32-bit) floating-point elements from memory
1799/// into result using `mask` (elements are zeroed out when the high bit of the
1800/// corresponding element is not set).
1801///
1802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1803#[inline]
1804#[target_feature(enable = "avx")]
1805#[cfg_attr(test, assert_instr(vmaskmovps))]
1806#[stable(feature = "simd_x86", since = "1.27.0")]
1807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1808pub const unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1809    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1810    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps())
1811}
1812
1813/// Stores packed single-precision (32-bit) floating-point elements from `a`
1814/// into memory using `mask`.
1815///
1816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1817#[inline]
1818#[target_feature(enable = "avx")]
1819#[cfg_attr(test, assert_instr(vmaskmovps))]
1820#[stable(feature = "simd_x86", since = "1.27.0")]
1821#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1822pub const unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1823    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1824    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1825}
1826
1827/// Loads packed single-precision (32-bit) floating-point elements from memory
1828/// into result using `mask` (elements are zeroed out when the high bit of the
1829/// corresponding element is not set).
1830///
1831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1832#[inline]
1833#[target_feature(enable = "avx")]
1834#[cfg_attr(test, assert_instr(vmaskmovps))]
1835#[stable(feature = "simd_x86", since = "1.27.0")]
1836#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1837pub const unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1838    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1839    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps())
1840}
1841
1842/// Stores packed single-precision (32-bit) floating-point elements from `a`
1843/// into memory using `mask`.
1844///
1845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1846#[inline]
1847#[target_feature(enable = "avx")]
1848#[cfg_attr(test, assert_instr(vmaskmovps))]
1849#[stable(feature = "simd_x86", since = "1.27.0")]
1850#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1851pub const unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1852    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1853    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1854}
1855
1856/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1857/// from `a`, and returns the results.
1858///
1859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1860#[inline]
1861#[target_feature(enable = "avx")]
1862#[cfg_attr(test, assert_instr(vmovshdup))]
1863#[stable(feature = "simd_x86", since = "1.27.0")]
1864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1865pub const fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1866    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1867}
1868
1869/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1870/// from `a`, and returns the results.
1871///
1872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1873#[inline]
1874#[target_feature(enable = "avx")]
1875#[cfg_attr(test, assert_instr(vmovsldup))]
1876#[stable(feature = "simd_x86", since = "1.27.0")]
1877#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1878pub const fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1879    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1880}
1881
1882/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1883/// from `a`, and returns the results.
1884///
1885/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1886#[inline]
1887#[target_feature(enable = "avx")]
1888#[cfg_attr(test, assert_instr(vmovddup))]
1889#[stable(feature = "simd_x86", since = "1.27.0")]
1890#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1891pub const fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1892    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1893}
1894
1895/// Loads 256-bits of integer data from unaligned memory into result.
1896/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1897/// data crosses a cache line boundary.
1898///
1899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1900#[inline]
1901#[target_feature(enable = "avx")]
1902#[cfg_attr(test, assert_instr(vlddqu))]
1903#[stable(feature = "simd_x86", since = "1.27.0")]
1904pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1905    transmute(vlddqu(mem_addr as *const i8))
1906}
1907
1908/// Moves integer data from a 256-bit integer vector to a 32-byte
1909/// aligned memory location. To minimize caching, the data is flagged as
1910/// non-temporal (unlikely to be used again soon)
1911///
1912/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1913///
1914/// # Safety of non-temporal stores
1915///
1916/// After using this intrinsic, but before any other access to the memory that this intrinsic
1917/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1918/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1919/// return.
1920///
1921/// See [`_mm_sfence`] for details.
1922#[inline]
1923#[target_feature(enable = "avx")]
1924#[cfg_attr(test, assert_instr(vmovntdq))]
1925#[stable(feature = "simd_x86", since = "1.27.0")]
1926pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1927    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1928    crate::arch::asm!(
1929        vps!("vmovntdq", ",{a}"),
1930        p = in(reg) mem_addr,
1931        a = in(ymm_reg) a,
1932        options(nostack, preserves_flags),
1933    );
1934}
1935
1936/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1937/// to a 32-byte aligned memory location. To minimize caching, the data is
1938/// flagged as non-temporal (unlikely to be used again soon).
1939///
1940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1941///
1942/// # Safety of non-temporal stores
1943///
1944/// After using this intrinsic, but before any other access to the memory that this intrinsic
1945/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1946/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1947/// return.
1948///
1949/// See [`_mm_sfence`] for details.
1950#[inline]
1951#[target_feature(enable = "avx")]
1952#[cfg_attr(test, assert_instr(vmovntpd))]
1953#[stable(feature = "simd_x86", since = "1.27.0")]
1954#[allow(clippy::cast_ptr_alignment)]
1955pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1956    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1957    crate::arch::asm!(
1958        vps!("vmovntpd", ",{a}"),
1959        p = in(reg) mem_addr,
1960        a = in(ymm_reg) a,
1961        options(nostack, preserves_flags),
1962    );
1963}
1964
1965/// Moves single-precision floating point values from a 256-bit vector
1966/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1967/// caching, the data is flagged as non-temporal (unlikely to be used again
1968/// soon).
1969///
1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1971///
1972/// # Safety of non-temporal stores
1973///
1974/// After using this intrinsic, but before any other access to the memory that this intrinsic
1975/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1976/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1977/// return.
1978///
1979/// See [`_mm_sfence`] for details.
1980#[inline]
1981#[target_feature(enable = "avx")]
1982#[cfg_attr(test, assert_instr(vmovntps))]
1983#[stable(feature = "simd_x86", since = "1.27.0")]
1984#[allow(clippy::cast_ptr_alignment)]
1985pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1986    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1987    crate::arch::asm!(
1988        vps!("vmovntps", ",{a}"),
1989        p = in(reg) mem_addr,
1990        a = in(ymm_reg) a,
1991        options(nostack, preserves_flags),
1992    );
1993}
1994
1995/// Computes the approximate reciprocal of packed single-precision (32-bit)
1996/// floating-point elements in `a`, and returns the results. The maximum
1997/// relative error for this approximation is less than 1.5*2^-12.
1998///
1999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
2000#[inline]
2001#[target_feature(enable = "avx")]
2002#[cfg_attr(test, assert_instr(vrcpps))]
2003#[stable(feature = "simd_x86", since = "1.27.0")]
2004pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
2005    unsafe { vrcpps(a) }
2006}
2007
2008/// Computes the approximate reciprocal square root of packed single-precision
2009/// (32-bit) floating-point elements in `a`, and returns the results.
2010/// The maximum relative error for this approximation is less than 1.5*2^-12.
2011///
2012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
2013#[inline]
2014#[target_feature(enable = "avx")]
2015#[cfg_attr(test, assert_instr(vrsqrtps))]
2016#[stable(feature = "simd_x86", since = "1.27.0")]
2017pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
2018    unsafe { vrsqrtps(a) }
2019}
2020
2021/// Unpacks and interleave double-precision (64-bit) floating-point elements
2022/// from the high half of each 128-bit lane in `a` and `b`.
2023///
2024/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
2025#[inline]
2026#[target_feature(enable = "avx")]
2027#[cfg_attr(test, assert_instr(vunpckhpd))]
2028#[stable(feature = "simd_x86", since = "1.27.0")]
2029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2030pub const fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
2031    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
2032}
2033
2034/// Unpacks and interleave single-precision (32-bit) floating-point elements
2035/// from the high half of each 128-bit lane in `a` and `b`.
2036///
2037/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
2038#[inline]
2039#[target_feature(enable = "avx")]
2040#[cfg_attr(test, assert_instr(vunpckhps))]
2041#[stable(feature = "simd_x86", since = "1.27.0")]
2042#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2043pub const fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
2044    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
2045}
2046
2047/// Unpacks and interleave double-precision (64-bit) floating-point elements
2048/// from the low half of each 128-bit lane in `a` and `b`.
2049///
2050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
2051#[inline]
2052#[target_feature(enable = "avx")]
2053#[cfg_attr(test, assert_instr(vunpcklpd))]
2054#[stable(feature = "simd_x86", since = "1.27.0")]
2055#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2056pub const fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
2057    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
2058}
2059
2060/// Unpacks and interleave single-precision (32-bit) floating-point elements
2061/// from the low half of each 128-bit lane in `a` and `b`.
2062///
2063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
2064#[inline]
2065#[target_feature(enable = "avx")]
2066#[cfg_attr(test, assert_instr(vunpcklps))]
2067#[stable(feature = "simd_x86", since = "1.27.0")]
2068#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2069pub const fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
2070    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
2071}
2072
2073/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2074/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2075/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2076/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
2077///
2078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
2079#[inline]
2080#[target_feature(enable = "avx")]
2081#[cfg_attr(test, assert_instr(vptest))]
2082#[stable(feature = "simd_x86", since = "1.27.0")]
2083#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2084pub const fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
2085    unsafe {
2086        let r = simd_and(a.as_i64x4(), b.as_i64x4());
2087        (0i64 == simd_reduce_or(r)) as i32
2088    }
2089}
2090
2091/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2092/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2093/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2094/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
2095///
2096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
2097#[inline]
2098#[target_feature(enable = "avx")]
2099#[cfg_attr(test, assert_instr(vptest))]
2100#[stable(feature = "simd_x86", since = "1.27.0")]
2101#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2102pub const fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
2103    unsafe {
2104        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
2105        (0i64 == simd_reduce_or(r)) as i32
2106    }
2107}
2108
2109/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2110/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2111/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2112/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
2113/// `CF` values are zero, otherwise return 0.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vptest))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
2121    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
2122}
2123
2124/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2125/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2126/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2127/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2128/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2129/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2130/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
2133#[inline]
2134#[target_feature(enable = "avx")]
2135#[cfg_attr(test, assert_instr(vtestpd))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
2138    unsafe { vtestzpd256(a, b) }
2139}
2140
2141/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2142/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2143/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2144/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2145/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2146/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2147/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2148///
2149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
2150#[inline]
2151#[target_feature(enable = "avx")]
2152#[cfg_attr(test, assert_instr(vtestpd))]
2153#[stable(feature = "simd_x86", since = "1.27.0")]
2154pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
2155    unsafe { vtestcpd256(a, b) }
2156}
2157
2158/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2159/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2160/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2161/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2162/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2163/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2164/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2165/// are zero, otherwise return 0.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2168#[inline]
2169#[target_feature(enable = "avx")]
2170#[cfg_attr(test, assert_instr(vtestpd))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2173    unsafe { vtestnzcpd256(a, b) }
2174}
2175
2176/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2177/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2178/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2179/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2180/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2181/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2182/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2183///
2184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2185#[inline]
2186#[target_feature(enable = "avx")]
2187#[cfg_attr(test, assert_instr(vtestpd))]
2188#[stable(feature = "simd_x86", since = "1.27.0")]
2189#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2190pub const fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2191    unsafe {
2192        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
2193        (0i64 == simd_reduce_or(r)) as i32
2194    }
2195}
2196
2197/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2198/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2199/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2200/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2201/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2202/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2203/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2204///
2205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2206#[inline]
2207#[target_feature(enable = "avx")]
2208#[cfg_attr(test, assert_instr(vtestpd))]
2209#[stable(feature = "simd_x86", since = "1.27.0")]
2210#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2211pub const fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2212    unsafe {
2213        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
2214        (0i64 == simd_reduce_or(r)) as i32
2215    }
2216}
2217
2218/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2219/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2220/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2221/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2222/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2223/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2224/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2225/// are zero, otherwise return 0.
2226///
2227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2228#[inline]
2229#[target_feature(enable = "avx")]
2230#[cfg_attr(test, assert_instr(vtestpd))]
2231#[stable(feature = "simd_x86", since = "1.27.0")]
2232pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2233    unsafe { vtestnzcpd(a, b) }
2234}
2235
2236/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2237/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2238/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2239/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2240/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2241/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2242/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2243///
2244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2245#[inline]
2246#[target_feature(enable = "avx")]
2247#[cfg_attr(test, assert_instr(vtestps))]
2248#[stable(feature = "simd_x86", since = "1.27.0")]
2249pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2250    unsafe { vtestzps256(a, b) }
2251}
2252
2253/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2254/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2255/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2256/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2257/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2258/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2259/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2260///
2261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2262#[inline]
2263#[target_feature(enable = "avx")]
2264#[cfg_attr(test, assert_instr(vtestps))]
2265#[stable(feature = "simd_x86", since = "1.27.0")]
2266pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2267    unsafe { vtestcps256(a, b) }
2268}
2269
2270/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2271/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2272/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2273/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2274/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2275/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2276/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2277/// are zero, otherwise return 0.
2278///
2279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2280#[inline]
2281#[target_feature(enable = "avx")]
2282#[cfg_attr(test, assert_instr(vtestps))]
2283#[stable(feature = "simd_x86", since = "1.27.0")]
2284pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2285    unsafe { vtestnzcps256(a, b) }
2286}
2287
2288/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2289/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2290/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2291/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2292/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2293/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2294/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2297#[inline]
2298#[target_feature(enable = "avx")]
2299#[cfg_attr(test, assert_instr(vtestps))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2302pub const fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2303    unsafe {
2304        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
2305        (0i32 == simd_reduce_or(r)) as i32
2306    }
2307}
2308
2309/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2310/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2311/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2312/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2313/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2314/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2315/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2316///
2317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2318#[inline]
2319#[target_feature(enable = "avx")]
2320#[cfg_attr(test, assert_instr(vtestps))]
2321#[stable(feature = "simd_x86", since = "1.27.0")]
2322#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2323pub const fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2324    unsafe {
2325        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
2326        (0i32 == simd_reduce_or(r)) as i32
2327    }
2328}
2329
2330/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2331/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2332/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2333/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2334/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2335/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2336/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2337/// are zero, otherwise return 0.
2338///
2339/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2340#[inline]
2341#[target_feature(enable = "avx")]
2342#[cfg_attr(test, assert_instr(vtestps))]
2343#[stable(feature = "simd_x86", since = "1.27.0")]
2344pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2345    unsafe { vtestnzcps(a, b) }
2346}
2347
2348/// Sets each bit of the returned mask based on the most significant bit of the
2349/// corresponding packed double-precision (64-bit) floating-point element in
2350/// `a`.
2351///
2352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2353#[inline]
2354#[target_feature(enable = "avx")]
2355#[cfg_attr(test, assert_instr(vmovmskpd))]
2356#[stable(feature = "simd_x86", since = "1.27.0")]
2357#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2358pub const fn _mm256_movemask_pd(a: __m256d) -> i32 {
2359    // Propagate the highest bit to the rest, because simd_bitmask
2360    // requires all-1 or all-0.
2361    unsafe {
2362        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2363        simd_bitmask::<i64x4, u8>(mask) as i32
2364    }
2365}
2366
2367/// Sets each bit of the returned mask based on the most significant bit of the
2368/// corresponding packed single-precision (32-bit) floating-point element in
2369/// `a`.
2370///
2371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2372#[inline]
2373#[target_feature(enable = "avx")]
2374#[cfg_attr(test, assert_instr(vmovmskps))]
2375#[stable(feature = "simd_x86", since = "1.27.0")]
2376#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2377pub const fn _mm256_movemask_ps(a: __m256) -> i32 {
2378    // Propagate the highest bit to the rest, because simd_bitmask
2379    // requires all-1 or all-0.
2380    unsafe {
2381        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2382        simd_bitmask::<i32x8, u8>(mask) as i32
2383    }
2384}
2385
2386/// Returns vector of type __m256d with all elements set to zero.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2389#[inline]
2390#[target_feature(enable = "avx")]
2391#[cfg_attr(test, assert_instr(vxorp))]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2394pub const fn _mm256_setzero_pd() -> __m256d {
2395    const { unsafe { mem::zeroed() } }
2396}
2397
2398/// Returns vector of type __m256 with all elements set to zero.
2399///
2400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2401#[inline]
2402#[target_feature(enable = "avx")]
2403#[cfg_attr(test, assert_instr(vxorps))]
2404#[stable(feature = "simd_x86", since = "1.27.0")]
2405#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2406pub const fn _mm256_setzero_ps() -> __m256 {
2407    const { unsafe { mem::zeroed() } }
2408}
2409
2410/// Returns vector of type __m256i with all elements set to zero.
2411///
2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2413#[inline]
2414#[target_feature(enable = "avx")]
2415#[cfg_attr(test, assert_instr(vxor))]
2416#[stable(feature = "simd_x86", since = "1.27.0")]
2417#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2418pub const fn _mm256_setzero_si256() -> __m256i {
2419    const { unsafe { mem::zeroed() } }
2420}
2421
2422/// Sets packed double-precision (64-bit) floating-point elements in returned
2423/// vector with the supplied values.
2424///
2425/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2426#[inline]
2427#[target_feature(enable = "avx")]
2428// This intrinsic has no corresponding instruction.
2429#[cfg_attr(test, assert_instr(vinsertf128))]
2430#[stable(feature = "simd_x86", since = "1.27.0")]
2431#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2432pub const fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2433    _mm256_setr_pd(d, c, b, a)
2434}
2435
2436/// Sets packed single-precision (32-bit) floating-point elements in returned
2437/// vector with the supplied values.
2438///
2439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2440#[inline]
2441#[target_feature(enable = "avx")]
2442// This intrinsic has no corresponding instruction.
2443#[stable(feature = "simd_x86", since = "1.27.0")]
2444#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2445pub const fn _mm256_set_ps(
2446    a: f32,
2447    b: f32,
2448    c: f32,
2449    d: f32,
2450    e: f32,
2451    f: f32,
2452    g: f32,
2453    h: f32,
2454) -> __m256 {
2455    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2456}
2457
2458/// Sets packed 8-bit integers in returned vector with the supplied values.
2459///
2460/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2461#[inline]
2462#[target_feature(enable = "avx")]
2463// This intrinsic has no corresponding instruction.
2464#[stable(feature = "simd_x86", since = "1.27.0")]
2465#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2466pub const fn _mm256_set_epi8(
2467    e00: i8,
2468    e01: i8,
2469    e02: i8,
2470    e03: i8,
2471    e04: i8,
2472    e05: i8,
2473    e06: i8,
2474    e07: i8,
2475    e08: i8,
2476    e09: i8,
2477    e10: i8,
2478    e11: i8,
2479    e12: i8,
2480    e13: i8,
2481    e14: i8,
2482    e15: i8,
2483    e16: i8,
2484    e17: i8,
2485    e18: i8,
2486    e19: i8,
2487    e20: i8,
2488    e21: i8,
2489    e22: i8,
2490    e23: i8,
2491    e24: i8,
2492    e25: i8,
2493    e26: i8,
2494    e27: i8,
2495    e28: i8,
2496    e29: i8,
2497    e30: i8,
2498    e31: i8,
2499) -> __m256i {
2500    #[rustfmt::skip]
2501    _mm256_setr_epi8(
2502        e31, e30, e29, e28, e27, e26, e25, e24,
2503        e23, e22, e21, e20, e19, e18, e17, e16,
2504        e15, e14, e13, e12, e11, e10, e09, e08,
2505        e07, e06, e05, e04, e03, e02, e01, e00,
2506    )
2507}
2508
2509/// Sets packed 16-bit integers in returned vector with the supplied values.
2510///
2511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2512#[inline]
2513#[target_feature(enable = "avx")]
2514// This intrinsic has no corresponding instruction.
2515#[stable(feature = "simd_x86", since = "1.27.0")]
2516#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2517pub const fn _mm256_set_epi16(
2518    e00: i16,
2519    e01: i16,
2520    e02: i16,
2521    e03: i16,
2522    e04: i16,
2523    e05: i16,
2524    e06: i16,
2525    e07: i16,
2526    e08: i16,
2527    e09: i16,
2528    e10: i16,
2529    e11: i16,
2530    e12: i16,
2531    e13: i16,
2532    e14: i16,
2533    e15: i16,
2534) -> __m256i {
2535    #[rustfmt::skip]
2536    _mm256_setr_epi16(
2537        e15, e14, e13, e12,
2538        e11, e10, e09, e08,
2539        e07, e06, e05, e04,
2540        e03, e02, e01, e00,
2541    )
2542}
2543
2544/// Sets packed 32-bit integers in returned vector with the supplied values.
2545///
2546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2547#[inline]
2548#[target_feature(enable = "avx")]
2549// This intrinsic has no corresponding instruction.
2550#[stable(feature = "simd_x86", since = "1.27.0")]
2551#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2552pub const fn _mm256_set_epi32(
2553    e0: i32,
2554    e1: i32,
2555    e2: i32,
2556    e3: i32,
2557    e4: i32,
2558    e5: i32,
2559    e6: i32,
2560    e7: i32,
2561) -> __m256i {
2562    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2563}
2564
2565/// Sets packed 64-bit integers in returned vector with the supplied values.
2566///
2567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2568#[inline]
2569#[target_feature(enable = "avx")]
2570// This intrinsic has no corresponding instruction.
2571#[stable(feature = "simd_x86", since = "1.27.0")]
2572#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2573pub const fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2574    _mm256_setr_epi64x(d, c, b, a)
2575}
2576
2577/// Sets packed double-precision (64-bit) floating-point elements in returned
2578/// vector with the supplied values in reverse order.
2579///
2580/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2581#[inline]
2582#[target_feature(enable = "avx")]
2583// This intrinsic has no corresponding instruction.
2584#[stable(feature = "simd_x86", since = "1.27.0")]
2585#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2586pub const fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2587    __m256d([a, b, c, d])
2588}
2589
2590/// Sets packed single-precision (32-bit) floating-point elements in returned
2591/// vector with the supplied values in reverse order.
2592///
2593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2594#[inline]
2595#[target_feature(enable = "avx")]
2596// This intrinsic has no corresponding instruction.
2597#[stable(feature = "simd_x86", since = "1.27.0")]
2598#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2599pub const fn _mm256_setr_ps(
2600    a: f32,
2601    b: f32,
2602    c: f32,
2603    d: f32,
2604    e: f32,
2605    f: f32,
2606    g: f32,
2607    h: f32,
2608) -> __m256 {
2609    __m256([a, b, c, d, e, f, g, h])
2610}
2611
2612/// Sets packed 8-bit integers in returned vector with the supplied values in
2613/// reverse order.
2614///
2615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2616#[inline]
2617#[target_feature(enable = "avx")]
2618// This intrinsic has no corresponding instruction.
2619#[stable(feature = "simd_x86", since = "1.27.0")]
2620#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2621pub const fn _mm256_setr_epi8(
2622    e00: i8,
2623    e01: i8,
2624    e02: i8,
2625    e03: i8,
2626    e04: i8,
2627    e05: i8,
2628    e06: i8,
2629    e07: i8,
2630    e08: i8,
2631    e09: i8,
2632    e10: i8,
2633    e11: i8,
2634    e12: i8,
2635    e13: i8,
2636    e14: i8,
2637    e15: i8,
2638    e16: i8,
2639    e17: i8,
2640    e18: i8,
2641    e19: i8,
2642    e20: i8,
2643    e21: i8,
2644    e22: i8,
2645    e23: i8,
2646    e24: i8,
2647    e25: i8,
2648    e26: i8,
2649    e27: i8,
2650    e28: i8,
2651    e29: i8,
2652    e30: i8,
2653    e31: i8,
2654) -> __m256i {
2655    unsafe {
2656        #[rustfmt::skip]
2657        transmute(i8x32::new(
2658            e00, e01, e02, e03, e04, e05, e06, e07,
2659            e08, e09, e10, e11, e12, e13, e14, e15,
2660            e16, e17, e18, e19, e20, e21, e22, e23,
2661            e24, e25, e26, e27, e28, e29, e30, e31,
2662        ))
2663    }
2664}
2665
2666/// Sets packed 16-bit integers in returned vector with the supplied values in
2667/// reverse order.
2668///
2669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2670#[inline]
2671#[target_feature(enable = "avx")]
2672// This intrinsic has no corresponding instruction.
2673#[stable(feature = "simd_x86", since = "1.27.0")]
2674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2675pub const fn _mm256_setr_epi16(
2676    e00: i16,
2677    e01: i16,
2678    e02: i16,
2679    e03: i16,
2680    e04: i16,
2681    e05: i16,
2682    e06: i16,
2683    e07: i16,
2684    e08: i16,
2685    e09: i16,
2686    e10: i16,
2687    e11: i16,
2688    e12: i16,
2689    e13: i16,
2690    e14: i16,
2691    e15: i16,
2692) -> __m256i {
2693    unsafe {
2694        #[rustfmt::skip]
2695        transmute(i16x16::new(
2696            e00, e01, e02, e03,
2697            e04, e05, e06, e07,
2698            e08, e09, e10, e11,
2699            e12, e13, e14, e15,
2700        ))
2701    }
2702}
2703
2704/// Sets packed 32-bit integers in returned vector with the supplied values in
2705/// reverse order.
2706///
2707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2708#[inline]
2709#[target_feature(enable = "avx")]
2710// This intrinsic has no corresponding instruction.
2711#[stable(feature = "simd_x86", since = "1.27.0")]
2712#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2713pub const fn _mm256_setr_epi32(
2714    e0: i32,
2715    e1: i32,
2716    e2: i32,
2717    e3: i32,
2718    e4: i32,
2719    e5: i32,
2720    e6: i32,
2721    e7: i32,
2722) -> __m256i {
2723    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2724}
2725
2726/// Sets packed 64-bit integers in returned vector with the supplied values in
2727/// reverse order.
2728///
2729/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2730#[inline]
2731#[target_feature(enable = "avx")]
2732// This intrinsic has no corresponding instruction.
2733#[stable(feature = "simd_x86", since = "1.27.0")]
2734#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2735pub const fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2736    unsafe { transmute(i64x4::new(a, b, c, d)) }
2737}
2738
2739/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2740/// elements of returned vector.
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2743#[inline]
2744#[target_feature(enable = "avx")]
2745// This intrinsic has no corresponding instruction.
2746#[stable(feature = "simd_x86", since = "1.27.0")]
2747#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2748pub const fn _mm256_set1_pd(a: f64) -> __m256d {
2749    f64x4::splat(a).as_m256d()
2750}
2751
2752/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2753/// elements of returned vector.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2756#[inline]
2757#[target_feature(enable = "avx")]
2758// This intrinsic has no corresponding instruction.
2759#[stable(feature = "simd_x86", since = "1.27.0")]
2760#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2761pub const fn _mm256_set1_ps(a: f32) -> __m256 {
2762    f32x8::splat(a).as_m256()
2763}
2764
2765/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2766/// This intrinsic may generate the `vpbroadcastb`.
2767///
2768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2769#[inline]
2770#[target_feature(enable = "avx")]
2771// This intrinsic has no corresponding instruction.
2772#[stable(feature = "simd_x86", since = "1.27.0")]
2773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2774pub const fn _mm256_set1_epi8(a: i8) -> __m256i {
2775    i8x32::splat(a).as_m256i()
2776}
2777
2778/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2779/// This intrinsic may generate the `vpbroadcastw`.
2780///
2781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2782#[inline]
2783#[target_feature(enable = "avx")]
2784//#[cfg_attr(test, assert_instr(vpshufb))]
2785#[cfg_attr(test, assert_instr(vinsertf128))]
2786// This intrinsic has no corresponding instruction.
2787#[stable(feature = "simd_x86", since = "1.27.0")]
2788#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2789pub const fn _mm256_set1_epi16(a: i16) -> __m256i {
2790    i16x16::splat(a).as_m256i()
2791}
2792
2793/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2794/// This intrinsic may generate the `vpbroadcastd`.
2795///
2796/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2797#[inline]
2798#[target_feature(enable = "avx")]
2799// This intrinsic has no corresponding instruction.
2800#[stable(feature = "simd_x86", since = "1.27.0")]
2801#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2802pub const fn _mm256_set1_epi32(a: i32) -> __m256i {
2803    i32x8::splat(a).as_m256i()
2804}
2805
2806/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2807/// This intrinsic may generate the `vpbroadcastq`.
2808///
2809/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2810#[inline]
2811#[target_feature(enable = "avx")]
2812#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2813#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2814// This intrinsic has no corresponding instruction.
2815#[stable(feature = "simd_x86", since = "1.27.0")]
2816#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2817pub const fn _mm256_set1_epi64x(a: i64) -> __m256i {
2818    i64x4::splat(a).as_m256i()
2819}
2820
2821/// Cast vector of type __m256d to type __m256.
2822///
2823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2824#[inline]
2825#[target_feature(enable = "avx")]
2826// This intrinsic is only used for compilation and does not generate any
2827// instructions, thus it has zero latency.
2828#[stable(feature = "simd_x86", since = "1.27.0")]
2829#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2830pub const fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2831    unsafe { transmute(a) }
2832}
2833
2834/// Cast vector of type __m256 to type __m256d.
2835///
2836/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2837#[inline]
2838#[target_feature(enable = "avx")]
2839// This intrinsic is only used for compilation and does not generate any
2840// instructions, thus it has zero latency.
2841#[stable(feature = "simd_x86", since = "1.27.0")]
2842#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2843pub const fn _mm256_castps_pd(a: __m256) -> __m256d {
2844    unsafe { transmute(a) }
2845}
2846
2847/// Casts vector of type __m256 to type __m256i.
2848///
2849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2850#[inline]
2851#[target_feature(enable = "avx")]
2852// This intrinsic is only used for compilation and does not generate any
2853// instructions, thus it has zero latency.
2854#[stable(feature = "simd_x86", since = "1.27.0")]
2855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2856pub const fn _mm256_castps_si256(a: __m256) -> __m256i {
2857    unsafe { transmute(a) }
2858}
2859
2860/// Casts vector of type __m256i to type __m256.
2861///
2862/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2863#[inline]
2864#[target_feature(enable = "avx")]
2865// This intrinsic is only used for compilation and does not generate any
2866// instructions, thus it has zero latency.
2867#[stable(feature = "simd_x86", since = "1.27.0")]
2868#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2869pub const fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2870    unsafe { transmute(a) }
2871}
2872
2873/// Casts vector of type __m256d to type __m256i.
2874///
2875/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2876#[inline]
2877#[target_feature(enable = "avx")]
2878// This intrinsic is only used for compilation and does not generate any
2879// instructions, thus it has zero latency.
2880#[stable(feature = "simd_x86", since = "1.27.0")]
2881#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2882pub const fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2883    unsafe { transmute(a) }
2884}
2885
2886/// Casts vector of type __m256i to type __m256d.
2887///
2888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2889#[inline]
2890#[target_feature(enable = "avx")]
2891// This intrinsic is only used for compilation and does not generate any
2892// instructions, thus it has zero latency.
2893#[stable(feature = "simd_x86", since = "1.27.0")]
2894#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2895pub const fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2896    unsafe { transmute(a) }
2897}
2898
2899/// Casts vector of type __m256 to type __m128.
2900///
2901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2902#[inline]
2903#[target_feature(enable = "avx")]
2904// This intrinsic is only used for compilation and does not generate any
2905// instructions, thus it has zero latency.
2906#[stable(feature = "simd_x86", since = "1.27.0")]
2907#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2908pub const fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2909    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2910}
2911
2912/// Casts vector of type __m256d to type __m128d.
2913///
2914/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2915#[inline]
2916#[target_feature(enable = "avx")]
2917// This intrinsic is only used for compilation and does not generate any
2918// instructions, thus it has zero latency.
2919#[stable(feature = "simd_x86", since = "1.27.0")]
2920#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2921pub const fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2922    unsafe { simd_shuffle!(a, a, [0, 1]) }
2923}
2924
2925/// Casts vector of type __m256i to type __m128i.
2926///
2927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2928#[inline]
2929#[target_feature(enable = "avx")]
2930// This intrinsic is only used for compilation and does not generate any
2931// instructions, thus it has zero latency.
2932#[stable(feature = "simd_x86", since = "1.27.0")]
2933#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2934pub const fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2935    unsafe {
2936        let a = a.as_i64x4();
2937        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2938        transmute(dst)
2939    }
2940}
2941
2942/// Casts vector of type __m128 to type __m256;
2943/// the upper 128 bits of the result are indeterminate.
2944///
2945/// In the Intel documentation, the upper bits are declared to be "undefined".
2946/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2947/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2948///
2949/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2950#[inline]
2951#[target_feature(enable = "avx")]
2952// This intrinsic is only used for compilation and does not generate any
2953// instructions, thus it has zero latency.
2954#[stable(feature = "simd_x86", since = "1.27.0")]
2955#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2956pub const fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2957    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2958}
2959
2960/// Casts vector of type __m128d to type __m256d;
2961/// the upper 128 bits of the result are indeterminate.
2962///
2963/// In the Intel documentation, the upper bits are declared to be "undefined".
2964/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2965/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2966///
2967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2968#[inline]
2969#[target_feature(enable = "avx")]
2970// This intrinsic is only used for compilation and does not generate any
2971// instructions, thus it has zero latency.
2972#[stable(feature = "simd_x86", since = "1.27.0")]
2973#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2974pub const fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2975    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2976}
2977
2978/// Casts vector of type __m128i to type __m256i;
2979/// the upper 128 bits of the result are indeterminate.
2980///
2981/// In the Intel documentation, the upper bits are declared to be "undefined".
2982/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2983/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2984///
2985/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2986#[inline]
2987#[target_feature(enable = "avx")]
2988// This intrinsic is only used for compilation and does not generate any
2989// instructions, thus it has zero latency.
2990#[stable(feature = "simd_x86", since = "1.27.0")]
2991#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2992pub const fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2993    unsafe {
2994        let a = a.as_i64x2();
2995        let undefined = i64x2::ZERO;
2996        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2997        transmute(dst)
2998    }
2999}
3000
3001/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
3002/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
3003/// the value of the source vector. The upper 128 bits are set to zero.
3004///
3005/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
3006#[inline]
3007#[target_feature(enable = "avx")]
3008// This intrinsic is only used for compilation and does not generate any
3009// instructions, thus it has zero latency.
3010#[stable(feature = "simd_x86", since = "1.27.0")]
3011#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3012pub const fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
3013    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
3014}
3015
3016/// Constructs a 256-bit integer vector from a 128-bit integer vector.
3017/// The lower 128 bits contain the value of the source vector. The upper
3018/// 128 bits are set to zero.
3019///
3020/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
3021#[inline]
3022#[target_feature(enable = "avx")]
3023// This intrinsic is only used for compilation and does not generate any
3024// instructions, thus it has zero latency.
3025#[stable(feature = "simd_x86", since = "1.27.0")]
3026#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3027pub const fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
3028    unsafe {
3029        let b = i64x2::ZERO;
3030        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
3031        transmute(dst)
3032    }
3033}
3034
3035/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
3036/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
3037/// contain the value of the source vector. The upper 128 bits are set
3038/// to zero.
3039///
3040/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
3041#[inline]
3042#[target_feature(enable = "avx")]
3043// This intrinsic is only used for compilation and does not generate any
3044// instructions, thus it has zero latency.
3045#[stable(feature = "simd_x86", since = "1.27.0")]
3046#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3047pub const fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
3048    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
3049}
3050
3051/// Returns vector of type `__m256` with indeterminate elements.
3052/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3053/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3054/// In practice, this is typically equivalent to [`mem::zeroed`].
3055///
3056/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
3057#[inline]
3058#[target_feature(enable = "avx")]
3059// This intrinsic has no corresponding instruction.
3060#[stable(feature = "simd_x86", since = "1.27.0")]
3061#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3062pub const fn _mm256_undefined_ps() -> __m256 {
3063    const { unsafe { mem::zeroed() } }
3064}
3065
3066/// Returns vector of type `__m256d` with indeterminate elements.
3067/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3068/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3069/// In practice, this is typically equivalent to [`mem::zeroed`].
3070///
3071/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
3072#[inline]
3073#[target_feature(enable = "avx")]
3074// This intrinsic has no corresponding instruction.
3075#[stable(feature = "simd_x86", since = "1.27.0")]
3076#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3077pub const fn _mm256_undefined_pd() -> __m256d {
3078    const { unsafe { mem::zeroed() } }
3079}
3080
3081/// Returns vector of type __m256i with with indeterminate elements.
3082/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3083/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3084/// In practice, this is typically equivalent to [`mem::zeroed`].
3085///
3086/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
3087#[inline]
3088#[target_feature(enable = "avx")]
3089// This intrinsic has no corresponding instruction.
3090#[stable(feature = "simd_x86", since = "1.27.0")]
3091#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3092pub const fn _mm256_undefined_si256() -> __m256i {
3093    const { unsafe { mem::zeroed() } }
3094}
3095
3096/// Sets packed __m256 returned vector with the supplied values.
3097///
3098/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
3099#[inline]
3100#[target_feature(enable = "avx")]
3101#[cfg_attr(test, assert_instr(vinsertf128))]
3102#[stable(feature = "simd_x86", since = "1.27.0")]
3103#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3104pub const fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
3105    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
3106}
3107
3108/// Sets packed __m256d returned vector with the supplied values.
3109///
3110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
3111#[inline]
3112#[target_feature(enable = "avx")]
3113#[cfg_attr(test, assert_instr(vinsertf128))]
3114#[stable(feature = "simd_x86", since = "1.27.0")]
3115#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3116pub const fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
3117    unsafe {
3118        let hi: __m128 = transmute(hi);
3119        let lo: __m128 = transmute(lo);
3120        transmute(_mm256_set_m128(hi, lo))
3121    }
3122}
3123
3124/// Sets packed __m256i returned vector with the supplied values.
3125///
3126/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
3127#[inline]
3128#[target_feature(enable = "avx")]
3129#[cfg_attr(test, assert_instr(vinsertf128))]
3130#[stable(feature = "simd_x86", since = "1.27.0")]
3131#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3132pub const fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
3133    unsafe {
3134        let hi: __m128 = transmute(hi);
3135        let lo: __m128 = transmute(lo);
3136        transmute(_mm256_set_m128(hi, lo))
3137    }
3138}
3139
3140/// Sets packed __m256 returned vector with the supplied values.
3141///
3142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
3143#[inline]
3144#[target_feature(enable = "avx")]
3145#[cfg_attr(test, assert_instr(vinsertf128))]
3146#[stable(feature = "simd_x86", since = "1.27.0")]
3147#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3148pub const fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
3149    _mm256_set_m128(hi, lo)
3150}
3151
3152/// Sets packed __m256d returned vector with the supplied values.
3153///
3154/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
3155#[inline]
3156#[target_feature(enable = "avx")]
3157#[cfg_attr(test, assert_instr(vinsertf128))]
3158#[stable(feature = "simd_x86", since = "1.27.0")]
3159#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3160pub const fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
3161    _mm256_set_m128d(hi, lo)
3162}
3163
3164/// Sets packed __m256i returned vector with the supplied values.
3165///
3166/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
3167#[inline]
3168#[target_feature(enable = "avx")]
3169#[cfg_attr(test, assert_instr(vinsertf128))]
3170#[stable(feature = "simd_x86", since = "1.27.0")]
3171#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3172pub const fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
3173    _mm256_set_m128i(hi, lo)
3174}
3175
3176/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
3177/// floating-point elements) from memory, and combine them into a 256-bit
3178/// value.
3179/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3180///
3181/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
3182#[inline]
3183#[target_feature(enable = "avx")]
3184// This intrinsic has no corresponding instruction.
3185#[stable(feature = "simd_x86", since = "1.27.0")]
3186#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3187pub const unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
3188    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
3189    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
3190}
3191
3192/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
3193/// floating-point elements) from memory, and combine them into a 256-bit
3194/// value.
3195/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3196///
3197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
3198#[inline]
3199#[target_feature(enable = "avx")]
3200// This intrinsic has no corresponding instruction.
3201#[stable(feature = "simd_x86", since = "1.27.0")]
3202#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3203pub const unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
3204    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
3205    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
3206}
3207
3208/// Loads two 128-bit values (composed of integer data) from memory, and combine
3209/// them into a 256-bit value.
3210/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3211///
3212/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
3213#[inline]
3214#[target_feature(enable = "avx")]
3215// This intrinsic has no corresponding instruction.
3216#[stable(feature = "simd_x86", since = "1.27.0")]
3217#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3218pub const unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
3219    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
3220    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
3221}
3222
3223/// Stores the high and low 128-bit halves (each composed of 4 packed
3224/// single-precision (32-bit) floating-point elements) from `a` into memory two
3225/// different 128-bit locations.
3226/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3227///
3228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
3229#[inline]
3230#[target_feature(enable = "avx")]
3231// This intrinsic has no corresponding instruction.
3232#[stable(feature = "simd_x86", since = "1.27.0")]
3233#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3234pub const unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
3235    let lo = _mm256_castps256_ps128(a);
3236    _mm_storeu_ps(loaddr, lo);
3237    let hi = _mm256_extractf128_ps::<1>(a);
3238    _mm_storeu_ps(hiaddr, hi);
3239}
3240
3241/// Stores the high and low 128-bit halves (each composed of 2 packed
3242/// double-precision (64-bit) floating-point elements) from `a` into memory two
3243/// different 128-bit locations.
3244/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3245///
3246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3247#[inline]
3248#[target_feature(enable = "avx")]
3249// This intrinsic has no corresponding instruction.
3250#[stable(feature = "simd_x86", since = "1.27.0")]
3251#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3252pub const unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3253    let lo = _mm256_castpd256_pd128(a);
3254    _mm_storeu_pd(loaddr, lo);
3255    let hi = _mm256_extractf128_pd::<1>(a);
3256    _mm_storeu_pd(hiaddr, hi);
3257}
3258
3259/// Stores the high and low 128-bit halves (each composed of integer data) from
3260/// `a` into memory two different 128-bit locations.
3261/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3262///
3263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3264#[inline]
3265#[target_feature(enable = "avx")]
3266// This intrinsic has no corresponding instruction.
3267#[stable(feature = "simd_x86", since = "1.27.0")]
3268#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3269pub const unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3270    let lo = _mm256_castsi256_si128(a);
3271    _mm_storeu_si128(loaddr, lo);
3272    let hi = _mm256_extractf128_si256::<1>(a);
3273    _mm_storeu_si128(hiaddr, hi);
3274}
3275
3276/// Returns the first element of the input vector of `[8 x float]`.
3277///
3278/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3279#[inline]
3280#[target_feature(enable = "avx")]
3281//#[cfg_attr(test, assert_instr(movss))] FIXME
3282#[stable(feature = "simd_x86", since = "1.27.0")]
3283#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3284pub const fn _mm256_cvtss_f32(a: __m256) -> f32 {
3285    unsafe { simd_extract!(a, 0) }
3286}
3287
3288// LLVM intrinsics used in the above functions
3289#[allow(improper_ctypes)]
3290unsafe extern "C" {
3291    #[link_name = "llvm.x86.avx.round.pd.256"]
3292    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3293    #[link_name = "llvm.x86.avx.round.ps.256"]
3294    fn roundps256(a: __m256, b: i32) -> __m256;
3295    #[link_name = "llvm.x86.avx.dp.ps.256"]
3296    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3297    #[link_name = "llvm.x86.sse2.cmp.pd"]
3298    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3299    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3300    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3301    #[link_name = "llvm.x86.sse.cmp.ps"]
3302    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3303    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3304    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3305    #[link_name = "llvm.x86.sse2.cmp.sd"]
3306    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3307    #[link_name = "llvm.x86.sse.cmp.ss"]
3308    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3309    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3310    fn vcvtps2dq(a: __m256) -> i32x8;
3311    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3312    fn vcvttpd2dq(a: __m256d) -> i32x4;
3313    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3314    fn vcvtpd2dq(a: __m256d) -> i32x4;
3315    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3316    fn vcvttps2dq(a: __m256) -> i32x8;
3317    #[link_name = "llvm.x86.avx.vzeroall"]
3318    fn vzeroall();
3319    #[link_name = "llvm.x86.avx.vzeroupper"]
3320    fn vzeroupper();
3321    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3322    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3323    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3324    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3325    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3326    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3327    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3328    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3329    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3330    fn vlddqu(mem_addr: *const i8) -> i8x32;
3331    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3332    fn vrcpps(a: __m256) -> __m256;
3333    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3334    fn vrsqrtps(a: __m256) -> __m256;
3335    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3336    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3337    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3338    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3339    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3340    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3341    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3342    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3343    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3344    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3345    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3346    fn vtestzps256(a: __m256, b: __m256) -> i32;
3347    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3348    fn vtestcps256(a: __m256, b: __m256) -> i32;
3349    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3350    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3351    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3352    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3353    #[link_name = "llvm.x86.avx.min.ps.256"]
3354    fn vminps(a: __m256, b: __m256) -> __m256;
3355    #[link_name = "llvm.x86.avx.max.ps.256"]
3356    fn vmaxps(a: __m256, b: __m256) -> __m256;
3357    #[link_name = "llvm.x86.avx.min.pd.256"]
3358    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3359    #[link_name = "llvm.x86.avx.max.pd.256"]
3360    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3361}
3362
3363#[cfg(test)]
3364mod tests {
3365    use crate::core_arch::assert_eq_const as assert_eq;
3366    use crate::core_arch::simd::*;
3367    use crate::hint::black_box;
3368    use crate::ptr;
3369    use stdarch_test::simd_test;
3370
3371    use crate::core_arch::x86::*;
3372
3373    #[simd_test(enable = "avx")]
3374    const fn test_mm256_add_pd() {
3375        let a = _mm256_setr_pd(1., 2., 3., 4.);
3376        let b = _mm256_setr_pd(5., 6., 7., 8.);
3377        let r = _mm256_add_pd(a, b);
3378        let e = _mm256_setr_pd(6., 8., 10., 12.);
3379        assert_eq_m256d(r, e);
3380    }
3381
3382    #[simd_test(enable = "avx")]
3383    const fn test_mm256_add_ps() {
3384        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3385        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3386        let r = _mm256_add_ps(a, b);
3387        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3388        assert_eq_m256(r, e);
3389    }
3390
3391    #[simd_test(enable = "avx")]
3392    const fn test_mm256_and_pd() {
3393        let a = _mm256_set1_pd(1.);
3394        let b = _mm256_set1_pd(0.6);
3395        let r = _mm256_and_pd(a, b);
3396        let e = _mm256_set1_pd(0.5);
3397        assert_eq_m256d(r, e);
3398    }
3399
3400    #[simd_test(enable = "avx")]
3401    const fn test_mm256_and_ps() {
3402        let a = _mm256_set1_ps(1.);
3403        let b = _mm256_set1_ps(0.6);
3404        let r = _mm256_and_ps(a, b);
3405        let e = _mm256_set1_ps(0.5);
3406        assert_eq_m256(r, e);
3407    }
3408
3409    #[simd_test(enable = "avx")]
3410    const fn test_mm256_or_pd() {
3411        let a = _mm256_set1_pd(1.);
3412        let b = _mm256_set1_pd(0.6);
3413        let r = _mm256_or_pd(a, b);
3414        let e = _mm256_set1_pd(1.2);
3415        assert_eq_m256d(r, e);
3416    }
3417
3418    #[simd_test(enable = "avx")]
3419    const fn test_mm256_or_ps() {
3420        let a = _mm256_set1_ps(1.);
3421        let b = _mm256_set1_ps(0.6);
3422        let r = _mm256_or_ps(a, b);
3423        let e = _mm256_set1_ps(1.2);
3424        assert_eq_m256(r, e);
3425    }
3426
3427    #[simd_test(enable = "avx")]
3428    const fn test_mm256_shuffle_pd() {
3429        let a = _mm256_setr_pd(1., 4., 5., 8.);
3430        let b = _mm256_setr_pd(2., 3., 6., 7.);
3431        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3432        let e = _mm256_setr_pd(4., 3., 8., 7.);
3433        assert_eq_m256d(r, e);
3434    }
3435
3436    #[simd_test(enable = "avx")]
3437    const fn test_mm256_shuffle_ps() {
3438        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3439        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3440        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3441        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3442        assert_eq_m256(r, e);
3443    }
3444
3445    #[simd_test(enable = "avx")]
3446    const fn test_mm256_andnot_pd() {
3447        let a = _mm256_set1_pd(0.);
3448        let b = _mm256_set1_pd(0.6);
3449        let r = _mm256_andnot_pd(a, b);
3450        assert_eq_m256d(r, b);
3451    }
3452
3453    #[simd_test(enable = "avx")]
3454    const fn test_mm256_andnot_ps() {
3455        let a = _mm256_set1_ps(0.);
3456        let b = _mm256_set1_ps(0.6);
3457        let r = _mm256_andnot_ps(a, b);
3458        assert_eq_m256(r, b);
3459    }
3460
3461    #[simd_test(enable = "avx")]
3462    fn test_mm256_max_pd() {
3463        let a = _mm256_setr_pd(1., 4., 5., 8.);
3464        let b = _mm256_setr_pd(2., 3., 6., 7.);
3465        let r = _mm256_max_pd(a, b);
3466        let e = _mm256_setr_pd(2., 4., 6., 8.);
3467        assert_eq_m256d(r, e);
3468        // > If the values being compared are both 0.0s (of either sign), the
3469        // > value in the second operand (source operand) is returned.
3470        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3471        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3472        let wu = _mm256_castpd_si256(w).as_u64x4();
3473        let xu = _mm256_castpd_si256(x).as_u64x4();
3474        assert_eq!(wu, u64x4::splat(0x8000_0000_0000_0000u64));
3475        assert_eq!(xu, u64x4::splat(0u64));
3476        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3477        // > second operand (source operand), either a NaN or a valid
3478        // > floating-point value, is written to the result.
3479        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3480        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3481        assert_eq_m256d(y, _mm256_set1_pd(0.0));
3482        let zf = *z.as_f64x4().as_array();
3483        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3484    }
3485
3486    #[simd_test(enable = "avx")]
3487    fn test_mm256_max_ps() {
3488        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3489        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3490        let r = _mm256_max_ps(a, b);
3491        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3492        assert_eq_m256(r, e);
3493        // > If the values being compared are both 0.0s (of either sign), the
3494        // > value in the second operand (source operand) is returned.
3495        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3496        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3497        let wu = _mm256_castps_si256(w).as_u32x8();
3498        let xu = _mm256_castps_si256(x).as_u32x8();
3499        assert_eq!(wu, u32x8::splat(0x8000_0000u32));
3500        assert_eq!(xu, u32x8::splat(0u32));
3501        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3502        // > second operand (source operand), either a NaN or a valid
3503        // > floating-point value, is written to the result.
3504        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3505        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3506        assert_eq_m256(y, _mm256_set1_ps(0.0));
3507        let zf = *z.as_f32x8().as_array();
3508        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3509    }
3510
3511    #[simd_test(enable = "avx")]
3512    fn test_mm256_min_pd() {
3513        let a = _mm256_setr_pd(1., 4., 5., 8.);
3514        let b = _mm256_setr_pd(2., 3., 6., 7.);
3515        let r = _mm256_min_pd(a, b);
3516        let e = _mm256_setr_pd(1., 3., 5., 7.);
3517        assert_eq_m256d(r, e);
3518        // > If the values being compared are both 0.0s (of either sign), the
3519        // > value in the second operand (source operand) is returned.
3520        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3521        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3522        let wu = _mm256_castpd_si256(w).as_u64x4();
3523        let xu = _mm256_castpd_si256(x).as_u64x4();
3524        assert_eq!(wu, u64x4::splat(0x8000_0000_0000_0000u64));
3525        assert_eq!(xu, u64x4::splat(0u64));
3526        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3527        // > second operand (source operand), either a NaN or a valid
3528        // > floating-point value, is written to the result.
3529        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3530        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3531        assert_eq_m256d(y, _mm256_set1_pd(0.0));
3532        let zf = *z.as_f64x4().as_array();
3533        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3534    }
3535
3536    #[simd_test(enable = "avx")]
3537    fn test_mm256_min_ps() {
3538        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3539        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3540        let r = _mm256_min_ps(a, b);
3541        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3542        assert_eq_m256(r, e);
3543        // > If the values being compared are both 0.0s (of either sign), the
3544        // > value in the second operand (source operand) is returned.
3545        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3546        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3547        let wu = _mm256_castps_si256(w).as_u32x8();
3548        let xu = _mm256_castps_si256(x).as_u32x8();
3549        assert_eq!(wu, u32x8::splat(0x8000_0000u32));
3550        assert_eq!(xu, u32x8::splat(0u32));
3551        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3552        // > second operand (source operand), either a NaN or a valid
3553        // > floating-point value, is written to the result.
3554        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3555        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3556        assert_eq_m256(y, _mm256_set1_ps(0.0));
3557        let zf = *z.as_f32x8().as_array();
3558        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3559    }
3560
3561    #[simd_test(enable = "avx")]
3562    const fn test_mm256_mul_pd() {
3563        let a = _mm256_setr_pd(1., 2., 3., 4.);
3564        let b = _mm256_setr_pd(5., 6., 7., 8.);
3565        let r = _mm256_mul_pd(a, b);
3566        let e = _mm256_setr_pd(5., 12., 21., 32.);
3567        assert_eq_m256d(r, e);
3568    }
3569
3570    #[simd_test(enable = "avx")]
3571    const fn test_mm256_mul_ps() {
3572        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3573        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3574        let r = _mm256_mul_ps(a, b);
3575        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3576        assert_eq_m256(r, e);
3577    }
3578
3579    #[simd_test(enable = "avx")]
3580    const fn test_mm256_addsub_pd() {
3581        let a = _mm256_setr_pd(1., 2., 3., 4.);
3582        let b = _mm256_setr_pd(5., 6., 7., 8.);
3583        let r = _mm256_addsub_pd(a, b);
3584        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3585        assert_eq_m256d(r, e);
3586    }
3587
3588    #[simd_test(enable = "avx")]
3589    const fn test_mm256_addsub_ps() {
3590        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3591        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3592        let r = _mm256_addsub_ps(a, b);
3593        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3594        assert_eq_m256(r, e);
3595    }
3596
3597    #[simd_test(enable = "avx")]
3598    const fn test_mm256_sub_pd() {
3599        let a = _mm256_setr_pd(1., 2., 3., 4.);
3600        let b = _mm256_setr_pd(5., 6., 7., 8.);
3601        let r = _mm256_sub_pd(a, b);
3602        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3603        assert_eq_m256d(r, e);
3604    }
3605
3606    #[simd_test(enable = "avx")]
3607    const fn test_mm256_sub_ps() {
3608        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3609        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3610        let r = _mm256_sub_ps(a, b);
3611        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3612        assert_eq_m256(r, e);
3613    }
3614
3615    #[simd_test(enable = "avx")]
3616    fn test_mm256_round_pd() {
3617        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3618        let result_closest = _mm256_round_pd::<0b0000>(a);
3619        let result_down = _mm256_round_pd::<0b0001>(a);
3620        let result_up = _mm256_round_pd::<0b0010>(a);
3621        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3622        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3623        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3624        assert_eq_m256d(result_closest, expected_closest);
3625        assert_eq_m256d(result_down, expected_down);
3626        assert_eq_m256d(result_up, expected_up);
3627    }
3628
3629    #[simd_test(enable = "avx")]
3630    const fn test_mm256_floor_pd() {
3631        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3632        let result_down = _mm256_floor_pd(a);
3633        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3634        assert_eq_m256d(result_down, expected_down);
3635    }
3636
3637    #[simd_test(enable = "avx")]
3638    const fn test_mm256_ceil_pd() {
3639        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3640        let result_up = _mm256_ceil_pd(a);
3641        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3642        assert_eq_m256d(result_up, expected_up);
3643    }
3644
3645    #[simd_test(enable = "avx")]
3646    fn test_mm256_round_ps() {
3647        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3648        let result_closest = _mm256_round_ps::<0b0000>(a);
3649        let result_down = _mm256_round_ps::<0b0001>(a);
3650        let result_up = _mm256_round_ps::<0b0010>(a);
3651        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3652        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3653        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3654        assert_eq_m256(result_closest, expected_closest);
3655        assert_eq_m256(result_down, expected_down);
3656        assert_eq_m256(result_up, expected_up);
3657    }
3658
3659    #[simd_test(enable = "avx")]
3660    const fn test_mm256_floor_ps() {
3661        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3662        let result_down = _mm256_floor_ps(a);
3663        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3664        assert_eq_m256(result_down, expected_down);
3665    }
3666
3667    #[simd_test(enable = "avx")]
3668    const fn test_mm256_ceil_ps() {
3669        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3670        let result_up = _mm256_ceil_ps(a);
3671        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3672        assert_eq_m256(result_up, expected_up);
3673    }
3674
3675    #[simd_test(enable = "avx")]
3676    fn test_mm256_sqrt_pd() {
3677        let a = _mm256_setr_pd(4., 9., 16., 25.);
3678        let r = _mm256_sqrt_pd(a);
3679        let e = _mm256_setr_pd(2., 3., 4., 5.);
3680        assert_eq_m256d(r, e);
3681    }
3682
3683    #[simd_test(enable = "avx")]
3684    fn test_mm256_sqrt_ps() {
3685        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3686        let r = _mm256_sqrt_ps(a);
3687        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3688        assert_eq_m256(r, e);
3689    }
3690
3691    #[simd_test(enable = "avx")]
3692    const fn test_mm256_div_ps() {
3693        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3694        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3695        let r = _mm256_div_ps(a, b);
3696        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3697        assert_eq_m256(r, e);
3698    }
3699
3700    #[simd_test(enable = "avx")]
3701    const fn test_mm256_div_pd() {
3702        let a = _mm256_setr_pd(4., 9., 16., 25.);
3703        let b = _mm256_setr_pd(4., 3., 2., 5.);
3704        let r = _mm256_div_pd(a, b);
3705        let e = _mm256_setr_pd(1., 3., 8., 5.);
3706        assert_eq_m256d(r, e);
3707    }
3708
3709    #[simd_test(enable = "avx")]
3710    const fn test_mm256_blend_pd() {
3711        let a = _mm256_setr_pd(4., 9., 16., 25.);
3712        let b = _mm256_setr_pd(4., 3., 2., 5.);
3713        let r = _mm256_blend_pd::<0x0>(a, b);
3714        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3715        let r = _mm256_blend_pd::<0x3>(a, b);
3716        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3717        let r = _mm256_blend_pd::<0xF>(a, b);
3718        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3719    }
3720
3721    #[simd_test(enable = "avx")]
3722    const fn test_mm256_blend_ps() {
3723        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3724        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3725        let r = _mm256_blend_ps::<0x0>(a, b);
3726        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3727        let r = _mm256_blend_ps::<0x3>(a, b);
3728        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3729        let r = _mm256_blend_ps::<0xF>(a, b);
3730        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3731    }
3732
3733    #[simd_test(enable = "avx")]
3734    const fn test_mm256_blendv_pd() {
3735        let a = _mm256_setr_pd(4., 9., 16., 25.);
3736        let b = _mm256_setr_pd(4., 3., 2., 5.);
3737        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3738        let r = _mm256_blendv_pd(a, b, c);
3739        let e = _mm256_setr_pd(4., 9., 2., 5.);
3740        assert_eq_m256d(r, e);
3741    }
3742
3743    #[simd_test(enable = "avx")]
3744    const fn test_mm256_blendv_ps() {
3745        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3746        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3747        #[rustfmt::skip]
3748        let c = _mm256_setr_ps(
3749            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3750        );
3751        let r = _mm256_blendv_ps(a, b, c);
3752        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3753        assert_eq_m256(r, e);
3754    }
3755
3756    #[simd_test(enable = "avx")]
3757    fn test_mm256_dp_ps() {
3758        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3759        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3760        let r = _mm256_dp_ps::<0xFF>(a, b);
3761        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3762        assert_eq_m256(r, e);
3763    }
3764
3765    #[simd_test(enable = "avx")]
3766    const fn test_mm256_hadd_pd() {
3767        let a = _mm256_setr_pd(4., 9., 16., 25.);
3768        let b = _mm256_setr_pd(4., 3., 2., 5.);
3769        let r = _mm256_hadd_pd(a, b);
3770        let e = _mm256_setr_pd(13., 7., 41., 7.);
3771        assert_eq_m256d(r, e);
3772
3773        let a = _mm256_setr_pd(1., 2., 3., 4.);
3774        let b = _mm256_setr_pd(5., 6., 7., 8.);
3775        let r = _mm256_hadd_pd(a, b);
3776        let e = _mm256_setr_pd(3., 11., 7., 15.);
3777        assert_eq_m256d(r, e);
3778    }
3779
3780    #[simd_test(enable = "avx")]
3781    const fn test_mm256_hadd_ps() {
3782        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3783        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3784        let r = _mm256_hadd_ps(a, b);
3785        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3786        assert_eq_m256(r, e);
3787
3788        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3789        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3790        let r = _mm256_hadd_ps(a, b);
3791        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3792        assert_eq_m256(r, e);
3793    }
3794
3795    #[simd_test(enable = "avx")]
3796    const fn test_mm256_hsub_pd() {
3797        let a = _mm256_setr_pd(4., 9., 16., 25.);
3798        let b = _mm256_setr_pd(4., 3., 2., 5.);
3799        let r = _mm256_hsub_pd(a, b);
3800        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3801        assert_eq_m256d(r, e);
3802
3803        let a = _mm256_setr_pd(1., 2., 3., 4.);
3804        let b = _mm256_setr_pd(5., 6., 7., 8.);
3805        let r = _mm256_hsub_pd(a, b);
3806        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3807        assert_eq_m256d(r, e);
3808    }
3809
3810    #[simd_test(enable = "avx")]
3811    const fn test_mm256_hsub_ps() {
3812        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3813        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3814        let r = _mm256_hsub_ps(a, b);
3815        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3816        assert_eq_m256(r, e);
3817
3818        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3819        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3820        let r = _mm256_hsub_ps(a, b);
3821        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3822        assert_eq_m256(r, e);
3823    }
3824
3825    #[simd_test(enable = "avx")]
3826    const fn test_mm256_xor_pd() {
3827        let a = _mm256_setr_pd(4., 9., 16., 25.);
3828        let b = _mm256_set1_pd(0.);
3829        let r = _mm256_xor_pd(a, b);
3830        assert_eq_m256d(r, a);
3831    }
3832
3833    #[simd_test(enable = "avx")]
3834    const fn test_mm256_xor_ps() {
3835        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3836        let b = _mm256_set1_ps(0.);
3837        let r = _mm256_xor_ps(a, b);
3838        assert_eq_m256(r, a);
3839    }
3840
3841    #[simd_test(enable = "avx")]
3842    fn test_mm_cmp_pd() {
3843        let a = _mm_setr_pd(4., 9.);
3844        let b = _mm_setr_pd(4., 3.);
3845        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3846        assert!(get_m128d(r, 0).is_nan());
3847        assert!(get_m128d(r, 1).is_nan());
3848    }
3849
3850    #[simd_test(enable = "avx")]
3851    fn test_mm256_cmp_pd() {
3852        let a = _mm256_setr_pd(1., 2., 3., 4.);
3853        let b = _mm256_setr_pd(5., 6., 7., 8.);
3854        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3855        let e = _mm256_set1_pd(0.);
3856        assert_eq_m256d(r, e);
3857    }
3858
3859    #[simd_test(enable = "avx")]
3860    fn test_mm_cmp_ps() {
3861        let a = _mm_setr_ps(4., 3., 2., 5.);
3862        let b = _mm_setr_ps(4., 9., 16., 25.);
3863        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3864        assert!(get_m128(r, 0).is_nan());
3865        assert_eq!(get_m128(r, 1), 0.);
3866        assert_eq!(get_m128(r, 2), 0.);
3867        assert_eq!(get_m128(r, 3), 0.);
3868    }
3869
3870    #[simd_test(enable = "avx")]
3871    fn test_mm256_cmp_ps() {
3872        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3873        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3874        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3875        let e = _mm256_set1_ps(0.);
3876        assert_eq_m256(r, e);
3877    }
3878
3879    #[simd_test(enable = "avx")]
3880    fn test_mm_cmp_sd() {
3881        let a = _mm_setr_pd(4., 9.);
3882        let b = _mm_setr_pd(4., 3.);
3883        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3884        assert!(get_m128d(r, 0).is_nan());
3885        assert_eq!(get_m128d(r, 1), 9.);
3886    }
3887
3888    #[simd_test(enable = "avx")]
3889    fn test_mm_cmp_ss() {
3890        let a = _mm_setr_ps(4., 3., 2., 5.);
3891        let b = _mm_setr_ps(4., 9., 16., 25.);
3892        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3893        assert!(get_m128(r, 0).is_nan());
3894        assert_eq!(get_m128(r, 1), 3.);
3895        assert_eq!(get_m128(r, 2), 2.);
3896        assert_eq!(get_m128(r, 3), 5.);
3897    }
3898
3899    #[simd_test(enable = "avx")]
3900    const fn test_mm256_cvtepi32_pd() {
3901        let a = _mm_setr_epi32(4, 9, 16, 25);
3902        let r = _mm256_cvtepi32_pd(a);
3903        let e = _mm256_setr_pd(4., 9., 16., 25.);
3904        assert_eq_m256d(r, e);
3905    }
3906
3907    #[simd_test(enable = "avx")]
3908    const fn test_mm256_cvtepi32_ps() {
3909        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3910        let r = _mm256_cvtepi32_ps(a);
3911        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3912        assert_eq_m256(r, e);
3913    }
3914
3915    #[simd_test(enable = "avx")]
3916    const fn test_mm256_cvtpd_ps() {
3917        let a = _mm256_setr_pd(4., 9., 16., 25.);
3918        let r = _mm256_cvtpd_ps(a);
3919        let e = _mm_setr_ps(4., 9., 16., 25.);
3920        assert_eq_m128(r, e);
3921    }
3922
3923    #[simd_test(enable = "avx")]
3924    fn test_mm256_cvtps_epi32() {
3925        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3926        let r = _mm256_cvtps_epi32(a);
3927        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3928        assert_eq_m256i(r, e);
3929    }
3930
3931    #[simd_test(enable = "avx")]
3932    const fn test_mm256_cvtps_pd() {
3933        let a = _mm_setr_ps(4., 9., 16., 25.);
3934        let r = _mm256_cvtps_pd(a);
3935        let e = _mm256_setr_pd(4., 9., 16., 25.);
3936        assert_eq_m256d(r, e);
3937    }
3938
3939    #[simd_test(enable = "avx")]
3940    const fn test_mm256_cvtsd_f64() {
3941        let a = _mm256_setr_pd(1., 2., 3., 4.);
3942        let r = _mm256_cvtsd_f64(a);
3943        assert_eq!(r, 1.);
3944    }
3945
3946    #[simd_test(enable = "avx")]
3947    fn test_mm256_cvttpd_epi32() {
3948        let a = _mm256_setr_pd(4., 9., 16., 25.);
3949        let r = _mm256_cvttpd_epi32(a);
3950        let e = _mm_setr_epi32(4, 9, 16, 25);
3951        assert_eq_m128i(r, e);
3952    }
3953
3954    #[simd_test(enable = "avx")]
3955    fn test_mm256_cvtpd_epi32() {
3956        let a = _mm256_setr_pd(4., 9., 16., 25.);
3957        let r = _mm256_cvtpd_epi32(a);
3958        let e = _mm_setr_epi32(4, 9, 16, 25);
3959        assert_eq_m128i(r, e);
3960    }
3961
3962    #[simd_test(enable = "avx")]
3963    fn test_mm256_cvttps_epi32() {
3964        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3965        let r = _mm256_cvttps_epi32(a);
3966        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3967        assert_eq_m256i(r, e);
3968    }
3969
3970    #[simd_test(enable = "avx")]
3971    const fn test_mm256_extractf128_ps() {
3972        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3973        let r = _mm256_extractf128_ps::<0>(a);
3974        let e = _mm_setr_ps(4., 3., 2., 5.);
3975        assert_eq_m128(r, e);
3976    }
3977
3978    #[simd_test(enable = "avx")]
3979    const fn test_mm256_extractf128_pd() {
3980        let a = _mm256_setr_pd(4., 3., 2., 5.);
3981        let r = _mm256_extractf128_pd::<0>(a);
3982        let e = _mm_setr_pd(4., 3.);
3983        assert_eq_m128d(r, e);
3984    }
3985
3986    #[simd_test(enable = "avx")]
3987    const fn test_mm256_extractf128_si256() {
3988        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3989        let r = _mm256_extractf128_si256::<0>(a);
3990        let e = _mm_setr_epi64x(4, 3);
3991        assert_eq_m128i(r, e);
3992    }
3993
3994    #[simd_test(enable = "avx")]
3995    const fn test_mm256_extract_epi32() {
3996        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3997        let r1 = _mm256_extract_epi32::<0>(a);
3998        let r2 = _mm256_extract_epi32::<3>(a);
3999        assert_eq!(r1, -1);
4000        assert_eq!(r2, 3);
4001    }
4002
4003    #[simd_test(enable = "avx")]
4004    const fn test_mm256_cvtsi256_si32() {
4005        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4006        let r = _mm256_cvtsi256_si32(a);
4007        assert_eq!(r, 1);
4008    }
4009
4010    #[simd_test(enable = "avx")]
4011    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4012    fn test_mm256_zeroall() {
4013        _mm256_zeroall();
4014    }
4015
4016    #[simd_test(enable = "avx")]
4017    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4018    fn test_mm256_zeroupper() {
4019        _mm256_zeroupper();
4020    }
4021
4022    #[simd_test(enable = "avx")]
4023    fn test_mm256_permutevar_ps() {
4024        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4025        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4026        let r = _mm256_permutevar_ps(a, b);
4027        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
4028        assert_eq_m256(r, e);
4029    }
4030
4031    #[simd_test(enable = "avx")]
4032    fn test_mm_permutevar_ps() {
4033        let a = _mm_setr_ps(4., 3., 2., 5.);
4034        let b = _mm_setr_epi32(1, 2, 3, 4);
4035        let r = _mm_permutevar_ps(a, b);
4036        let e = _mm_setr_ps(3., 2., 5., 4.);
4037        assert_eq_m128(r, e);
4038    }
4039
4040    #[simd_test(enable = "avx")]
4041    const fn test_mm256_permute_ps() {
4042        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4043        let r = _mm256_permute_ps::<0x1b>(a);
4044        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
4045        assert_eq_m256(r, e);
4046    }
4047
4048    #[simd_test(enable = "avx")]
4049    const fn test_mm_permute_ps() {
4050        let a = _mm_setr_ps(4., 3., 2., 5.);
4051        let r = _mm_permute_ps::<0x1b>(a);
4052        let e = _mm_setr_ps(5., 2., 3., 4.);
4053        assert_eq_m128(r, e);
4054    }
4055
4056    #[simd_test(enable = "avx")]
4057    fn test_mm256_permutevar_pd() {
4058        let a = _mm256_setr_pd(4., 3., 2., 5.);
4059        let b = _mm256_setr_epi64x(1, 2, 3, 4);
4060        let r = _mm256_permutevar_pd(a, b);
4061        let e = _mm256_setr_pd(4., 3., 5., 2.);
4062        assert_eq_m256d(r, e);
4063    }
4064
4065    #[simd_test(enable = "avx")]
4066    fn test_mm_permutevar_pd() {
4067        let a = _mm_setr_pd(4., 3.);
4068        let b = _mm_setr_epi64x(3, 0);
4069        let r = _mm_permutevar_pd(a, b);
4070        let e = _mm_setr_pd(3., 4.);
4071        assert_eq_m128d(r, e);
4072    }
4073
4074    #[simd_test(enable = "avx")]
4075    const fn test_mm256_permute_pd() {
4076        let a = _mm256_setr_pd(4., 3., 2., 5.);
4077        let r = _mm256_permute_pd::<5>(a);
4078        let e = _mm256_setr_pd(3., 4., 5., 2.);
4079        assert_eq_m256d(r, e);
4080    }
4081
4082    #[simd_test(enable = "avx")]
4083    const fn test_mm_permute_pd() {
4084        let a = _mm_setr_pd(4., 3.);
4085        let r = _mm_permute_pd::<1>(a);
4086        let e = _mm_setr_pd(3., 4.);
4087        assert_eq_m128d(r, e);
4088    }
4089
4090    #[simd_test(enable = "avx")]
4091    const fn test_mm256_permute2f128_ps() {
4092        let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
4093        let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
4094        let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
4095        let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
4096        assert_eq_m256(r, e);
4097
4098        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4099        let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
4100        let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
4101        assert_eq_m256(r, z);
4102    }
4103
4104    #[simd_test(enable = "avx")]
4105    const fn test_mm256_permute2f128_pd() {
4106        let a = _mm256_setr_pd(1., 2., 3., 4.);
4107        let b = _mm256_setr_pd(5., 6., 7., 8.);
4108        let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
4109        let e = _mm256_setr_pd(3., 4., 7., 8.);
4110        assert_eq_m256d(r, e);
4111
4112        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4113        let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
4114        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
4115        assert_eq_m256d(r, e);
4116    }
4117
4118    #[simd_test(enable = "avx")]
4119    const fn test_mm256_permute2f128_si256() {
4120        let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
4121        let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
4122        let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
4123        let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
4124        assert_eq_m256i(r, e);
4125
4126        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4127        let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
4128        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
4129        assert_eq_m256i(r, e);
4130    }
4131
4132    #[simd_test(enable = "avx")]
4133    const fn test_mm256_broadcast_ss() {
4134        let r = _mm256_broadcast_ss(&3.);
4135        let e = _mm256_set1_ps(3.);
4136        assert_eq_m256(r, e);
4137    }
4138
4139    #[simd_test(enable = "avx")]
4140    const fn test_mm_broadcast_ss() {
4141        let r = _mm_broadcast_ss(&3.);
4142        let e = _mm_set1_ps(3.);
4143        assert_eq_m128(r, e);
4144    }
4145
4146    #[simd_test(enable = "avx")]
4147    const fn test_mm256_broadcast_sd() {
4148        let r = _mm256_broadcast_sd(&3.);
4149        let e = _mm256_set1_pd(3.);
4150        assert_eq_m256d(r, e);
4151    }
4152
4153    #[simd_test(enable = "avx")]
4154    const fn test_mm256_broadcast_ps() {
4155        let a = _mm_setr_ps(4., 3., 2., 5.);
4156        let r = _mm256_broadcast_ps(&a);
4157        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
4158        assert_eq_m256(r, e);
4159    }
4160
4161    #[simd_test(enable = "avx")]
4162    const fn test_mm256_broadcast_pd() {
4163        let a = _mm_setr_pd(4., 3.);
4164        let r = _mm256_broadcast_pd(&a);
4165        let e = _mm256_setr_pd(4., 3., 4., 3.);
4166        assert_eq_m256d(r, e);
4167    }
4168
4169    #[simd_test(enable = "avx")]
4170    const fn test_mm256_insertf128_ps() {
4171        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4172        let b = _mm_setr_ps(4., 9., 16., 25.);
4173        let r = _mm256_insertf128_ps::<0>(a, b);
4174        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
4175        assert_eq_m256(r, e);
4176    }
4177
4178    #[simd_test(enable = "avx")]
4179    const fn test_mm256_insertf128_pd() {
4180        let a = _mm256_setr_pd(1., 2., 3., 4.);
4181        let b = _mm_setr_pd(5., 6.);
4182        let r = _mm256_insertf128_pd::<0>(a, b);
4183        let e = _mm256_setr_pd(5., 6., 3., 4.);
4184        assert_eq_m256d(r, e);
4185    }
4186
4187    #[simd_test(enable = "avx")]
4188    const fn test_mm256_insertf128_si256() {
4189        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4190        let b = _mm_setr_epi64x(5, 6);
4191        let r = _mm256_insertf128_si256::<0>(a, b);
4192        let e = _mm256_setr_epi64x(5, 6, 3, 4);
4193        assert_eq_m256i(r, e);
4194    }
4195
4196    #[simd_test(enable = "avx")]
4197    const fn test_mm256_insert_epi8() {
4198        #[rustfmt::skip]
4199        let a = _mm256_setr_epi8(
4200            1, 2, 3, 4, 5, 6, 7, 8,
4201            9, 10, 11, 12, 13, 14, 15, 16,
4202            17, 18, 19, 20, 21, 22, 23, 24,
4203            25, 26, 27, 28, 29, 30, 31, 32,
4204        );
4205        let r = _mm256_insert_epi8::<31>(a, 0);
4206        #[rustfmt::skip]
4207        let e = _mm256_setr_epi8(
4208            1, 2, 3, 4, 5, 6, 7, 8,
4209            9, 10, 11, 12, 13, 14, 15, 16,
4210            17, 18, 19, 20, 21, 22, 23, 24,
4211            25, 26, 27, 28, 29, 30, 31, 0,
4212        );
4213        assert_eq_m256i(r, e);
4214    }
4215
4216    #[simd_test(enable = "avx")]
4217    const fn test_mm256_insert_epi16() {
4218        #[rustfmt::skip]
4219        let a = _mm256_setr_epi16(
4220            0, 1, 2, 3, 4, 5, 6, 7,
4221            8, 9, 10, 11, 12, 13, 14, 15,
4222        );
4223        let r = _mm256_insert_epi16::<15>(a, 0);
4224        #[rustfmt::skip]
4225        let e = _mm256_setr_epi16(
4226            0, 1, 2, 3, 4, 5, 6, 7,
4227            8, 9, 10, 11, 12, 13, 14, 0,
4228        );
4229        assert_eq_m256i(r, e);
4230    }
4231
4232    #[simd_test(enable = "avx")]
4233    const fn test_mm256_insert_epi32() {
4234        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4235        let r = _mm256_insert_epi32::<7>(a, 0);
4236        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4237        assert_eq_m256i(r, e);
4238    }
4239
4240    #[simd_test(enable = "avx")]
4241    const fn test_mm256_load_pd() {
4242        let a = _mm256_setr_pd(1., 2., 3., 4.);
4243        let p = ptr::addr_of!(a) as *const f64;
4244        let r = unsafe { _mm256_load_pd(p) };
4245        let e = _mm256_setr_pd(1., 2., 3., 4.);
4246        assert_eq_m256d(r, e);
4247    }
4248
4249    #[simd_test(enable = "avx")]
4250    const fn test_mm256_store_pd() {
4251        let a = _mm256_setr_pd(1., 2., 3., 4.);
4252        let mut r = _mm256_undefined_pd();
4253        unsafe {
4254            _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4255        }
4256        assert_eq_m256d(r, a);
4257    }
4258
4259    #[simd_test(enable = "avx")]
4260    const fn test_mm256_load_ps() {
4261        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4262        let p = ptr::addr_of!(a) as *const f32;
4263        let r = unsafe { _mm256_load_ps(p) };
4264        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4265        assert_eq_m256(r, e);
4266    }
4267
4268    #[simd_test(enable = "avx")]
4269    const fn test_mm256_store_ps() {
4270        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4271        let mut r = _mm256_undefined_ps();
4272        unsafe {
4273            _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4274        }
4275        assert_eq_m256(r, a);
4276    }
4277
4278    #[simd_test(enable = "avx")]
4279    const fn test_mm256_loadu_pd() {
4280        let a = &[1.0f64, 2., 3., 4.];
4281        let p = a.as_ptr();
4282        let r = unsafe { _mm256_loadu_pd(black_box(p)) };
4283        let e = _mm256_setr_pd(1., 2., 3., 4.);
4284        assert_eq_m256d(r, e);
4285    }
4286
4287    #[simd_test(enable = "avx")]
4288    const fn test_mm256_storeu_pd() {
4289        let a = _mm256_set1_pd(9.);
4290        let mut r = _mm256_undefined_pd();
4291        unsafe {
4292            _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4293        }
4294        assert_eq_m256d(r, a);
4295    }
4296
4297    #[simd_test(enable = "avx")]
4298    const fn test_mm256_loadu_ps() {
4299        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4300        let p = a.as_ptr();
4301        let r = unsafe { _mm256_loadu_ps(black_box(p)) };
4302        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4303        assert_eq_m256(r, e);
4304    }
4305
4306    #[simd_test(enable = "avx")]
4307    const fn test_mm256_storeu_ps() {
4308        let a = _mm256_set1_ps(9.);
4309        let mut r = _mm256_undefined_ps();
4310        unsafe {
4311            _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4312        }
4313        assert_eq_m256(r, a);
4314    }
4315
4316    #[simd_test(enable = "avx")]
4317    const fn test_mm256_load_si256() {
4318        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4319        let p = ptr::addr_of!(a);
4320        let r = unsafe { _mm256_load_si256(p) };
4321        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4322        assert_eq_m256i(r, e);
4323    }
4324
4325    #[simd_test(enable = "avx")]
4326    const fn test_mm256_store_si256() {
4327        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4328        let mut r = _mm256_undefined_si256();
4329        unsafe {
4330            _mm256_store_si256(ptr::addr_of_mut!(r), a);
4331        }
4332        assert_eq_m256i(r, a);
4333    }
4334
4335    #[simd_test(enable = "avx")]
4336    const fn test_mm256_loadu_si256() {
4337        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4338        let p = ptr::addr_of!(a);
4339        let r = unsafe { _mm256_loadu_si256(black_box(p)) };
4340        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4341        assert_eq_m256i(r, e);
4342    }
4343
4344    #[simd_test(enable = "avx")]
4345    const fn test_mm256_storeu_si256() {
4346        let a = _mm256_set1_epi8(9);
4347        let mut r = _mm256_undefined_si256();
4348        unsafe {
4349            _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4350        }
4351        assert_eq_m256i(r, a);
4352    }
4353
4354    #[simd_test(enable = "avx")]
4355    const fn test_mm256_maskload_pd() {
4356        let a = &[1.0f64, 2., 3., 4.];
4357        let p = a.as_ptr();
4358        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4359        let r = unsafe { _mm256_maskload_pd(black_box(p), mask) };
4360        let e = _mm256_setr_pd(0., 2., 0., 4.);
4361        assert_eq_m256d(r, e);
4362    }
4363
4364    #[simd_test(enable = "avx")]
4365    const fn test_mm256_maskstore_pd() {
4366        let mut r = _mm256_set1_pd(0.);
4367        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4368        let a = _mm256_setr_pd(1., 2., 3., 4.);
4369        unsafe {
4370            _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4371        }
4372        let e = _mm256_setr_pd(0., 2., 0., 4.);
4373        assert_eq_m256d(r, e);
4374    }
4375
4376    #[simd_test(enable = "avx")]
4377    const fn test_mm_maskload_pd() {
4378        let a = &[1.0f64, 2.];
4379        let p = a.as_ptr();
4380        let mask = _mm_setr_epi64x(0, !0);
4381        let r = unsafe { _mm_maskload_pd(black_box(p), mask) };
4382        let e = _mm_setr_pd(0., 2.);
4383        assert_eq_m128d(r, e);
4384    }
4385
4386    #[simd_test(enable = "avx")]
4387    const fn test_mm_maskstore_pd() {
4388        let mut r = _mm_set1_pd(0.);
4389        let mask = _mm_setr_epi64x(0, !0);
4390        let a = _mm_setr_pd(1., 2.);
4391        unsafe {
4392            _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4393        }
4394        let e = _mm_setr_pd(0., 2.);
4395        assert_eq_m128d(r, e);
4396    }
4397
4398    #[simd_test(enable = "avx")]
4399    const fn test_mm256_maskload_ps() {
4400        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4401        let p = a.as_ptr();
4402        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4403        let r = unsafe { _mm256_maskload_ps(black_box(p), mask) };
4404        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4405        assert_eq_m256(r, e);
4406    }
4407
4408    #[simd_test(enable = "avx")]
4409    const fn test_mm256_maskstore_ps() {
4410        let mut r = _mm256_set1_ps(0.);
4411        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4412        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4413        unsafe {
4414            _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4415        }
4416        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4417        assert_eq_m256(r, e);
4418    }
4419
4420    #[simd_test(enable = "avx")]
4421    const fn test_mm_maskload_ps() {
4422        let a = &[1.0f32, 2., 3., 4.];
4423        let p = a.as_ptr();
4424        let mask = _mm_setr_epi32(0, !0, 0, !0);
4425        let r = unsafe { _mm_maskload_ps(black_box(p), mask) };
4426        let e = _mm_setr_ps(0., 2., 0., 4.);
4427        assert_eq_m128(r, e);
4428    }
4429
4430    #[simd_test(enable = "avx")]
4431    const fn test_mm_maskstore_ps() {
4432        let mut r = _mm_set1_ps(0.);
4433        let mask = _mm_setr_epi32(0, !0, 0, !0);
4434        let a = _mm_setr_ps(1., 2., 3., 4.);
4435        unsafe {
4436            _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4437        }
4438        let e = _mm_setr_ps(0., 2., 0., 4.);
4439        assert_eq_m128(r, e);
4440    }
4441
4442    #[simd_test(enable = "avx")]
4443    const fn test_mm256_movehdup_ps() {
4444        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4445        let r = _mm256_movehdup_ps(a);
4446        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4447        assert_eq_m256(r, e);
4448    }
4449
4450    #[simd_test(enable = "avx")]
4451    const fn test_mm256_moveldup_ps() {
4452        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4453        let r = _mm256_moveldup_ps(a);
4454        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4455        assert_eq_m256(r, e);
4456    }
4457
4458    #[simd_test(enable = "avx")]
4459    const fn test_mm256_movedup_pd() {
4460        let a = _mm256_setr_pd(1., 2., 3., 4.);
4461        let r = _mm256_movedup_pd(a);
4462        let e = _mm256_setr_pd(1., 1., 3., 3.);
4463        assert_eq_m256d(r, e);
4464    }
4465
4466    #[simd_test(enable = "avx")]
4467    fn test_mm256_lddqu_si256() {
4468        #[rustfmt::skip]
4469        let a = _mm256_setr_epi8(
4470            1, 2, 3, 4, 5, 6, 7, 8,
4471            9, 10, 11, 12, 13, 14, 15, 16,
4472            17, 18, 19, 20, 21, 22, 23, 24,
4473            25, 26, 27, 28, 29, 30, 31, 32,
4474        );
4475        let p = ptr::addr_of!(a);
4476        let r = unsafe { _mm256_lddqu_si256(black_box(p)) };
4477        #[rustfmt::skip]
4478        let e = _mm256_setr_epi8(
4479            1, 2, 3, 4, 5, 6, 7, 8,
4480            9, 10, 11, 12, 13, 14, 15, 16,
4481            17, 18, 19, 20, 21, 22, 23, 24,
4482            25, 26, 27, 28, 29, 30, 31, 32,
4483        );
4484        assert_eq_m256i(r, e);
4485    }
4486
4487    #[simd_test(enable = "avx")]
4488    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4489    fn test_mm256_stream_si256() {
4490        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4491        let mut r = _mm256_undefined_si256();
4492        unsafe {
4493            _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4494        }
4495        _mm_sfence();
4496        assert_eq_m256i(r, a);
4497    }
4498
4499    #[simd_test(enable = "avx")]
4500    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4501    fn test_mm256_stream_pd() {
4502        #[repr(align(32))]
4503        struct Memory {
4504            pub data: [f64; 4],
4505        }
4506        let a = _mm256_set1_pd(7.0);
4507        let mut mem = Memory { data: [-1.0; 4] };
4508
4509        unsafe {
4510            _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4511        }
4512        _mm_sfence();
4513        for i in 0..4 {
4514            assert_eq!(mem.data[i], get_m256d(a, i));
4515        }
4516    }
4517
4518    #[simd_test(enable = "avx")]
4519    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4520    fn test_mm256_stream_ps() {
4521        #[repr(align(32))]
4522        struct Memory {
4523            pub data: [f32; 8],
4524        }
4525        let a = _mm256_set1_ps(7.0);
4526        let mut mem = Memory { data: [-1.0; 8] };
4527
4528        unsafe {
4529            _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4530        }
4531        _mm_sfence();
4532        for i in 0..8 {
4533            assert_eq!(mem.data[i], get_m256(a, i));
4534        }
4535    }
4536
4537    #[simd_test(enable = "avx")]
4538    fn test_mm256_rcp_ps() {
4539        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4540        let r = _mm256_rcp_ps(a);
4541        #[rustfmt::skip]
4542        let e = _mm256_setr_ps(
4543            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4544            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4545        );
4546        let rel_err = 0.00048828125;
4547        for i in 0..8 {
4548            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4549        }
4550    }
4551
4552    #[simd_test(enable = "avx")]
4553    fn test_mm256_rsqrt_ps() {
4554        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4555        let r = _mm256_rsqrt_ps(a);
4556        #[rustfmt::skip]
4557        let e = _mm256_setr_ps(
4558            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4559            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4560        );
4561        let rel_err = 0.00048828125;
4562        for i in 0..8 {
4563            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4564        }
4565    }
4566
4567    #[simd_test(enable = "avx")]
4568    const fn test_mm256_unpackhi_pd() {
4569        let a = _mm256_setr_pd(1., 2., 3., 4.);
4570        let b = _mm256_setr_pd(5., 6., 7., 8.);
4571        let r = _mm256_unpackhi_pd(a, b);
4572        let e = _mm256_setr_pd(2., 6., 4., 8.);
4573        assert_eq_m256d(r, e);
4574    }
4575
4576    #[simd_test(enable = "avx")]
4577    const fn test_mm256_unpackhi_ps() {
4578        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4579        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4580        let r = _mm256_unpackhi_ps(a, b);
4581        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4582        assert_eq_m256(r, e);
4583    }
4584
4585    #[simd_test(enable = "avx")]
4586    const fn test_mm256_unpacklo_pd() {
4587        let a = _mm256_setr_pd(1., 2., 3., 4.);
4588        let b = _mm256_setr_pd(5., 6., 7., 8.);
4589        let r = _mm256_unpacklo_pd(a, b);
4590        let e = _mm256_setr_pd(1., 5., 3., 7.);
4591        assert_eq_m256d(r, e);
4592    }
4593
4594    #[simd_test(enable = "avx")]
4595    const fn test_mm256_unpacklo_ps() {
4596        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4597        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4598        let r = _mm256_unpacklo_ps(a, b);
4599        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4600        assert_eq_m256(r, e);
4601    }
4602
4603    #[simd_test(enable = "avx")]
4604    const fn test_mm256_testz_si256() {
4605        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4606        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4607        let r = _mm256_testz_si256(a, b);
4608        assert_eq!(r, 0);
4609        let b = _mm256_set1_epi64x(0);
4610        let r = _mm256_testz_si256(a, b);
4611        assert_eq!(r, 1);
4612    }
4613
4614    #[simd_test(enable = "avx")]
4615    const fn test_mm256_testc_si256() {
4616        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4617        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4618        let r = _mm256_testc_si256(a, b);
4619        assert_eq!(r, 0);
4620        let b = _mm256_set1_epi64x(0);
4621        let r = _mm256_testc_si256(a, b);
4622        assert_eq!(r, 1);
4623    }
4624
4625    #[simd_test(enable = "avx")]
4626    fn test_mm256_testnzc_si256() {
4627        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4628        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4629        let r = _mm256_testnzc_si256(a, b);
4630        assert_eq!(r, 1);
4631        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4632        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4633        let r = _mm256_testnzc_si256(a, b);
4634        assert_eq!(r, 0);
4635    }
4636
4637    #[simd_test(enable = "avx")]
4638    fn test_mm256_testz_pd() {
4639        let a = _mm256_setr_pd(1., 2., 3., 4.);
4640        let b = _mm256_setr_pd(5., 6., 7., 8.);
4641        let r = _mm256_testz_pd(a, b);
4642        assert_eq!(r, 1);
4643        let a = _mm256_set1_pd(-1.);
4644        let r = _mm256_testz_pd(a, a);
4645        assert_eq!(r, 0);
4646    }
4647
4648    #[simd_test(enable = "avx")]
4649    fn test_mm256_testc_pd() {
4650        let a = _mm256_setr_pd(1., 2., 3., 4.);
4651        let b = _mm256_setr_pd(5., 6., 7., 8.);
4652        let r = _mm256_testc_pd(a, b);
4653        assert_eq!(r, 1);
4654        let a = _mm256_set1_pd(1.);
4655        let b = _mm256_set1_pd(-1.);
4656        let r = _mm256_testc_pd(a, b);
4657        assert_eq!(r, 0);
4658    }
4659
4660    #[simd_test(enable = "avx")]
4661    fn test_mm256_testnzc_pd() {
4662        let a = _mm256_setr_pd(1., 2., 3., 4.);
4663        let b = _mm256_setr_pd(5., 6., 7., 8.);
4664        let r = _mm256_testnzc_pd(a, b);
4665        assert_eq!(r, 0);
4666        let a = _mm256_setr_pd(1., -1., -1., -1.);
4667        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4668        let r = _mm256_testnzc_pd(a, b);
4669        assert_eq!(r, 1);
4670    }
4671
4672    #[simd_test(enable = "avx")]
4673    const fn test_mm_testz_pd() {
4674        let a = _mm_setr_pd(1., 2.);
4675        let b = _mm_setr_pd(5., 6.);
4676        let r = _mm_testz_pd(a, b);
4677        assert_eq!(r, 1);
4678        let a = _mm_set1_pd(-1.);
4679        let r = _mm_testz_pd(a, a);
4680        assert_eq!(r, 0);
4681    }
4682
4683    #[simd_test(enable = "avx")]
4684    const fn test_mm_testc_pd() {
4685        let a = _mm_setr_pd(1., 2.);
4686        let b = _mm_setr_pd(5., 6.);
4687        let r = _mm_testc_pd(a, b);
4688        assert_eq!(r, 1);
4689        let a = _mm_set1_pd(1.);
4690        let b = _mm_set1_pd(-1.);
4691        let r = _mm_testc_pd(a, b);
4692        assert_eq!(r, 0);
4693    }
4694
4695    #[simd_test(enable = "avx")]
4696    fn test_mm_testnzc_pd() {
4697        let a = _mm_setr_pd(1., 2.);
4698        let b = _mm_setr_pd(5., 6.);
4699        let r = _mm_testnzc_pd(a, b);
4700        assert_eq!(r, 0);
4701        let a = _mm_setr_pd(1., -1.);
4702        let b = _mm_setr_pd(-1., -1.);
4703        let r = _mm_testnzc_pd(a, b);
4704        assert_eq!(r, 1);
4705    }
4706
4707    #[simd_test(enable = "avx")]
4708    fn test_mm256_testz_ps() {
4709        let a = _mm256_set1_ps(1.);
4710        let r = _mm256_testz_ps(a, a);
4711        assert_eq!(r, 1);
4712        let a = _mm256_set1_ps(-1.);
4713        let r = _mm256_testz_ps(a, a);
4714        assert_eq!(r, 0);
4715    }
4716
4717    #[simd_test(enable = "avx")]
4718    fn test_mm256_testc_ps() {
4719        let a = _mm256_set1_ps(1.);
4720        let r = _mm256_testc_ps(a, a);
4721        assert_eq!(r, 1);
4722        let b = _mm256_set1_ps(-1.);
4723        let r = _mm256_testc_ps(a, b);
4724        assert_eq!(r, 0);
4725    }
4726
4727    #[simd_test(enable = "avx")]
4728    fn test_mm256_testnzc_ps() {
4729        let a = _mm256_set1_ps(1.);
4730        let r = _mm256_testnzc_ps(a, a);
4731        assert_eq!(r, 0);
4732        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4733        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4734        let r = _mm256_testnzc_ps(a, b);
4735        assert_eq!(r, 1);
4736    }
4737
4738    #[simd_test(enable = "avx")]
4739    const fn test_mm_testz_ps() {
4740        let a = _mm_set1_ps(1.);
4741        let r = _mm_testz_ps(a, a);
4742        assert_eq!(r, 1);
4743        let a = _mm_set1_ps(-1.);
4744        let r = _mm_testz_ps(a, a);
4745        assert_eq!(r, 0);
4746    }
4747
4748    #[simd_test(enable = "avx")]
4749    const fn test_mm_testc_ps() {
4750        let a = _mm_set1_ps(1.);
4751        let r = _mm_testc_ps(a, a);
4752        assert_eq!(r, 1);
4753        let b = _mm_set1_ps(-1.);
4754        let r = _mm_testc_ps(a, b);
4755        assert_eq!(r, 0);
4756    }
4757
4758    #[simd_test(enable = "avx")]
4759    fn test_mm_testnzc_ps() {
4760        let a = _mm_set1_ps(1.);
4761        let r = _mm_testnzc_ps(a, a);
4762        assert_eq!(r, 0);
4763        let a = _mm_setr_ps(1., -1., -1., -1.);
4764        let b = _mm_setr_ps(-1., -1., 1., 1.);
4765        let r = _mm_testnzc_ps(a, b);
4766        assert_eq!(r, 1);
4767    }
4768
4769    #[simd_test(enable = "avx")]
4770    const fn test_mm256_movemask_pd() {
4771        let a = _mm256_setr_pd(1., -2., 3., -4.);
4772        let r = _mm256_movemask_pd(a);
4773        assert_eq!(r, 0xA);
4774    }
4775
4776    #[simd_test(enable = "avx")]
4777    const fn test_mm256_movemask_ps() {
4778        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4779        let r = _mm256_movemask_ps(a);
4780        assert_eq!(r, 0xAA);
4781    }
4782
4783    #[simd_test(enable = "avx")]
4784    const fn test_mm256_setzero_pd() {
4785        let r = _mm256_setzero_pd();
4786        assert_eq_m256d(r, _mm256_set1_pd(0.));
4787    }
4788
4789    #[simd_test(enable = "avx")]
4790    const fn test_mm256_setzero_ps() {
4791        let r = _mm256_setzero_ps();
4792        assert_eq_m256(r, _mm256_set1_ps(0.));
4793    }
4794
4795    #[simd_test(enable = "avx")]
4796    const fn test_mm256_setzero_si256() {
4797        let r = _mm256_setzero_si256();
4798        assert_eq_m256i(r, _mm256_set1_epi8(0));
4799    }
4800
4801    #[simd_test(enable = "avx")]
4802    const fn test_mm256_set_pd() {
4803        let r = _mm256_set_pd(1., 2., 3., 4.);
4804        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4805    }
4806
4807    #[simd_test(enable = "avx")]
4808    const fn test_mm256_set_ps() {
4809        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4810        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4811    }
4812
4813    #[simd_test(enable = "avx")]
4814    const fn test_mm256_set_epi8() {
4815        #[rustfmt::skip]
4816        let r = _mm256_set_epi8(
4817            1, 2, 3, 4, 5, 6, 7, 8,
4818            9, 10, 11, 12, 13, 14, 15, 16,
4819            17, 18, 19, 20, 21, 22, 23, 24,
4820            25, 26, 27, 28, 29, 30, 31, 32,
4821        );
4822        #[rustfmt::skip]
4823        let e = _mm256_setr_epi8(
4824            32, 31, 30, 29, 28, 27, 26, 25,
4825            24, 23, 22, 21, 20, 19, 18, 17,
4826            16, 15, 14, 13, 12, 11, 10, 9,
4827            8, 7, 6, 5, 4, 3, 2, 1
4828        );
4829        assert_eq_m256i(r, e);
4830    }
4831
4832    #[simd_test(enable = "avx")]
4833    const fn test_mm256_set_epi16() {
4834        #[rustfmt::skip]
4835        let r = _mm256_set_epi16(
4836            1, 2, 3, 4, 5, 6, 7, 8,
4837            9, 10, 11, 12, 13, 14, 15, 16,
4838        );
4839        #[rustfmt::skip]
4840        let e = _mm256_setr_epi16(
4841            16, 15, 14, 13, 12, 11, 10, 9, 8,
4842            7, 6, 5, 4, 3, 2, 1,
4843        );
4844        assert_eq_m256i(r, e);
4845    }
4846
4847    #[simd_test(enable = "avx")]
4848    const fn test_mm256_set_epi32() {
4849        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4850        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4851    }
4852
4853    #[simd_test(enable = "avx")]
4854    const fn test_mm256_set_epi64x() {
4855        let r = _mm256_set_epi64x(1, 2, 3, 4);
4856        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4857    }
4858
4859    #[simd_test(enable = "avx")]
4860    const fn test_mm256_setr_pd() {
4861        let r = _mm256_setr_pd(1., 2., 3., 4.);
4862        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4863    }
4864
4865    #[simd_test(enable = "avx")]
4866    const fn test_mm256_setr_ps() {
4867        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4868        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4869    }
4870
4871    #[simd_test(enable = "avx")]
4872    const fn test_mm256_setr_epi8() {
4873        #[rustfmt::skip]
4874        let r = _mm256_setr_epi8(
4875            1, 2, 3, 4, 5, 6, 7, 8,
4876            9, 10, 11, 12, 13, 14, 15, 16,
4877            17, 18, 19, 20, 21, 22, 23, 24,
4878            25, 26, 27, 28, 29, 30, 31, 32,
4879        );
4880        #[rustfmt::skip]
4881        let e = _mm256_setr_epi8(
4882            1, 2, 3, 4, 5, 6, 7, 8,
4883            9, 10, 11, 12, 13, 14, 15, 16,
4884            17, 18, 19, 20, 21, 22, 23, 24,
4885            25, 26, 27, 28, 29, 30, 31, 32
4886        );
4887
4888        assert_eq_m256i(r, e);
4889    }
4890
4891    #[simd_test(enable = "avx")]
4892    const fn test_mm256_setr_epi16() {
4893        #[rustfmt::skip]
4894        let r = _mm256_setr_epi16(
4895            1, 2, 3, 4, 5, 6, 7, 8,
4896            9, 10, 11, 12, 13, 14, 15, 16,
4897        );
4898        #[rustfmt::skip]
4899        let e = _mm256_setr_epi16(
4900            1, 2, 3, 4, 5, 6, 7, 8,
4901            9, 10, 11, 12, 13, 14, 15, 16,
4902        );
4903        assert_eq_m256i(r, e);
4904    }
4905
4906    #[simd_test(enable = "avx")]
4907    const fn test_mm256_setr_epi32() {
4908        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4909        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4910    }
4911
4912    #[simd_test(enable = "avx")]
4913    const fn test_mm256_setr_epi64x() {
4914        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4915        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4916    }
4917
4918    #[simd_test(enable = "avx")]
4919    const fn test_mm256_set1_pd() {
4920        let r = _mm256_set1_pd(1.);
4921        assert_eq_m256d(r, _mm256_set1_pd(1.));
4922    }
4923
4924    #[simd_test(enable = "avx")]
4925    const fn test_mm256_set1_ps() {
4926        let r = _mm256_set1_ps(1.);
4927        assert_eq_m256(r, _mm256_set1_ps(1.));
4928    }
4929
4930    #[simd_test(enable = "avx")]
4931    const fn test_mm256_set1_epi8() {
4932        let r = _mm256_set1_epi8(1);
4933        assert_eq_m256i(r, _mm256_set1_epi8(1));
4934    }
4935
4936    #[simd_test(enable = "avx")]
4937    const fn test_mm256_set1_epi16() {
4938        let r = _mm256_set1_epi16(1);
4939        assert_eq_m256i(r, _mm256_set1_epi16(1));
4940    }
4941
4942    #[simd_test(enable = "avx")]
4943    const fn test_mm256_set1_epi32() {
4944        let r = _mm256_set1_epi32(1);
4945        assert_eq_m256i(r, _mm256_set1_epi32(1));
4946    }
4947
4948    #[simd_test(enable = "avx")]
4949    const fn test_mm256_set1_epi64x() {
4950        let r = _mm256_set1_epi64x(1);
4951        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4952    }
4953
4954    #[simd_test(enable = "avx")]
4955    const fn test_mm256_castpd_ps() {
4956        let a = _mm256_setr_pd(1., 2., 3., 4.);
4957        let r = _mm256_castpd_ps(a);
4958        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4959        assert_eq_m256(r, e);
4960    }
4961
4962    #[simd_test(enable = "avx")]
4963    const fn test_mm256_castps_pd() {
4964        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4965        let r = _mm256_castps_pd(a);
4966        let e = _mm256_setr_pd(1., 2., 3., 4.);
4967        assert_eq_m256d(r, e);
4968    }
4969
4970    #[simd_test(enable = "avx")]
4971    const fn test_mm256_castps_si256() {
4972        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4973        let r = _mm256_castps_si256(a);
4974        #[rustfmt::skip]
4975        let e = _mm256_setr_epi8(
4976            0, 0, -128, 63, 0, 0, 0, 64,
4977            0, 0, 64, 64, 0, 0, -128, 64,
4978            0, 0, -96, 64, 0, 0, -64, 64,
4979            0, 0, -32, 64, 0, 0, 0, 65,
4980        );
4981        assert_eq_m256i(r, e);
4982    }
4983
4984    #[simd_test(enable = "avx")]
4985    const fn test_mm256_castsi256_ps() {
4986        #[rustfmt::skip]
4987        let a = _mm256_setr_epi8(
4988            0, 0, -128, 63, 0, 0, 0, 64,
4989            0, 0, 64, 64, 0, 0, -128, 64,
4990            0, 0, -96, 64, 0, 0, -64, 64,
4991            0, 0, -32, 64, 0, 0, 0, 65,
4992        );
4993        let r = _mm256_castsi256_ps(a);
4994        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4995        assert_eq_m256(r, e);
4996    }
4997
4998    #[simd_test(enable = "avx")]
4999    const fn test_mm256_castpd_si256() {
5000        let a = _mm256_setr_pd(1., 2., 3., 4.);
5001        let r = _mm256_castpd_si256(a);
5002        assert_eq_m256d(unsafe { transmute(r) }, a);
5003    }
5004
5005    #[simd_test(enable = "avx")]
5006    const fn test_mm256_castsi256_pd() {
5007        let a = _mm256_setr_epi64x(1, 2, 3, 4);
5008        let r = _mm256_castsi256_pd(a);
5009        assert_eq_m256d(r, unsafe { transmute(a) });
5010    }
5011
5012    #[simd_test(enable = "avx")]
5013    const fn test_mm256_castps256_ps128() {
5014        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5015        let r = _mm256_castps256_ps128(a);
5016        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
5017    }
5018
5019    #[simd_test(enable = "avx")]
5020    const fn test_mm256_castpd256_pd128() {
5021        let a = _mm256_setr_pd(1., 2., 3., 4.);
5022        let r = _mm256_castpd256_pd128(a);
5023        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
5024    }
5025
5026    #[simd_test(enable = "avx")]
5027    const fn test_mm256_castsi256_si128() {
5028        let a = _mm256_setr_epi64x(1, 2, 3, 4);
5029        let r = _mm256_castsi256_si128(a);
5030        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
5031    }
5032
5033    #[simd_test(enable = "avx")]
5034    const fn test_mm256_castps128_ps256() {
5035        let a = _mm_setr_ps(1., 2., 3., 4.);
5036        let r = _mm256_castps128_ps256(a);
5037        assert_eq_m128(_mm256_castps256_ps128(r), a);
5038    }
5039
5040    #[simd_test(enable = "avx")]
5041    const fn test_mm256_castpd128_pd256() {
5042        let a = _mm_setr_pd(1., 2.);
5043        let r = _mm256_castpd128_pd256(a);
5044        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
5045    }
5046
5047    #[simd_test(enable = "avx")]
5048    const fn test_mm256_castsi128_si256() {
5049        let a = _mm_setr_epi32(1, 2, 3, 4);
5050        let r = _mm256_castsi128_si256(a);
5051        assert_eq_m128i(_mm256_castsi256_si128(r), a);
5052    }
5053
5054    #[simd_test(enable = "avx")]
5055    const fn test_mm256_zextps128_ps256() {
5056        let a = _mm_setr_ps(1., 2., 3., 4.);
5057        let r = _mm256_zextps128_ps256(a);
5058        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
5059        assert_eq_m256(r, e);
5060    }
5061
5062    #[simd_test(enable = "avx")]
5063    const fn test_mm256_zextsi128_si256() {
5064        let a = _mm_setr_epi64x(1, 2);
5065        let r = _mm256_zextsi128_si256(a);
5066        let e = _mm256_setr_epi64x(1, 2, 0, 0);
5067        assert_eq_m256i(r, e);
5068    }
5069
5070    #[simd_test(enable = "avx")]
5071    const fn test_mm256_zextpd128_pd256() {
5072        let a = _mm_setr_pd(1., 2.);
5073        let r = _mm256_zextpd128_pd256(a);
5074        let e = _mm256_setr_pd(1., 2., 0., 0.);
5075        assert_eq_m256d(r, e);
5076    }
5077
5078    #[simd_test(enable = "avx")]
5079    const fn test_mm256_set_m128() {
5080        let hi = _mm_setr_ps(5., 6., 7., 8.);
5081        let lo = _mm_setr_ps(1., 2., 3., 4.);
5082        let r = _mm256_set_m128(hi, lo);
5083        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5084        assert_eq_m256(r, e);
5085    }
5086
5087    #[simd_test(enable = "avx")]
5088    const fn test_mm256_set_m128d() {
5089        let hi = _mm_setr_pd(3., 4.);
5090        let lo = _mm_setr_pd(1., 2.);
5091        let r = _mm256_set_m128d(hi, lo);
5092        let e = _mm256_setr_pd(1., 2., 3., 4.);
5093        assert_eq_m256d(r, e);
5094    }
5095
5096    #[simd_test(enable = "avx")]
5097    const fn test_mm256_set_m128i() {
5098        #[rustfmt::skip]
5099        let hi = _mm_setr_epi8(
5100            17, 18, 19, 20,
5101            21, 22, 23, 24,
5102            25, 26, 27, 28,
5103            29, 30, 31, 32,
5104        );
5105        #[rustfmt::skip]
5106        let lo = _mm_setr_epi8(
5107            1, 2, 3, 4,
5108            5, 6, 7, 8,
5109            9, 10, 11, 12,
5110            13, 14, 15, 16,
5111        );
5112        let r = _mm256_set_m128i(hi, lo);
5113        #[rustfmt::skip]
5114        let e = _mm256_setr_epi8(
5115            1, 2, 3, 4, 5, 6, 7, 8,
5116            9, 10, 11, 12, 13, 14, 15, 16,
5117            17, 18, 19, 20, 21, 22, 23, 24,
5118            25, 26, 27, 28, 29, 30, 31, 32,
5119        );
5120        assert_eq_m256i(r, e);
5121    }
5122
5123    #[simd_test(enable = "avx")]
5124    const fn test_mm256_setr_m128() {
5125        let lo = _mm_setr_ps(1., 2., 3., 4.);
5126        let hi = _mm_setr_ps(5., 6., 7., 8.);
5127        let r = _mm256_setr_m128(lo, hi);
5128        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5129        assert_eq_m256(r, e);
5130    }
5131
5132    #[simd_test(enable = "avx")]
5133    const fn test_mm256_setr_m128d() {
5134        let lo = _mm_setr_pd(1., 2.);
5135        let hi = _mm_setr_pd(3., 4.);
5136        let r = _mm256_setr_m128d(lo, hi);
5137        let e = _mm256_setr_pd(1., 2., 3., 4.);
5138        assert_eq_m256d(r, e);
5139    }
5140
5141    #[simd_test(enable = "avx")]
5142    const fn test_mm256_setr_m128i() {
5143        #[rustfmt::skip]
5144        let lo = _mm_setr_epi8(
5145            1, 2, 3, 4,
5146            5, 6, 7, 8,
5147            9, 10, 11, 12,
5148            13, 14, 15, 16,
5149        );
5150        #[rustfmt::skip]
5151        let hi = _mm_setr_epi8(
5152            17, 18, 19, 20, 21, 22, 23, 24,
5153            25, 26, 27, 28, 29, 30, 31, 32,
5154        );
5155        let r = _mm256_setr_m128i(lo, hi);
5156        #[rustfmt::skip]
5157        let e = _mm256_setr_epi8(
5158            1, 2, 3, 4, 5, 6, 7, 8,
5159            9, 10, 11, 12, 13, 14, 15, 16,
5160            17, 18, 19, 20, 21, 22, 23, 24,
5161            25, 26, 27, 28, 29, 30, 31, 32,
5162        );
5163        assert_eq_m256i(r, e);
5164    }
5165
5166    #[simd_test(enable = "avx")]
5167    const fn test_mm256_loadu2_m128() {
5168        let hi = &[5., 6., 7., 8.];
5169        let hiaddr = hi.as_ptr();
5170        let lo = &[1., 2., 3., 4.];
5171        let loaddr = lo.as_ptr();
5172        let r = unsafe { _mm256_loadu2_m128(hiaddr, loaddr) };
5173        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5174        assert_eq_m256(r, e);
5175    }
5176
5177    #[simd_test(enable = "avx")]
5178    const fn test_mm256_loadu2_m128d() {
5179        let hi = &[3., 4.];
5180        let hiaddr = hi.as_ptr();
5181        let lo = &[1., 2.];
5182        let loaddr = lo.as_ptr();
5183        let r = unsafe { _mm256_loadu2_m128d(hiaddr, loaddr) };
5184        let e = _mm256_setr_pd(1., 2., 3., 4.);
5185        assert_eq_m256d(r, e);
5186    }
5187
5188    #[simd_test(enable = "avx")]
5189    const fn test_mm256_loadu2_m128i() {
5190        #[rustfmt::skip]
5191        let hi = _mm_setr_epi8(
5192            17, 18, 19, 20, 21, 22, 23, 24,
5193            25, 26, 27, 28, 29, 30, 31, 32,
5194        );
5195        #[rustfmt::skip]
5196        let lo = _mm_setr_epi8(
5197            1, 2, 3, 4, 5, 6, 7, 8,
5198            9, 10, 11, 12, 13, 14, 15, 16,
5199        );
5200        let r = unsafe {
5201            _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _)
5202        };
5203        #[rustfmt::skip]
5204        let e = _mm256_setr_epi8(
5205            1, 2, 3, 4, 5, 6, 7, 8,
5206            9, 10, 11, 12, 13, 14, 15, 16,
5207            17, 18, 19, 20, 21, 22, 23, 24,
5208            25, 26, 27, 28, 29, 30, 31, 32,
5209        );
5210        assert_eq_m256i(r, e);
5211    }
5212
5213    #[simd_test(enable = "avx")]
5214    const fn test_mm256_storeu2_m128() {
5215        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5216        let mut hi = _mm_undefined_ps();
5217        let mut lo = _mm_undefined_ps();
5218        unsafe {
5219            _mm256_storeu2_m128(
5220                ptr::addr_of_mut!(hi) as *mut f32,
5221                ptr::addr_of_mut!(lo) as *mut f32,
5222                a,
5223            );
5224        }
5225        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
5226        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
5227    }
5228
5229    #[simd_test(enable = "avx")]
5230    const fn test_mm256_storeu2_m128d() {
5231        let a = _mm256_setr_pd(1., 2., 3., 4.);
5232        let mut hi = _mm_undefined_pd();
5233        let mut lo = _mm_undefined_pd();
5234        unsafe {
5235            _mm256_storeu2_m128d(
5236                ptr::addr_of_mut!(hi) as *mut f64,
5237                ptr::addr_of_mut!(lo) as *mut f64,
5238                a,
5239            );
5240        }
5241        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
5242        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
5243    }
5244
5245    #[simd_test(enable = "avx")]
5246    const fn test_mm256_storeu2_m128i() {
5247        #[rustfmt::skip]
5248        let a = _mm256_setr_epi8(
5249            1, 2, 3, 4, 5, 6, 7, 8,
5250            9, 10, 11, 12, 13, 14, 15, 16,
5251            17, 18, 19, 20, 21, 22, 23, 24,
5252            25, 26, 27, 28, 29, 30, 31, 32,
5253        );
5254        let mut hi = _mm_undefined_si128();
5255        let mut lo = _mm_undefined_si128();
5256        unsafe {
5257            _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5258        }
5259        #[rustfmt::skip]
5260        let e_hi = _mm_setr_epi8(
5261            17, 18, 19, 20, 21, 22, 23, 24,
5262            25, 26, 27, 28, 29, 30, 31, 32
5263        );
5264        #[rustfmt::skip]
5265        let e_lo = _mm_setr_epi8(
5266            1, 2, 3, 4, 5, 6, 7, 8,
5267            9, 10, 11, 12, 13, 14, 15, 16
5268        );
5269
5270        assert_eq_m128i(hi, e_hi);
5271        assert_eq_m128i(lo, e_lo);
5272    }
5273
5274    #[simd_test(enable = "avx")]
5275    const fn test_mm256_cvtss_f32() {
5276        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5277        let r = _mm256_cvtss_f32(a);
5278        assert_eq!(r, 1.);
5279    }
5280}