16#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
29 const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
30 const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
31 const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
32 const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
33 const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
34 const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
35 const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
37 const __m256 x_times_x = _mm256_mul_ps(x, x);
40 arctan = _mm256_mul_ps(x_times_x, arctan);
41 arctan = _mm256_add_ps(arctan, a11);
42 arctan = _mm256_mul_ps(x_times_x, arctan);
43 arctan = _mm256_add_ps(arctan, a9);
44 arctan = _mm256_mul_ps(x_times_x, arctan);
45 arctan = _mm256_add_ps(arctan, a7);
46 arctan = _mm256_mul_ps(x_times_x, arctan);
47 arctan = _mm256_add_ps(arctan, a5);
48 arctan = _mm256_mul_ps(x_times_x, arctan);
49 arctan = _mm256_add_ps(arctan, a3);
50 arctan = _mm256_mul_ps(x_times_x, arctan);
51 arctan = _mm256_add_ps(arctan, a1);
52 arctan = _mm256_mul_ps(x, arctan);
59 __m256 yl, yh, tmp1, tmp2;
60 yl = _mm256_moveldup_ps(y);
61 yh = _mm256_movehdup_ps(y);
62 tmp1 = _mm256_mul_ps(x, yl);
63 x = _mm256_shuffle_ps(x, x, 0xB1);
64 tmp2 = _mm256_mul_ps(x, yh);
67 return _mm256_addsub_ps(tmp1, tmp2);
72 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
73 return _mm256_xor_ps(x, conjugator);
78 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
79 const __m256 dreal = _mm256_moveldup_ps(y);
80 const __m256 dimag = _mm256_movehdup_ps(y);
82 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
83 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
84 const __m256 multreal = _mm256_mul_ps(x, dreal);
85 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
86 return _mm256_add_ps(multreal, multimag);
91 __m256 tmp1 = _mm256_mul_ps(
val,
val);
92 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
93 tmp1 = _mm256_shuffle_ps(tmp1, tmp1,
_MM_SHUFFLE(3, 1, 2, 0));
94 tmp1 = _mm256_sqrt_ps(tmp1);
95 return _mm256_div_ps(
val, tmp1);
100 __m256 complex1, complex2;
101 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
102 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
103 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
104 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
105 return _mm256_hadd_ps(complex1, complex2);
114 const __m256 symbols1,
115 const __m256 points0,
116 const __m256 points1,
124 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
125 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
127 return _mm256_mul_ps(norms, scalar);
132 __m256 sign_mask_dummy = _mm256_setzero_ps();
186 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
187 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
188 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
189 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
194 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
195 const __m256 abs_mask =
196 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
203 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
205 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
206 return _mm256_or_ps(dst, sign);
218 llr0 = _mm256_xor_ps(llr0, sign_mask);
219 __m256 dst = _mm256_add_ps(llr0, llr1);
224 __m256 sq_acc, __m256 acc, __m256
val, __m256 rec, __m256 aux)
226 aux = _mm256_mul_ps(aux,
val);
227 aux = _mm256_sub_ps(aux, acc);
228 aux = _mm256_mul_ps(aux, aux);
229 aux = _mm256_mul_ps(aux, rec);
230 return _mm256_add_ps(sq_acc, aux);