Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_avx_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11/*
12 * This file is intended to hold AVX intrinsics of intrinsics.
13 * They should be used in VOLK kernels to avoid copy-pasta.
14 */
15
16#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
18#include <immintrin.h>
19
20/*
21 * Approximate arctan(x) via polynomial expansion
22 * on the interval [-1, 1]
23 *
24 * Maximum relative error ~6.5e-7
25 * Polynomial evaluated via Horner's method
26 */
27static inline __m256 _m256_arctan_poly_avx(const __m256 x)
28{
29 const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
30 const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
31 const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
32 const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
33 const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
34 const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
35 const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
36
37 const __m256 x_times_x = _mm256_mul_ps(x, x);
38 __m256 arctan;
39 arctan = a13;
40 arctan = _mm256_mul_ps(x_times_x, arctan);
41 arctan = _mm256_add_ps(arctan, a11);
42 arctan = _mm256_mul_ps(x_times_x, arctan);
43 arctan = _mm256_add_ps(arctan, a9);
44 arctan = _mm256_mul_ps(x_times_x, arctan);
45 arctan = _mm256_add_ps(arctan, a7);
46 arctan = _mm256_mul_ps(x_times_x, arctan);
47 arctan = _mm256_add_ps(arctan, a5);
48 arctan = _mm256_mul_ps(x_times_x, arctan);
49 arctan = _mm256_add_ps(arctan, a3);
50 arctan = _mm256_mul_ps(x_times_x, arctan);
51 arctan = _mm256_add_ps(arctan, a1);
52 arctan = _mm256_mul_ps(x, arctan);
53
54 return arctan;
55}
56
57static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
58{
59 __m256 yl, yh, tmp1, tmp2;
60 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
61 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
62 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
63 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
64 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
65
66 // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
67 return _mm256_addsub_ps(tmp1, tmp2);
68}
69
70static inline __m256 _mm256_conjugate_ps(__m256 x)
71{
72 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
73 return _mm256_xor_ps(x, conjugator); // conjugate y
74}
75
76static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
77{
78 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
79 const __m256 dreal = _mm256_moveldup_ps(y);
80 const __m256 dimag = _mm256_movehdup_ps(y);
81
82 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
83 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
84 const __m256 multreal = _mm256_mul_ps(x, dreal);
85 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
86 return _mm256_add_ps(multreal, multimag);
87}
88
89static inline __m256 _mm256_normalize_ps(__m256 val)
90{
91 __m256 tmp1 = _mm256_mul_ps(val, val);
92 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
93 tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
94 tmp1 = _mm256_sqrt_ps(tmp1);
95 return _mm256_div_ps(val, tmp1);
96}
97
98static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
99{
100 __m256 complex1, complex2;
101 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
102 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
103 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
104 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
105 return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
106}
107
108static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
109{
110 return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
111}
112
113static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
114 const __m256 symbols1,
115 const __m256 points0,
116 const __m256 points1,
117 const __m256 scalar)
118{
119 /*
120 * Calculate: |y - x|^2 * SNR_lin
121 * Consider 'symbolsX' and 'pointsX' to be complex float
122 * 'symbolsX' are 'y' and 'pointsX' are 'x'
123 */
124 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
125 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
126 const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
127 return _mm256_mul_ps(norms, scalar);
128}
129
130static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
131{
132 __m256 sign_mask_dummy = _mm256_setzero_ps();
133 const __m128i zeros = _mm_set1_epi8(0x00);
134 const __m128i sign_extract = _mm_set1_epi8(0x80);
135 const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
136 0xff,
137 0xff,
138 0x00,
139 0xff,
140 0xff,
141 0xff,
142 0x01,
143 0xff,
144 0xff,
145 0xff,
146 0x02,
147 0xff,
148 0xff,
149 0xff,
150 0x03);
151 const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
152 0xff,
153 0xff,
154 0x04,
155 0xff,
156 0xff,
157 0xff,
158 0x05,
159 0xff,
160 0xff,
161 0xff,
162 0x06,
163 0xff,
164 0xff,
165 0xff,
166 0x07);
167
168 fbits = _mm_cmpgt_epi8(fbits, zeros);
169 fbits = _mm_and_si128(fbits, sign_extract);
170 __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
171 __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
172
173 __m256 sign_mask =
174 _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
175 return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
176 // // This is the desired function call. Though it seems to be missing in GCC.
177 // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
178 // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
179 // _mm_castsi128_ps(sign_bits0));
180}
181
182static inline void
183_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
184{
185 // deinterleave values
186 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
187 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
188 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
189 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
190}
191
192static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
193{
194 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
195 const __m256 abs_mask =
196 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
197
198 __m256 llr0, llr1;
199 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
200
201 // calculate result
202 __m256 sign =
203 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
204 __m256 dst =
205 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
206 return _mm256_or_ps(dst, sign);
207}
208
209static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
210{
211 // prepare sign mask for correct +-
212 __m256 sign_mask = _mm256_polar_sign_mask(fbits);
213
214 __m256 llr0, llr1;
215 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
216
217 // calculate result
218 llr0 = _mm256_xor_ps(llr0, sign_mask);
219 __m256 dst = _mm256_add_ps(llr0, llr1);
220 return dst;
221}
222
224 __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
225{
226 aux = _mm256_mul_ps(aux, val);
227 aux = _mm256_sub_ps(aux, acc);
228 aux = _mm256_mul_ps(aux, aux);
229 aux = _mm256_mul_ps(aux, rec);
230 return _mm256_add_ps(sq_acc, aux);
231}
232
233#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */