Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_avx2_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11/*
12 * This file is intended to hold AVX2 intrinsics of intrinsics.
13 * They should be used in VOLK kernels to avoid copy-paste.
14 */
15
16#ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
17#define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
19#include <immintrin.h>
20
21static inline __m256 _mm256_real(const __m256 z1, const __m256 z2)
22{
23 const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
24 __m256 r = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(2, 0, 2, 0));
25 return _mm256_permutevar8x32_ps(r, permute_mask);
26}
27
28static inline __m256 _mm256_imag(const __m256 z1, const __m256 z2)
29{
30 const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
31 __m256 i = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(3, 1, 3, 1));
32 return _mm256_permutevar8x32_ps(i, permute_mask);
33}
34
35static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
36{
37 const __m128i zeros = _mm_set1_epi8(0x00);
38 const __m128i sign_extract = _mm_set1_epi8(0x80);
39 const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
40 0xff,
41 0xff,
42 0x00,
43 0xff,
44 0xff,
45 0xff,
46 0x01,
47 0xff,
48 0xff,
49 0xff,
50 0x02,
51 0xff,
52 0xff,
53 0xff,
54 0x03,
55 0xff,
56 0xff,
57 0xff,
58 0x04,
59 0xff,
60 0xff,
61 0xff,
62 0x05,
63 0xff,
64 0xff,
65 0xff,
66 0x06,
67 0xff,
68 0xff,
69 0xff,
70 0x07);
71 __m256i sign_bits = _mm256_setzero_si256();
72
73 fbits = _mm_cmpgt_epi8(fbits, zeros);
74 fbits = _mm_and_si128(fbits, sign_extract);
75 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
76 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
77 sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
78
79 return _mm256_castsi256_ps(sign_bits);
80}
81
82static inline __m256
83_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
84{
85 // prepare sign mask for correct +-
86 __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
87
88 __m256 llr0, llr1;
89 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
90
91 // calculate result
92 llr0 = _mm256_xor_ps(llr0, sign_mask);
93 __m256 dst = _mm256_add_ps(llr0, llr1);
94 return dst;
95}
96
97static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
98 const __m256 cplxValue1)
99{
100 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
101 const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
102 const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
103 const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
104 return _mm256_permutevar8x32_ps(complex_result, idx);
105}
106
107static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
108 const __m256 symbols1,
109 const __m256 points0,
110 const __m256 points1,
111 const __m256 scalar)
112{
113 /*
114 * Calculate: |y - x|^2 * SNR_lin
115 * Consider 'symbolsX' and 'pointsX' to be complex float
116 * 'symbolsX' are 'y' and 'pointsX' are 'x'
117 */
118 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
119 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
120 const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
121 return _mm256_mul_ps(norms, scalar);
122}
123
124/*
125 * The function below vectorizes the inner loop of the following code:
126 *
127 * float max_values[8] = {0.f};
128 * unsigned max_indices[8] = {0};
129 * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
130 * for (unsigned i = 0; i < num_points / 8; ++i) {
131 * for (unsigned j = 0; j < 8; ++j) {
132 * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
133 * bool compare = abs_squared > max_values[j];
134 * max_values[j] = compare ? abs_squared : max_values[j];
135 * max_indices[j] = compare ? current_indices[j] : max_indices[j]
136 * current_indices[j] += 8; // update for next outer loop iteration
137 * ++src0;
138 * }
139 * }
140 */
141static inline void vector_32fc_index_max_variant0(__m256 in0,
142 __m256 in1,
143 __m256* max_values,
144 __m256i* max_indices,
145 __m256i* current_indices,
146 __m256i indices_increment)
147{
148 in0 = _mm256_mul_ps(in0, in0);
149 in1 = _mm256_mul_ps(in1, in1);
150
151 /*
152 * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
153 * hadd_ps(a, b) computes
154 * (b_7 + b_6,
155 * b_5 + b_4,
156 * ---------
157 * a_7 + b_6,
158 * a_5 + a_4,
159 * ---------
160 * b_3 + b_2,
161 * b_1 + b_0,
162 * ---------
163 * a_3 + a_2,
164 * a_1 + a_0).
165 * The result is the squared absolute value of complex numbers at index
166 * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
167 * current_indices!
168 */
169 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
170
171 /*
172 * Compare the recently computed squared absolute values with the
173 * previously determined maximum values. cmp_ps(a, b) determines
174 * a > b ? 0xFFFFFFFF for each element in the vectors =>
175 * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
176 *
177 * If either operand is NaN, 0 is returned as an “ordered” comparision is
178 * used => the blend operation will select the value from *max_values.
179 */
180 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
181
182 /* Select maximum by blending. This is the only line which differs from variant1 */
183 *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
184
185 /*
186 * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
187 * each element in the vectors =>
188 * max_indices = compare_mask ? current_indices : max_indices
189 *
190 * Note: The casting of data types is required to make the compiler happy
191 * and does not change values.
192 */
193 *max_indices =
194 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
195 _mm256_castsi256_ps(*current_indices),
196 compare_mask));
197
198 /* compute indices of complex numbers which will be loaded in the next iteration */
199 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
200}
201
202/* See _variant0 for details */
203static inline void vector_32fc_index_max_variant1(__m256 in0,
204 __m256 in1,
205 __m256* max_values,
206 __m256i* max_indices,
207 __m256i* current_indices,
208 __m256i indices_increment)
209{
210 in0 = _mm256_mul_ps(in0, in0);
211 in1 = _mm256_mul_ps(in1, in1);
212
213 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
214 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
215
216 /*
217 * This is the only line which differs from variant0. Using maxps instead of
218 * blendvps is faster on Intel CPUs (on the ones tested with).
219 *
220 * Note: The order of arguments matters if a NaN is encountered in which
221 * case the value of the second argument is selected. This is consistent
222 * with the “ordered” comparision and the blend operation: The comparision
223 * returns false if a NaN is encountered and the blend operation
224 * consequently selects the value from max_indices.
225 */
226 *max_values = _mm256_max_ps(abs_squared, *max_values);
227
228 *max_indices =
229 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
230 _mm256_castsi256_ps(*current_indices),
231 compare_mask));
232
233 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
234}
235
236/*
237 * The function below vectorizes the inner loop of the following code:
238 *
239 * float min_values[8] = {FLT_MAX};
240 * unsigned min_indices[8] = {0};
241 * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
242 * for (unsigned i = 0; i < num_points / 8; ++i) {
243 * for (unsigned j = 0; j < 8; ++j) {
244 * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
245 * bool compare = abs_squared < min_values[j];
246 * min_values[j] = compare ? abs_squared : min_values[j];
247 * min_indices[j] = compare ? current_indices[j] : min_indices[j]
248 * current_indices[j] += 8; // update for next outer loop iteration
249 * ++src0;
250 * }
251 * }
252 */
253static inline void vector_32fc_index_min_variant0(__m256 in0,
254 __m256 in1,
255 __m256* min_values,
256 __m256i* min_indices,
257 __m256i* current_indices,
258 __m256i indices_increment)
259{
260 in0 = _mm256_mul_ps(in0, in0);
261 in1 = _mm256_mul_ps(in1, in1);
262
263 /*
264 * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
265 * hadd_ps(a, b) computes
266 * (b_7 + b_6,
267 * b_5 + b_4,
268 * ---------
269 * a_7 + b_6,
270 * a_5 + a_4,
271 * ---------
272 * b_3 + b_2,
273 * b_1 + b_0,
274 * ---------
275 * a_3 + a_2,
276 * a_1 + a_0).
277 * The result is the squared absolute value of complex numbers at index
278 * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
279 * current_indices!
280 */
281 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
282
283 /*
284 * Compare the recently computed squared absolute values with the
285 * previously determined minimum values. cmp_ps(a, b) determines
286 * a < b ? 0xFFFFFFFF for each element in the vectors =>
287 * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
288 *
289 * If either operand is NaN, 0 is returned as an “ordered” comparision is
290 * used => the blend operation will select the value from *min_values.
291 */
292 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
293
294 /* Select minimum by blending. This is the only line which differs from variant1 */
295 *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
296
297 /*
298 * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
299 * each element in the vectors =>
300 * min_indices = compare_mask ? current_indices : min_indices
301 *
302 * Note: The casting of data types is required to make the compiler happy
303 * and does not change values.
304 */
305 *min_indices =
306 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
307 _mm256_castsi256_ps(*current_indices),
308 compare_mask));
309
310 /* compute indices of complex numbers which will be loaded in the next iteration */
311 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
312}
313
314/* See _variant0 for details */
315static inline void vector_32fc_index_min_variant1(__m256 in0,
316 __m256 in1,
317 __m256* min_values,
318 __m256i* min_indices,
319 __m256i* current_indices,
320 __m256i indices_increment)
321{
322 in0 = _mm256_mul_ps(in0, in0);
323 in1 = _mm256_mul_ps(in1, in1);
324
325 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
326 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
327
328 /*
329 * This is the only line which differs from variant0. Using maxps instead of
330 * blendvps is faster on Intel CPUs (on the ones tested with).
331 *
332 * Note: The order of arguments matters if a NaN is encountered in which
333 * case the value of the second argument is selected. This is consistent
334 * with the “ordered” comparision and the blend operation: The comparision
335 * returns false if a NaN is encountered and the blend operation
336 * consequently selects the value from min_indices.
337 */
338 *min_values = _mm256_min_ps(abs_squared, *min_values);
339
340 *min_indices =
341 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
342 _mm256_castsi256_ps(*current_indices),
343 compare_mask));
344
345 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
346}
347
348#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */