Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_atan2_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
61#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
63
64#include <math.h>
65
66#ifdef LV_HAVE_GENERIC
67static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
68 const lv_32fc_t* inputVector,
69 const float normalizeFactor,
70 unsigned int num_points)
71{
72 float* outPtr = outputVector;
73 const float* inPtr = (float*)inputVector;
74 const float invNormalizeFactor = 1.f / normalizeFactor;
75 unsigned int number = 0;
76 for (; number < num_points; number++) {
77 const float real = *inPtr++;
78 const float imag = *inPtr++;
79 *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
80 }
81}
82#endif /* LV_HAVE_GENERIC */
83
84#ifdef LV_HAVE_GENERIC
85#include <volk/volk_common.h>
86static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector,
87 const lv_32fc_t* inputVector,
88 const float normalizeFactor,
89 unsigned int num_points)
90{
91 float* outPtr = outputVector;
92 const float* inPtr = (float*)inputVector;
93 const float invNormalizeFactor = 1.f / normalizeFactor;
94 unsigned int number = 0;
95 for (; number < num_points; number++) {
96 const float x = *inPtr++;
97 const float y = *inPtr++;
98 *outPtr++ = volk_atan2(y, x) * invNormalizeFactor;
99 }
100}
101#endif /* LV_HAVE_GENERIC */
102
103#if LV_HAVE_AVX2 && LV_HAVE_FMA
104#include <immintrin.h>
106static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector,
107 const lv_32fc_t* complexVector,
108 const float normalizeFactor,
109 unsigned int num_points)
110{
111 const float* in = (float*)complexVector;
112 float* out = (float*)outputVector;
113
114 const float invNormalizeFactor = 1.f / normalizeFactor;
115 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
116 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
117 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
118 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
119 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
120
121 unsigned int number = 0;
122 unsigned int eighth_points = num_points / 8;
123 for (; number < eighth_points; number++) {
124 __m256 z1 = _mm256_load_ps(in);
125 in += 8;
126 __m256 z2 = _mm256_load_ps(in);
127 in += 8;
128
129 __m256 x = _mm256_real(z1, z2);
130 __m256 y = _mm256_imag(z1, z2);
131
132 __m256 swap_mask = _mm256_cmp_ps(
133 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
134 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
135 _mm256_blendv_ps(x, y, swap_mask));
136 __m256 result = _m256_arctan_poly_avx2_fma(input);
137
138 input =
139 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
140 result = _mm256_blendv_ps(result, input, swap_mask);
141
142 __m256 x_sign_mask =
143 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
144
145 result = _mm256_add_ps(
146 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
147 result);
148 result = _mm256_mul_ps(result, vinvNormalizeFactor);
149
150 _mm256_store_ps(out, result);
151 out += 8;
152 }
153
154 number = eighth_points * 8;
156 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
157}
158#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
159
160#if LV_HAVE_AVX2
161#include <immintrin.h>
163static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
164 const lv_32fc_t* complexVector,
165 const float normalizeFactor,
166 unsigned int num_points)
167{
168 const float* in = (float*)complexVector;
169 float* out = (float*)outputVector;
170
171 const float invNormalizeFactor = 1.f / normalizeFactor;
172 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
173 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
174 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
175 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
176 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
177
178 unsigned int number = 0;
179 unsigned int eighth_points = num_points / 8;
180 for (; number < eighth_points; number++) {
181 __m256 z1 = _mm256_load_ps(in);
182 in += 8;
183 __m256 z2 = _mm256_load_ps(in);
184 in += 8;
185
186 __m256 x = _mm256_real(z1, z2);
187 __m256 y = _mm256_imag(z1, z2);
188
189 __m256 swap_mask = _mm256_cmp_ps(
190 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
191 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
192 _mm256_blendv_ps(x, y, swap_mask));
193 __m256 result = _m256_arctan_poly_avx(input);
194
195 input =
196 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
197 result = _mm256_blendv_ps(result, input, swap_mask);
198
199 __m256 x_sign_mask =
200 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
201
202 result = _mm256_add_ps(
203 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
204 result);
205 result = _mm256_mul_ps(result, vinvNormalizeFactor);
206
207 _mm256_store_ps(out, result);
208 out += 8;
209 }
210
211 number = eighth_points * 8;
213 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
214}
215#endif /* LV_HAVE_AVX2 for aligned */
216#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
217
218#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
219#define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
220
221#if LV_HAVE_AVX2 && LV_HAVE_FMA
222#include <immintrin.h>
224static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector,
225 const lv_32fc_t* complexVector,
226 const float normalizeFactor,
227 unsigned int num_points)
228{
229 const float* in = (float*)complexVector;
230 float* out = (float*)outputVector;
231
232 const float invNormalizeFactor = 1.f / normalizeFactor;
233 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
234 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
235 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
236 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
237 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
238
239 unsigned int number = 0;
240 unsigned int eighth_points = num_points / 8;
241 for (; number < eighth_points; number++) {
242 __m256 z1 = _mm256_loadu_ps(in);
243 in += 8;
244 __m256 z2 = _mm256_loadu_ps(in);
245 in += 8;
246
247 __m256 x = _mm256_real(z1, z2);
248 __m256 y = _mm256_imag(z1, z2);
249
250 __m256 swap_mask = _mm256_cmp_ps(
251 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
252 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
253 _mm256_blendv_ps(x, y, swap_mask));
254 __m256 result = _m256_arctan_poly_avx2_fma(input);
255
256 input =
257 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
258 result = _mm256_blendv_ps(result, input, swap_mask);
259
260 __m256 x_sign_mask =
261 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
262
263 result = _mm256_add_ps(
264 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
265 result);
266 result = _mm256_mul_ps(result, vinvNormalizeFactor);
267
268 _mm256_storeu_ps(out, result);
269 out += 8;
270 }
271
272 number = eighth_points * 8;
274 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
275}
276#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
277
278#if LV_HAVE_AVX2
279#include <immintrin.h>
281static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
282 const lv_32fc_t* complexVector,
283 const float normalizeFactor,
284 unsigned int num_points)
285{
286 const float* in = (float*)complexVector;
287 float* out = (float*)outputVector;
288
289 const float invNormalizeFactor = 1.f / normalizeFactor;
290 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
291 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
292 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
293 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
294 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
295
296 unsigned int number = 0;
297 unsigned int eighth_points = num_points / 8;
298 for (; number < eighth_points; number++) {
299 __m256 z1 = _mm256_loadu_ps(in);
300 in += 8;
301 __m256 z2 = _mm256_loadu_ps(in);
302 in += 8;
303
304 __m256 x = _mm256_real(z1, z2);
305 __m256 y = _mm256_imag(z1, z2);
306
307 __m256 swap_mask = _mm256_cmp_ps(
308 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
309 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
310 _mm256_blendv_ps(x, y, swap_mask));
311 __m256 result = _m256_arctan_poly_avx(input);
312
313 input =
314 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
315 result = _mm256_blendv_ps(result, input, swap_mask);
316
317 __m256 x_sign_mask =
318 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
319
320 result = _mm256_add_ps(
321 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
322 result);
323 result = _mm256_mul_ps(result, vinvNormalizeFactor);
324
325 _mm256_storeu_ps(out, result);
326 out += 8;
327 }
328
329 number = eighth_points * 8;
331 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
332}
333#endif /* LV_HAVE_AVX2 for unaligned */
334
335#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_u_H */