61#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
69 const float normalizeFactor,
70 unsigned int num_points)
72 float* outPtr = outputVector;
73 const float* inPtr = (
float*)inputVector;
74 const float invNormalizeFactor = 1.f / normalizeFactor;
75 unsigned int number = 0;
76 for (; number < num_points; number++) {
77 const float real = *inPtr++;
78 const float imag = *inPtr++;
79 *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
88 const float normalizeFactor,
89 unsigned int num_points)
91 float* outPtr = outputVector;
92 const float* inPtr = (
float*)inputVector;
93 const float invNormalizeFactor = 1.f / normalizeFactor;
94 unsigned int number = 0;
95 for (; number < num_points; number++) {
96 const float x = *inPtr++;
97 const float y = *inPtr++;
98 *outPtr++ =
volk_atan2(y, x) * invNormalizeFactor;
103#if LV_HAVE_AVX2 && LV_HAVE_FMA
104#include <immintrin.h>
106static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(
float* outputVector,
108 const float normalizeFactor,
109 unsigned int num_points)
111 const float* in = (
float*)complexVector;
112 float* out = (
float*)outputVector;
114 const float invNormalizeFactor = 1.f / normalizeFactor;
115 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
116 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
117 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
118 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
119 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
121 unsigned int number = 0;
122 unsigned int eighth_points = num_points / 8;
123 for (; number < eighth_points; number++) {
124 __m256 z1 = _mm256_load_ps(in);
126 __m256 z2 = _mm256_load_ps(in);
132 __m256 swap_mask = _mm256_cmp_ps(
133 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
134 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
135 _mm256_blendv_ps(x, y, swap_mask));
139 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
140 result = _mm256_blendv_ps(result, input, swap_mask);
143 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
145 result = _mm256_add_ps(
146 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
148 result = _mm256_mul_ps(result, vinvNormalizeFactor);
150 _mm256_store_ps(out, result);
154 number = eighth_points * 8;
156 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
161#include <immintrin.h>
163static inline void volk_32fc_s32f_atan2_32f_a_avx2(
float* outputVector,
165 const float normalizeFactor,
166 unsigned int num_points)
168 const float* in = (
float*)complexVector;
169 float* out = (
float*)outputVector;
171 const float invNormalizeFactor = 1.f / normalizeFactor;
172 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
173 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
174 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
175 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
176 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
178 unsigned int number = 0;
179 unsigned int eighth_points = num_points / 8;
180 for (; number < eighth_points; number++) {
181 __m256 z1 = _mm256_load_ps(in);
183 __m256 z2 = _mm256_load_ps(in);
189 __m256 swap_mask = _mm256_cmp_ps(
190 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
191 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
192 _mm256_blendv_ps(x, y, swap_mask));
196 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
197 result = _mm256_blendv_ps(result, input, swap_mask);
200 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
202 result = _mm256_add_ps(
203 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
205 result = _mm256_mul_ps(result, vinvNormalizeFactor);
207 _mm256_store_ps(out, result);
211 number = eighth_points * 8;
213 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
218#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
219#define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
221#if LV_HAVE_AVX2 && LV_HAVE_FMA
222#include <immintrin.h>
224static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(
float* outputVector,
226 const float normalizeFactor,
227 unsigned int num_points)
229 const float* in = (
float*)complexVector;
230 float* out = (
float*)outputVector;
232 const float invNormalizeFactor = 1.f / normalizeFactor;
233 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
234 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
235 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
236 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
237 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
239 unsigned int number = 0;
240 unsigned int eighth_points = num_points / 8;
241 for (; number < eighth_points; number++) {
242 __m256 z1 = _mm256_loadu_ps(in);
244 __m256 z2 = _mm256_loadu_ps(in);
250 __m256 swap_mask = _mm256_cmp_ps(
251 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
252 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
253 _mm256_blendv_ps(x, y, swap_mask));
257 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
258 result = _mm256_blendv_ps(result, input, swap_mask);
261 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
263 result = _mm256_add_ps(
264 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
266 result = _mm256_mul_ps(result, vinvNormalizeFactor);
268 _mm256_storeu_ps(out, result);
272 number = eighth_points * 8;
274 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
279#include <immintrin.h>
281static inline void volk_32fc_s32f_atan2_32f_u_avx2(
float* outputVector,
283 const float normalizeFactor,
284 unsigned int num_points)
286 const float* in = (
float*)complexVector;
287 float* out = (
float*)outputVector;
289 const float invNormalizeFactor = 1.f / normalizeFactor;
290 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
291 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
292 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
293 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
294 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
296 unsigned int number = 0;
297 unsigned int eighth_points = num_points / 8;
298 for (; number < eighth_points; number++) {
299 __m256 z1 = _mm256_loadu_ps(in);
301 __m256 z2 = _mm256_loadu_ps(in);
307 __m256 swap_mask = _mm256_cmp_ps(
308 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
309 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
310 _mm256_blendv_ps(x, y, swap_mask));
314 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
315 result = _mm256_blendv_ps(result, input, swap_mask);
318 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
320 result = _mm256_add_ps(
321 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
323 result = _mm256_mul_ps(result, vinvNormalizeFactor);
325 _mm256_storeu_ps(out, result);
329 number = eighth_points * 8;
331 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);