65#ifndef INCLUDED_volk_32f_acos_32f_a_H
66#define INCLUDED_volk_32f_acos_32f_a_H
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
71static inline void volk_32f_acos_32f_a_avx2_fma(
float* bVector,
73 unsigned int num_points)
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
82 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
85 pi = _mm256_set1_ps(3.14159265358979323846);
86 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
87 fzeroes = _mm256_setzero_ps();
88 fones = _mm256_set1_ps(1.0);
89 ftwos = _mm256_set1_ps(2.0);
90 ffours = _mm256_set1_ps(4.0);
92 for (; number < eighthPoints; number++) {
93 aVal = _mm256_load_ps(aPtr);
95 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
96 _mm256_sub_ps(fones, aVal))),
99 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
100 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
101 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
103 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
105 for (i = 0; i < 2; i++)
106 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
107 x = _mm256_div_ps(fones, x);
111 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
113 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
116 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
118 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119 arccosine = _mm256_sub_ps(
120 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
121 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
122 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
124 _mm256_store_ps(bPtr, arccosine);
129 number = eighthPoints * 8;
130 for (; number < num_points; number++) {
131 *bPtr++ = acos(*aPtr++);
139#include <immintrin.h>
144 float* bPtr = bVector;
145 const float* aPtr = aVector;
147 unsigned int number = 0;
148 unsigned int eighthPoints = num_points / 8;
151 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
152 __m256 fzeroes, fones, ftwos, ffours, condition;
154 pi = _mm256_set1_ps(3.14159265358979323846);
155 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
156 fzeroes = _mm256_setzero_ps();
157 fones = _mm256_set1_ps(1.0);
158 ftwos = _mm256_set1_ps(2.0);
159 ffours = _mm256_set1_ps(4.0);
161 for (; number < eighthPoints; number++) {
162 aVal = _mm256_load_ps(aPtr);
164 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
165 _mm256_sub_ps(fones, aVal))),
168 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
169 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
170 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
172 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
174 for (i = 0; i < 2; i++)
176 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
177 x = _mm256_div_ps(fones, x);
180 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
181 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
183 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
184 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
187 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
189 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
190 arccosine = _mm256_sub_ps(
191 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
192 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
193 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
195 _mm256_store_ps(bPtr, arccosine);
200 number = eighthPoints * 8;
201 for (; number < num_points; number++) {
202 *bPtr++ = acos(*aPtr++);
209#include <smmintrin.h>
212volk_32f_acos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
214 float* bPtr = bVector;
215 const float* aPtr = aVector;
217 unsigned int number = 0;
218 unsigned int quarterPoints = num_points / 4;
221 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
222 __m128 fzeroes, fones, ftwos, ffours, condition;
231 for (; number < quarterPoints; number++) {
243 for (i = 0; i < 2; i++)
267 number = quarterPoints * 4;
268 for (; number < num_points; number++) {
269 *bPtr++ = acosf(*aPtr++);
278#ifndef INCLUDED_volk_32f_acos_32f_u_H
279#define INCLUDED_volk_32f_acos_32f_u_H
281#if LV_HAVE_AVX2 && LV_HAVE_FMA
282#include <immintrin.h>
284static inline void volk_32f_acos_32f_u_avx2_fma(
float* bVector,
285 const float* aVector,
286 unsigned int num_points)
288 float* bPtr = bVector;
289 const float* aPtr = aVector;
291 unsigned int number = 0;
292 unsigned int eighthPoints = num_points / 8;
295 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
296 __m256 fzeroes, fones, ftwos, ffours, condition;
298 pi = _mm256_set1_ps(3.14159265358979323846);
299 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
300 fzeroes = _mm256_setzero_ps();
301 fones = _mm256_set1_ps(1.0);
302 ftwos = _mm256_set1_ps(2.0);
303 ffours = _mm256_set1_ps(4.0);
305 for (; number < eighthPoints; number++) {
306 aVal = _mm256_loadu_ps(aPtr);
308 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
309 _mm256_sub_ps(fones, aVal))),
312 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
313 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
314 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
316 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
318 for (i = 0; i < 2; i++)
319 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
320 x = _mm256_div_ps(fones, x);
324 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
326 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
327 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
329 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
331 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
332 arccosine = _mm256_sub_ps(
333 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
334 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
335 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
337 _mm256_storeu_ps(bPtr, arccosine);
342 number = eighthPoints * 8;
343 for (; number < num_points; number++) {
344 *bPtr++ = acos(*aPtr++);
352#include <immintrin.h>
357 float* bPtr = bVector;
358 const float* aPtr = aVector;
360 unsigned int number = 0;
361 unsigned int eighthPoints = num_points / 8;
364 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
365 __m256 fzeroes, fones, ftwos, ffours, condition;
367 pi = _mm256_set1_ps(3.14159265358979323846);
368 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
369 fzeroes = _mm256_setzero_ps();
370 fones = _mm256_set1_ps(1.0);
371 ftwos = _mm256_set1_ps(2.0);
372 ffours = _mm256_set1_ps(4.0);
374 for (; number < eighthPoints; number++) {
375 aVal = _mm256_loadu_ps(aPtr);
377 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
378 _mm256_sub_ps(fones, aVal))),
381 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
382 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
383 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
385 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
387 for (i = 0; i < 2; i++)
389 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
390 x = _mm256_div_ps(fones, x);
393 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
394 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
396 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
397 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
400 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
402 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
403 arccosine = _mm256_sub_ps(
404 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
405 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
406 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
408 _mm256_storeu_ps(bPtr, arccosine);
413 number = eighthPoints * 8;
414 for (; number < num_points; number++) {
415 *bPtr++ = acos(*aPtr++);
422#include <smmintrin.h>
425volk_32f_acos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
427 float* bPtr = bVector;
428 const float* aPtr = aVector;
430 unsigned int number = 0;
431 unsigned int quarterPoints = num_points / 4;
434 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
435 __m128 fzeroes, fones, ftwos, ffours, condition;
444 for (; number < quarterPoints; number++) {
456 for (i = 0; i < 2; i++)
481 number = quarterPoints * 4;
482 for (; number < num_points; number++) {
483 *bPtr++ = acosf(*aPtr++);
489#ifdef LV_HAVE_GENERIC
494 float* bPtr = bVector;
495 const float* aPtr = aVector;
496 unsigned int number = 0;
498 for (number = 0; number < num_points; number++) {
499 *bPtr++ = acosf(*aPtr++);