Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_acos_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57#include <inttypes.h>
58#include <math.h>
59#include <stdio.h>
60
61/* This is the number of terms of Taylor series to evaluate, increase this for more
62 * accuracy*/
63#define ACOS_TERMS 2
64
65#ifndef INCLUDED_volk_32f_acos_32f_a_H
66#define INCLUDED_volk_32f_acos_32f_a_H
67
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
69#include <immintrin.h>
70
71static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
72 const float* aVector,
73 unsigned int num_points)
74{
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
77
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
80 int i, j;
81
82 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
84
85 pi = _mm256_set1_ps(3.14159265358979323846);
86 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
87 fzeroes = _mm256_setzero_ps();
88 fones = _mm256_set1_ps(1.0);
89 ftwos = _mm256_set1_ps(2.0);
90 ffours = _mm256_set1_ps(4.0);
91
92 for (; number < eighthPoints; number++) {
93 aVal = _mm256_load_ps(aPtr);
94 d = aVal;
95 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
96 _mm256_sub_ps(fones, aVal))),
97 aVal);
98 z = aVal;
99 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
100 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
101 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
102 x = _mm256_add_ps(
103 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
104
105 for (i = 0; i < 2; i++)
106 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
107 x = _mm256_div_ps(fones, x);
108 y = fzeroes;
109 for (j = ACOS_TERMS - 1; j >= 0; j--)
110 y = _mm256_fmadd_ps(
111 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
112
113 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
115
116 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
117 arccosine = y;
118 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119 arccosine = _mm256_sub_ps(
120 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
121 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
122 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
123
124 _mm256_store_ps(bPtr, arccosine);
125 aPtr += 8;
126 bPtr += 8;
127 }
128
129 number = eighthPoints * 8;
130 for (; number < num_points; number++) {
131 *bPtr++ = acos(*aPtr++);
132 }
133}
134
135#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
136
137
138#ifdef LV_HAVE_AVX
139#include <immintrin.h>
140
141static inline void
142volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
143{
144 float* bPtr = bVector;
145 const float* aPtr = aVector;
146
147 unsigned int number = 0;
148 unsigned int eighthPoints = num_points / 8;
149 int i, j;
150
151 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
152 __m256 fzeroes, fones, ftwos, ffours, condition;
153
154 pi = _mm256_set1_ps(3.14159265358979323846);
155 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
156 fzeroes = _mm256_setzero_ps();
157 fones = _mm256_set1_ps(1.0);
158 ftwos = _mm256_set1_ps(2.0);
159 ffours = _mm256_set1_ps(4.0);
160
161 for (; number < eighthPoints; number++) {
162 aVal = _mm256_load_ps(aPtr);
163 d = aVal;
164 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
165 _mm256_sub_ps(fones, aVal))),
166 aVal);
167 z = aVal;
168 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
169 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
170 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
171 x = _mm256_add_ps(
172 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
173
174 for (i = 0; i < 2; i++)
175 x = _mm256_add_ps(x,
176 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
177 x = _mm256_div_ps(fones, x);
178 y = fzeroes;
179 for (j = ACOS_TERMS - 1; j >= 0; j--)
180 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
181 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
182
183 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
184 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
185
186 y = _mm256_add_ps(
187 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
188 arccosine = y;
189 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
190 arccosine = _mm256_sub_ps(
191 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
192 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
193 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
194
195 _mm256_store_ps(bPtr, arccosine);
196 aPtr += 8;
197 bPtr += 8;
198 }
199
200 number = eighthPoints * 8;
201 for (; number < num_points; number++) {
202 *bPtr++ = acos(*aPtr++);
203 }
204}
205
206#endif /* LV_HAVE_AVX2 for aligned */
207
208#ifdef LV_HAVE_SSE4_1
209#include <smmintrin.h>
210
211static inline void
212volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
213{
214 float* bPtr = bVector;
215 const float* aPtr = aVector;
216
217 unsigned int number = 0;
218 unsigned int quarterPoints = num_points / 4;
219 int i, j;
220
221 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
222 __m128 fzeroes, fones, ftwos, ffours, condition;
223
224 pi = _mm_set1_ps(3.14159265358979323846);
225 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
226 fzeroes = _mm_setzero_ps();
227 fones = _mm_set1_ps(1.0);
228 ftwos = _mm_set1_ps(2.0);
229 ffours = _mm_set1_ps(4.0);
230
231 for (; number < quarterPoints; number++) {
232 aVal = _mm_load_ps(aPtr);
233 d = aVal;
234 aVal = _mm_div_ps(
235 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
236 aVal);
237 z = aVal;
238 condition = _mm_cmplt_ps(z, fzeroes);
239 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
240 condition = _mm_cmplt_ps(z, fones);
241 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
242
243 for (i = 0; i < 2; i++)
244 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
245 x = _mm_div_ps(fones, x);
246 y = fzeroes;
247 for (j = ACOS_TERMS - 1; j >= 0; j--)
248 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
249 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
250
251 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
252 condition = _mm_cmpgt_ps(z, fones);
253
254 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
255 arccosine = y;
256 condition = _mm_cmplt_ps(aVal, fzeroes);
257 arccosine =
258 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
259 condition = _mm_cmplt_ps(d, fzeroes);
260 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
261
262 _mm_store_ps(bPtr, arccosine);
263 aPtr += 4;
264 bPtr += 4;
265 }
266
267 number = quarterPoints * 4;
268 for (; number < num_points; number++) {
269 *bPtr++ = acosf(*aPtr++);
270 }
271}
272
273#endif /* LV_HAVE_SSE4_1 for aligned */
274
275#endif /* INCLUDED_volk_32f_acos_32f_a_H */
276
277
278#ifndef INCLUDED_volk_32f_acos_32f_u_H
279#define INCLUDED_volk_32f_acos_32f_u_H
280
281#if LV_HAVE_AVX2 && LV_HAVE_FMA
282#include <immintrin.h>
283
284static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
285 const float* aVector,
286 unsigned int num_points)
287{
288 float* bPtr = bVector;
289 const float* aPtr = aVector;
290
291 unsigned int number = 0;
292 unsigned int eighthPoints = num_points / 8;
293 int i, j;
294
295 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
296 __m256 fzeroes, fones, ftwos, ffours, condition;
297
298 pi = _mm256_set1_ps(3.14159265358979323846);
299 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
300 fzeroes = _mm256_setzero_ps();
301 fones = _mm256_set1_ps(1.0);
302 ftwos = _mm256_set1_ps(2.0);
303 ffours = _mm256_set1_ps(4.0);
304
305 for (; number < eighthPoints; number++) {
306 aVal = _mm256_loadu_ps(aPtr);
307 d = aVal;
308 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
309 _mm256_sub_ps(fones, aVal))),
310 aVal);
311 z = aVal;
312 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
313 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
314 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
315 x = _mm256_add_ps(
316 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
317
318 for (i = 0; i < 2; i++)
319 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
320 x = _mm256_div_ps(fones, x);
321 y = fzeroes;
322 for (j = ACOS_TERMS - 1; j >= 0; j--)
323 y = _mm256_fmadd_ps(
324 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
325
326 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
327 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
328
329 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
330 arccosine = y;
331 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
332 arccosine = _mm256_sub_ps(
333 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
334 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
335 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
336
337 _mm256_storeu_ps(bPtr, arccosine);
338 aPtr += 8;
339 bPtr += 8;
340 }
341
342 number = eighthPoints * 8;
343 for (; number < num_points; number++) {
344 *bPtr++ = acos(*aPtr++);
345 }
346}
347
348#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
349
350
351#ifdef LV_HAVE_AVX
352#include <immintrin.h>
353
354static inline void
355volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
356{
357 float* bPtr = bVector;
358 const float* aPtr = aVector;
359
360 unsigned int number = 0;
361 unsigned int eighthPoints = num_points / 8;
362 int i, j;
363
364 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
365 __m256 fzeroes, fones, ftwos, ffours, condition;
366
367 pi = _mm256_set1_ps(3.14159265358979323846);
368 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
369 fzeroes = _mm256_setzero_ps();
370 fones = _mm256_set1_ps(1.0);
371 ftwos = _mm256_set1_ps(2.0);
372 ffours = _mm256_set1_ps(4.0);
373
374 for (; number < eighthPoints; number++) {
375 aVal = _mm256_loadu_ps(aPtr);
376 d = aVal;
377 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
378 _mm256_sub_ps(fones, aVal))),
379 aVal);
380 z = aVal;
381 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
382 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
383 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
384 x = _mm256_add_ps(
385 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
386
387 for (i = 0; i < 2; i++)
388 x = _mm256_add_ps(x,
389 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
390 x = _mm256_div_ps(fones, x);
391 y = fzeroes;
392 for (j = ACOS_TERMS - 1; j >= 0; j--)
393 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
394 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
395
396 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
397 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
398
399 y = _mm256_add_ps(
400 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
401 arccosine = y;
402 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
403 arccosine = _mm256_sub_ps(
404 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
405 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
406 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
407
408 _mm256_storeu_ps(bPtr, arccosine);
409 aPtr += 8;
410 bPtr += 8;
411 }
412
413 number = eighthPoints * 8;
414 for (; number < num_points; number++) {
415 *bPtr++ = acos(*aPtr++);
416 }
417}
418
419#endif /* LV_HAVE_AVX2 for unaligned */
420
421#ifdef LV_HAVE_SSE4_1
422#include <smmintrin.h>
423
424static inline void
425volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
426{
427 float* bPtr = bVector;
428 const float* aPtr = aVector;
429
430 unsigned int number = 0;
431 unsigned int quarterPoints = num_points / 4;
432 int i, j;
433
434 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
435 __m128 fzeroes, fones, ftwos, ffours, condition;
436
437 pi = _mm_set1_ps(3.14159265358979323846);
438 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
439 fzeroes = _mm_setzero_ps();
440 fones = _mm_set1_ps(1.0);
441 ftwos = _mm_set1_ps(2.0);
442 ffours = _mm_set1_ps(4.0);
443
444 for (; number < quarterPoints; number++) {
445 aVal = _mm_loadu_ps(aPtr);
446 d = aVal;
447 aVal = _mm_div_ps(
448 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
449 aVal);
450 z = aVal;
451 condition = _mm_cmplt_ps(z, fzeroes);
452 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
453 condition = _mm_cmplt_ps(z, fones);
454 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
455
456 for (i = 0; i < 2; i++)
457 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
458 x = _mm_div_ps(fones, x);
459 y = fzeroes;
460
461 for (j = ACOS_TERMS - 1; j >= 0; j--)
462 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
463 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
464
465 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
466 condition = _mm_cmpgt_ps(z, fones);
467
468 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
469 arccosine = y;
470 condition = _mm_cmplt_ps(aVal, fzeroes);
471 arccosine =
472 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
473 condition = _mm_cmplt_ps(d, fzeroes);
474 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
475
476 _mm_storeu_ps(bPtr, arccosine);
477 aPtr += 4;
478 bPtr += 4;
479 }
480
481 number = quarterPoints * 4;
482 for (; number < num_points; number++) {
483 *bPtr++ = acosf(*aPtr++);
484 }
485}
486
487#endif /* LV_HAVE_SSE4_1 for aligned */
488
489#ifdef LV_HAVE_GENERIC
490
491static inline void
492volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
493{
494 float* bPtr = bVector;
495 const float* aPtr = aVector;
496 unsigned int number = 0;
497
498 for (number = 0; number < num_points; number++) {
499 *bPtr++ = acosf(*aPtr++);
500 }
501}
502#endif /* LV_HAVE_GENERIC */
503
504#endif /* INCLUDED_volk_32f_acos_32f_u_H */