Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_asin_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57#include <inttypes.h>
58#include <math.h>
59#include <stdio.h>
60
61/* This is the number of terms of Taylor series to evaluate, increase this for more
62 * accuracy*/
63#define ASIN_TERMS 2
64
65#ifndef INCLUDED_volk_32f_asin_32f_a_H
66#define INCLUDED_volk_32f_asin_32f_a_H
67
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
69#include <immintrin.h>
70
71static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
72 const float* aVector,
73 unsigned int num_points)
74{
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
77
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
80 int i, j;
81
82 __m256 aVal, pio2, x, y, z, arcsine;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
84
85 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
86 fzeroes = _mm256_setzero_ps();
87 fones = _mm256_set1_ps(1.0);
88 ftwos = _mm256_set1_ps(2.0);
89 ffours = _mm256_set1_ps(4.0);
90
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
93 aVal = _mm256_div_ps(aVal,
94 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
95 _mm256_sub_ps(fones, aVal))));
96 z = aVal;
97 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
98 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
99 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
100 x = _mm256_add_ps(
101 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
102
103 for (i = 0; i < 2; i++) {
104 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
105 }
106 x = _mm256_div_ps(fones, x);
107 y = fzeroes;
108 for (j = ASIN_TERMS - 1; j >= 0; j--) {
109 y = _mm256_fmadd_ps(
110 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
111 }
112
113 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
115
116 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
117 arcsine = y;
118 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119 arcsine = _mm256_sub_ps(arcsine,
120 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
121
122 _mm256_store_ps(bPtr, arcsine);
123 aPtr += 8;
124 bPtr += 8;
125 }
126
127 number = eighthPoints * 8;
128 for (; number < num_points; number++) {
129 *bPtr++ = asin(*aPtr++);
130 }
131}
132
133#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
134
135
136#ifdef LV_HAVE_AVX
137#include <immintrin.h>
138
139static inline void
140volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
141{
142 float* bPtr = bVector;
143 const float* aPtr = aVector;
144
145 unsigned int number = 0;
146 unsigned int eighthPoints = num_points / 8;
147 int i, j;
148
149 __m256 aVal, pio2, x, y, z, arcsine;
150 __m256 fzeroes, fones, ftwos, ffours, condition;
151
152 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
153 fzeroes = _mm256_setzero_ps();
154 fones = _mm256_set1_ps(1.0);
155 ftwos = _mm256_set1_ps(2.0);
156 ffours = _mm256_set1_ps(4.0);
157
158 for (; number < eighthPoints; number++) {
159 aVal = _mm256_load_ps(aPtr);
160 aVal = _mm256_div_ps(aVal,
161 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
162 _mm256_sub_ps(fones, aVal))));
163 z = aVal;
164 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
165 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
166 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
167 x = _mm256_add_ps(
168 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
169
170 for (i = 0; i < 2; i++) {
171 x = _mm256_add_ps(x,
172 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
173 }
174 x = _mm256_div_ps(fones, x);
175 y = fzeroes;
176 for (j = ASIN_TERMS - 1; j >= 0; j--) {
177 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
178 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
179 }
180
181 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
182 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
183
184 y = _mm256_add_ps(
185 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
186 arcsine = y;
187 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
188 arcsine = _mm256_sub_ps(arcsine,
189 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
190
191 _mm256_store_ps(bPtr, arcsine);
192 aPtr += 8;
193 bPtr += 8;
194 }
195
196 number = eighthPoints * 8;
197 for (; number < num_points; number++) {
198 *bPtr++ = asin(*aPtr++);
199 }
200}
201
202#endif /* LV_HAVE_AVX for aligned */
203
204#ifdef LV_HAVE_SSE4_1
205#include <smmintrin.h>
206
207static inline void
208volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
209{
210 float* bPtr = bVector;
211 const float* aPtr = aVector;
212
213 unsigned int number = 0;
214 unsigned int quarterPoints = num_points / 4;
215 int i, j;
216
217 __m128 aVal, pio2, x, y, z, arcsine;
218 __m128 fzeroes, fones, ftwos, ffours, condition;
219
220 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
221 fzeroes = _mm_setzero_ps();
222 fones = _mm_set1_ps(1.0);
223 ftwos = _mm_set1_ps(2.0);
224 ffours = _mm_set1_ps(4.0);
225
226 for (; number < quarterPoints; number++) {
227 aVal = _mm_load_ps(aPtr);
228 aVal = _mm_div_ps(
229 aVal,
230 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
231 z = aVal;
232 condition = _mm_cmplt_ps(z, fzeroes);
233 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
234 condition = _mm_cmplt_ps(z, fones);
235 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
236
237 for (i = 0; i < 2; i++) {
238 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
239 }
240 x = _mm_div_ps(fones, x);
241 y = fzeroes;
242 for (j = ASIN_TERMS - 1; j >= 0; j--) {
243 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
244 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
245 }
246
247 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
248 condition = _mm_cmpgt_ps(z, fones);
249
250 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
251 arcsine = y;
252 condition = _mm_cmplt_ps(aVal, fzeroes);
253 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
254
255 _mm_store_ps(bPtr, arcsine);
256 aPtr += 4;
257 bPtr += 4;
258 }
259
260 number = quarterPoints * 4;
261 for (; number < num_points; number++) {
262 *bPtr++ = asinf(*aPtr++);
263 }
264}
265
266#endif /* LV_HAVE_SSE4_1 for aligned */
267
268#endif /* INCLUDED_volk_32f_asin_32f_a_H */
269
270#ifndef INCLUDED_volk_32f_asin_32f_u_H
271#define INCLUDED_volk_32f_asin_32f_u_H
272
273#if LV_HAVE_AVX2 && LV_HAVE_FMA
274#include <immintrin.h>
275
276static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
277 const float* aVector,
278 unsigned int num_points)
279{
280 float* bPtr = bVector;
281 const float* aPtr = aVector;
282
283 unsigned int number = 0;
284 unsigned int eighthPoints = num_points / 8;
285 int i, j;
286
287 __m256 aVal, pio2, x, y, z, arcsine;
288 __m256 fzeroes, fones, ftwos, ffours, condition;
289
290 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
291 fzeroes = _mm256_setzero_ps();
292 fones = _mm256_set1_ps(1.0);
293 ftwos = _mm256_set1_ps(2.0);
294 ffours = _mm256_set1_ps(4.0);
295
296 for (; number < eighthPoints; number++) {
297 aVal = _mm256_loadu_ps(aPtr);
298 aVal = _mm256_div_ps(aVal,
299 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
300 _mm256_sub_ps(fones, aVal))));
301 z = aVal;
302 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
303 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
304 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
305 x = _mm256_add_ps(
306 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
307
308 for (i = 0; i < 2; i++) {
309 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
310 }
311 x = _mm256_div_ps(fones, x);
312 y = fzeroes;
313 for (j = ASIN_TERMS - 1; j >= 0; j--) {
314 y = _mm256_fmadd_ps(
315 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
316 }
317
318 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
319 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
320
321 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
322 arcsine = y;
323 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
324 arcsine = _mm256_sub_ps(arcsine,
325 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
326
327 _mm256_storeu_ps(bPtr, arcsine);
328 aPtr += 8;
329 bPtr += 8;
330 }
331
332 number = eighthPoints * 8;
333 for (; number < num_points; number++) {
334 *bPtr++ = asin(*aPtr++);
335 }
336}
337
338#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
339
340
341#ifdef LV_HAVE_AVX
342#include <immintrin.h>
343
344static inline void
345volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
346{
347 float* bPtr = bVector;
348 const float* aPtr = aVector;
349
350 unsigned int number = 0;
351 unsigned int eighthPoints = num_points / 8;
352 int i, j;
353
354 __m256 aVal, pio2, x, y, z, arcsine;
355 __m256 fzeroes, fones, ftwos, ffours, condition;
356
357 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
358 fzeroes = _mm256_setzero_ps();
359 fones = _mm256_set1_ps(1.0);
360 ftwos = _mm256_set1_ps(2.0);
361 ffours = _mm256_set1_ps(4.0);
362
363 for (; number < eighthPoints; number++) {
364 aVal = _mm256_loadu_ps(aPtr);
365 aVal = _mm256_div_ps(aVal,
366 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
367 _mm256_sub_ps(fones, aVal))));
368 z = aVal;
369 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
370 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
371 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
372 x = _mm256_add_ps(
373 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
374
375 for (i = 0; i < 2; i++) {
376 x = _mm256_add_ps(x,
377 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
378 }
379 x = _mm256_div_ps(fones, x);
380 y = fzeroes;
381 for (j = ASIN_TERMS - 1; j >= 0; j--) {
382 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
383 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
384 }
385
386 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
387 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
388
389 y = _mm256_add_ps(
390 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
391 arcsine = y;
392 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
393 arcsine = _mm256_sub_ps(arcsine,
394 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
395
396 _mm256_storeu_ps(bPtr, arcsine);
397 aPtr += 8;
398 bPtr += 8;
399 }
400
401 number = eighthPoints * 8;
402 for (; number < num_points; number++) {
403 *bPtr++ = asin(*aPtr++);
404 }
405}
406
407#endif /* LV_HAVE_AVX for unaligned */
408
409
410#ifdef LV_HAVE_SSE4_1
411#include <smmintrin.h>
412
413static inline void
414volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
415{
416 float* bPtr = bVector;
417 const float* aPtr = aVector;
418
419 unsigned int number = 0;
420 unsigned int quarterPoints = num_points / 4;
421 int i, j;
422
423 __m128 aVal, pio2, x, y, z, arcsine;
424 __m128 fzeroes, fones, ftwos, ffours, condition;
425
426 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
427 fzeroes = _mm_setzero_ps();
428 fones = _mm_set1_ps(1.0);
429 ftwos = _mm_set1_ps(2.0);
430 ffours = _mm_set1_ps(4.0);
431
432 for (; number < quarterPoints; number++) {
433 aVal = _mm_loadu_ps(aPtr);
434 aVal = _mm_div_ps(
435 aVal,
436 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
437 z = aVal;
438 condition = _mm_cmplt_ps(z, fzeroes);
439 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
440 condition = _mm_cmplt_ps(z, fones);
441 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
442
443 for (i = 0; i < 2; i++) {
444 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
445 }
446 x = _mm_div_ps(fones, x);
447 y = fzeroes;
448 for (j = ASIN_TERMS - 1; j >= 0; j--) {
449 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
450 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
451 }
452
453 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
454 condition = _mm_cmpgt_ps(z, fones);
455
456 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
457 arcsine = y;
458 condition = _mm_cmplt_ps(aVal, fzeroes);
459 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
460
461 _mm_storeu_ps(bPtr, arcsine);
462 aPtr += 4;
463 bPtr += 4;
464 }
465
466 number = quarterPoints * 4;
467 for (; number < num_points; number++) {
468 *bPtr++ = asinf(*aPtr++);
469 }
470}
471
472#endif /* LV_HAVE_SSE4_1 for unaligned */
473
474#ifdef LV_HAVE_GENERIC
475
476static inline void
477volk_32f_asin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
478{
479 float* bPtr = bVector;
480 const float* aPtr = aVector;
481 unsigned int number = 0;
482
483 for (number = 0; number < num_points; number++) {
484 *bPtr++ = asinf(*aPtr++);
485 }
486}
487#endif /* LV_HAVE_GENERIC */
488
489#endif /* INCLUDED_volk_32f_asin_32f_u_H */