Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_tan_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
61#include <inttypes.h>
62#include <math.h>
63#include <stdio.h>
64
65#ifndef INCLUDED_volk_32f_tan_32f_a_H
66#define INCLUDED_volk_32f_tan_32f_a_H
67
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
69#include <immintrin.h>
70
71static inline void
72volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
73{
74 float* bPtr = bVector;
75 const float* aPtr = aVector;
76
77 unsigned int number = 0;
78 unsigned int eighthPoints = num_points / 8;
79 unsigned int i = 0;
80
81 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
82 fzeroes;
83 __m256 sine, cosine, tangent, condition1, condition2, condition3;
84 __m256i q, r, ones, twos, fours;
85
86 m4pi = _mm256_set1_ps(1.273239545);
87 pio4A = _mm256_set1_ps(0.78515625);
88 pio4B = _mm256_set1_ps(0.241876e-3);
89 ffours = _mm256_set1_ps(4.0);
90 ftwos = _mm256_set1_ps(2.0);
91 fones = _mm256_set1_ps(1.0);
92 fzeroes = _mm256_setzero_ps();
93 ones = _mm256_set1_epi32(1);
94 twos = _mm256_set1_epi32(2);
95 fours = _mm256_set1_epi32(4);
96
97 cp1 = _mm256_set1_ps(1.0);
98 cp2 = _mm256_set1_ps(0.83333333e-1);
99 cp3 = _mm256_set1_ps(0.2777778e-2);
100 cp4 = _mm256_set1_ps(0.49603e-4);
101 cp5 = _mm256_set1_ps(0.551e-6);
102
103 for (; number < eighthPoints; number++) {
104 aVal = _mm256_load_ps(aPtr);
105 s = _mm256_sub_ps(aVal,
106 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
107 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
108 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
109 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
110
111 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
112 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
113
114 s = _mm256_div_ps(
115 s,
116 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
117 s = _mm256_mul_ps(s, s);
118 // Evaluate Taylor series
119 s = _mm256_mul_ps(
120 _mm256_fmadd_ps(
121 _mm256_fmsub_ps(
122 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
123 s,
124 cp1),
125 s);
126
127 for (i = 0; i < 3; i++) {
128 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
129 }
130 s = _mm256_div_ps(s, ftwos);
131
132 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
133 cosine = _mm256_sub_ps(fones, s);
134
135 condition1 = _mm256_cmp_ps(
136 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
137 fzeroes,
138 _CMP_NEQ_UQ);
139 condition2 = _mm256_cmp_ps(
140 _mm256_cmp_ps(
141 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
142 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
143 _CMP_NEQ_UQ);
144 condition3 = _mm256_cmp_ps(
145 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
146 fzeroes,
147 _CMP_NEQ_UQ);
148
149 __m256 temp = cosine;
150 cosine =
151 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
152 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
153 sine = _mm256_sub_ps(
154 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
155 cosine = _mm256_sub_ps(
156 cosine,
157 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
158 tangent = _mm256_div_ps(sine, cosine);
159 _mm256_store_ps(bPtr, tangent);
160 aPtr += 8;
161 bPtr += 8;
162 }
163
164 number = eighthPoints * 8;
165 for (; number < num_points; number++) {
166 *bPtr++ = tan(*aPtr++);
167 }
168}
169
170#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
171
172#ifdef LV_HAVE_AVX2
173#include <immintrin.h>
174
175static inline void
176volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
177{
178 float* bPtr = bVector;
179 const float* aPtr = aVector;
180
181 unsigned int number = 0;
182 unsigned int eighthPoints = num_points / 8;
183 unsigned int i = 0;
184
185 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
186 fzeroes;
187 __m256 sine, cosine, tangent, condition1, condition2, condition3;
188 __m256i q, r, ones, twos, fours;
189
190 m4pi = _mm256_set1_ps(1.273239545);
191 pio4A = _mm256_set1_ps(0.78515625);
192 pio4B = _mm256_set1_ps(0.241876e-3);
193 ffours = _mm256_set1_ps(4.0);
194 ftwos = _mm256_set1_ps(2.0);
195 fones = _mm256_set1_ps(1.0);
196 fzeroes = _mm256_setzero_ps();
197 ones = _mm256_set1_epi32(1);
198 twos = _mm256_set1_epi32(2);
199 fours = _mm256_set1_epi32(4);
200
201 cp1 = _mm256_set1_ps(1.0);
202 cp2 = _mm256_set1_ps(0.83333333e-1);
203 cp3 = _mm256_set1_ps(0.2777778e-2);
204 cp4 = _mm256_set1_ps(0.49603e-4);
205 cp5 = _mm256_set1_ps(0.551e-6);
206
207 for (; number < eighthPoints; number++) {
208 aVal = _mm256_load_ps(aPtr);
209 s = _mm256_sub_ps(aVal,
210 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
211 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
212 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
213 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
214
215 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
216 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
217
218 s = _mm256_div_ps(
219 s,
220 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
221 s = _mm256_mul_ps(s, s);
222 // Evaluate Taylor series
223 s = _mm256_mul_ps(
224 _mm256_add_ps(
225 _mm256_mul_ps(
226 _mm256_sub_ps(
227 _mm256_mul_ps(
228 _mm256_add_ps(
229 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
230 s),
231 cp3),
232 s),
233 cp2),
234 s),
235 cp1),
236 s);
237
238 for (i = 0; i < 3; i++) {
239 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
240 }
241 s = _mm256_div_ps(s, ftwos);
242
243 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
244 cosine = _mm256_sub_ps(fones, s);
245
246 condition1 = _mm256_cmp_ps(
247 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
248 fzeroes,
249 _CMP_NEQ_UQ);
250 condition2 = _mm256_cmp_ps(
251 _mm256_cmp_ps(
252 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
253 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
254 _CMP_NEQ_UQ);
255 condition3 = _mm256_cmp_ps(
256 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
257 fzeroes,
258 _CMP_NEQ_UQ);
259
260 __m256 temp = cosine;
261 cosine =
262 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
263 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
264 sine = _mm256_sub_ps(
265 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
266 cosine = _mm256_sub_ps(
267 cosine,
268 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
269 tangent = _mm256_div_ps(sine, cosine);
270 _mm256_store_ps(bPtr, tangent);
271 aPtr += 8;
272 bPtr += 8;
273 }
274
275 number = eighthPoints * 8;
276 for (; number < num_points; number++) {
277 *bPtr++ = tan(*aPtr++);
278 }
279}
280
281#endif /* LV_HAVE_AVX2 for aligned */
282
283#ifdef LV_HAVE_SSE4_1
284#include <smmintrin.h>
285
286static inline void
287volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
288{
289 float* bPtr = bVector;
290 const float* aPtr = aVector;
291
292 unsigned int number = 0;
293 unsigned int quarterPoints = num_points / 4;
294 unsigned int i = 0;
295
296 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
297 fzeroes;
298 __m128 sine, cosine, tangent, condition1, condition2, condition3;
299 __m128i q, r, ones, twos, fours;
300
301 m4pi = _mm_set1_ps(1.273239545);
302 pio4A = _mm_set1_ps(0.78515625);
303 pio4B = _mm_set1_ps(0.241876e-3);
304 ffours = _mm_set1_ps(4.0);
305 ftwos = _mm_set1_ps(2.0);
306 fones = _mm_set1_ps(1.0);
307 fzeroes = _mm_setzero_ps();
308 ones = _mm_set1_epi32(1);
309 twos = _mm_set1_epi32(2);
310 fours = _mm_set1_epi32(4);
311
312 cp1 = _mm_set1_ps(1.0);
313 cp2 = _mm_set1_ps(0.83333333e-1);
314 cp3 = _mm_set1_ps(0.2777778e-2);
315 cp4 = _mm_set1_ps(0.49603e-4);
316 cp5 = _mm_set1_ps(0.551e-6);
317
318 for (; number < quarterPoints; number++) {
319 aVal = _mm_load_ps(aPtr);
320 s = _mm_sub_ps(aVal,
321 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
323 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
324
325 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
326 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
327
328 s = _mm_div_ps(
329 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
330 s = _mm_mul_ps(s, s);
331 // Evaluate Taylor series
332 s = _mm_mul_ps(
337 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
338 cp3),
339 s),
340 cp2),
341 s),
342 cp1),
343 s);
344
345 for (i = 0; i < 3; i++) {
346 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
347 }
348 s = _mm_div_ps(s, ftwos);
349
350 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
351 cosine = _mm_sub_ps(fones, s);
352
353 condition1 = _mm_cmpneq_ps(
354 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
355 condition2 = _mm_cmpneq_ps(
356 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
357 _mm_cmplt_ps(aVal, fzeroes));
358 condition3 = _mm_cmpneq_ps(
359 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
360
361 __m128 temp = cosine;
362 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
363 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
364 sine =
365 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
366 cosine = _mm_sub_ps(
367 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
368 tangent = _mm_div_ps(sine, cosine);
369 _mm_store_ps(bPtr, tangent);
370 aPtr += 4;
371 bPtr += 4;
372 }
373
374 number = quarterPoints * 4;
375 for (; number < num_points; number++) {
376 *bPtr++ = tanf(*aPtr++);
377 }
378}
379
380#endif /* LV_HAVE_SSE4_1 for aligned */
381
382
383#endif /* INCLUDED_volk_32f_tan_32f_a_H */
384
385#ifndef INCLUDED_volk_32f_tan_32f_u_H
386#define INCLUDED_volk_32f_tan_32f_u_H
387
388#if LV_HAVE_AVX2 && LV_HAVE_FMA
389#include <immintrin.h>
390
391static inline void
392volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
393{
394 float* bPtr = bVector;
395 const float* aPtr = aVector;
396
397 unsigned int number = 0;
398 unsigned int eighthPoints = num_points / 8;
399 unsigned int i = 0;
400
401 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
402 fzeroes;
403 __m256 sine, cosine, tangent, condition1, condition2, condition3;
404 __m256i q, r, ones, twos, fours;
405
406 m4pi = _mm256_set1_ps(1.273239545);
407 pio4A = _mm256_set1_ps(0.78515625);
408 pio4B = _mm256_set1_ps(0.241876e-3);
409 ffours = _mm256_set1_ps(4.0);
410 ftwos = _mm256_set1_ps(2.0);
411 fones = _mm256_set1_ps(1.0);
412 fzeroes = _mm256_setzero_ps();
413 ones = _mm256_set1_epi32(1);
414 twos = _mm256_set1_epi32(2);
415 fours = _mm256_set1_epi32(4);
416
417 cp1 = _mm256_set1_ps(1.0);
418 cp2 = _mm256_set1_ps(0.83333333e-1);
419 cp3 = _mm256_set1_ps(0.2777778e-2);
420 cp4 = _mm256_set1_ps(0.49603e-4);
421 cp5 = _mm256_set1_ps(0.551e-6);
422
423 for (; number < eighthPoints; number++) {
424 aVal = _mm256_loadu_ps(aPtr);
425 s = _mm256_sub_ps(aVal,
426 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
427 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
428 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
429 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
430
431 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
432 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
433
434 s = _mm256_div_ps(
435 s,
436 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
437 s = _mm256_mul_ps(s, s);
438 // Evaluate Taylor series
439 s = _mm256_mul_ps(
440 _mm256_fmadd_ps(
441 _mm256_fmsub_ps(
442 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
443 s,
444 cp1),
445 s);
446
447 for (i = 0; i < 3; i++) {
448 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
449 }
450 s = _mm256_div_ps(s, ftwos);
451
452 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
453 cosine = _mm256_sub_ps(fones, s);
454
455 condition1 = _mm256_cmp_ps(
456 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
457 fzeroes,
458 _CMP_NEQ_UQ);
459 condition2 = _mm256_cmp_ps(
460 _mm256_cmp_ps(
461 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
462 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
463 _CMP_NEQ_UQ);
464 condition3 = _mm256_cmp_ps(
465 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
466 fzeroes,
467 _CMP_NEQ_UQ);
468
469 __m256 temp = cosine;
470 cosine =
471 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
472 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
473 sine = _mm256_sub_ps(
474 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
475 cosine = _mm256_sub_ps(
476 cosine,
477 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
478 tangent = _mm256_div_ps(sine, cosine);
479 _mm256_storeu_ps(bPtr, tangent);
480 aPtr += 8;
481 bPtr += 8;
482 }
483
484 number = eighthPoints * 8;
485 for (; number < num_points; number++) {
486 *bPtr++ = tan(*aPtr++);
487 }
488}
489
490#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
491
492#ifdef LV_HAVE_AVX2
493#include <immintrin.h>
494
495static inline void
496volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
497{
498 float* bPtr = bVector;
499 const float* aPtr = aVector;
500
501 unsigned int number = 0;
502 unsigned int eighthPoints = num_points / 8;
503 unsigned int i = 0;
504
505 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
506 fzeroes;
507 __m256 sine, cosine, tangent, condition1, condition2, condition3;
508 __m256i q, r, ones, twos, fours;
509
510 m4pi = _mm256_set1_ps(1.273239545);
511 pio4A = _mm256_set1_ps(0.78515625);
512 pio4B = _mm256_set1_ps(0.241876e-3);
513 ffours = _mm256_set1_ps(4.0);
514 ftwos = _mm256_set1_ps(2.0);
515 fones = _mm256_set1_ps(1.0);
516 fzeroes = _mm256_setzero_ps();
517 ones = _mm256_set1_epi32(1);
518 twos = _mm256_set1_epi32(2);
519 fours = _mm256_set1_epi32(4);
520
521 cp1 = _mm256_set1_ps(1.0);
522 cp2 = _mm256_set1_ps(0.83333333e-1);
523 cp3 = _mm256_set1_ps(0.2777778e-2);
524 cp4 = _mm256_set1_ps(0.49603e-4);
525 cp5 = _mm256_set1_ps(0.551e-6);
526
527 for (; number < eighthPoints; number++) {
528 aVal = _mm256_loadu_ps(aPtr);
529 s = _mm256_sub_ps(aVal,
530 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
531 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
533 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
534
535 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
536 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
537
538 s = _mm256_div_ps(
539 s,
540 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
541 s = _mm256_mul_ps(s, s);
542 // Evaluate Taylor series
543 s = _mm256_mul_ps(
544 _mm256_add_ps(
545 _mm256_mul_ps(
546 _mm256_sub_ps(
547 _mm256_mul_ps(
548 _mm256_add_ps(
549 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
550 s),
551 cp3),
552 s),
553 cp2),
554 s),
555 cp1),
556 s);
557
558 for (i = 0; i < 3; i++) {
559 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
560 }
561 s = _mm256_div_ps(s, ftwos);
562
563 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
564 cosine = _mm256_sub_ps(fones, s);
565
566 condition1 = _mm256_cmp_ps(
567 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
568 fzeroes,
569 _CMP_NEQ_UQ);
570 condition2 = _mm256_cmp_ps(
571 _mm256_cmp_ps(
572 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
573 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
574 _CMP_NEQ_UQ);
575 condition3 = _mm256_cmp_ps(
576 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
577 fzeroes,
578 _CMP_NEQ_UQ);
579
580 __m256 temp = cosine;
581 cosine =
582 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
583 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
584 sine = _mm256_sub_ps(
585 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
586 cosine = _mm256_sub_ps(
587 cosine,
588 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
589 tangent = _mm256_div_ps(sine, cosine);
590 _mm256_storeu_ps(bPtr, tangent);
591 aPtr += 8;
592 bPtr += 8;
593 }
594
595 number = eighthPoints * 8;
596 for (; number < num_points; number++) {
597 *bPtr++ = tan(*aPtr++);
598 }
599}
600
601#endif /* LV_HAVE_AVX2 for unaligned */
602
603
604#ifdef LV_HAVE_SSE4_1
605#include <smmintrin.h>
606
607static inline void
608volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
609{
610 float* bPtr = bVector;
611 const float* aPtr = aVector;
612
613 unsigned int number = 0;
614 unsigned int quarterPoints = num_points / 4;
615 unsigned int i = 0;
616
617 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
618 fzeroes;
619 __m128 sine, cosine, tangent, condition1, condition2, condition3;
620 __m128i q, r, ones, twos, fours;
621
622 m4pi = _mm_set1_ps(1.273239545);
623 pio4A = _mm_set1_ps(0.78515625);
624 pio4B = _mm_set1_ps(0.241876e-3);
625 ffours = _mm_set1_ps(4.0);
626 ftwos = _mm_set1_ps(2.0);
627 fones = _mm_set1_ps(1.0);
628 fzeroes = _mm_setzero_ps();
629 ones = _mm_set1_epi32(1);
630 twos = _mm_set1_epi32(2);
631 fours = _mm_set1_epi32(4);
632
633 cp1 = _mm_set1_ps(1.0);
634 cp2 = _mm_set1_ps(0.83333333e-1);
635 cp3 = _mm_set1_ps(0.2777778e-2);
636 cp4 = _mm_set1_ps(0.49603e-4);
637 cp5 = _mm_set1_ps(0.551e-6);
638
639 for (; number < quarterPoints; number++) {
640 aVal = _mm_loadu_ps(aPtr);
641 s = _mm_sub_ps(aVal,
642 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
644 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
645
646 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
647 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
648
649 s = _mm_div_ps(
650 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
651 s = _mm_mul_ps(s, s);
652 // Evaluate Taylor series
653 s = _mm_mul_ps(
658 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
659 cp3),
660 s),
661 cp2),
662 s),
663 cp1),
664 s);
665
666 for (i = 0; i < 3; i++) {
667 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
668 }
669 s = _mm_div_ps(s, ftwos);
670
671 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
672 cosine = _mm_sub_ps(fones, s);
673
674 condition1 = _mm_cmpneq_ps(
675 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
676 condition2 = _mm_cmpneq_ps(
677 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
678 _mm_cmplt_ps(aVal, fzeroes));
679 condition3 = _mm_cmpneq_ps(
680 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
681
682 __m128 temp = cosine;
683 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
684 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
685 sine =
686 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
687 cosine = _mm_sub_ps(
688 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
689 tangent = _mm_div_ps(sine, cosine);
690 _mm_storeu_ps(bPtr, tangent);
691 aPtr += 4;
692 bPtr += 4;
693 }
694
695 number = quarterPoints * 4;
696 for (; number < num_points; number++) {
697 *bPtr++ = tanf(*aPtr++);
698 }
699}
700
701#endif /* LV_HAVE_SSE4_1 for unaligned */
702
703
704#ifdef LV_HAVE_GENERIC
705
706static inline void
707volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
708{
709 float* bPtr = bVector;
710 const float* aPtr = aVector;
711 unsigned int number = 0;
712
713 for (; number < num_points; number++) {
714 *bPtr++ = tanf(*aPtr++);
715 }
716}
717#endif /* LV_HAVE_GENERIC */
718
719
720#ifdef LV_HAVE_NEON
721#include <arm_neon.h>
723
724static inline void
725volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
726{
727 unsigned int number = 0;
728 unsigned int quarter_points = num_points / 4;
729 float* bVectorPtr = bVector;
730 const float* aVectorPtr = aVector;
731
732 float32x4_t b_vec;
733 float32x4_t a_vec;
734
735 for (number = 0; number < quarter_points; number++) {
736 a_vec = vld1q_f32(aVectorPtr);
737 // Prefetch next one, speeds things up
738 __VOLK_PREFETCH(aVectorPtr + 4);
739 b_vec = _vtanq_f32(a_vec);
740 vst1q_f32(bVectorPtr, b_vec);
741 // move pointers ahead
742 bVectorPtr += 4;
743 aVectorPtr += 4;
744 }
745
746 // Deal with the rest
747 for (number = quarter_points * 4; number < num_points; number++) {
748 *bVectorPtr++ = tanf(*aVectorPtr++);
749 }
750}
751#endif /* LV_HAVE_NEON */
752
753
754#endif /* INCLUDED_volk_32f_tan_32f_u_H */