Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_sin_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59#include <inttypes.h>
60#include <math.h>
61#include <stdio.h>
62
63#ifndef INCLUDED_volk_32f_sin_32f_a_H
64#define INCLUDED_volk_32f_sin_32f_a_H
65#ifdef LV_HAVE_AVX512F
66
67#include <immintrin.h>
68static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
69 const float* inVector,
70 unsigned int num_points)
71{
72 float* sinPtr = sinVector;
73 const float* inPtr = inVector;
74
75 unsigned int number = 0;
76 unsigned int sixteenPoints = num_points / 16;
77 unsigned int i = 0;
78
79 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
80 fones;
81 __m512 sine, cosine;
82 __m512i q, zeros, ones, twos, fours;
83
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
95
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2, ltZero;
102
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
105 // s = fabs(aVal)
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107
108 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110 // r = q + q&1, q indicates quadrant, r gives
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
116
117 s = _mm512_div_ps(
118 s,
119 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120 s = _mm512_mul_ps(s, s);
121 // Evaluate Taylor series
122 s = _mm512_mul_ps(
123 _mm512_fmadd_ps(
124 _mm512_fmsub_ps(
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126 s,
127 cp1),
128 s);
129
130 for (i = 0; i < 3; i++)
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132 s = _mm512_div_ps(s, ftwos);
133
134 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
135 cosine = _mm512_sub_ps(fones, s);
136
137 condition1 = _mm512_cmpneq_epi32_mask(
138 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
139 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
140 condition2 = _mm512_kxor(
141 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
142
143 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
144 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
145 _mm512_store_ps(sinPtr, sine);
146 inPtr += 16;
147 sinPtr += 16;
148 }
149
150 number = sixteenPoints * 16;
151 for (; number < num_points; number++) {
152 *sinPtr++ = sinf(*inPtr++);
153 }
154}
155#endif
156#if LV_HAVE_AVX2 && LV_HAVE_FMA
157#include <immintrin.h>
158
159static inline void
160volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
161{
162 float* bPtr = bVector;
163 const float* aPtr = aVector;
164
165 unsigned int number = 0;
166 unsigned int eighthPoints = num_points / 8;
167 unsigned int i = 0;
168
169 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
170 fzeroes;
171 __m256 sine, cosine, condition1, condition2;
172 __m256i q, r, ones, twos, fours;
173
174 m4pi = _mm256_set1_ps(1.273239545);
175 pio4A = _mm256_set1_ps(0.78515625);
176 pio4B = _mm256_set1_ps(0.241876e-3);
177 ffours = _mm256_set1_ps(4.0);
178 ftwos = _mm256_set1_ps(2.0);
179 fones = _mm256_set1_ps(1.0);
180 fzeroes = _mm256_setzero_ps();
181 ones = _mm256_set1_epi32(1);
182 twos = _mm256_set1_epi32(2);
183 fours = _mm256_set1_epi32(4);
184
185 cp1 = _mm256_set1_ps(1.0);
186 cp2 = _mm256_set1_ps(0.83333333e-1);
187 cp3 = _mm256_set1_ps(0.2777778e-2);
188 cp4 = _mm256_set1_ps(0.49603e-4);
189 cp5 = _mm256_set1_ps(0.551e-6);
190
191 for (; number < eighthPoints; number++) {
192 aVal = _mm256_load_ps(aPtr);
193 s = _mm256_sub_ps(aVal,
194 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
195 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
196 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
197 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
198
199 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
200 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
201
202 s = _mm256_div_ps(
203 s,
204 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
205 s = _mm256_mul_ps(s, s);
206 // Evaluate Taylor series
207 s = _mm256_mul_ps(
208 _mm256_fmadd_ps(
209 _mm256_fmsub_ps(
210 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
211 s,
212 cp1),
213 s);
214
215 for (i = 0; i < 3; i++) {
216 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
217 }
218 s = _mm256_div_ps(s, ftwos);
219
220 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
221 cosine = _mm256_sub_ps(fones, s);
222
223 condition1 = _mm256_cmp_ps(
224 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
225 fzeroes,
226 _CMP_NEQ_UQ);
227 condition2 = _mm256_cmp_ps(
228 _mm256_cmp_ps(
229 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
230 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
231 _CMP_NEQ_UQ);
232 // Need this condition only for cos
233 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
234 // twos), fours)), fzeroes);
235
236 sine =
237 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
238 sine = _mm256_sub_ps(
239 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
240 _mm256_store_ps(bPtr, sine);
241 aPtr += 8;
242 bPtr += 8;
243 }
244
245 number = eighthPoints * 8;
246 for (; number < num_points; number++) {
247 *bPtr++ = sin(*aPtr++);
248 }
249}
250
251#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
252
253#ifdef LV_HAVE_AVX2
254#include <immintrin.h>
255
256static inline void
257volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
258{
259 float* bPtr = bVector;
260 const float* aPtr = aVector;
261
262 unsigned int number = 0;
263 unsigned int eighthPoints = num_points / 8;
264 unsigned int i = 0;
265
266 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
267 fzeroes;
268 __m256 sine, cosine, condition1, condition2;
269 __m256i q, r, ones, twos, fours;
270
271 m4pi = _mm256_set1_ps(1.273239545);
272 pio4A = _mm256_set1_ps(0.78515625);
273 pio4B = _mm256_set1_ps(0.241876e-3);
274 ffours = _mm256_set1_ps(4.0);
275 ftwos = _mm256_set1_ps(2.0);
276 fones = _mm256_set1_ps(1.0);
277 fzeroes = _mm256_setzero_ps();
278 ones = _mm256_set1_epi32(1);
279 twos = _mm256_set1_epi32(2);
280 fours = _mm256_set1_epi32(4);
281
282 cp1 = _mm256_set1_ps(1.0);
283 cp2 = _mm256_set1_ps(0.83333333e-1);
284 cp3 = _mm256_set1_ps(0.2777778e-2);
285 cp4 = _mm256_set1_ps(0.49603e-4);
286 cp5 = _mm256_set1_ps(0.551e-6);
287
288 for (; number < eighthPoints; number++) {
289 aVal = _mm256_load_ps(aPtr);
290 s = _mm256_sub_ps(aVal,
291 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
292 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
293 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
294 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
295
296 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
297 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
298
299 s = _mm256_div_ps(
300 s,
301 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
302 s = _mm256_mul_ps(s, s);
303 // Evaluate Taylor series
304 s = _mm256_mul_ps(
305 _mm256_add_ps(
306 _mm256_mul_ps(
307 _mm256_sub_ps(
308 _mm256_mul_ps(
309 _mm256_add_ps(
310 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
311 s),
312 cp3),
313 s),
314 cp2),
315 s),
316 cp1),
317 s);
318
319 for (i = 0; i < 3; i++) {
320 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
321 }
322 s = _mm256_div_ps(s, ftwos);
323
324 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
325 cosine = _mm256_sub_ps(fones, s);
326
327 condition1 = _mm256_cmp_ps(
328 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
329 fzeroes,
330 _CMP_NEQ_UQ);
331 condition2 = _mm256_cmp_ps(
332 _mm256_cmp_ps(
333 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
334 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
335 _CMP_NEQ_UQ);
336 // Need this condition only for cos
337 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
338 // twos), fours)), fzeroes);
339
340 sine =
341 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
342 sine = _mm256_sub_ps(
343 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
344 _mm256_store_ps(bPtr, sine);
345 aPtr += 8;
346 bPtr += 8;
347 }
348
349 number = eighthPoints * 8;
350 for (; number < num_points; number++) {
351 *bPtr++ = sin(*aPtr++);
352 }
353}
354
355#endif /* LV_HAVE_AVX2 for aligned */
356
357#ifdef LV_HAVE_SSE4_1
358#include <smmintrin.h>
359
360static inline void
361volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
362{
363 float* bPtr = bVector;
364 const float* aPtr = aVector;
365
366 unsigned int number = 0;
367 unsigned int quarterPoints = num_points / 4;
368 unsigned int i = 0;
369
370 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
371 fzeroes;
372 __m128 sine, cosine, condition1, condition2;
373 __m128i q, r, ones, twos, fours;
374
375 m4pi = _mm_set1_ps(1.273239545);
376 pio4A = _mm_set1_ps(0.78515625);
377 pio4B = _mm_set1_ps(0.241876e-3);
378 ffours = _mm_set1_ps(4.0);
379 ftwos = _mm_set1_ps(2.0);
380 fones = _mm_set1_ps(1.0);
381 fzeroes = _mm_setzero_ps();
382 ones = _mm_set1_epi32(1);
383 twos = _mm_set1_epi32(2);
384 fours = _mm_set1_epi32(4);
385
386 cp1 = _mm_set1_ps(1.0);
387 cp2 = _mm_set1_ps(0.83333333e-1);
388 cp3 = _mm_set1_ps(0.2777778e-2);
389 cp4 = _mm_set1_ps(0.49603e-4);
390 cp5 = _mm_set1_ps(0.551e-6);
391
392 for (; number < quarterPoints; number++) {
393 aVal = _mm_load_ps(aPtr);
394 s = _mm_sub_ps(aVal,
395 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
397 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
398
399 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
400 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
401
402 s = _mm_div_ps(
403 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
404 s = _mm_mul_ps(s, s);
405 // Evaluate Taylor series
406 s = _mm_mul_ps(
411 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
412 cp3),
413 s),
414 cp2),
415 s),
416 cp1),
417 s);
418
419 for (i = 0; i < 3; i++) {
420 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
421 }
422 s = _mm_div_ps(s, ftwos);
423
424 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
425 cosine = _mm_sub_ps(fones, s);
426
427 condition1 = _mm_cmpneq_ps(
428 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
429 condition2 = _mm_cmpneq_ps(
430 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
431 _mm_cmplt_ps(aVal, fzeroes));
432 // Need this condition only for cos
433 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
434 // twos), fours)), fzeroes);
435
436 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
437 sine =
438 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
439 _mm_store_ps(bPtr, sine);
440 aPtr += 4;
441 bPtr += 4;
442 }
443
444 number = quarterPoints * 4;
445 for (; number < num_points; number++) {
446 *bPtr++ = sinf(*aPtr++);
447 }
448}
449
450#endif /* LV_HAVE_SSE4_1 for aligned */
451
452
453#endif /* INCLUDED_volk_32f_sin_32f_a_H */
454
455#ifndef INCLUDED_volk_32f_sin_32f_u_H
456#define INCLUDED_volk_32f_sin_32f_u_H
457
458#ifdef LV_HAVE_AVX512F
459
460#include <immintrin.h>
461static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
462 const float* inVector,
463 unsigned int num_points)
464{
465 float* sinPtr = sinVector;
466 const float* inPtr = inVector;
467
468 unsigned int number = 0;
469 unsigned int sixteenPoints = num_points / 16;
470 unsigned int i = 0;
471
472 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
473 fones;
474 __m512 sine, cosine;
475 __m512i q, zeros, ones, twos, fours;
476
477 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
478 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
479 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
480 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
481 ffours = _mm512_set1_ps(4.0);
482 ftwos = _mm512_set1_ps(2.0);
483 fones = _mm512_set1_ps(1.0);
484 zeros = _mm512_setzero_epi32();
485 ones = _mm512_set1_epi32(1);
486 twos = _mm512_set1_epi32(2);
487 fours = _mm512_set1_epi32(4);
488
489 cp1 = _mm512_set1_ps(1.0);
490 cp2 = _mm512_set1_ps(0.08333333333333333);
491 cp3 = _mm512_set1_ps(0.002777777777777778);
492 cp4 = _mm512_set1_ps(4.96031746031746e-05);
493 cp5 = _mm512_set1_ps(5.511463844797178e-07);
494 __mmask16 condition1, condition2, ltZero;
495
496 for (; number < sixteenPoints; number++) {
497 aVal = _mm512_loadu_ps(inPtr);
498 // s = fabs(aVal)
499 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
500
501 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
502 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
503 // r = q + q&1, q indicates quadrant, r gives
504 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
505
506 s = _mm512_fnmadd_ps(r, pio4A, s);
507 s = _mm512_fnmadd_ps(r, pio4B, s);
508 s = _mm512_fnmadd_ps(r, pio4C, s);
509
510 s = _mm512_div_ps(
511 s,
512 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
513 s = _mm512_mul_ps(s, s);
514 // Evaluate Taylor series
515 s = _mm512_mul_ps(
516 _mm512_fmadd_ps(
517 _mm512_fmsub_ps(
518 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
519 s,
520 cp1),
521 s);
522
523 for (i = 0; i < 3; i++)
524 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
525 s = _mm512_div_ps(s, ftwos);
526
527 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
528 cosine = _mm512_sub_ps(fones, s);
529
530 condition1 = _mm512_cmpneq_epi32_mask(
531 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
532 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
533 condition2 = _mm512_kxor(
534 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
535
536 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
537 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
538 _mm512_storeu_ps(sinPtr, sine);
539 inPtr += 16;
540 sinPtr += 16;
541 }
542
543 number = sixteenPoints * 16;
544 for (; number < num_points; number++) {
545 *sinPtr++ = sinf(*inPtr++);
546 }
547}
548#endif
549
550#if LV_HAVE_AVX2 && LV_HAVE_FMA
551#include <immintrin.h>
552
553static inline void
554volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
555{
556 float* bPtr = bVector;
557 const float* aPtr = aVector;
558
559 unsigned int number = 0;
560 unsigned int eighthPoints = num_points / 8;
561 unsigned int i = 0;
562
563 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
564 fzeroes;
565 __m256 sine, cosine, condition1, condition2;
566 __m256i q, r, ones, twos, fours;
567
568 m4pi = _mm256_set1_ps(1.273239545);
569 pio4A = _mm256_set1_ps(0.78515625);
570 pio4B = _mm256_set1_ps(0.241876e-3);
571 ffours = _mm256_set1_ps(4.0);
572 ftwos = _mm256_set1_ps(2.0);
573 fones = _mm256_set1_ps(1.0);
574 fzeroes = _mm256_setzero_ps();
575 ones = _mm256_set1_epi32(1);
576 twos = _mm256_set1_epi32(2);
577 fours = _mm256_set1_epi32(4);
578
579 cp1 = _mm256_set1_ps(1.0);
580 cp2 = _mm256_set1_ps(0.83333333e-1);
581 cp3 = _mm256_set1_ps(0.2777778e-2);
582 cp4 = _mm256_set1_ps(0.49603e-4);
583 cp5 = _mm256_set1_ps(0.551e-6);
584
585 for (; number < eighthPoints; number++) {
586 aVal = _mm256_loadu_ps(aPtr);
587 s = _mm256_sub_ps(aVal,
588 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
589 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
590 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
591 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
592
593 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
594 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
595
596 s = _mm256_div_ps(
597 s,
598 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
599 s = _mm256_mul_ps(s, s);
600 // Evaluate Taylor series
601 s = _mm256_mul_ps(
602 _mm256_fmadd_ps(
603 _mm256_fmsub_ps(
604 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
605 s,
606 cp1),
607 s);
608
609 for (i = 0; i < 3; i++) {
610 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
611 }
612 s = _mm256_div_ps(s, ftwos);
613
614 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
615 cosine = _mm256_sub_ps(fones, s);
616
617 condition1 = _mm256_cmp_ps(
618 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
619 fzeroes,
620 _CMP_NEQ_UQ);
621 condition2 = _mm256_cmp_ps(
622 _mm256_cmp_ps(
623 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
624 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
625 _CMP_NEQ_UQ);
626 // Need this condition only for cos
627 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
628 // twos), fours)), fzeroes);
629
630 sine =
631 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
632 sine = _mm256_sub_ps(
633 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
634 _mm256_storeu_ps(bPtr, sine);
635 aPtr += 8;
636 bPtr += 8;
637 }
638
639 number = eighthPoints * 8;
640 for (; number < num_points; number++) {
641 *bPtr++ = sin(*aPtr++);
642 }
643}
644
645#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
646
647#ifdef LV_HAVE_AVX2
648#include <immintrin.h>
649
650static inline void
651volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
652{
653 float* bPtr = bVector;
654 const float* aPtr = aVector;
655
656 unsigned int number = 0;
657 unsigned int eighthPoints = num_points / 8;
658 unsigned int i = 0;
659
660 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
661 fzeroes;
662 __m256 sine, cosine, condition1, condition2;
663 __m256i q, r, ones, twos, fours;
664
665 m4pi = _mm256_set1_ps(1.273239545);
666 pio4A = _mm256_set1_ps(0.78515625);
667 pio4B = _mm256_set1_ps(0.241876e-3);
668 ffours = _mm256_set1_ps(4.0);
669 ftwos = _mm256_set1_ps(2.0);
670 fones = _mm256_set1_ps(1.0);
671 fzeroes = _mm256_setzero_ps();
672 ones = _mm256_set1_epi32(1);
673 twos = _mm256_set1_epi32(2);
674 fours = _mm256_set1_epi32(4);
675
676 cp1 = _mm256_set1_ps(1.0);
677 cp2 = _mm256_set1_ps(0.83333333e-1);
678 cp3 = _mm256_set1_ps(0.2777778e-2);
679 cp4 = _mm256_set1_ps(0.49603e-4);
680 cp5 = _mm256_set1_ps(0.551e-6);
681
682 for (; number < eighthPoints; number++) {
683 aVal = _mm256_loadu_ps(aPtr);
684 s = _mm256_sub_ps(aVal,
685 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
686 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
687 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
688 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
689
690 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
691 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
692
693 s = _mm256_div_ps(
694 s,
695 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
696 s = _mm256_mul_ps(s, s);
697 // Evaluate Taylor series
698 s = _mm256_mul_ps(
699 _mm256_add_ps(
700 _mm256_mul_ps(
701 _mm256_sub_ps(
702 _mm256_mul_ps(
703 _mm256_add_ps(
704 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
705 s),
706 cp3),
707 s),
708 cp2),
709 s),
710 cp1),
711 s);
712
713 for (i = 0; i < 3; i++) {
714 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
715 }
716 s = _mm256_div_ps(s, ftwos);
717
718 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
719 cosine = _mm256_sub_ps(fones, s);
720
721 condition1 = _mm256_cmp_ps(
722 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
723 fzeroes,
724 _CMP_NEQ_UQ);
725 condition2 = _mm256_cmp_ps(
726 _mm256_cmp_ps(
727 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
728 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
729 _CMP_NEQ_UQ);
730 // Need this condition only for cos
731 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
732 // twos), fours)), fzeroes);
733
734 sine =
735 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
736 sine = _mm256_sub_ps(
737 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
738 _mm256_storeu_ps(bPtr, sine);
739 aPtr += 8;
740 bPtr += 8;
741 }
742
743 number = eighthPoints * 8;
744 for (; number < num_points; number++) {
745 *bPtr++ = sin(*aPtr++);
746 }
747}
748
749#endif /* LV_HAVE_AVX2 for unaligned */
750
751
752#ifdef LV_HAVE_SSE4_1
753#include <smmintrin.h>
754
755static inline void
756volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
757{
758 float* bPtr = bVector;
759 const float* aPtr = aVector;
760
761 unsigned int number = 0;
762 unsigned int quarterPoints = num_points / 4;
763 unsigned int i = 0;
764
765 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
766 fzeroes;
767 __m128 sine, cosine, condition1, condition2;
768 __m128i q, r, ones, twos, fours;
769
770 m4pi = _mm_set1_ps(1.273239545);
771 pio4A = _mm_set1_ps(0.78515625);
772 pio4B = _mm_set1_ps(0.241876e-3);
773 ffours = _mm_set1_ps(4.0);
774 ftwos = _mm_set1_ps(2.0);
775 fones = _mm_set1_ps(1.0);
776 fzeroes = _mm_setzero_ps();
777 ones = _mm_set1_epi32(1);
778 twos = _mm_set1_epi32(2);
779 fours = _mm_set1_epi32(4);
780
781 cp1 = _mm_set1_ps(1.0);
782 cp2 = _mm_set1_ps(0.83333333e-1);
783 cp3 = _mm_set1_ps(0.2777778e-2);
784 cp4 = _mm_set1_ps(0.49603e-4);
785 cp5 = _mm_set1_ps(0.551e-6);
786
787 for (; number < quarterPoints; number++) {
788 aVal = _mm_loadu_ps(aPtr);
789 s = _mm_sub_ps(aVal,
790 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
792 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
793
794 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
795 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
796
797 s = _mm_div_ps(
798 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
799 s = _mm_mul_ps(s, s);
800 // Evaluate Taylor series
801 s = _mm_mul_ps(
806 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
807 cp3),
808 s),
809 cp2),
810 s),
811 cp1),
812 s);
813
814 for (i = 0; i < 3; i++) {
815 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
816 }
817 s = _mm_div_ps(s, ftwos);
818
819 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
820 cosine = _mm_sub_ps(fones, s);
821
822 condition1 = _mm_cmpneq_ps(
823 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
824 condition2 = _mm_cmpneq_ps(
825 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
826 _mm_cmplt_ps(aVal, fzeroes));
827
828 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
829 sine =
830 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
831 _mm_storeu_ps(bPtr, sine);
832 aPtr += 4;
833 bPtr += 4;
834 }
835
836 number = quarterPoints * 4;
837 for (; number < num_points; number++) {
838 *bPtr++ = sinf(*aPtr++);
839 }
840}
841
842#endif /* LV_HAVE_SSE4_1 for unaligned */
843
844
845#ifdef LV_HAVE_GENERIC
846
847static inline void
848volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
849{
850 float* bPtr = bVector;
851 const float* aPtr = aVector;
852 unsigned int number = 0;
853
854 for (number = 0; number < num_points; number++) {
855 *bPtr++ = sinf(*aPtr++);
856 }
857}
858
859#endif /* LV_HAVE_GENERIC */
860
861
862#ifdef LV_HAVE_NEON
863#include <arm_neon.h>
865
866static inline void
867volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
868{
869 unsigned int number = 0;
870 unsigned int quarter_points = num_points / 4;
871 float* bVectorPtr = bVector;
872 const float* aVectorPtr = aVector;
873
874 float32x4_t b_vec;
875 float32x4_t a_vec;
876
877 for (number = 0; number < quarter_points; number++) {
878 a_vec = vld1q_f32(aVectorPtr);
879 // Prefetch next one, speeds things up
880 __VOLK_PREFETCH(aVectorPtr + 4);
881 b_vec = _vsinq_f32(a_vec);
882 vst1q_f32(bVectorPtr, b_vec);
883 // move pointers ahead
884 bVectorPtr += 4;
885 aVectorPtr += 4;
886 }
887
888 // Deal with the rest
889 for (number = quarter_points * 4; number < num_points; number++) {
890 *bVectorPtr++ = sinf(*aVectorPtr++);
891 }
892}
893
894#endif /* LV_HAVE_NEON */
895
896
897#endif /* INCLUDED_volk_32f_sin_32f_u_H */