Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_cos_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59#include <inttypes.h>
60#include <math.h>
61#include <stdio.h>
62
63#ifndef INCLUDED_volk_32f_cos_32f_a_H
64#define INCLUDED_volk_32f_cos_32f_a_H
65
66#ifdef LV_HAVE_AVX512F
67
68#include <immintrin.h>
69static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
70 const float* inVector,
71 unsigned int num_points)
72{
73 float* cosPtr = cosVector;
74 const float* inPtr = inVector;
75
76 unsigned int number = 0;
77 unsigned int sixteenPoints = num_points / 16;
78 unsigned int i = 0;
79
80 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
81 fones, sine, cosine;
82 __m512i q, zeros, ones, twos, fours;
83
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
95
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2;
102
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
105 // s = fabs(aVal)
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107
108 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110 // r = q + q&1, q indicates quadrant, r gives
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
116
117 s = _mm512_div_ps(
118 s,
119 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120 s = _mm512_mul_ps(s, s);
121 // Evaluate Taylor series
122 s = _mm512_mul_ps(
123 _mm512_fmadd_ps(
124 _mm512_fmsub_ps(
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126 s,
127 cp1),
128 s);
129
130 for (i = 0; i < 3; i++)
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132 s = _mm512_div_ps(s, ftwos);
133
134 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
135 cosine = _mm512_sub_ps(fones, s);
136
137 // if(((q+1)&2) != 0) { cosine=sine;}
138 condition1 = _mm512_cmpneq_epi32_mask(
139 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
140
141 // if(((q+2)&4) != 0) { cosine = -cosine;}
142 condition2 = _mm512_cmpneq_epi32_mask(
143 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
144 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
145 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
146 _mm512_store_ps(cosPtr, cosine);
147 inPtr += 16;
148 cosPtr += 16;
149 }
150
151 number = sixteenPoints * 16;
152 for (; number < num_points; number++) {
153 *cosPtr++ = cosf(*inPtr++);
154 }
155}
156#endif
157
158#if LV_HAVE_AVX2 && LV_HAVE_FMA
159#include <immintrin.h>
160
161static inline void
162volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
163{
164 float* bPtr = bVector;
165 const float* aPtr = aVector;
166
167 unsigned int number = 0;
168 unsigned int eighthPoints = num_points / 8;
169 unsigned int i = 0;
170
171 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
172 fones, fzeroes;
173 __m256 sine, cosine;
174 __m256i q, ones, twos, fours;
175
176 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
177 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
178 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
179 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
180 ffours = _mm256_set1_ps(4.0);
181 ftwos = _mm256_set1_ps(2.0);
182 fones = _mm256_set1_ps(1.0);
183 fzeroes = _mm256_setzero_ps();
184 __m256i zeroes = _mm256_set1_epi32(0);
185 ones = _mm256_set1_epi32(1);
186 __m256i allones = _mm256_set1_epi32(0xffffffff);
187 twos = _mm256_set1_epi32(2);
188 fours = _mm256_set1_epi32(4);
189
190 cp1 = _mm256_set1_ps(1.0);
191 cp2 = _mm256_set1_ps(0.08333333333333333);
192 cp3 = _mm256_set1_ps(0.002777777777777778);
193 cp4 = _mm256_set1_ps(4.96031746031746e-05);
194 cp5 = _mm256_set1_ps(5.511463844797178e-07);
195 union bit256 condition1;
196 union bit256 condition3;
197
198 for (; number < eighthPoints; number++) {
199
200 aVal = _mm256_load_ps(aPtr);
201 // s = fabs(aVal)
202 s = _mm256_sub_ps(aVal,
203 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
204 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
205 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
206 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
207 // r = q + q&1, q indicates quadrant, r gives
208 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
209
210 s = _mm256_fnmadd_ps(r, pio4A, s);
211 s = _mm256_fnmadd_ps(r, pio4B, s);
212 s = _mm256_fnmadd_ps(r, pio4C, s);
213
214 s = _mm256_div_ps(
215 s,
216 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
217 s = _mm256_mul_ps(s, s);
218 // Evaluate Taylor series
219 s = _mm256_mul_ps(
220 _mm256_fmadd_ps(
221 _mm256_fmsub_ps(
222 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
223 s,
224 cp1),
225 s);
226
227 for (i = 0; i < 3; i++)
228 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
229 s = _mm256_div_ps(s, ftwos);
230
231 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
232 cosine = _mm256_sub_ps(fones, s);
233
234 // if(((q+1)&2) != 0) { cosine=sine;}
235 condition1.int_vec =
236 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
237 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
238
239 // if(((q+2)&4) != 0) { cosine = -cosine;}
240 condition3.int_vec = _mm256_cmpeq_epi32(
241 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
242 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
243
244 cosine = _mm256_add_ps(
245 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
246 cosine = _mm256_sub_ps(cosine,
247 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
248 condition3.float_vec));
249 _mm256_store_ps(bPtr, cosine);
250 aPtr += 8;
251 bPtr += 8;
252 }
253
254 number = eighthPoints * 8;
255 for (; number < num_points; number++) {
256 *bPtr++ = cos(*aPtr++);
257 }
258}
259
260#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
261
262#ifdef LV_HAVE_AVX2
263#include <immintrin.h>
264
265static inline void
266volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
267{
268 float* bPtr = bVector;
269 const float* aPtr = aVector;
270
271 unsigned int number = 0;
272 unsigned int eighthPoints = num_points / 8;
273 unsigned int i = 0;
274
275 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
276 fones, fzeroes;
277 __m256 sine, cosine;
278 __m256i q, ones, twos, fours;
279
280 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
281 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
282 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
283 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
284 ffours = _mm256_set1_ps(4.0);
285 ftwos = _mm256_set1_ps(2.0);
286 fones = _mm256_set1_ps(1.0);
287 fzeroes = _mm256_setzero_ps();
288 __m256i zeroes = _mm256_set1_epi32(0);
289 ones = _mm256_set1_epi32(1);
290 __m256i allones = _mm256_set1_epi32(0xffffffff);
291 twos = _mm256_set1_epi32(2);
292 fours = _mm256_set1_epi32(4);
293
294 cp1 = _mm256_set1_ps(1.0);
295 cp2 = _mm256_set1_ps(0.08333333333333333);
296 cp3 = _mm256_set1_ps(0.002777777777777778);
297 cp4 = _mm256_set1_ps(4.96031746031746e-05);
298 cp5 = _mm256_set1_ps(5.511463844797178e-07);
299 union bit256 condition1;
300 union bit256 condition3;
301
302 for (; number < eighthPoints; number++) {
303
304 aVal = _mm256_load_ps(aPtr);
305 // s = fabs(aVal)
306 s = _mm256_sub_ps(aVal,
307 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
308 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
309 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
310 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
311 // r = q + q&1, q indicates quadrant, r gives
312 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
313
314 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
315 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
316 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
317
318 s = _mm256_div_ps(
319 s,
320 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
321 s = _mm256_mul_ps(s, s);
322 // Evaluate Taylor series
323 s = _mm256_mul_ps(
324 _mm256_add_ps(
325 _mm256_mul_ps(
326 _mm256_sub_ps(
327 _mm256_mul_ps(
328 _mm256_add_ps(
329 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
330 s),
331 cp3),
332 s),
333 cp2),
334 s),
335 cp1),
336 s);
337
338 for (i = 0; i < 3; i++)
339 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
340 s = _mm256_div_ps(s, ftwos);
341
342 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
343 cosine = _mm256_sub_ps(fones, s);
344
345 // if(((q+1)&2) != 0) { cosine=sine;}
346 condition1.int_vec =
347 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
348 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
349
350 // if(((q+2)&4) != 0) { cosine = -cosine;}
351 condition3.int_vec = _mm256_cmpeq_epi32(
352 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
353 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
354
355 cosine = _mm256_add_ps(
356 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
357 cosine = _mm256_sub_ps(cosine,
358 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
359 condition3.float_vec));
360 _mm256_store_ps(bPtr, cosine);
361 aPtr += 8;
362 bPtr += 8;
363 }
364
365 number = eighthPoints * 8;
366 for (; number < num_points; number++) {
367 *bPtr++ = cos(*aPtr++);
368 }
369}
370
371#endif /* LV_HAVE_AVX2 for aligned */
372
373#ifdef LV_HAVE_SSE4_1
374#include <smmintrin.h>
375
376static inline void
377volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
378{
379 float* bPtr = bVector;
380 const float* aPtr = aVector;
381
382 unsigned int number = 0;
383 unsigned int quarterPoints = num_points / 4;
384 unsigned int i = 0;
385
386 __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
387 fones, fzeroes;
388 __m128 sine, cosine;
389 __m128i q, ones, twos, fours;
390
391 m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
392 pio4A = _mm_set1_ps(0.7853981554508209228515625);
393 pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
394 pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
395 ffours = _mm_set1_ps(4.0);
396 ftwos = _mm_set1_ps(2.0);
397 fones = _mm_set1_ps(1.0);
398 fzeroes = _mm_setzero_ps();
399 __m128i zeroes = _mm_set1_epi32(0);
400 ones = _mm_set1_epi32(1);
401 __m128i allones = _mm_set1_epi32(0xffffffff);
402 twos = _mm_set1_epi32(2);
403 fours = _mm_set1_epi32(4);
404
405 cp1 = _mm_set1_ps(1.0);
406 cp2 = _mm_set1_ps(0.08333333333333333);
407 cp3 = _mm_set1_ps(0.002777777777777778);
408 cp4 = _mm_set1_ps(4.96031746031746e-05);
409 cp5 = _mm_set1_ps(5.511463844797178e-07);
410 union bit128 condition1;
411 union bit128 condition3;
412
413 for (; number < quarterPoints; number++) {
414
415 aVal = _mm_load_ps(aPtr);
416 // s = fabs(aVal)
417 s = _mm_sub_ps(aVal,
418 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
419 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
421 // r = q + q&1, q indicates quadrant, r gives
423
424 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
425 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
426 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
427
428 s = _mm_div_ps(
429 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
430 s = _mm_mul_ps(s, s);
431 // Evaluate Taylor series
432 s = _mm_mul_ps(
437 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
438 cp3),
439 s),
440 cp2),
441 s),
442 cp1),
443 s);
444
445 for (i = 0; i < 3; i++)
446 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
447 s = _mm_div_ps(s, ftwos);
448
449 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
450 cosine = _mm_sub_ps(fones, s);
451
452 // if(((q+1)&2) != 0) { cosine=sine;}
453 condition1.int_vec =
454 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
455 condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
456
457 // if(((q+2)&4) != 0) { cosine = -cosine;}
458 condition3.int_vec =
459 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
460 condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
461
462 cosine = _mm_add_ps(cosine,
463 _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
464 cosine = _mm_sub_ps(
465 cosine,
466 _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
467 _mm_store_ps(bPtr, cosine);
468 aPtr += 4;
469 bPtr += 4;
470 }
471
472 number = quarterPoints * 4;
473 for (; number < num_points; number++) {
474 *bPtr++ = cosf(*aPtr++);
475 }
476}
477
478#endif /* LV_HAVE_SSE4_1 for aligned */
479
480#endif /* INCLUDED_volk_32f_cos_32f_a_H */
481
482
483#ifndef INCLUDED_volk_32f_cos_32f_u_H
484#define INCLUDED_volk_32f_cos_32f_u_H
485
486#ifdef LV_HAVE_AVX512F
487
488#include <immintrin.h>
489static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
490 const float* inVector,
491 unsigned int num_points)
492{
493 float* cosPtr = cosVector;
494 const float* inPtr = inVector;
495
496 unsigned int number = 0;
497 unsigned int sixteenPoints = num_points / 16;
498 unsigned int i = 0;
499
500 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
501 fones, sine, cosine;
502 __m512i q, zeros, ones, twos, fours;
503
504 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
505 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
506 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
507 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
508 ffours = _mm512_set1_ps(4.0);
509 ftwos = _mm512_set1_ps(2.0);
510 fones = _mm512_set1_ps(1.0);
511 zeros = _mm512_setzero_epi32();
512 ones = _mm512_set1_epi32(1);
513 twos = _mm512_set1_epi32(2);
514 fours = _mm512_set1_epi32(4);
515
516 cp1 = _mm512_set1_ps(1.0);
517 cp2 = _mm512_set1_ps(0.08333333333333333);
518 cp3 = _mm512_set1_ps(0.002777777777777778);
519 cp4 = _mm512_set1_ps(4.96031746031746e-05);
520 cp5 = _mm512_set1_ps(5.511463844797178e-07);
521 __mmask16 condition1, condition2;
522 for (; number < sixteenPoints; number++) {
523 aVal = _mm512_loadu_ps(inPtr);
524 // s = fabs(aVal)
525 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
526
527 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
528 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
529 // r = q + q&1, q indicates quadrant, r gives
530 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
531
532 s = _mm512_fnmadd_ps(r, pio4A, s);
533 s = _mm512_fnmadd_ps(r, pio4B, s);
534 s = _mm512_fnmadd_ps(r, pio4C, s);
535
536 s = _mm512_div_ps(
537 s,
538 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
539 s = _mm512_mul_ps(s, s);
540 // Evaluate Taylor series
541 s = _mm512_mul_ps(
542 _mm512_fmadd_ps(
543 _mm512_fmsub_ps(
544 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
545 s,
546 cp1),
547 s);
548
549 for (i = 0; i < 3; i++)
550 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
551 s = _mm512_div_ps(s, ftwos);
552
553 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
554 cosine = _mm512_sub_ps(fones, s);
555
556 // if(((q+1)&2) != 0) { cosine=sine;}
557 condition1 = _mm512_cmpneq_epi32_mask(
558 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
559
560 // if(((q+2)&4) != 0) { cosine = -cosine;}
561 condition2 = _mm512_cmpneq_epi32_mask(
562 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
563
564 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
565 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
566 _mm512_storeu_ps(cosPtr, cosine);
567 inPtr += 16;
568 cosPtr += 16;
569 }
570
571 number = sixteenPoints * 16;
572 for (; number < num_points; number++) {
573 *cosPtr++ = cosf(*inPtr++);
574 }
575}
576#endif
577
578#if LV_HAVE_AVX2 && LV_HAVE_FMA
579#include <immintrin.h>
580
581static inline void
582volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
583{
584 float* bPtr = bVector;
585 const float* aPtr = aVector;
586
587 unsigned int number = 0;
588 unsigned int eighthPoints = num_points / 8;
589 unsigned int i = 0;
590
591 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
592 fones, fzeroes;
593 __m256 sine, cosine;
594 __m256i q, ones, twos, fours;
595
596 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
597 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
598 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
599 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
600 ffours = _mm256_set1_ps(4.0);
601 ftwos = _mm256_set1_ps(2.0);
602 fones = _mm256_set1_ps(1.0);
603 fzeroes = _mm256_setzero_ps();
604 __m256i zeroes = _mm256_set1_epi32(0);
605 ones = _mm256_set1_epi32(1);
606 __m256i allones = _mm256_set1_epi32(0xffffffff);
607 twos = _mm256_set1_epi32(2);
608 fours = _mm256_set1_epi32(4);
609
610 cp1 = _mm256_set1_ps(1.0);
611 cp2 = _mm256_set1_ps(0.08333333333333333);
612 cp3 = _mm256_set1_ps(0.002777777777777778);
613 cp4 = _mm256_set1_ps(4.96031746031746e-05);
614 cp5 = _mm256_set1_ps(5.511463844797178e-07);
615 union bit256 condition1;
616 union bit256 condition3;
617
618 for (; number < eighthPoints; number++) {
619
620 aVal = _mm256_loadu_ps(aPtr);
621 // s = fabs(aVal)
622 s = _mm256_sub_ps(aVal,
623 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
624 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
625 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
626 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
627 // r = q + q&1, q indicates quadrant, r gives
628 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
629
630 s = _mm256_fnmadd_ps(r, pio4A, s);
631 s = _mm256_fnmadd_ps(r, pio4B, s);
632 s = _mm256_fnmadd_ps(r, pio4C, s);
633
634 s = _mm256_div_ps(
635 s,
636 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
637 s = _mm256_mul_ps(s, s);
638 // Evaluate Taylor series
639 s = _mm256_mul_ps(
640 _mm256_fmadd_ps(
641 _mm256_fmsub_ps(
642 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
643 s,
644 cp1),
645 s);
646
647 for (i = 0; i < 3; i++)
648 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
649 s = _mm256_div_ps(s, ftwos);
650
651 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
652 cosine = _mm256_sub_ps(fones, s);
653
654 // if(((q+1)&2) != 0) { cosine=sine;}
655 condition1.int_vec =
656 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
657 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
658
659 // if(((q+2)&4) != 0) { cosine = -cosine;}
660 condition3.int_vec = _mm256_cmpeq_epi32(
661 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
662 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
663
664 cosine = _mm256_add_ps(
665 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
666 cosine = _mm256_sub_ps(cosine,
667 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
668 condition3.float_vec));
669 _mm256_storeu_ps(bPtr, cosine);
670 aPtr += 8;
671 bPtr += 8;
672 }
673
674 number = eighthPoints * 8;
675 for (; number < num_points; number++) {
676 *bPtr++ = cos(*aPtr++);
677 }
678}
679
680#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
681
682#ifdef LV_HAVE_AVX2
683#include <immintrin.h>
684
685static inline void
686volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
687{
688 float* bPtr = bVector;
689 const float* aPtr = aVector;
690
691 unsigned int number = 0;
692 unsigned int eighthPoints = num_points / 8;
693 unsigned int i = 0;
694
695 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
696 fones, fzeroes;
697 __m256 sine, cosine;
698 __m256i q, ones, twos, fours;
699
700 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
701 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
702 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
703 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
704 ffours = _mm256_set1_ps(4.0);
705 ftwos = _mm256_set1_ps(2.0);
706 fones = _mm256_set1_ps(1.0);
707 fzeroes = _mm256_setzero_ps();
708 __m256i zeroes = _mm256_set1_epi32(0);
709 ones = _mm256_set1_epi32(1);
710 __m256i allones = _mm256_set1_epi32(0xffffffff);
711 twos = _mm256_set1_epi32(2);
712 fours = _mm256_set1_epi32(4);
713
714 cp1 = _mm256_set1_ps(1.0);
715 cp2 = _mm256_set1_ps(0.08333333333333333);
716 cp3 = _mm256_set1_ps(0.002777777777777778);
717 cp4 = _mm256_set1_ps(4.96031746031746e-05);
718 cp5 = _mm256_set1_ps(5.511463844797178e-07);
719 union bit256 condition1;
720 union bit256 condition3;
721
722 for (; number < eighthPoints; number++) {
723
724 aVal = _mm256_loadu_ps(aPtr);
725 // s = fabs(aVal)
726 s = _mm256_sub_ps(aVal,
727 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
728 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
729 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
730 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
731 // r = q + q&1, q indicates quadrant, r gives
732 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
733
734 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
735 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
736 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
737
738 s = _mm256_div_ps(
739 s,
740 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
741 s = _mm256_mul_ps(s, s);
742 // Evaluate Taylor series
743 s = _mm256_mul_ps(
744 _mm256_add_ps(
745 _mm256_mul_ps(
746 _mm256_sub_ps(
747 _mm256_mul_ps(
748 _mm256_add_ps(
749 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
750 s),
751 cp3),
752 s),
753 cp2),
754 s),
755 cp1),
756 s);
757
758 for (i = 0; i < 3; i++)
759 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
760 s = _mm256_div_ps(s, ftwos);
761
762 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
763 cosine = _mm256_sub_ps(fones, s);
764
765 // if(((q+1)&2) != 0) { cosine=sine;}
766 condition1.int_vec =
767 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
768 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
769
770 // if(((q+2)&4) != 0) { cosine = -cosine;}
771 condition3.int_vec = _mm256_cmpeq_epi32(
772 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
773 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
774
775 cosine = _mm256_add_ps(
776 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
777 cosine = _mm256_sub_ps(cosine,
778 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
779 condition3.float_vec));
780 _mm256_storeu_ps(bPtr, cosine);
781 aPtr += 8;
782 bPtr += 8;
783 }
784
785 number = eighthPoints * 8;
786 for (; number < num_points; number++) {
787 *bPtr++ = cos(*aPtr++);
788 }
789}
790
791#endif /* LV_HAVE_AVX2 for unaligned */
792
793#ifdef LV_HAVE_SSE4_1
794#include <smmintrin.h>
795
796static inline void
797volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
798{
799 float* bPtr = bVector;
800 const float* aPtr = aVector;
801
802 unsigned int number = 0;
803 unsigned int quarterPoints = num_points / 4;
804 unsigned int i = 0;
805
806 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
807 fzeroes;
808 __m128 sine, cosine, condition1, condition3;
809 __m128i q, r, ones, twos, fours;
810
811 m4pi = _mm_set1_ps(1.273239545);
812 pio4A = _mm_set1_ps(0.78515625);
813 pio4B = _mm_set1_ps(0.241876e-3);
814 ffours = _mm_set1_ps(4.0);
815 ftwos = _mm_set1_ps(2.0);
816 fones = _mm_set1_ps(1.0);
817 fzeroes = _mm_setzero_ps();
818 ones = _mm_set1_epi32(1);
819 twos = _mm_set1_epi32(2);
820 fours = _mm_set1_epi32(4);
821
822 cp1 = _mm_set1_ps(1.0);
823 cp2 = _mm_set1_ps(0.83333333e-1);
824 cp3 = _mm_set1_ps(0.2777778e-2);
825 cp4 = _mm_set1_ps(0.49603e-4);
826 cp5 = _mm_set1_ps(0.551e-6);
827
828 for (; number < quarterPoints; number++) {
829 aVal = _mm_loadu_ps(aPtr);
830 s = _mm_sub_ps(aVal,
831 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
833 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
834
835 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
836 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
837
838 s = _mm_div_ps(
839 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
840 s = _mm_mul_ps(s, s);
841 // Evaluate Taylor series
842 s = _mm_mul_ps(
847 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
848 cp3),
849 s),
850 cp2),
851 s),
852 cp1),
853 s);
854
855 for (i = 0; i < 3; i++) {
856 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
857 }
858 s = _mm_div_ps(s, ftwos);
859
860 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
861 cosine = _mm_sub_ps(fones, s);
862
863 condition1 = _mm_cmpneq_ps(
864 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
865
866 condition3 = _mm_cmpneq_ps(
867 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
868
869 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
870 cosine = _mm_sub_ps(
871 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
872 _mm_storeu_ps(bPtr, cosine);
873 aPtr += 4;
874 bPtr += 4;
875 }
876
877 number = quarterPoints * 4;
878 for (; number < num_points; number++) {
879 *bPtr++ = cosf(*aPtr++);
880 }
881}
882
883#endif /* LV_HAVE_SSE4_1 for unaligned */
884
885
886#ifdef LV_HAVE_GENERIC
887
888/*
889 * For derivation see
890 * Shibata, Naoki, "Efficient evaluation methods of elementary functions
891 * suitable for SIMD computation," in Springer-Verlag 2010
892 */
893static inline void volk_32f_cos_32f_generic_fast(float* bVector,
894 const float* aVector,
895 unsigned int num_points)
896{
897 float* bPtr = bVector;
898 const float* aPtr = aVector;
899
900 float m4pi = 1.273239544735162542821171882678754627704620361328125;
901 float pio4A = 0.7853981554508209228515625;
902 float pio4B = 0.794662735614792836713604629039764404296875e-8;
903 float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
904 int N = 3; // order of argument reduction
905
906 unsigned int number;
907 for (number = 0; number < num_points; number++) {
908 float s = fabs(*aPtr);
909 int q = (int)(s * m4pi);
910 int r = q + (q & 1);
911 s -= r * pio4A;
912 s -= r * pio4B;
913 s -= r * pio4C;
914
915 s = s * 0.125; // 2^-N (<--3)
916 s = s * s;
917 s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
918 1.0) *
919 s;
920
921 int i;
922 for (i = 0; i < N; ++i) {
923 s = (4.0 - s) * s;
924 }
925 s = s / 2.0;
926
927 float sine = sqrt((2.0 - s) * s);
928 float cosine = 1 - s;
929
930 if (((q + 1) & 2) != 0) {
931 s = cosine;
932 cosine = sine;
933 sine = s;
934 }
935 if (((q + 2) & 4) != 0) {
936 cosine = -cosine;
937 }
938 *bPtr = cosine;
939 bPtr++;
940 aPtr++;
941 }
942}
943
944#endif /* LV_HAVE_GENERIC */
945
946
947#ifdef LV_HAVE_GENERIC
948
949static inline void
950volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
951{
952 float* bPtr = bVector;
953 const float* aPtr = aVector;
954 unsigned int number = 0;
955
956 for (; number < num_points; number++) {
957 *bPtr++ = cosf(*aPtr++);
958 }
959}
960
961#endif /* LV_HAVE_GENERIC */
962
963
964#ifdef LV_HAVE_NEON
965#include <arm_neon.h>
967
968static inline void
969volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
970{
971 unsigned int number = 0;
972 unsigned int quarter_points = num_points / 4;
973 float* bVectorPtr = bVector;
974 const float* aVectorPtr = aVector;
975
976 float32x4_t b_vec;
977 float32x4_t a_vec;
978
979 for (number = 0; number < quarter_points; number++) {
980 a_vec = vld1q_f32(aVectorPtr);
981 // Prefetch next one, speeds things up
982 __VOLK_PREFETCH(aVectorPtr + 4);
983 b_vec = _vcosq_f32(a_vec);
984 vst1q_f32(bVectorPtr, b_vec);
985 // move pointers ahead
986 bVectorPtr += 4;
987 aVectorPtr += 4;
988 }
989
990 // Deal with the rest
991 for (number = quarter_points * 4; number < num_points; number++) {
992 *bVectorPtr++ = cosf(*aVectorPtr++);
993 }
994}
995
996#endif /* LV_HAVE_NEON */
997
998
999#endif /* INCLUDED_volk_32f_cos_32f_u_H */