60#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
61#define INCLUDED_volk_32f_x2_pow_32f_a_H
68#define POW_POLY_DEGREE 3
70#if LV_HAVE_AVX2 && LV_HAVE_FMA
73#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
74#define POLY1_AVX2_FMA(x, c0, c1) \
75 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
76#define POLY2_AVX2_FMA(x, c0, c1, c2) \
77 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
78#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
79 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
80#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
81 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
82#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
83 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
85static inline void volk_32f_x2_pow_32f_a_avx2_fma(
float* cVector,
88 unsigned int num_points)
90 float* cPtr = cVector;
91 const float* bPtr = bVector;
92 const float* aPtr = aVector;
94 unsigned int number = 0;
95 const unsigned int eighthPoints = num_points / 8;
97 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
98 __m256 tmp, fx, mask, pow2n, z, y;
99 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
100 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
101 __m256i bias, exp, emm0, pi32_0x7f;
103 one = _mm256_set1_ps(1.0);
104 exp_hi = _mm256_set1_ps(88.3762626647949);
105 exp_lo = _mm256_set1_ps(-88.3762626647949);
106 ln2 = _mm256_set1_ps(0.6931471805);
107 log2EF = _mm256_set1_ps(1.44269504088896341);
108 half = _mm256_set1_ps(0.5);
109 exp_C1 = _mm256_set1_ps(0.693359375);
110 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
111 pi32_0x7f = _mm256_set1_epi32(0x7f);
113 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
114 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
115 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
116 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
117 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
118 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
120 for (; number < eighthPoints; number++) {
122 aVal = _mm256_load_ps(aPtr);
123 bias = _mm256_set1_epi32(127);
124 leadingOne = _mm256_set1_ps(1.0f);
125 exp = _mm256_sub_epi32(
126 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
127 _mm256_set1_epi32(0x7f800000)),
130 logarithm = _mm256_cvtepi32_ps(exp);
134 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
136#if POW_POLY_DEGREE == 6
137 mantissa = POLY5_AVX2_FMA(frac,
144#elif POW_POLY_DEGREE == 5
145 mantissa = POLY4_AVX2_FMA(frac,
146 2.8882704548164776201f,
147 -2.52074962577807006663f,
148 1.48116647521213171641f,
149 -0.465725644288844778798f,
150 0.0596515482674574969533f);
151#elif POW_POLY_DEGREE == 4
152 mantissa = POLY3_AVX2_FMA(frac,
153 2.61761038894603480148f,
154 -1.75647175389045657003f,
155 0.688243882994381274313f,
156 -0.107254423828329604454f);
157#elif POW_POLY_DEGREE == 3
158 mantissa = POLY2_AVX2_FMA(frac,
159 2.28330284476918490682f,
160 -1.04913055217340124191f,
161 0.204446009836232697516f);
166 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
167 logarithm = _mm256_mul_ps(logarithm, ln2);
170 bVal = _mm256_load_ps(bPtr);
171 bVal = _mm256_mul_ps(bVal, logarithm);
174 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
176 fx = _mm256_fmadd_ps(bVal, log2EF, half);
178 emm0 = _mm256_cvttps_epi32(fx);
179 tmp = _mm256_cvtepi32_ps(emm0);
181 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
182 fx = _mm256_sub_ps(tmp, mask);
184 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
185 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
186 z = _mm256_mul_ps(bVal, bVal);
188 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
189 y = _mm256_fmadd_ps(y, bVal, exp_p2);
190 y = _mm256_fmadd_ps(y, bVal, exp_p3);
191 y = _mm256_fmadd_ps(y, bVal, exp_p4);
192 y = _mm256_fmadd_ps(y, bVal, exp_p5);
193 y = _mm256_fmadd_ps(y, z, bVal);
194 y = _mm256_add_ps(y, one);
197 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
199 pow2n = _mm256_castsi256_ps(emm0);
200 cVal = _mm256_mul_ps(y, pow2n);
202 _mm256_store_ps(cPtr, cVal);
209 number = eighthPoints * 8;
210 for (; number < num_points; number++) {
211 *cPtr++ = pow(*aPtr++, *bPtr++);
218#include <immintrin.h>
220#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
221#define POLY1_AVX2(x, c0, c1) \
222 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
223#define POLY2_AVX2(x, c0, c1, c2) \
224 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
225#define POLY3_AVX2(x, c0, c1, c2, c3) \
226 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
227#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
228 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
229#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
230 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
232static inline void volk_32f_x2_pow_32f_a_avx2(
float* cVector,
233 const float* bVector,
234 const float* aVector,
235 unsigned int num_points)
237 float* cPtr = cVector;
238 const float* bPtr = bVector;
239 const float* aPtr = aVector;
241 unsigned int number = 0;
242 const unsigned int eighthPoints = num_points / 8;
244 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
245 __m256 tmp, fx, mask, pow2n, z, y;
246 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
247 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
248 __m256i bias, exp, emm0, pi32_0x7f;
250 one = _mm256_set1_ps(1.0);
251 exp_hi = _mm256_set1_ps(88.3762626647949);
252 exp_lo = _mm256_set1_ps(-88.3762626647949);
253 ln2 = _mm256_set1_ps(0.6931471805);
254 log2EF = _mm256_set1_ps(1.44269504088896341);
255 half = _mm256_set1_ps(0.5);
256 exp_C1 = _mm256_set1_ps(0.693359375);
257 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
258 pi32_0x7f = _mm256_set1_epi32(0x7f);
260 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
261 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
262 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
263 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
264 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
265 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
267 for (; number < eighthPoints; number++) {
269 aVal = _mm256_load_ps(aPtr);
270 bias = _mm256_set1_epi32(127);
271 leadingOne = _mm256_set1_ps(1.0f);
272 exp = _mm256_sub_epi32(
273 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
274 _mm256_set1_epi32(0x7f800000)),
277 logarithm = _mm256_cvtepi32_ps(exp);
281 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
283#if POW_POLY_DEGREE == 6
284 mantissa = POLY5_AVX2(frac,
291#elif POW_POLY_DEGREE == 5
292 mantissa = POLY4_AVX2(frac,
293 2.8882704548164776201f,
294 -2.52074962577807006663f,
295 1.48116647521213171641f,
296 -0.465725644288844778798f,
297 0.0596515482674574969533f);
298#elif POW_POLY_DEGREE == 4
299 mantissa = POLY3_AVX2(frac,
300 2.61761038894603480148f,
301 -1.75647175389045657003f,
302 0.688243882994381274313f,
303 -0.107254423828329604454f);
304#elif POW_POLY_DEGREE == 3
305 mantissa = POLY2_AVX2(frac,
306 2.28330284476918490682f,
307 -1.04913055217340124191f,
308 0.204446009836232697516f);
313 logarithm = _mm256_add_ps(
314 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
315 logarithm = _mm256_mul_ps(logarithm, ln2);
318 bVal = _mm256_load_ps(bPtr);
319 bVal = _mm256_mul_ps(bVal, logarithm);
322 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
324 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
326 emm0 = _mm256_cvttps_epi32(fx);
327 tmp = _mm256_cvtepi32_ps(emm0);
329 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
330 fx = _mm256_sub_ps(tmp, mask);
332 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
333 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
334 z = _mm256_mul_ps(bVal, bVal);
336 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
337 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
338 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
339 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
340 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
341 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
342 y = _mm256_add_ps(y, one);
345 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
347 pow2n = _mm256_castsi256_ps(emm0);
348 cVal = _mm256_mul_ps(y, pow2n);
350 _mm256_store_ps(cPtr, cVal);
357 number = eighthPoints * 8;
358 for (; number < num_points; number++) {
359 *cPtr++ = pow(*aPtr++, *bPtr++);
367#include <smmintrin.h>
369#define POLY0(x, c0) _mm_set1_ps(c0)
370#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
371#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
372#define POLY3(x, c0, c1, c2, c3) \
373 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
374#define POLY4(x, c0, c1, c2, c3, c4) \
375 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
376#define POLY5(x, c0, c1, c2, c3, c4, c5) \
377 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
379static inline void volk_32f_x2_pow_32f_a_sse4_1(
float* cVector,
380 const float* bVector,
381 const float* aVector,
382 unsigned int num_points)
384 float* cPtr = cVector;
385 const float* bPtr = bVector;
386 const float* aPtr = aVector;
388 unsigned int number = 0;
389 const unsigned int quarterPoints = num_points / 4;
391 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
392 __m128 tmp, fx, mask, pow2n, z, y;
393 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
394 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
395 __m128i bias, exp, emm0, pi32_0x7f;
414 for (; number < quarterPoints; number++) {
428#if POW_POLY_DEGREE == 6
429 mantissa = POLY5(frac,
436#elif POW_POLY_DEGREE == 5
437 mantissa = POLY4(frac,
438 2.8882704548164776201f,
439 -2.52074962577807006663f,
440 1.48116647521213171641f,
441 -0.465725644288844778798f,
442 0.0596515482674574969533f);
443#elif POW_POLY_DEGREE == 4
444 mantissa = POLY3(frac,
445 2.61761038894603480148f,
446 -1.75647175389045657003f,
447 0.688243882994381274313f,
448 -0.107254423828329604454f);
449#elif POW_POLY_DEGREE == 3
450 mantissa = POLY2(frac,
451 2.28330284476918490682f,
452 -1.04913055217340124191f,
453 0.204446009836232697516f);
501 number = quarterPoints * 4;
502 for (; number < num_points; number++) {
503 *cPtr++ = powf(*aPtr++, *bPtr++);
511#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
512#define INCLUDED_volk_32f_x2_pow_32f_u_H
519#define POW_POLY_DEGREE 3
521#ifdef LV_HAVE_GENERIC
524 const float* bVector,
525 const float* aVector,
526 unsigned int num_points)
528 float* cPtr = cVector;
529 const float* bPtr = bVector;
530 const float* aPtr = aVector;
531 unsigned int number = 0;
533 for (number = 0; number < num_points; number++) {
534 *cPtr++ = powf(*aPtr++, *bPtr++);
541#include <smmintrin.h>
543#define POLY0(x, c0) _mm_set1_ps(c0)
544#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
545#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
546#define POLY3(x, c0, c1, c2, c3) \
547 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
548#define POLY4(x, c0, c1, c2, c3, c4) \
549 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
550#define POLY5(x, c0, c1, c2, c3, c4, c5) \
551 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
553static inline void volk_32f_x2_pow_32f_u_sse4_1(
float* cVector,
554 const float* bVector,
555 const float* aVector,
556 unsigned int num_points)
558 float* cPtr = cVector;
559 const float* bPtr = bVector;
560 const float* aPtr = aVector;
562 unsigned int number = 0;
563 const unsigned int quarterPoints = num_points / 4;
565 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
566 __m128 tmp, fx, mask, pow2n, z, y;
567 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
568 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
569 __m128i bias, exp, emm0, pi32_0x7f;
588 for (; number < quarterPoints; number++) {
602#if POW_POLY_DEGREE == 6
603 mantissa = POLY5(frac,
610#elif POW_POLY_DEGREE == 5
611 mantissa = POLY4(frac,
612 2.8882704548164776201f,
613 -2.52074962577807006663f,
614 1.48116647521213171641f,
615 -0.465725644288844778798f,
616 0.0596515482674574969533f);
617#elif POW_POLY_DEGREE == 4
618 mantissa = POLY3(frac,
619 2.61761038894603480148f,
620 -1.75647175389045657003f,
621 0.688243882994381274313f,
622 -0.107254423828329604454f);
623#elif POW_POLY_DEGREE == 3
624 mantissa = POLY2(frac,
625 2.28330284476918490682f,
626 -1.04913055217340124191f,
627 0.204446009836232697516f);
675 number = quarterPoints * 4;
676 for (; number < num_points; number++) {
677 *cPtr++ = powf(*aPtr++, *bPtr++);
683#if LV_HAVE_AVX2 && LV_HAVE_FMA
684#include <immintrin.h>
686#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
687#define POLY1_AVX2_FMA(x, c0, c1) \
688 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
689#define POLY2_AVX2_FMA(x, c0, c1, c2) \
690 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
691#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
692 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
693#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
694 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
695#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
696 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
698static inline void volk_32f_x2_pow_32f_u_avx2_fma(
float* cVector,
699 const float* bVector,
700 const float* aVector,
701 unsigned int num_points)
703 float* cPtr = cVector;
704 const float* bPtr = bVector;
705 const float* aPtr = aVector;
707 unsigned int number = 0;
708 const unsigned int eighthPoints = num_points / 8;
710 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
711 __m256 tmp, fx, mask, pow2n, z, y;
712 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
713 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
714 __m256i bias, exp, emm0, pi32_0x7f;
716 one = _mm256_set1_ps(1.0);
717 exp_hi = _mm256_set1_ps(88.3762626647949);
718 exp_lo = _mm256_set1_ps(-88.3762626647949);
719 ln2 = _mm256_set1_ps(0.6931471805);
720 log2EF = _mm256_set1_ps(1.44269504088896341);
721 half = _mm256_set1_ps(0.5);
722 exp_C1 = _mm256_set1_ps(0.693359375);
723 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
724 pi32_0x7f = _mm256_set1_epi32(0x7f);
726 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
727 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
728 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
729 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
730 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
731 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
733 for (; number < eighthPoints; number++) {
735 aVal = _mm256_loadu_ps(aPtr);
736 bias = _mm256_set1_epi32(127);
737 leadingOne = _mm256_set1_ps(1.0f);
738 exp = _mm256_sub_epi32(
739 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
740 _mm256_set1_epi32(0x7f800000)),
743 logarithm = _mm256_cvtepi32_ps(exp);
747 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
749#if POW_POLY_DEGREE == 6
750 mantissa = POLY5_AVX2_FMA(frac,
757#elif POW_POLY_DEGREE == 5
758 mantissa = POLY4_AVX2_FMA(frac,
759 2.8882704548164776201f,
760 -2.52074962577807006663f,
761 1.48116647521213171641f,
762 -0.465725644288844778798f,
763 0.0596515482674574969533f);
764#elif POW_POLY_DEGREE == 4
765 mantissa = POLY3_AVX2_FMA(frac,
766 2.61761038894603480148f,
767 -1.75647175389045657003f,
768 0.688243882994381274313f,
769 -0.107254423828329604454f);
770#elif POW_POLY_DEGREE == 3
771 mantissa = POLY2_AVX2_FMA(frac,
772 2.28330284476918490682f,
773 -1.04913055217340124191f,
774 0.204446009836232697516f);
779 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
780 logarithm = _mm256_mul_ps(logarithm, ln2);
784 bVal = _mm256_loadu_ps(bPtr);
785 bVal = _mm256_mul_ps(bVal, logarithm);
788 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
790 fx = _mm256_fmadd_ps(bVal, log2EF, half);
792 emm0 = _mm256_cvttps_epi32(fx);
793 tmp = _mm256_cvtepi32_ps(emm0);
795 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
796 fx = _mm256_sub_ps(tmp, mask);
798 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
799 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
800 z = _mm256_mul_ps(bVal, bVal);
802 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
803 y = _mm256_fmadd_ps(y, bVal, exp_p2);
804 y = _mm256_fmadd_ps(y, bVal, exp_p3);
805 y = _mm256_fmadd_ps(y, bVal, exp_p4);
806 y = _mm256_fmadd_ps(y, bVal, exp_p5);
807 y = _mm256_fmadd_ps(y, z, bVal);
808 y = _mm256_add_ps(y, one);
811 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
813 pow2n = _mm256_castsi256_ps(emm0);
814 cVal = _mm256_mul_ps(y, pow2n);
816 _mm256_storeu_ps(cPtr, cVal);
823 number = eighthPoints * 8;
824 for (; number < num_points; number++) {
825 *cPtr++ = pow(*aPtr++, *bPtr++);
832#include <immintrin.h>
834#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
835#define POLY1_AVX2(x, c0, c1) \
836 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
837#define POLY2_AVX2(x, c0, c1, c2) \
838 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
839#define POLY3_AVX2(x, c0, c1, c2, c3) \
840 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
841#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
842 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
843#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
844 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
846static inline void volk_32f_x2_pow_32f_u_avx2(
float* cVector,
847 const float* bVector,
848 const float* aVector,
849 unsigned int num_points)
851 float* cPtr = cVector;
852 const float* bPtr = bVector;
853 const float* aPtr = aVector;
855 unsigned int number = 0;
856 const unsigned int eighthPoints = num_points / 8;
858 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
859 __m256 tmp, fx, mask, pow2n, z, y;
860 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
861 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
862 __m256i bias, exp, emm0, pi32_0x7f;
864 one = _mm256_set1_ps(1.0);
865 exp_hi = _mm256_set1_ps(88.3762626647949);
866 exp_lo = _mm256_set1_ps(-88.3762626647949);
867 ln2 = _mm256_set1_ps(0.6931471805);
868 log2EF = _mm256_set1_ps(1.44269504088896341);
869 half = _mm256_set1_ps(0.5);
870 exp_C1 = _mm256_set1_ps(0.693359375);
871 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
872 pi32_0x7f = _mm256_set1_epi32(0x7f);
874 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
875 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
876 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
877 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
878 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
879 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
881 for (; number < eighthPoints; number++) {
883 aVal = _mm256_loadu_ps(aPtr);
884 bias = _mm256_set1_epi32(127);
885 leadingOne = _mm256_set1_ps(1.0f);
886 exp = _mm256_sub_epi32(
887 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
888 _mm256_set1_epi32(0x7f800000)),
891 logarithm = _mm256_cvtepi32_ps(exp);
895 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
897#if POW_POLY_DEGREE == 6
898 mantissa = POLY5_AVX2(frac,
905#elif POW_POLY_DEGREE == 5
906 mantissa = POLY4_AVX2(frac,
907 2.8882704548164776201f,
908 -2.52074962577807006663f,
909 1.48116647521213171641f,
910 -0.465725644288844778798f,
911 0.0596515482674574969533f);
912#elif POW_POLY_DEGREE == 4
913 mantissa = POLY3_AVX2(frac,
914 2.61761038894603480148f,
915 -1.75647175389045657003f,
916 0.688243882994381274313f,
917 -0.107254423828329604454f);
918#elif POW_POLY_DEGREE == 3
919 mantissa = POLY2_AVX2(frac,
920 2.28330284476918490682f,
921 -1.04913055217340124191f,
922 0.204446009836232697516f);
927 logarithm = _mm256_add_ps(
928 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
929 logarithm = _mm256_mul_ps(logarithm, ln2);
932 bVal = _mm256_loadu_ps(bPtr);
933 bVal = _mm256_mul_ps(bVal, logarithm);
936 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
938 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
940 emm0 = _mm256_cvttps_epi32(fx);
941 tmp = _mm256_cvtepi32_ps(emm0);
943 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
944 fx = _mm256_sub_ps(tmp, mask);
946 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
947 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
948 z = _mm256_mul_ps(bVal, bVal);
950 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
951 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
952 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
953 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
954 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
955 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
956 y = _mm256_add_ps(y, one);
959 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
961 pow2n = _mm256_castsi256_ps(emm0);
962 cVal = _mm256_mul_ps(y, pow2n);
964 _mm256_storeu_ps(cPtr, cVal);
971 number = eighthPoints * 8;
972 for (; number < num_points; number++) {
973 *cPtr++ = pow(*aPtr++, *bPtr++);