Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
60#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
61#define INCLUDED_volk_32f_x2_pow_32f_a_H
62
63#include <inttypes.h>
64#include <math.h>
65#include <stdio.h>
66#include <stdlib.h>
67
68#define POW_POLY_DEGREE 3
69
70#if LV_HAVE_AVX2 && LV_HAVE_FMA
71#include <immintrin.h>
72
73#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
74#define POLY1_AVX2_FMA(x, c0, c1) \
75 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
76#define POLY2_AVX2_FMA(x, c0, c1, c2) \
77 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
78#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
79 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
80#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
81 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
82#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
83 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
84
85static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
86 const float* bVector,
87 const float* aVector,
88 unsigned int num_points)
89{
90 float* cPtr = cVector;
91 const float* bPtr = bVector;
92 const float* aPtr = aVector;
93
94 unsigned int number = 0;
95 const unsigned int eighthPoints = num_points / 8;
96
97 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
98 __m256 tmp, fx, mask, pow2n, z, y;
99 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
100 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
101 __m256i bias, exp, emm0, pi32_0x7f;
102
103 one = _mm256_set1_ps(1.0);
104 exp_hi = _mm256_set1_ps(88.3762626647949);
105 exp_lo = _mm256_set1_ps(-88.3762626647949);
106 ln2 = _mm256_set1_ps(0.6931471805);
107 log2EF = _mm256_set1_ps(1.44269504088896341);
108 half = _mm256_set1_ps(0.5);
109 exp_C1 = _mm256_set1_ps(0.693359375);
110 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
111 pi32_0x7f = _mm256_set1_epi32(0x7f);
112
113 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
114 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
115 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
116 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
117 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
118 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
119
120 for (; number < eighthPoints; number++) {
121 // First compute the logarithm
122 aVal = _mm256_load_ps(aPtr);
123 bias = _mm256_set1_epi32(127);
124 leadingOne = _mm256_set1_ps(1.0f);
125 exp = _mm256_sub_epi32(
126 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
127 _mm256_set1_epi32(0x7f800000)),
128 23),
129 bias);
130 logarithm = _mm256_cvtepi32_ps(exp);
131
132 frac = _mm256_or_ps(
133 leadingOne,
134 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
135
136#if POW_POLY_DEGREE == 6
137 mantissa = POLY5_AVX2_FMA(frac,
138 3.1157899f,
139 -3.3241990f,
140 2.5988452f,
141 -1.2315303f,
142 3.1821337e-1f,
143 -3.4436006e-2f);
144#elif POW_POLY_DEGREE == 5
145 mantissa = POLY4_AVX2_FMA(frac,
146 2.8882704548164776201f,
147 -2.52074962577807006663f,
148 1.48116647521213171641f,
149 -0.465725644288844778798f,
150 0.0596515482674574969533f);
151#elif POW_POLY_DEGREE == 4
152 mantissa = POLY3_AVX2_FMA(frac,
153 2.61761038894603480148f,
154 -1.75647175389045657003f,
155 0.688243882994381274313f,
156 -0.107254423828329604454f);
157#elif POW_POLY_DEGREE == 3
158 mantissa = POLY2_AVX2_FMA(frac,
159 2.28330284476918490682f,
160 -1.04913055217340124191f,
161 0.204446009836232697516f);
162#else
163#error
164#endif
165
166 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
167 logarithm = _mm256_mul_ps(logarithm, ln2);
168
169 // Now calculate b*lna
170 bVal = _mm256_load_ps(bPtr);
171 bVal = _mm256_mul_ps(bVal, logarithm);
172
173 // Now compute exp(b*lna)
174 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
175
176 fx = _mm256_fmadd_ps(bVal, log2EF, half);
177
178 emm0 = _mm256_cvttps_epi32(fx);
179 tmp = _mm256_cvtepi32_ps(emm0);
180
181 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
182 fx = _mm256_sub_ps(tmp, mask);
183
184 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
185 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
186 z = _mm256_mul_ps(bVal, bVal);
187
188 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
189 y = _mm256_fmadd_ps(y, bVal, exp_p2);
190 y = _mm256_fmadd_ps(y, bVal, exp_p3);
191 y = _mm256_fmadd_ps(y, bVal, exp_p4);
192 y = _mm256_fmadd_ps(y, bVal, exp_p5);
193 y = _mm256_fmadd_ps(y, z, bVal);
194 y = _mm256_add_ps(y, one);
195
196 emm0 =
197 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
198
199 pow2n = _mm256_castsi256_ps(emm0);
200 cVal = _mm256_mul_ps(y, pow2n);
201
202 _mm256_store_ps(cPtr, cVal);
203
204 aPtr += 8;
205 bPtr += 8;
206 cPtr += 8;
207 }
208
209 number = eighthPoints * 8;
210 for (; number < num_points; number++) {
211 *cPtr++ = pow(*aPtr++, *bPtr++);
212 }
213}
214
215#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
216
217#ifdef LV_HAVE_AVX2
218#include <immintrin.h>
219
220#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
221#define POLY1_AVX2(x, c0, c1) \
222 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
223#define POLY2_AVX2(x, c0, c1, c2) \
224 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
225#define POLY3_AVX2(x, c0, c1, c2, c3) \
226 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
227#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
228 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
229#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
230 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
231
232static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
233 const float* bVector,
234 const float* aVector,
235 unsigned int num_points)
236{
237 float* cPtr = cVector;
238 const float* bPtr = bVector;
239 const float* aPtr = aVector;
240
241 unsigned int number = 0;
242 const unsigned int eighthPoints = num_points / 8;
243
244 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
245 __m256 tmp, fx, mask, pow2n, z, y;
246 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
247 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
248 __m256i bias, exp, emm0, pi32_0x7f;
249
250 one = _mm256_set1_ps(1.0);
251 exp_hi = _mm256_set1_ps(88.3762626647949);
252 exp_lo = _mm256_set1_ps(-88.3762626647949);
253 ln2 = _mm256_set1_ps(0.6931471805);
254 log2EF = _mm256_set1_ps(1.44269504088896341);
255 half = _mm256_set1_ps(0.5);
256 exp_C1 = _mm256_set1_ps(0.693359375);
257 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
258 pi32_0x7f = _mm256_set1_epi32(0x7f);
259
260 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
261 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
262 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
263 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
264 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
265 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
266
267 for (; number < eighthPoints; number++) {
268 // First compute the logarithm
269 aVal = _mm256_load_ps(aPtr);
270 bias = _mm256_set1_epi32(127);
271 leadingOne = _mm256_set1_ps(1.0f);
272 exp = _mm256_sub_epi32(
273 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
274 _mm256_set1_epi32(0x7f800000)),
275 23),
276 bias);
277 logarithm = _mm256_cvtepi32_ps(exp);
278
279 frac = _mm256_or_ps(
280 leadingOne,
281 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
282
283#if POW_POLY_DEGREE == 6
284 mantissa = POLY5_AVX2(frac,
285 3.1157899f,
286 -3.3241990f,
287 2.5988452f,
288 -1.2315303f,
289 3.1821337e-1f,
290 -3.4436006e-2f);
291#elif POW_POLY_DEGREE == 5
292 mantissa = POLY4_AVX2(frac,
293 2.8882704548164776201f,
294 -2.52074962577807006663f,
295 1.48116647521213171641f,
296 -0.465725644288844778798f,
297 0.0596515482674574969533f);
298#elif POW_POLY_DEGREE == 4
299 mantissa = POLY3_AVX2(frac,
300 2.61761038894603480148f,
301 -1.75647175389045657003f,
302 0.688243882994381274313f,
303 -0.107254423828329604454f);
304#elif POW_POLY_DEGREE == 3
305 mantissa = POLY2_AVX2(frac,
306 2.28330284476918490682f,
307 -1.04913055217340124191f,
308 0.204446009836232697516f);
309#else
310#error
311#endif
312
313 logarithm = _mm256_add_ps(
314 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
315 logarithm = _mm256_mul_ps(logarithm, ln2);
316
317 // Now calculate b*lna
318 bVal = _mm256_load_ps(bPtr);
319 bVal = _mm256_mul_ps(bVal, logarithm);
320
321 // Now compute exp(b*lna)
322 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
323
324 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
325
326 emm0 = _mm256_cvttps_epi32(fx);
327 tmp = _mm256_cvtepi32_ps(emm0);
328
329 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
330 fx = _mm256_sub_ps(tmp, mask);
331
332 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
333 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
334 z = _mm256_mul_ps(bVal, bVal);
335
336 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
337 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
338 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
339 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
340 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
341 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
342 y = _mm256_add_ps(y, one);
343
344 emm0 =
345 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
346
347 pow2n = _mm256_castsi256_ps(emm0);
348 cVal = _mm256_mul_ps(y, pow2n);
349
350 _mm256_store_ps(cPtr, cVal);
351
352 aPtr += 8;
353 bPtr += 8;
354 cPtr += 8;
355 }
356
357 number = eighthPoints * 8;
358 for (; number < num_points; number++) {
359 *cPtr++ = pow(*aPtr++, *bPtr++);
360 }
361}
362
363#endif /* LV_HAVE_AVX2 for aligned */
364
365
366#ifdef LV_HAVE_SSE4_1
367#include <smmintrin.h>
368
369#define POLY0(x, c0) _mm_set1_ps(c0)
370#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
371#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
372#define POLY3(x, c0, c1, c2, c3) \
373 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
374#define POLY4(x, c0, c1, c2, c3, c4) \
375 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
376#define POLY5(x, c0, c1, c2, c3, c4, c5) \
377 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
378
379static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
380 const float* bVector,
381 const float* aVector,
382 unsigned int num_points)
383{
384 float* cPtr = cVector;
385 const float* bPtr = bVector;
386 const float* aPtr = aVector;
387
388 unsigned int number = 0;
389 const unsigned int quarterPoints = num_points / 4;
390
391 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
392 __m128 tmp, fx, mask, pow2n, z, y;
393 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
394 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
395 __m128i bias, exp, emm0, pi32_0x7f;
396
397 one = _mm_set1_ps(1.0);
398 exp_hi = _mm_set1_ps(88.3762626647949);
399 exp_lo = _mm_set1_ps(-88.3762626647949);
400 ln2 = _mm_set1_ps(0.6931471805);
401 log2EF = _mm_set1_ps(1.44269504088896341);
402 half = _mm_set1_ps(0.5);
403 exp_C1 = _mm_set1_ps(0.693359375);
404 exp_C2 = _mm_set1_ps(-2.12194440e-4);
405 pi32_0x7f = _mm_set1_epi32(0x7f);
406
407 exp_p0 = _mm_set1_ps(1.9875691500e-4);
408 exp_p1 = _mm_set1_ps(1.3981999507e-3);
409 exp_p2 = _mm_set1_ps(8.3334519073e-3);
410 exp_p3 = _mm_set1_ps(4.1665795894e-2);
411 exp_p4 = _mm_set1_ps(1.6666665459e-1);
412 exp_p5 = _mm_set1_ps(5.0000001201e-1);
413
414 for (; number < quarterPoints; number++) {
415 // First compute the logarithm
416 aVal = _mm_load_ps(aPtr);
417 bias = _mm_set1_epi32(127);
418 leadingOne = _mm_set1_ps(1.0f);
419 exp = _mm_sub_epi32(
421 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
422 bias);
423 logarithm = _mm_cvtepi32_ps(exp);
424
425 frac = _mm_or_ps(leadingOne,
426 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
427
428#if POW_POLY_DEGREE == 6
429 mantissa = POLY5(frac,
430 3.1157899f,
431 -3.3241990f,
432 2.5988452f,
433 -1.2315303f,
434 3.1821337e-1f,
435 -3.4436006e-2f);
436#elif POW_POLY_DEGREE == 5
437 mantissa = POLY4(frac,
438 2.8882704548164776201f,
439 -2.52074962577807006663f,
440 1.48116647521213171641f,
441 -0.465725644288844778798f,
442 0.0596515482674574969533f);
443#elif POW_POLY_DEGREE == 4
444 mantissa = POLY3(frac,
445 2.61761038894603480148f,
446 -1.75647175389045657003f,
447 0.688243882994381274313f,
448 -0.107254423828329604454f);
449#elif POW_POLY_DEGREE == 3
450 mantissa = POLY2(frac,
451 2.28330284476918490682f,
452 -1.04913055217340124191f,
453 0.204446009836232697516f);
454#else
455#error
456#endif
457
458 logarithm =
459 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
460 logarithm = _mm_mul_ps(logarithm, ln2);
461
462
463 // Now calculate b*lna
464 bVal = _mm_load_ps(bPtr);
465 bVal = _mm_mul_ps(bVal, logarithm);
466
467 // Now compute exp(b*lna)
468 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
469
470 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
471
472 emm0 = _mm_cvttps_epi32(fx);
473 tmp = _mm_cvtepi32_ps(emm0);
474
475 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
476 fx = _mm_sub_ps(tmp, mask);
477
478 tmp = _mm_mul_ps(fx, exp_C1);
479 z = _mm_mul_ps(fx, exp_C2);
480 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
481 z = _mm_mul_ps(bVal, bVal);
482
483 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
484 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
485 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
486 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
487 y = _mm_add_ps(y, one);
488
489 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
490
491 pow2n = _mm_castsi128_ps(emm0);
492 cVal = _mm_mul_ps(y, pow2n);
493
494 _mm_store_ps(cPtr, cVal);
495
496 aPtr += 4;
497 bPtr += 4;
498 cPtr += 4;
499 }
500
501 number = quarterPoints * 4;
502 for (; number < num_points; number++) {
503 *cPtr++ = powf(*aPtr++, *bPtr++);
504 }
505}
506
507#endif /* LV_HAVE_SSE4_1 for aligned */
508
509#endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
510
511#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
512#define INCLUDED_volk_32f_x2_pow_32f_u_H
513
514#include <inttypes.h>
515#include <math.h>
516#include <stdio.h>
517#include <stdlib.h>
518
519#define POW_POLY_DEGREE 3
520
521#ifdef LV_HAVE_GENERIC
522
523static inline void volk_32f_x2_pow_32f_generic(float* cVector,
524 const float* bVector,
525 const float* aVector,
526 unsigned int num_points)
527{
528 float* cPtr = cVector;
529 const float* bPtr = bVector;
530 const float* aPtr = aVector;
531 unsigned int number = 0;
532
533 for (number = 0; number < num_points; number++) {
534 *cPtr++ = powf(*aPtr++, *bPtr++);
535 }
536}
537#endif /* LV_HAVE_GENERIC */
538
539
540#ifdef LV_HAVE_SSE4_1
541#include <smmintrin.h>
542
543#define POLY0(x, c0) _mm_set1_ps(c0)
544#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
545#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
546#define POLY3(x, c0, c1, c2, c3) \
547 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
548#define POLY4(x, c0, c1, c2, c3, c4) \
549 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
550#define POLY5(x, c0, c1, c2, c3, c4, c5) \
551 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
552
553static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
554 const float* bVector,
555 const float* aVector,
556 unsigned int num_points)
557{
558 float* cPtr = cVector;
559 const float* bPtr = bVector;
560 const float* aPtr = aVector;
561
562 unsigned int number = 0;
563 const unsigned int quarterPoints = num_points / 4;
564
565 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
566 __m128 tmp, fx, mask, pow2n, z, y;
567 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
568 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
569 __m128i bias, exp, emm0, pi32_0x7f;
570
571 one = _mm_set1_ps(1.0);
572 exp_hi = _mm_set1_ps(88.3762626647949);
573 exp_lo = _mm_set1_ps(-88.3762626647949);
574 ln2 = _mm_set1_ps(0.6931471805);
575 log2EF = _mm_set1_ps(1.44269504088896341);
576 half = _mm_set1_ps(0.5);
577 exp_C1 = _mm_set1_ps(0.693359375);
578 exp_C2 = _mm_set1_ps(-2.12194440e-4);
579 pi32_0x7f = _mm_set1_epi32(0x7f);
580
581 exp_p0 = _mm_set1_ps(1.9875691500e-4);
582 exp_p1 = _mm_set1_ps(1.3981999507e-3);
583 exp_p2 = _mm_set1_ps(8.3334519073e-3);
584 exp_p3 = _mm_set1_ps(4.1665795894e-2);
585 exp_p4 = _mm_set1_ps(1.6666665459e-1);
586 exp_p5 = _mm_set1_ps(5.0000001201e-1);
587
588 for (; number < quarterPoints; number++) {
589 // First compute the logarithm
590 aVal = _mm_loadu_ps(aPtr);
591 bias = _mm_set1_epi32(127);
592 leadingOne = _mm_set1_ps(1.0f);
593 exp = _mm_sub_epi32(
595 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
596 bias);
597 logarithm = _mm_cvtepi32_ps(exp);
598
599 frac = _mm_or_ps(leadingOne,
600 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
601
602#if POW_POLY_DEGREE == 6
603 mantissa = POLY5(frac,
604 3.1157899f,
605 -3.3241990f,
606 2.5988452f,
607 -1.2315303f,
608 3.1821337e-1f,
609 -3.4436006e-2f);
610#elif POW_POLY_DEGREE == 5
611 mantissa = POLY4(frac,
612 2.8882704548164776201f,
613 -2.52074962577807006663f,
614 1.48116647521213171641f,
615 -0.465725644288844778798f,
616 0.0596515482674574969533f);
617#elif POW_POLY_DEGREE == 4
618 mantissa = POLY3(frac,
619 2.61761038894603480148f,
620 -1.75647175389045657003f,
621 0.688243882994381274313f,
622 -0.107254423828329604454f);
623#elif POW_POLY_DEGREE == 3
624 mantissa = POLY2(frac,
625 2.28330284476918490682f,
626 -1.04913055217340124191f,
627 0.204446009836232697516f);
628#else
629#error
630#endif
631
632 logarithm =
633 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
634 logarithm = _mm_mul_ps(logarithm, ln2);
635
636
637 // Now calculate b*lna
638 bVal = _mm_loadu_ps(bPtr);
639 bVal = _mm_mul_ps(bVal, logarithm);
640
641 // Now compute exp(b*lna)
642 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
643
644 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
645
646 emm0 = _mm_cvttps_epi32(fx);
647 tmp = _mm_cvtepi32_ps(emm0);
648
649 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
650 fx = _mm_sub_ps(tmp, mask);
651
652 tmp = _mm_mul_ps(fx, exp_C1);
653 z = _mm_mul_ps(fx, exp_C2);
654 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
655 z = _mm_mul_ps(bVal, bVal);
656
657 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
658 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
659 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
660 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
661 y = _mm_add_ps(y, one);
662
663 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
664
665 pow2n = _mm_castsi128_ps(emm0);
666 cVal = _mm_mul_ps(y, pow2n);
667
668 _mm_storeu_ps(cPtr, cVal);
669
670 aPtr += 4;
671 bPtr += 4;
672 cPtr += 4;
673 }
674
675 number = quarterPoints * 4;
676 for (; number < num_points; number++) {
677 *cPtr++ = powf(*aPtr++, *bPtr++);
678 }
679}
680
681#endif /* LV_HAVE_SSE4_1 for unaligned */
682
683#if LV_HAVE_AVX2 && LV_HAVE_FMA
684#include <immintrin.h>
685
686#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
687#define POLY1_AVX2_FMA(x, c0, c1) \
688 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
689#define POLY2_AVX2_FMA(x, c0, c1, c2) \
690 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
691#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
692 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
693#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
694 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
695#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
696 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
697
698static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
699 const float* bVector,
700 const float* aVector,
701 unsigned int num_points)
702{
703 float* cPtr = cVector;
704 const float* bPtr = bVector;
705 const float* aPtr = aVector;
706
707 unsigned int number = 0;
708 const unsigned int eighthPoints = num_points / 8;
709
710 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
711 __m256 tmp, fx, mask, pow2n, z, y;
712 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
713 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
714 __m256i bias, exp, emm0, pi32_0x7f;
715
716 one = _mm256_set1_ps(1.0);
717 exp_hi = _mm256_set1_ps(88.3762626647949);
718 exp_lo = _mm256_set1_ps(-88.3762626647949);
719 ln2 = _mm256_set1_ps(0.6931471805);
720 log2EF = _mm256_set1_ps(1.44269504088896341);
721 half = _mm256_set1_ps(0.5);
722 exp_C1 = _mm256_set1_ps(0.693359375);
723 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
724 pi32_0x7f = _mm256_set1_epi32(0x7f);
725
726 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
727 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
728 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
729 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
730 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
731 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
732
733 for (; number < eighthPoints; number++) {
734 // First compute the logarithm
735 aVal = _mm256_loadu_ps(aPtr);
736 bias = _mm256_set1_epi32(127);
737 leadingOne = _mm256_set1_ps(1.0f);
738 exp = _mm256_sub_epi32(
739 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
740 _mm256_set1_epi32(0x7f800000)),
741 23),
742 bias);
743 logarithm = _mm256_cvtepi32_ps(exp);
744
745 frac = _mm256_or_ps(
746 leadingOne,
747 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
748
749#if POW_POLY_DEGREE == 6
750 mantissa = POLY5_AVX2_FMA(frac,
751 3.1157899f,
752 -3.3241990f,
753 2.5988452f,
754 -1.2315303f,
755 3.1821337e-1f,
756 -3.4436006e-2f);
757#elif POW_POLY_DEGREE == 5
758 mantissa = POLY4_AVX2_FMA(frac,
759 2.8882704548164776201f,
760 -2.52074962577807006663f,
761 1.48116647521213171641f,
762 -0.465725644288844778798f,
763 0.0596515482674574969533f);
764#elif POW_POLY_DEGREE == 4
765 mantissa = POLY3_AVX2_FMA(frac,
766 2.61761038894603480148f,
767 -1.75647175389045657003f,
768 0.688243882994381274313f,
769 -0.107254423828329604454f);
770#elif POW_POLY_DEGREE == 3
771 mantissa = POLY2_AVX2_FMA(frac,
772 2.28330284476918490682f,
773 -1.04913055217340124191f,
774 0.204446009836232697516f);
775#else
776#error
777#endif
778
779 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
780 logarithm = _mm256_mul_ps(logarithm, ln2);
781
782
783 // Now calculate b*lna
784 bVal = _mm256_loadu_ps(bPtr);
785 bVal = _mm256_mul_ps(bVal, logarithm);
786
787 // Now compute exp(b*lna)
788 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
789
790 fx = _mm256_fmadd_ps(bVal, log2EF, half);
791
792 emm0 = _mm256_cvttps_epi32(fx);
793 tmp = _mm256_cvtepi32_ps(emm0);
794
795 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
796 fx = _mm256_sub_ps(tmp, mask);
797
798 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
799 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
800 z = _mm256_mul_ps(bVal, bVal);
801
802 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
803 y = _mm256_fmadd_ps(y, bVal, exp_p2);
804 y = _mm256_fmadd_ps(y, bVal, exp_p3);
805 y = _mm256_fmadd_ps(y, bVal, exp_p4);
806 y = _mm256_fmadd_ps(y, bVal, exp_p5);
807 y = _mm256_fmadd_ps(y, z, bVal);
808 y = _mm256_add_ps(y, one);
809
810 emm0 =
811 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
812
813 pow2n = _mm256_castsi256_ps(emm0);
814 cVal = _mm256_mul_ps(y, pow2n);
815
816 _mm256_storeu_ps(cPtr, cVal);
817
818 aPtr += 8;
819 bPtr += 8;
820 cPtr += 8;
821 }
822
823 number = eighthPoints * 8;
824 for (; number < num_points; number++) {
825 *cPtr++ = pow(*aPtr++, *bPtr++);
826 }
827}
828
829#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
830
831#ifdef LV_HAVE_AVX2
832#include <immintrin.h>
833
834#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
835#define POLY1_AVX2(x, c0, c1) \
836 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
837#define POLY2_AVX2(x, c0, c1, c2) \
838 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
839#define POLY3_AVX2(x, c0, c1, c2, c3) \
840 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
841#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
842 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
843#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
844 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
845
846static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
847 const float* bVector,
848 const float* aVector,
849 unsigned int num_points)
850{
851 float* cPtr = cVector;
852 const float* bPtr = bVector;
853 const float* aPtr = aVector;
854
855 unsigned int number = 0;
856 const unsigned int eighthPoints = num_points / 8;
857
858 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
859 __m256 tmp, fx, mask, pow2n, z, y;
860 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
861 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
862 __m256i bias, exp, emm0, pi32_0x7f;
863
864 one = _mm256_set1_ps(1.0);
865 exp_hi = _mm256_set1_ps(88.3762626647949);
866 exp_lo = _mm256_set1_ps(-88.3762626647949);
867 ln2 = _mm256_set1_ps(0.6931471805);
868 log2EF = _mm256_set1_ps(1.44269504088896341);
869 half = _mm256_set1_ps(0.5);
870 exp_C1 = _mm256_set1_ps(0.693359375);
871 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
872 pi32_0x7f = _mm256_set1_epi32(0x7f);
873
874 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
875 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
876 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
877 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
878 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
879 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
880
881 for (; number < eighthPoints; number++) {
882 // First compute the logarithm
883 aVal = _mm256_loadu_ps(aPtr);
884 bias = _mm256_set1_epi32(127);
885 leadingOne = _mm256_set1_ps(1.0f);
886 exp = _mm256_sub_epi32(
887 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
888 _mm256_set1_epi32(0x7f800000)),
889 23),
890 bias);
891 logarithm = _mm256_cvtepi32_ps(exp);
892
893 frac = _mm256_or_ps(
894 leadingOne,
895 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
896
897#if POW_POLY_DEGREE == 6
898 mantissa = POLY5_AVX2(frac,
899 3.1157899f,
900 -3.3241990f,
901 2.5988452f,
902 -1.2315303f,
903 3.1821337e-1f,
904 -3.4436006e-2f);
905#elif POW_POLY_DEGREE == 5
906 mantissa = POLY4_AVX2(frac,
907 2.8882704548164776201f,
908 -2.52074962577807006663f,
909 1.48116647521213171641f,
910 -0.465725644288844778798f,
911 0.0596515482674574969533f);
912#elif POW_POLY_DEGREE == 4
913 mantissa = POLY3_AVX2(frac,
914 2.61761038894603480148f,
915 -1.75647175389045657003f,
916 0.688243882994381274313f,
917 -0.107254423828329604454f);
918#elif POW_POLY_DEGREE == 3
919 mantissa = POLY2_AVX2(frac,
920 2.28330284476918490682f,
921 -1.04913055217340124191f,
922 0.204446009836232697516f);
923#else
924#error
925#endif
926
927 logarithm = _mm256_add_ps(
928 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
929 logarithm = _mm256_mul_ps(logarithm, ln2);
930
931 // Now calculate b*lna
932 bVal = _mm256_loadu_ps(bPtr);
933 bVal = _mm256_mul_ps(bVal, logarithm);
934
935 // Now compute exp(b*lna)
936 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
937
938 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
939
940 emm0 = _mm256_cvttps_epi32(fx);
941 tmp = _mm256_cvtepi32_ps(emm0);
942
943 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
944 fx = _mm256_sub_ps(tmp, mask);
945
946 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
947 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
948 z = _mm256_mul_ps(bVal, bVal);
949
950 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
951 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
952 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
953 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
954 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
955 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
956 y = _mm256_add_ps(y, one);
957
958 emm0 =
959 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
960
961 pow2n = _mm256_castsi256_ps(emm0);
962 cVal = _mm256_mul_ps(y, pow2n);
963
964 _mm256_storeu_ps(cPtr, cVal);
965
966 aPtr += 8;
967 bPtr += 8;
968 cPtr += 8;
969 }
970
971 number = eighthPoints * 8;
972 for (; number < num_points; number++) {
973 *cPtr++ = pow(*aPtr++, *bPtr++);
974 }
975}
976
977#endif /* LV_HAVE_AVX2 for unaligned */
978
979#endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */