79#ifndef INCLUDED_volk_32f_log2_32f_a_H
80#define INCLUDED_volk_32f_log2_32f_a_H
87#define LOG_POLY_DEGREE 6
94 float* bPtr = bVector;
95 const float* aPtr = aVector;
96 unsigned int number = 0;
98 for (number = 0; number < num_points; number++)
103#if LV_HAVE_AVX2 && LV_HAVE_FMA
104#include <immintrin.h>
106#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
107#define POLY1_FMAAVX2(x, c0, c1) \
108 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
109#define POLY2_FMAAVX2(x, c0, c1, c2) \
110 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
111#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
112 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
113#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
114 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
115#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
116 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
118static inline void volk_32f_log2_32f_a_avx2_fma(
float* bVector,
119 const float* aVector,
120 unsigned int num_points)
122 float* bPtr = bVector;
123 const float* aPtr = aVector;
125 unsigned int number = 0;
126 const unsigned int eighthPoints = num_points / 8;
128 __m256 aVal, bVal, mantissa, frac, leadingOne;
131 for (; number < eighthPoints; number++) {
133 aVal = _mm256_load_ps(aPtr);
134 bias = _mm256_set1_epi32(127);
135 leadingOne = _mm256_set1_ps(1.0f);
136 exp = _mm256_sub_epi32(
137 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138 _mm256_set1_epi32(0x7f800000)),
141 bVal = _mm256_cvtepi32_ps(exp);
146 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
148#if LOG_POLY_DEGREE == 6
149 mantissa = POLY5_FMAAVX2(frac,
156#elif LOG_POLY_DEGREE == 5
157 mantissa = POLY4_FMAAVX2(frac,
158 2.8882704548164776201f,
159 -2.52074962577807006663f,
160 1.48116647521213171641f,
161 -0.465725644288844778798f,
162 0.0596515482674574969533f);
163#elif LOG_POLY_DEGREE == 4
164 mantissa = POLY3_FMAAVX2(frac,
165 2.61761038894603480148f,
166 -1.75647175389045657003f,
167 0.688243882994381274313f,
168 -0.107254423828329604454f);
169#elif LOG_POLY_DEGREE == 3
170 mantissa = POLY2_FMAAVX2(frac,
171 2.28330284476918490682f,
172 -1.04913055217340124191f,
173 0.204446009836232697516f);
178 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
179 _mm256_store_ps(bPtr, bVal);
185 number = eighthPoints * 8;
192#include <immintrin.h>
194#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
195#define POLY1_AVX2(x, c0, c1) \
196 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
197#define POLY2_AVX2(x, c0, c1, c2) \
198 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
199#define POLY3_AVX2(x, c0, c1, c2, c3) \
200 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
201#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
202 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
203#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
204 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
207volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
209 float* bPtr = bVector;
210 const float* aPtr = aVector;
212 unsigned int number = 0;
213 const unsigned int eighthPoints = num_points / 8;
215 __m256 aVal, bVal, mantissa, frac, leadingOne;
218 for (; number < eighthPoints; number++) {
220 aVal = _mm256_load_ps(aPtr);
221 bias = _mm256_set1_epi32(127);
222 leadingOne = _mm256_set1_ps(1.0f);
223 exp = _mm256_sub_epi32(
224 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
225 _mm256_set1_epi32(0x7f800000)),
228 bVal = _mm256_cvtepi32_ps(exp);
233 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
235#if LOG_POLY_DEGREE == 6
236 mantissa = POLY5_AVX2(frac,
243#elif LOG_POLY_DEGREE == 5
244 mantissa = POLY4_AVX2(frac,
245 2.8882704548164776201f,
246 -2.52074962577807006663f,
247 1.48116647521213171641f,
248 -0.465725644288844778798f,
249 0.0596515482674574969533f);
250#elif LOG_POLY_DEGREE == 4
251 mantissa = POLY3_AVX2(frac,
252 2.61761038894603480148f,
253 -1.75647175389045657003f,
254 0.688243882994381274313f,
255 -0.107254423828329604454f);
256#elif LOG_POLY_DEGREE == 3
257 mantissa = POLY2_AVX2(frac,
258 2.28330284476918490682f,
259 -1.04913055217340124191f,
260 0.204446009836232697516f);
266 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
267 _mm256_store_ps(bPtr, bVal);
273 number = eighthPoints * 8;
280#include <smmintrin.h>
282#define POLY0(x, c0) _mm_set1_ps(c0)
283#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
284#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
285#define POLY3(x, c0, c1, c2, c3) \
286 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
287#define POLY4(x, c0, c1, c2, c3, c4) \
288 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
289#define POLY5(x, c0, c1, c2, c3, c4, c5) \
290 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
293volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
295 float* bPtr = bVector;
296 const float* aPtr = aVector;
298 unsigned int number = 0;
299 const unsigned int quarterPoints = num_points / 4;
301 __m128 aVal, bVal, mantissa, frac, leadingOne;
304 for (; number < quarterPoints; number++) {
319#if LOG_POLY_DEGREE == 6
320 mantissa = POLY5(frac,
327#elif LOG_POLY_DEGREE == 5
328 mantissa = POLY4(frac,
329 2.8882704548164776201f,
330 -2.52074962577807006663f,
331 1.48116647521213171641f,
332 -0.465725644288844778798f,
333 0.0596515482674574969533f);
334#elif LOG_POLY_DEGREE == 4
335 mantissa = POLY3(frac,
336 2.61761038894603480148f,
337 -1.75647175389045657003f,
338 0.688243882994381274313f,
339 -0.107254423828329604454f);
340#elif LOG_POLY_DEGREE == 3
341 mantissa = POLY2(frac,
342 2.28330284476918490682f,
343 -1.04913055217340124191f,
344 0.204446009836232697516f);
356 number = quarterPoints * 4;
366#define VLOG2Q_NEON_PREAMBLE() \
367 int32x4_t one = vdupq_n_s32(0x000800000); \
369 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
370 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
371 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
372 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
373 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
374 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
375 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
376 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
377 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
378 int32x4_t exp_bias = vdupq_n_s32(127);
381#define VLOG2Q_NEON_F32(log2_approx, aval) \
382 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
383 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
384 exponent_i = vshrq_n_s32(exponent_i, 23); \
389 significand_i = vorrq_s32(one, significand_i); \
390 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
392 exponent_i = vsubq_s32(exponent_i, exp_bias); \
393 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
397 log2_approx = vaddq_f32(exponent_f, p0); \
398 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \
399 log2_approx = vaddq_f32(log2_approx, tmp1); \
400 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \
401 tmp1 = vmulq_f32(sig_2, p2); \
402 log2_approx = vaddq_f32(log2_approx, tmp1); \
404 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \
405 tmp1 = vmulq_f32(sig_3, p3); \
406 log2_approx = vaddq_f32(log2_approx, tmp1); \
407 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \
408 tmp1 = vmulq_f32(sig_4, p4); \
409 log2_approx = vaddq_f32(log2_approx, tmp1); \
410 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \
411 tmp1 = vmulq_f32(sig_5, p5); \
412 log2_approx = vaddq_f32(log2_approx, tmp1); \
413 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \
414 tmp1 = vmulq_f32(sig_6, p6); \
415 log2_approx = vaddq_f32(log2_approx, tmp1);
420 float* bPtr = bVector;
421 const float* aPtr = aVector;
423 const unsigned int quarterPoints = num_points / 4;
426 float32x4_t log2_approx;
437 for (number = 0; number < quarterPoints; ++number) {
439 aval = vld1q_s32((
int*)aPtr);
443 vst1q_f32(bPtr, log2_approx);
449 number = quarterPoints * 4;
458#ifndef INCLUDED_volk_32f_log2_32f_u_H
459#define INCLUDED_volk_32f_log2_32f_u_H
463#include <smmintrin.h>
465#define POLY0(x, c0) _mm_set1_ps(c0)
466#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
467#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
468#define POLY3(x, c0, c1, c2, c3) \
469 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
470#define POLY4(x, c0, c1, c2, c3, c4) \
471 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
472#define POLY5(x, c0, c1, c2, c3, c4, c5) \
473 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
476volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
478 float* bPtr = bVector;
479 const float* aPtr = aVector;
481 unsigned int number = 0;
482 const unsigned int quarterPoints = num_points / 4;
484 __m128 aVal, bVal, mantissa, frac, leadingOne;
487 for (; number < quarterPoints; number++) {
502#if LOG_POLY_DEGREE == 6
503 mantissa = POLY5(frac,
510#elif LOG_POLY_DEGREE == 5
511 mantissa = POLY4(frac,
512 2.8882704548164776201f,
513 -2.52074962577807006663f,
514 1.48116647521213171641f,
515 -0.465725644288844778798f,
516 0.0596515482674574969533f);
517#elif LOG_POLY_DEGREE == 4
518 mantissa = POLY3(frac,
519 2.61761038894603480148f,
520 -1.75647175389045657003f,
521 0.688243882994381274313f,
522 -0.107254423828329604454f);
523#elif LOG_POLY_DEGREE == 3
524 mantissa = POLY2(frac,
525 2.28330284476918490682f,
526 -1.04913055217340124191f,
527 0.204446009836232697516f);
539 number = quarterPoints * 4;
545#if LV_HAVE_AVX2 && LV_HAVE_FMA
546#include <immintrin.h>
548#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
549#define POLY1_FMAAVX2(x, c0, c1) \
550 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
551#define POLY2_FMAAVX2(x, c0, c1, c2) \
552 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
553#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
554 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
555#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
556 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
557#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
558 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
560static inline void volk_32f_log2_32f_u_avx2_fma(
float* bVector,
561 const float* aVector,
562 unsigned int num_points)
564 float* bPtr = bVector;
565 const float* aPtr = aVector;
567 unsigned int number = 0;
568 const unsigned int eighthPoints = num_points / 8;
570 __m256 aVal, bVal, mantissa, frac, leadingOne;
573 for (; number < eighthPoints; number++) {
575 aVal = _mm256_loadu_ps(aPtr);
576 bias = _mm256_set1_epi32(127);
577 leadingOne = _mm256_set1_ps(1.0f);
578 exp = _mm256_sub_epi32(
579 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
580 _mm256_set1_epi32(0x7f800000)),
583 bVal = _mm256_cvtepi32_ps(exp);
588 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
590#if LOG_POLY_DEGREE == 6
591 mantissa = POLY5_FMAAVX2(frac,
598#elif LOG_POLY_DEGREE == 5
599 mantissa = POLY4_FMAAVX2(frac,
600 2.8882704548164776201f,
601 -2.52074962577807006663f,
602 1.48116647521213171641f,
603 -0.465725644288844778798f,
604 0.0596515482674574969533f);
605#elif LOG_POLY_DEGREE == 4
606 mantissa = POLY3_FMAAVX2(frac,
607 2.61761038894603480148f,
608 -1.75647175389045657003f,
609 0.688243882994381274313f,
610 -0.107254423828329604454f);
611#elif LOG_POLY_DEGREE == 3
612 mantissa = POLY2_FMAAVX2(frac,
613 2.28330284476918490682f,
614 -1.04913055217340124191f,
615 0.204446009836232697516f);
620 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
621 _mm256_storeu_ps(bPtr, bVal);
627 number = eighthPoints * 8;
634#include <immintrin.h>
636#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
637#define POLY1_AVX2(x, c0, c1) \
638 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
639#define POLY2_AVX2(x, c0, c1, c2) \
640 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
641#define POLY3_AVX2(x, c0, c1, c2, c3) \
642 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
643#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
644 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
645#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
646 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
649volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
651 float* bPtr = bVector;
652 const float* aPtr = aVector;
654 unsigned int number = 0;
655 const unsigned int eighthPoints = num_points / 8;
657 __m256 aVal, bVal, mantissa, frac, leadingOne;
660 for (; number < eighthPoints; number++) {
662 aVal = _mm256_loadu_ps(aPtr);
663 bias = _mm256_set1_epi32(127);
664 leadingOne = _mm256_set1_ps(1.0f);
665 exp = _mm256_sub_epi32(
666 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
667 _mm256_set1_epi32(0x7f800000)),
670 bVal = _mm256_cvtepi32_ps(exp);
675 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
677#if LOG_POLY_DEGREE == 6
678 mantissa = POLY5_AVX2(frac,
685#elif LOG_POLY_DEGREE == 5
686 mantissa = POLY4_AVX2(frac,
687 2.8882704548164776201f,
688 -2.52074962577807006663f,
689 1.48116647521213171641f,
690 -0.465725644288844778798f,
691 0.0596515482674574969533f);
692#elif LOG_POLY_DEGREE == 4
693 mantissa = POLY3_AVX2(frac,
694 2.61761038894603480148f,
695 -1.75647175389045657003f,
696 0.688243882994381274313f,
697 -0.107254423828329604454f);
698#elif LOG_POLY_DEGREE == 3
699 mantissa = POLY2_AVX2(frac,
700 2.28330284476918490682f,
701 -1.04913055217340124191f,
702 0.204446009836232697516f);
708 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
709 _mm256_storeu_ps(bPtr, bVal);
715 number = eighthPoints * 8;