Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_log2_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
79#ifndef INCLUDED_volk_32f_log2_32f_a_H
80#define INCLUDED_volk_32f_log2_32f_a_H
81
82#include <inttypes.h>
83#include <math.h>
84#include <stdio.h>
85#include <stdlib.h>
86
87#define LOG_POLY_DEGREE 6
88
89#ifdef LV_HAVE_GENERIC
90
91static inline void
92volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
93{
94 float* bPtr = bVector;
95 const float* aPtr = aVector;
96 unsigned int number = 0;
97
98 for (number = 0; number < num_points; number++)
99 *bPtr++ = log2f_non_ieee(*aPtr++);
100}
101#endif /* LV_HAVE_GENERIC */
102
103#if LV_HAVE_AVX2 && LV_HAVE_FMA
104#include <immintrin.h>
105
106#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
107#define POLY1_FMAAVX2(x, c0, c1) \
108 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
109#define POLY2_FMAAVX2(x, c0, c1, c2) \
110 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
111#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
112 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
113#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
114 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
115#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
116 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
117
118static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
119 const float* aVector,
120 unsigned int num_points)
121{
122 float* bPtr = bVector;
123 const float* aPtr = aVector;
124
125 unsigned int number = 0;
126 const unsigned int eighthPoints = num_points / 8;
127
128 __m256 aVal, bVal, mantissa, frac, leadingOne;
129 __m256i bias, exp;
130
131 for (; number < eighthPoints; number++) {
132
133 aVal = _mm256_load_ps(aPtr);
134 bias = _mm256_set1_epi32(127);
135 leadingOne = _mm256_set1_ps(1.0f);
136 exp = _mm256_sub_epi32(
137 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138 _mm256_set1_epi32(0x7f800000)),
139 23),
140 bias);
141 bVal = _mm256_cvtepi32_ps(exp);
142
143 // Now to extract mantissa
144 frac = _mm256_or_ps(
145 leadingOne,
146 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
147
148#if LOG_POLY_DEGREE == 6
149 mantissa = POLY5_FMAAVX2(frac,
150 3.1157899f,
151 -3.3241990f,
152 2.5988452f,
153 -1.2315303f,
154 3.1821337e-1f,
155 -3.4436006e-2f);
156#elif LOG_POLY_DEGREE == 5
157 mantissa = POLY4_FMAAVX2(frac,
158 2.8882704548164776201f,
159 -2.52074962577807006663f,
160 1.48116647521213171641f,
161 -0.465725644288844778798f,
162 0.0596515482674574969533f);
163#elif LOG_POLY_DEGREE == 4
164 mantissa = POLY3_FMAAVX2(frac,
165 2.61761038894603480148f,
166 -1.75647175389045657003f,
167 0.688243882994381274313f,
168 -0.107254423828329604454f);
169#elif LOG_POLY_DEGREE == 3
170 mantissa = POLY2_FMAAVX2(frac,
171 2.28330284476918490682f,
172 -1.04913055217340124191f,
173 0.204446009836232697516f);
174#else
175#error
176#endif
177
178 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
179 _mm256_store_ps(bPtr, bVal);
180
181 aPtr += 8;
182 bPtr += 8;
183 }
184
185 number = eighthPoints * 8;
186 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
187}
188
189#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
190
191#ifdef LV_HAVE_AVX2
192#include <immintrin.h>
193
194#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
195#define POLY1_AVX2(x, c0, c1) \
196 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
197#define POLY2_AVX2(x, c0, c1, c2) \
198 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
199#define POLY3_AVX2(x, c0, c1, c2, c3) \
200 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
201#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
202 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
203#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
204 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
205
206static inline void
207volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
208{
209 float* bPtr = bVector;
210 const float* aPtr = aVector;
211
212 unsigned int number = 0;
213 const unsigned int eighthPoints = num_points / 8;
214
215 __m256 aVal, bVal, mantissa, frac, leadingOne;
216 __m256i bias, exp;
217
218 for (; number < eighthPoints; number++) {
219
220 aVal = _mm256_load_ps(aPtr);
221 bias = _mm256_set1_epi32(127);
222 leadingOne = _mm256_set1_ps(1.0f);
223 exp = _mm256_sub_epi32(
224 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
225 _mm256_set1_epi32(0x7f800000)),
226 23),
227 bias);
228 bVal = _mm256_cvtepi32_ps(exp);
229
230 // Now to extract mantissa
231 frac = _mm256_or_ps(
232 leadingOne,
233 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
234
235#if LOG_POLY_DEGREE == 6
236 mantissa = POLY5_AVX2(frac,
237 3.1157899f,
238 -3.3241990f,
239 2.5988452f,
240 -1.2315303f,
241 3.1821337e-1f,
242 -3.4436006e-2f);
243#elif LOG_POLY_DEGREE == 5
244 mantissa = POLY4_AVX2(frac,
245 2.8882704548164776201f,
246 -2.52074962577807006663f,
247 1.48116647521213171641f,
248 -0.465725644288844778798f,
249 0.0596515482674574969533f);
250#elif LOG_POLY_DEGREE == 4
251 mantissa = POLY3_AVX2(frac,
252 2.61761038894603480148f,
253 -1.75647175389045657003f,
254 0.688243882994381274313f,
255 -0.107254423828329604454f);
256#elif LOG_POLY_DEGREE == 3
257 mantissa = POLY2_AVX2(frac,
258 2.28330284476918490682f,
259 -1.04913055217340124191f,
260 0.204446009836232697516f);
261#else
262#error
263#endif
264
265 bVal =
266 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
267 _mm256_store_ps(bPtr, bVal);
268
269 aPtr += 8;
270 bPtr += 8;
271 }
272
273 number = eighthPoints * 8;
274 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
275}
276
277#endif /* LV_HAVE_AVX2 for aligned */
278
279#ifdef LV_HAVE_SSE4_1
280#include <smmintrin.h>
281
282#define POLY0(x, c0) _mm_set1_ps(c0)
283#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
284#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
285#define POLY3(x, c0, c1, c2, c3) \
286 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
287#define POLY4(x, c0, c1, c2, c3, c4) \
288 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
289#define POLY5(x, c0, c1, c2, c3, c4, c5) \
290 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
291
292static inline void
293volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
294{
295 float* bPtr = bVector;
296 const float* aPtr = aVector;
297
298 unsigned int number = 0;
299 const unsigned int quarterPoints = num_points / 4;
300
301 __m128 aVal, bVal, mantissa, frac, leadingOne;
302 __m128i bias, exp;
303
304 for (; number < quarterPoints; number++) {
305
306 aVal = _mm_load_ps(aPtr);
307 bias = _mm_set1_epi32(127);
308 leadingOne = _mm_set1_ps(1.0f);
309 exp = _mm_sub_epi32(
311 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
312 bias);
313 bVal = _mm_cvtepi32_ps(exp);
314
315 // Now to extract mantissa
316 frac = _mm_or_ps(leadingOne,
317 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
318
319#if LOG_POLY_DEGREE == 6
320 mantissa = POLY5(frac,
321 3.1157899f,
322 -3.3241990f,
323 2.5988452f,
324 -1.2315303f,
325 3.1821337e-1f,
326 -3.4436006e-2f);
327#elif LOG_POLY_DEGREE == 5
328 mantissa = POLY4(frac,
329 2.8882704548164776201f,
330 -2.52074962577807006663f,
331 1.48116647521213171641f,
332 -0.465725644288844778798f,
333 0.0596515482674574969533f);
334#elif LOG_POLY_DEGREE == 4
335 mantissa = POLY3(frac,
336 2.61761038894603480148f,
337 -1.75647175389045657003f,
338 0.688243882994381274313f,
339 -0.107254423828329604454f);
340#elif LOG_POLY_DEGREE == 3
341 mantissa = POLY2(frac,
342 2.28330284476918490682f,
343 -1.04913055217340124191f,
344 0.204446009836232697516f);
345#else
346#error
347#endif
348
349 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
350 _mm_store_ps(bPtr, bVal);
351
352 aPtr += 4;
353 bPtr += 4;
354 }
355
356 number = quarterPoints * 4;
357 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
358}
359
360#endif /* LV_HAVE_SSE4_1 for aligned */
361
362#ifdef LV_HAVE_NEON
363#include <arm_neon.h>
364
365/* these macros allow us to embed logs in other kernels */
366#define VLOG2Q_NEON_PREAMBLE() \
367 int32x4_t one = vdupq_n_s32(0x000800000); \
368 /* minimax polynomial */ \
369 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
370 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
371 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
372 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
373 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
374 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
375 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
376 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
377 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
378 int32x4_t exp_bias = vdupq_n_s32(127);
379
380
381#define VLOG2Q_NEON_F32(log2_approx, aval) \
382 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
383 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
384 exponent_i = vshrq_n_s32(exponent_i, 23); \
385 \
386 /* extract the exponent and significand \
387 we can treat this as fixed point to save ~9% on the \
388 conversion + float add */ \
389 significand_i = vorrq_s32(one, significand_i); \
390 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
391 /* debias the exponent and convert to float */ \
392 exponent_i = vsubq_s32(exponent_i, exp_bias); \
393 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
394 \
395 /* put the significand through a polynomial fit of log2(x) [1,2] \
396 add the result to the exponent */ \
397 log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
398 float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
399 log2_approx = vaddq_f32(log2_approx, tmp1); \
400 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
401 tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
402 log2_approx = vaddq_f32(log2_approx, tmp1); \
403 \
404 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
405 tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
406 log2_approx = vaddq_f32(log2_approx, tmp1); \
407 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
408 tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
409 log2_approx = vaddq_f32(log2_approx, tmp1); \
410 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
411 tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
412 log2_approx = vaddq_f32(log2_approx, tmp1); \
413 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
414 tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
415 log2_approx = vaddq_f32(log2_approx, tmp1);
416
417static inline void
418volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
419{
420 float* bPtr = bVector;
421 const float* aPtr = aVector;
422 unsigned int number;
423 const unsigned int quarterPoints = num_points / 4;
424
425 int32x4_t aval;
426 float32x4_t log2_approx;
427
429 // lms
430 // p0 = vdupq_n_f32(-1.649132280361871);
431 // p1 = vdupq_n_f32(1.995047138579499);
432 // p2 = vdupq_n_f32(-0.336914839219728);
433
434 // keep in mind a single precision float is represented as
435 // (-1)^sign * 2^exp * 1.significand, so the log2 is
436 // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
437 for (number = 0; number < quarterPoints; ++number) {
438 // load float in to an int register without conversion
439 aval = vld1q_s32((int*)aPtr);
440
441 VLOG2Q_NEON_F32(log2_approx, aval)
442
443 vst1q_f32(bPtr, log2_approx);
444
445 aPtr += 4;
446 bPtr += 4;
447 }
448
449 number = quarterPoints * 4;
450 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
451}
452
453#endif /* LV_HAVE_NEON */
454
455
456#endif /* INCLUDED_volk_32f_log2_32f_a_H */
457
458#ifndef INCLUDED_volk_32f_log2_32f_u_H
459#define INCLUDED_volk_32f_log2_32f_u_H
460
461
462#ifdef LV_HAVE_SSE4_1
463#include <smmintrin.h>
464
465#define POLY0(x, c0) _mm_set1_ps(c0)
466#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
467#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
468#define POLY3(x, c0, c1, c2, c3) \
469 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
470#define POLY4(x, c0, c1, c2, c3, c4) \
471 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
472#define POLY5(x, c0, c1, c2, c3, c4, c5) \
473 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
474
475static inline void
476volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
477{
478 float* bPtr = bVector;
479 const float* aPtr = aVector;
480
481 unsigned int number = 0;
482 const unsigned int quarterPoints = num_points / 4;
483
484 __m128 aVal, bVal, mantissa, frac, leadingOne;
485 __m128i bias, exp;
486
487 for (; number < quarterPoints; number++) {
488
489 aVal = _mm_loadu_ps(aPtr);
490 bias = _mm_set1_epi32(127);
491 leadingOne = _mm_set1_ps(1.0f);
492 exp = _mm_sub_epi32(
494 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
495 bias);
496 bVal = _mm_cvtepi32_ps(exp);
497
498 // Now to extract mantissa
499 frac = _mm_or_ps(leadingOne,
500 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
501
502#if LOG_POLY_DEGREE == 6
503 mantissa = POLY5(frac,
504 3.1157899f,
505 -3.3241990f,
506 2.5988452f,
507 -1.2315303f,
508 3.1821337e-1f,
509 -3.4436006e-2f);
510#elif LOG_POLY_DEGREE == 5
511 mantissa = POLY4(frac,
512 2.8882704548164776201f,
513 -2.52074962577807006663f,
514 1.48116647521213171641f,
515 -0.465725644288844778798f,
516 0.0596515482674574969533f);
517#elif LOG_POLY_DEGREE == 4
518 mantissa = POLY3(frac,
519 2.61761038894603480148f,
520 -1.75647175389045657003f,
521 0.688243882994381274313f,
522 -0.107254423828329604454f);
523#elif LOG_POLY_DEGREE == 3
524 mantissa = POLY2(frac,
525 2.28330284476918490682f,
526 -1.04913055217340124191f,
527 0.204446009836232697516f);
528#else
529#error
530#endif
531
532 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
533 _mm_storeu_ps(bPtr, bVal);
534
535 aPtr += 4;
536 bPtr += 4;
537 }
538
539 number = quarterPoints * 4;
540 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
541}
542
543#endif /* LV_HAVE_SSE4_1 for unaligned */
544
545#if LV_HAVE_AVX2 && LV_HAVE_FMA
546#include <immintrin.h>
547
548#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
549#define POLY1_FMAAVX2(x, c0, c1) \
550 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
551#define POLY2_FMAAVX2(x, c0, c1, c2) \
552 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
553#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
554 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
555#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
556 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
557#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
558 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
559
560static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
561 const float* aVector,
562 unsigned int num_points)
563{
564 float* bPtr = bVector;
565 const float* aPtr = aVector;
566
567 unsigned int number = 0;
568 const unsigned int eighthPoints = num_points / 8;
569
570 __m256 aVal, bVal, mantissa, frac, leadingOne;
571 __m256i bias, exp;
572
573 for (; number < eighthPoints; number++) {
574
575 aVal = _mm256_loadu_ps(aPtr);
576 bias = _mm256_set1_epi32(127);
577 leadingOne = _mm256_set1_ps(1.0f);
578 exp = _mm256_sub_epi32(
579 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
580 _mm256_set1_epi32(0x7f800000)),
581 23),
582 bias);
583 bVal = _mm256_cvtepi32_ps(exp);
584
585 // Now to extract mantissa
586 frac = _mm256_or_ps(
587 leadingOne,
588 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
589
590#if LOG_POLY_DEGREE == 6
591 mantissa = POLY5_FMAAVX2(frac,
592 3.1157899f,
593 -3.3241990f,
594 2.5988452f,
595 -1.2315303f,
596 3.1821337e-1f,
597 -3.4436006e-2f);
598#elif LOG_POLY_DEGREE == 5
599 mantissa = POLY4_FMAAVX2(frac,
600 2.8882704548164776201f,
601 -2.52074962577807006663f,
602 1.48116647521213171641f,
603 -0.465725644288844778798f,
604 0.0596515482674574969533f);
605#elif LOG_POLY_DEGREE == 4
606 mantissa = POLY3_FMAAVX2(frac,
607 2.61761038894603480148f,
608 -1.75647175389045657003f,
609 0.688243882994381274313f,
610 -0.107254423828329604454f);
611#elif LOG_POLY_DEGREE == 3
612 mantissa = POLY2_FMAAVX2(frac,
613 2.28330284476918490682f,
614 -1.04913055217340124191f,
615 0.204446009836232697516f);
616#else
617#error
618#endif
619
620 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
621 _mm256_storeu_ps(bPtr, bVal);
622
623 aPtr += 8;
624 bPtr += 8;
625 }
626
627 number = eighthPoints * 8;
628 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
629}
630
631#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
632
633#ifdef LV_HAVE_AVX2
634#include <immintrin.h>
635
636#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
637#define POLY1_AVX2(x, c0, c1) \
638 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
639#define POLY2_AVX2(x, c0, c1, c2) \
640 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
641#define POLY3_AVX2(x, c0, c1, c2, c3) \
642 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
643#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
644 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
645#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
646 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
647
648static inline void
649volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
650{
651 float* bPtr = bVector;
652 const float* aPtr = aVector;
653
654 unsigned int number = 0;
655 const unsigned int eighthPoints = num_points / 8;
656
657 __m256 aVal, bVal, mantissa, frac, leadingOne;
658 __m256i bias, exp;
659
660 for (; number < eighthPoints; number++) {
661
662 aVal = _mm256_loadu_ps(aPtr);
663 bias = _mm256_set1_epi32(127);
664 leadingOne = _mm256_set1_ps(1.0f);
665 exp = _mm256_sub_epi32(
666 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
667 _mm256_set1_epi32(0x7f800000)),
668 23),
669 bias);
670 bVal = _mm256_cvtepi32_ps(exp);
671
672 // Now to extract mantissa
673 frac = _mm256_or_ps(
674 leadingOne,
675 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
676
677#if LOG_POLY_DEGREE == 6
678 mantissa = POLY5_AVX2(frac,
679 3.1157899f,
680 -3.3241990f,
681 2.5988452f,
682 -1.2315303f,
683 3.1821337e-1f,
684 -3.4436006e-2f);
685#elif LOG_POLY_DEGREE == 5
686 mantissa = POLY4_AVX2(frac,
687 2.8882704548164776201f,
688 -2.52074962577807006663f,
689 1.48116647521213171641f,
690 -0.465725644288844778798f,
691 0.0596515482674574969533f);
692#elif LOG_POLY_DEGREE == 4
693 mantissa = POLY3_AVX2(frac,
694 2.61761038894603480148f,
695 -1.75647175389045657003f,
696 0.688243882994381274313f,
697 -0.107254423828329604454f);
698#elif LOG_POLY_DEGREE == 3
699 mantissa = POLY2_AVX2(frac,
700 2.28330284476918490682f,
701 -1.04913055217340124191f,
702 0.204446009836232697516f);
703#else
704#error
705#endif
706
707 bVal =
708 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
709 _mm256_storeu_ps(bPtr, bVal);
710
711 aPtr += 8;
712 bPtr += 8;
713 }
714
715 number = eighthPoints * 8;
716 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
717}
718
719#endif /* LV_HAVE_AVX2 for unaligned */
720
721
722#endif /* INCLUDED_volk_32f_log2_32f_u_H */