Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
71#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
72#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
73
74#include <inttypes.h>
75#include <stdio.h>
76#include <volk/volk_complex.h>
77
78#ifndef MAX
79#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
80#endif
81
82#ifdef LV_HAVE_SSE3
83#include <pmmintrin.h>
84#include <xmmintrin.h>
85
86static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
87 float* src0,
88 float* center_point_array,
89 float* cutoff,
90 unsigned int num_points)
91{
92 float result = 0.0f;
93 float fst = 0.0f;
94 float sq = 0.0f;
95 float thrd = 0.0f;
96 float frth = 0.0f;
97
98 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
99
100 xmm9 = _mm_setzero_ps();
101 xmm1 = _mm_setzero_ps();
102 xmm0 = _mm_load1_ps(&center_point_array[0]);
103 xmm6 = _mm_load1_ps(&center_point_array[1]);
104 xmm7 = _mm_load1_ps(&center_point_array[2]);
105 xmm8 = _mm_load1_ps(&center_point_array[3]);
106 xmm10 = _mm_load1_ps(cutoff);
107
108 int bound = num_points / 8;
109 int leftovers = num_points - 8 * bound;
110 int i = 0;
111 for (; i < bound; ++i) {
112 // 1st
113 xmm2 = _mm_load_ps(src0);
114 xmm2 = _mm_max_ps(xmm10, xmm2);
115 xmm3 = _mm_mul_ps(xmm2, xmm2);
116 xmm4 = _mm_mul_ps(xmm2, xmm3);
117 xmm5 = _mm_mul_ps(xmm3, xmm3);
118
119 xmm2 = _mm_mul_ps(xmm2, xmm0);
120 xmm3 = _mm_mul_ps(xmm3, xmm6);
121 xmm4 = _mm_mul_ps(xmm4, xmm7);
122 xmm5 = _mm_mul_ps(xmm5, xmm8);
123
124 xmm2 = _mm_add_ps(xmm2, xmm3);
125 xmm3 = _mm_add_ps(xmm4, xmm5);
126
127 src0 += 4;
128
129 xmm9 = _mm_add_ps(xmm2, xmm9);
130 xmm9 = _mm_add_ps(xmm3, xmm9);
131
132 // 2nd
133 xmm2 = _mm_load_ps(src0);
134 xmm2 = _mm_max_ps(xmm10, xmm2);
135 xmm3 = _mm_mul_ps(xmm2, xmm2);
136 xmm4 = _mm_mul_ps(xmm2, xmm3);
137 xmm5 = _mm_mul_ps(xmm3, xmm3);
138
139 xmm2 = _mm_mul_ps(xmm2, xmm0);
140 xmm3 = _mm_mul_ps(xmm3, xmm6);
141 xmm4 = _mm_mul_ps(xmm4, xmm7);
142 xmm5 = _mm_mul_ps(xmm5, xmm8);
143
144 xmm2 = _mm_add_ps(xmm2, xmm3);
145 xmm3 = _mm_add_ps(xmm4, xmm5);
146
147 src0 += 4;
148
149 xmm1 = _mm_add_ps(xmm2, xmm1);
150 xmm1 = _mm_add_ps(xmm3, xmm1);
151 }
152 xmm2 = _mm_hadd_ps(xmm9, xmm1);
153 xmm3 = _mm_hadd_ps(xmm2, xmm2);
154 xmm4 = _mm_hadd_ps(xmm3, xmm3);
155 _mm_store_ss(&result, xmm4);
156
157 for (i = 0; i < leftovers; ++i) {
158 fst = *src0++;
159 fst = MAX(fst, *cutoff);
160 sq = fst * fst;
161 thrd = fst * sq;
162 frth = sq * sq;
163 result += (center_point_array[0] * fst + center_point_array[1] * sq +
164 center_point_array[2] * thrd + center_point_array[3] * frth);
165 }
166
167 result += (float)(num_points)*center_point_array[4];
168 *target = result;
169}
170
171
172#endif /*LV_HAVE_SSE3*/
173
174#if LV_HAVE_AVX && LV_HAVE_FMA
175#include <immintrin.h>
176
177static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
178 float* src0,
179 float* center_point_array,
180 float* cutoff,
181 unsigned int num_points)
182{
183 const unsigned int eighth_points = num_points / 8;
184 float fst = 0.0;
185 float sq = 0.0;
186 float thrd = 0.0;
187 float frth = 0.0;
188
189 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
190 __m256 target_vec;
191 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
192
193 cpa0 = _mm256_set1_ps(center_point_array[0]);
194 cpa1 = _mm256_set1_ps(center_point_array[1]);
195 cpa2 = _mm256_set1_ps(center_point_array[2]);
196 cpa3 = _mm256_set1_ps(center_point_array[3]);
197 cutoff_vec = _mm256_set1_ps(*cutoff);
198 target_vec = _mm256_setzero_ps();
199
200 unsigned int i;
201
202 for (i = 0; i < eighth_points; ++i) {
203 x_to_1 = _mm256_load_ps(src0);
204 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
205 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
206 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
207 // x^1 * x^3 is slightly faster than x^2 * x^2
208 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
209
210 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
211 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
212
213 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
214 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
215 // this is slightly faster than result += (x_to_1 + x_to_3)
216 target_vec = _mm256_add_ps(x_to_1, target_vec);
217 target_vec = _mm256_add_ps(x_to_3, target_vec);
218
219 src0 += 8;
220 }
221
222 // the hadd for vector reduction has very very slight impact @ 50k iters
223 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
224 target_vec = _mm256_hadd_ps(
225 target_vec,
226 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
227 _mm256_store_ps(temp_results, target_vec);
228 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
229
230 for (i = eighth_points * 8; i < num_points; ++i) {
231 fst = *src0++;
232 fst = MAX(fst, *cutoff);
233 sq = fst * fst;
234 thrd = fst * sq;
235 frth = sq * sq;
236 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
237 center_point_array[2] * thrd + center_point_array[3] * frth);
238 }
239 *target += (float)(num_points)*center_point_array[4];
240}
241#endif // LV_HAVE_AVX && LV_HAVE_FMA
242
243#ifdef LV_HAVE_AVX
244#include <immintrin.h>
245
246static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
247 float* src0,
248 float* center_point_array,
249 float* cutoff,
250 unsigned int num_points)
251{
252 const unsigned int eighth_points = num_points / 8;
253 float fst = 0.0;
254 float sq = 0.0;
255 float thrd = 0.0;
256 float frth = 0.0;
257
258 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
259 __m256 target_vec;
260 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
261
262 cpa0 = _mm256_set1_ps(center_point_array[0]);
263 cpa1 = _mm256_set1_ps(center_point_array[1]);
264 cpa2 = _mm256_set1_ps(center_point_array[2]);
265 cpa3 = _mm256_set1_ps(center_point_array[3]);
266 cutoff_vec = _mm256_set1_ps(*cutoff);
267 target_vec = _mm256_setzero_ps();
268
269 unsigned int i;
270
271 for (i = 0; i < eighth_points; ++i) {
272 x_to_1 = _mm256_load_ps(src0);
273 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
274 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
275 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
276 // x^1 * x^3 is slightly faster than x^2 * x^2
277 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
278
279 x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
280 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
281 x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
282 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
283
284 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
285 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
286 // this is slightly faster than result += (x_to_1 + x_to_3)
287 target_vec = _mm256_add_ps(x_to_1, target_vec);
288 target_vec = _mm256_add_ps(x_to_3, target_vec);
289
290 src0 += 8;
291 }
292
293 // the hadd for vector reduction has very very slight impact @ 50k iters
294 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
295 target_vec = _mm256_hadd_ps(
296 target_vec,
297 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
298 _mm256_store_ps(temp_results, target_vec);
299 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
300
301 for (i = eighth_points * 8; i < num_points; ++i) {
302 fst = *src0++;
303 fst = MAX(fst, *cutoff);
304 sq = fst * fst;
305 thrd = fst * sq;
306 frth = sq * sq;
307 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
308 center_point_array[2] * thrd + center_point_array[3] * frth);
309 }
310 *target += (float)(num_points)*center_point_array[4];
311}
312#endif // LV_HAVE_AVX
313
314
315#ifdef LV_HAVE_GENERIC
316
317static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
318 float* src0,
319 float* center_point_array,
320 float* cutoff,
321 unsigned int num_points)
322{
323 const unsigned int eighth_points = num_points / 8;
324
325 float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
326 float fst = 0.0f;
327 float sq = 0.0f;
328 float thrd = 0.0f;
329 float frth = 0.0f;
330
331 unsigned int i = 0;
332 unsigned int k = 0;
333 for (i = 0; i < eighth_points; ++i) {
334 for (k = 0; k < 8; ++k) {
335 fst = *src0++;
336 fst = MAX(fst, *cutoff);
337 sq = fst * fst;
338 thrd = fst * sq;
339 frth = fst * thrd;
340 result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
341 result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
342 }
343 }
344 for (k = 0; k < 8; k += 2)
345 result[k] = result[k] + result[k + 1];
346
347 *target = result[0] + result[2] + result[4] + result[6];
348
349 for (i = eighth_points * 8; i < num_points; ++i) {
350 fst = *src0++;
351 fst = MAX(fst, *cutoff);
352 sq = fst * fst;
353 thrd = fst * sq;
354 frth = fst * thrd;
355 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
356 center_point_array[2] * thrd + center_point_array[3] * frth);
357 }
358 *target += (float)(num_points)*center_point_array[4];
359}
360
361#endif /*LV_HAVE_GENERIC*/
362
363#ifdef LV_HAVE_NEON
364#include <arm_neon.h>
365
366static inline void
367volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
368 float* __restrict src0,
369 float* __restrict center_point_array,
370 float* __restrict cutoff,
371 unsigned int num_points)
372{
373 unsigned int i;
374 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
375
376 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
377 float32x2_t cutoff_vector;
378 float32x2x2_t x_low, x_high;
379 float32x4_t x_qvector, c_qvector, cpa_qvector;
380 float accumulator;
381 float res_accumulators[4];
382
383 c_qvector = vld1q_f32(zero);
384 // load the cutoff in to a vector
385 cutoff_vector = vdup_n_f32(*cutoff);
386 // ... center point array
387 cpa_qvector = vld1q_f32(center_point_array);
388
389 for (i = 0; i < num_points; ++i) {
390 // load x (src0)
391 x_to_1 = vdup_n_f32(*src0++);
392
393 // Get a vector of max(src0, cutoff)
394 x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
395 x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
396 x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
397 x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
398 // zip up doubles to interleave
399 x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
400 x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
401 // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
402 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
403 // now we finally have [x^4 | x^3 | x^2 | x] !
404
405 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
406 }
407 // there should be better vector reduction techniques
408 vst1q_f32(res_accumulators, c_qvector);
409 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
410 res_accumulators[3];
411
412 *target = accumulator + (float)num_points * center_point_array[4];
413}
414
415#endif /* LV_HAVE_NEON */
416
417
418#ifdef LV_HAVE_NEON
419
420static inline void
422 float* __restrict src0,
423 float* __restrict center_point_array,
424 float* __restrict cutoff,
425 unsigned int num_points)
426{
427 unsigned int i;
428 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
429
430 float accumulator;
431
432 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
433 accumulator1_vec = vld1q_f32(zero);
434 accumulator2_vec = vld1q_f32(zero);
435 accumulator3_vec = vld1q_f32(zero);
436 accumulator4_vec = vld1q_f32(zero);
437 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
438 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
439
440 // load the cutoff in to a vector
441 cutoff_vector = vdupq_n_f32(*cutoff);
442 // ... center point array
443 cpa_0 = vdupq_n_f32(center_point_array[0]);
444 cpa_1 = vdupq_n_f32(center_point_array[1]);
445 cpa_2 = vdupq_n_f32(center_point_array[2]);
446 cpa_3 = vdupq_n_f32(center_point_array[3]);
447
448 // nathan is not sure why this is slower *and* wrong compared to neonvertfma
449 for (i = 0; i < num_points / 4; ++i) {
450 // load x
451 x_to_1 = vld1q_f32(src0);
452
453 // Get a vector of max(src0, cutoff)
454 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
455 x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
456 x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
457 x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
458 x_to_1 = vmulq_f32(x_to_1, cpa_0);
459 x_to_2 = vmulq_f32(x_to_2, cpa_1);
460 x_to_3 = vmulq_f32(x_to_3, cpa_2);
461 x_to_4 = vmulq_f32(x_to_4, cpa_3);
462 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
463 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
464 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
465 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
466
467 src0 += 4;
468 }
469 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
470 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
471 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
472
473 __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
474 vst1q_f32(res_accumulators, accumulator1_vec);
475 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
476 res_accumulators[3];
477
478 float fst = 0.0;
479 float sq = 0.0;
480 float thrd = 0.0;
481 float frth = 0.0;
482
483 for (i = 4 * (num_points / 4); i < num_points; ++i) {
484 fst = *src0++;
485 fst = MAX(fst, *cutoff);
486
487 sq = fst * fst;
488 thrd = fst * sq;
489 frth = sq * sq;
490 // fith = sq * thrd;
491
492 accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
493 center_point_array[2] * thrd + center_point_array[3] * frth); //+
494 }
495
496 *target = accumulator + (float)num_points * center_point_array[4];
497}
498
499#endif /* LV_HAVE_NEON */
500
501#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
502
503#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
504#define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
505
506#include <inttypes.h>
507#include <stdio.h>
508#include <volk/volk_complex.h>
509
510#ifndef MAX
511#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
512#endif
513
514#if LV_HAVE_AVX && LV_HAVE_FMA
515#include <immintrin.h>
516
517static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
518 float* src0,
519 float* center_point_array,
520 float* cutoff,
521 unsigned int num_points)
522{
523 const unsigned int eighth_points = num_points / 8;
524 float fst = 0.0;
525 float sq = 0.0;
526 float thrd = 0.0;
527 float frth = 0.0;
528
529 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
530 __m256 target_vec;
531 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
532
533 cpa0 = _mm256_set1_ps(center_point_array[0]);
534 cpa1 = _mm256_set1_ps(center_point_array[1]);
535 cpa2 = _mm256_set1_ps(center_point_array[2]);
536 cpa3 = _mm256_set1_ps(center_point_array[3]);
537 cutoff_vec = _mm256_set1_ps(*cutoff);
538 target_vec = _mm256_setzero_ps();
539
540 unsigned int i;
541
542 for (i = 0; i < eighth_points; ++i) {
543 x_to_1 = _mm256_loadu_ps(src0);
544 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
545 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
546 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
547 // x^1 * x^3 is slightly faster than x^2 * x^2
548 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
549
550 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
551 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
552
553 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
554 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
555 // this is slightly faster than result += (x_to_1 + x_to_3)
556 target_vec = _mm256_add_ps(x_to_1, target_vec);
557 target_vec = _mm256_add_ps(x_to_3, target_vec);
558
559 src0 += 8;
560 }
561
562 // the hadd for vector reduction has very very slight impact @ 50k iters
563 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
564 target_vec = _mm256_hadd_ps(
565 target_vec,
566 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
567 _mm256_storeu_ps(temp_results, target_vec);
568 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
569
570 for (i = eighth_points * 8; i < num_points; ++i) {
571 fst = *src0++;
572 fst = MAX(fst, *cutoff);
573 sq = fst * fst;
574 thrd = fst * sq;
575 frth = sq * sq;
576 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
577 center_point_array[2] * thrd + center_point_array[3] * frth);
578 }
579
580 *target += (float)(num_points)*center_point_array[4];
581}
582#endif // LV_HAVE_AVX && LV_HAVE_FMA
583
584#ifdef LV_HAVE_AVX
585#include <immintrin.h>
586
587static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
588 float* src0,
589 float* center_point_array,
590 float* cutoff,
591 unsigned int num_points)
592{
593 const unsigned int eighth_points = num_points / 8;
594 float fst = 0.0;
595 float sq = 0.0;
596 float thrd = 0.0;
597 float frth = 0.0;
598
599 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
600 __m256 target_vec;
601 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
602
603 cpa0 = _mm256_set1_ps(center_point_array[0]);
604 cpa1 = _mm256_set1_ps(center_point_array[1]);
605 cpa2 = _mm256_set1_ps(center_point_array[2]);
606 cpa3 = _mm256_set1_ps(center_point_array[3]);
607 cutoff_vec = _mm256_set1_ps(*cutoff);
608 target_vec = _mm256_setzero_ps();
609
610 unsigned int i;
611
612 for (i = 0; i < eighth_points; ++i) {
613 x_to_1 = _mm256_loadu_ps(src0);
614 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
615 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
616 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
617 // x^1 * x^3 is slightly faster than x^2 * x^2
618 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
619
620 x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
621 x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
622 x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
623 x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
624
625 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
626 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
627 // this is slightly faster than result += (x_to_1 + x_to_3)
628 target_vec = _mm256_add_ps(x_to_1, target_vec);
629 target_vec = _mm256_add_ps(x_to_3, target_vec);
630
631 src0 += 8;
632 }
633
634 // the hadd for vector reduction has very very slight impact @ 50k iters
635 __VOLK_ATTR_ALIGNED(32) float temp_results[8];
636 target_vec = _mm256_hadd_ps(
637 target_vec,
638 target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
639 _mm256_storeu_ps(temp_results, target_vec);
640 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
641
642 for (i = eighth_points * 8; i < num_points; ++i) {
643 fst = *src0++;
644 fst = MAX(fst, *cutoff);
645 sq = fst * fst;
646 thrd = fst * sq;
647 frth = sq * sq;
648
649 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
650 center_point_array[2] * thrd + center_point_array[3] * frth);
651 }
652
653 *target += (float)(num_points)*center_point_array[4];
654}
655#endif // LV_HAVE_AVX
656
657#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/