50#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
61 unsigned int num_points)
65 const float* aPtr = (
float*)input;
66 const float* bPtr = taps;
67 unsigned int number = 0;
69 for (number = 0; number < num_points; number++) {
70 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
75 *result = returnValue;
80#if LV_HAVE_AVX2 && LV_HAVE_FMA
84static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
87 unsigned int num_points)
90 unsigned int number = 0;
91 const unsigned int sixteenthPoints = num_points / 16;
94 const float* aPtr = (
float*)input;
95 const float* bPtr = taps;
97 __m256 a0Val, a1Val, a2Val, a3Val;
98 __m256 b0Val, b1Val, b2Val, b3Val;
99 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
101 __m256 dotProdVal0 = _mm256_setzero_ps();
102 __m256 dotProdVal1 = _mm256_setzero_ps();
103 __m256 dotProdVal2 = _mm256_setzero_ps();
104 __m256 dotProdVal3 = _mm256_setzero_ps();
106 for (; number < sixteenthPoints; number++) {
108 a0Val = _mm256_load_ps(aPtr);
109 a1Val = _mm256_load_ps(aPtr + 8);
110 a2Val = _mm256_load_ps(aPtr + 16);
111 a3Val = _mm256_load_ps(aPtr + 24);
113 x0Val = _mm256_load_ps(bPtr);
114 x1Val = _mm256_load_ps(bPtr + 8);
115 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
116 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
117 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
118 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
121 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
122 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
123 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
124 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
126 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
127 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
128 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
129 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
135 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
136 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
137 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
141 _mm256_store_ps(dotProductVector,
144 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
145 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
146 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
147 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
149 number = sixteenthPoints * 16;
150 for (; number < num_points; number++) {
151 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
156 *result = returnValue;
163#include <immintrin.h>
168 unsigned int num_points)
171 unsigned int number = 0;
172 const unsigned int sixteenthPoints = num_points / 16;
175 const float* aPtr = (
float*)input;
176 const float* bPtr = taps;
178 __m256 a0Val, a1Val, a2Val, a3Val;
179 __m256 b0Val, b1Val, b2Val, b3Val;
180 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
181 __m256 c0Val, c1Val, c2Val, c3Val;
183 __m256 dotProdVal0 = _mm256_setzero_ps();
184 __m256 dotProdVal1 = _mm256_setzero_ps();
185 __m256 dotProdVal2 = _mm256_setzero_ps();
186 __m256 dotProdVal3 = _mm256_setzero_ps();
188 for (; number < sixteenthPoints; number++) {
190 a0Val = _mm256_load_ps(aPtr);
191 a1Val = _mm256_load_ps(aPtr + 8);
192 a2Val = _mm256_load_ps(aPtr + 16);
193 a3Val = _mm256_load_ps(aPtr + 24);
195 x0Val = _mm256_load_ps(bPtr);
196 x1Val = _mm256_load_ps(bPtr + 8);
197 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
198 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
199 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
200 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
203 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
204 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
205 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
206 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
208 c0Val = _mm256_mul_ps(a0Val, b0Val);
209 c1Val = _mm256_mul_ps(a1Val, b1Val);
210 c2Val = _mm256_mul_ps(a2Val, b2Val);
211 c3Val = _mm256_mul_ps(a3Val, b3Val);
213 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
214 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
215 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
216 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
222 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
223 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
224 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
228 _mm256_store_ps(dotProductVector,
231 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
232 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
233 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
234 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
236 number = sixteenthPoints * 16;
237 for (; number < num_points; number++) {
238 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
243 *result = returnValue;
255 unsigned int num_points)
258 unsigned int number = 0;
259 const unsigned int eighthPoints = num_points / 8;
262 const float* aPtr = (
float*)input;
263 const float* bPtr = taps;
265 __m128 a0Val, a1Val, a2Val, a3Val;
266 __m128 b0Val, b1Val, b2Val, b3Val;
267 __m128 x0Val, x1Val, x2Val, x3Val;
268 __m128 c0Val, c1Val, c2Val, c3Val;
275 for (; number < eighthPoints; number++) {
305 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
306 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
307 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
314 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
315 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
317 number = eighthPoints * 8;
318 for (; number < num_points; number++) {
319 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
324 *result = returnValue;
329#if LV_HAVE_AVX2 && LV_HAVE_FMA
331#include <immintrin.h>
333static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
336 unsigned int num_points)
339 unsigned int number = 0;
340 const unsigned int sixteenthPoints = num_points / 16;
343 const float* aPtr = (
float*)input;
344 const float* bPtr = taps;
346 __m256 a0Val, a1Val, a2Val, a3Val;
347 __m256 b0Val, b1Val, b2Val, b3Val;
348 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
350 __m256 dotProdVal0 = _mm256_setzero_ps();
351 __m256 dotProdVal1 = _mm256_setzero_ps();
352 __m256 dotProdVal2 = _mm256_setzero_ps();
353 __m256 dotProdVal3 = _mm256_setzero_ps();
355 for (; number < sixteenthPoints; number++) {
357 a0Val = _mm256_loadu_ps(aPtr);
358 a1Val = _mm256_loadu_ps(aPtr + 8);
359 a2Val = _mm256_loadu_ps(aPtr + 16);
360 a3Val = _mm256_loadu_ps(aPtr + 24);
362 x0Val = _mm256_loadu_ps(bPtr);
363 x1Val = _mm256_loadu_ps(bPtr + 8);
364 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
365 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
366 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
367 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
370 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
371 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
372 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
373 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
375 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
376 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
377 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
378 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
384 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
385 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
386 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
390 _mm256_store_ps(dotProductVector,
393 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
394 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
395 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
396 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
398 number = sixteenthPoints * 16;
399 for (; number < num_points; number++) {
400 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
405 *result = returnValue;
412#include <immintrin.h>
417 unsigned int num_points)
420 unsigned int number = 0;
421 const unsigned int sixteenthPoints = num_points / 16;
424 const float* aPtr = (
float*)input;
425 const float* bPtr = taps;
427 __m256 a0Val, a1Val, a2Val, a3Val;
428 __m256 b0Val, b1Val, b2Val, b3Val;
429 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
430 __m256 c0Val, c1Val, c2Val, c3Val;
432 __m256 dotProdVal0 = _mm256_setzero_ps();
433 __m256 dotProdVal1 = _mm256_setzero_ps();
434 __m256 dotProdVal2 = _mm256_setzero_ps();
435 __m256 dotProdVal3 = _mm256_setzero_ps();
437 for (; number < sixteenthPoints; number++) {
439 a0Val = _mm256_loadu_ps(aPtr);
440 a1Val = _mm256_loadu_ps(aPtr + 8);
441 a2Val = _mm256_loadu_ps(aPtr + 16);
442 a3Val = _mm256_loadu_ps(aPtr + 24);
444 x0Val = _mm256_loadu_ps(bPtr);
445 x1Val = _mm256_loadu_ps(bPtr + 8);
446 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
447 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
448 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
449 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
452 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
453 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
454 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
455 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
457 c0Val = _mm256_mul_ps(a0Val, b0Val);
458 c1Val = _mm256_mul_ps(a1Val, b1Val);
459 c2Val = _mm256_mul_ps(a2Val, b2Val);
460 c3Val = _mm256_mul_ps(a3Val, b3Val);
462 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
463 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
464 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
465 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
471 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
472 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
473 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
477 _mm256_store_ps(dotProductVector,
480 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
481 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
482 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
483 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
485 number = sixteenthPoints * 16;
486 for (; number < num_points; number++) {
487 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
492 *result = returnValue;
502 const float* __restrict taps,
503 unsigned int num_points)
507 const unsigned int quarterPoints = num_points / 8;
510 const float* inputPtr = (
float*)input;
511 const float* tapsPtr = taps;
512 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
513 float accVector_real[4];
514 float accVector_imag[4];
516 float32x4x2_t inputVector0, inputVector1;
517 float32x4_t tapsVector0, tapsVector1;
518 float32x4_t tmp_real0, tmp_imag0;
519 float32x4_t tmp_real1, tmp_imag1;
520 float32x4_t real_accumulator0, imag_accumulator0;
521 float32x4_t real_accumulator1, imag_accumulator1;
525 real_accumulator0 = vld1q_f32(zero);
526 imag_accumulator0 = vld1q_f32(zero);
527 real_accumulator1 = vld1q_f32(zero);
528 imag_accumulator1 = vld1q_f32(zero);
530 for (number = 0; number < quarterPoints; number++) {
532 tapsVector0 = vld1q_f32(tapsPtr);
533 tapsVector1 = vld1q_f32(tapsPtr + 4);
536 inputVector0 = vld2q_f32(inputPtr);
537 inputVector1 = vld2q_f32(inputPtr + 8);
540 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
541 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
543 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
544 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
546 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
547 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
549 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
550 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
556 real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
557 imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
560 vst1q_f32(accVector_real, real_accumulator0);
561 vst1q_f32(accVector_imag, imag_accumulator0);
563 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
564 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
567 for (number = quarterPoints * 8; number < num_points; number++) {
568 returnValue +=
lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
573 *result = returnValue;
583 const float* __restrict taps,
584 unsigned int num_points)
588 const unsigned int quarterPoints = num_points / 4;
591 const float* inputPtr = (
float*)input;
592 const float* tapsPtr = taps;
593 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
594 float accVector_real[4];
595 float accVector_imag[4];
597 float32x4x2_t inputVector;
598 float32x4_t tapsVector;
599 float32x4_t tmp_real, tmp_imag;
600 float32x4_t real_accumulator, imag_accumulator;
605 real_accumulator = vld1q_f32(zero);
606 imag_accumulator = vld1q_f32(zero);
608 for (number = 0; number < quarterPoints; number++) {
611 tapsVector = vld1q_f32(tapsPtr);
614 inputVector = vld2q_f32(inputPtr);
616 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
617 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
619 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
620 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
628 vst1q_f32(accVector_real, real_accumulator);
629 vst1q_f32(accVector_imag, imag_accumulator);
631 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
632 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
635 for (number = quarterPoints * 4; number < num_points; number++) {
636 returnValue +=
lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
641 *result = returnValue;
647extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(
lv_32fc_t* result,
650 unsigned int num_points);
654extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(
lv_32fc_t* result,
657 unsigned int num_points);
661extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(
lv_32fc_t* result,
664 unsigned int num_points);
672 unsigned int num_points)
675 unsigned int number = 0;
676 const unsigned int eighthPoints = num_points / 8;
679 const float* aPtr = (
float*)input;
680 const float* bPtr = taps;
682 __m128 a0Val, a1Val, a2Val, a3Val;
683 __m128 b0Val, b1Val, b2Val, b3Val;
684 __m128 x0Val, x1Val, x2Val, x3Val;
685 __m128 c0Val, c1Val, c2Val, c3Val;
692 for (; number < eighthPoints; number++) {
722 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
723 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
724 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
731 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
732 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
734 number = eighthPoints * 8;
735 for (; number < num_points; number++) {
736 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
741 *result = returnValue;