45#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
57 unsigned int num_points)
60 static const int N_UNROLL = 4;
68 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
70 for (i = 0; i < n; i += N_UNROLL) {
71 acc0 += taps[i + 0] * (float)input[i + 0];
72 acc1 += taps[i + 1] * (float)input[i + 1];
73 acc2 += taps[i + 2] * (float)input[i + 2];
74 acc3 += taps[i + 3] * (float)input[i + 3];
77 for (; i < num_points; i++) {
78 acc0 += taps[i] * (float)input[i];
81 *result = acc0 + acc1 + acc2 + acc3;
91 unsigned int num_points)
95 unsigned quarter_points = num_points / 4;
97 short* inputPtr = (
short*)input;
100 float32x4x2_t tapsVal, accumulator_val;
103 float32x4_t input_float, prod_re, prod_im;
105 accumulator_val.val[0] = vdupq_n_f32(0.0);
106 accumulator_val.val[1] = vdupq_n_f32(0.0);
108 for (ii = 0; ii < quarter_points; ++ii) {
109 tapsVal = vld2q_f32((
float*)tapsPtr);
110 input16 = vld1_s16(inputPtr);
112 input32 = vmovl_s16(input16);
114 input_float = vcvtq_f32_s32(input32);
116 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
119 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
125 vst2q_f32((
float*)accumulator_vec, accumulator_val);
126 accumulator_vec[0] += accumulator_vec[1];
127 accumulator_vec[2] += accumulator_vec[3];
128 accumulator_vec[0] += accumulator_vec[2];
130 for (ii = quarter_points * 4; ii < num_points; ++ii) {
131 accumulator_vec[0] += *(tapsPtr++) * (
float)(*(inputPtr++));
134 *result = accumulator_vec[0];
139#if LV_HAVE_SSE && LV_HAVE_MMX
141static inline void volk_16i_32fc_dot_prod_32fc_u_sse(
lv_32fc_t* result,
144 unsigned int num_points)
147 unsigned int number = 0;
148 const unsigned int eighthPoints = num_points / 8;
151 const short* aPtr = input;
152 const float* bPtr = (
float*)taps;
156 __m128 a0Val, a1Val, a2Val, a3Val;
157 __m128 b0Val, b1Val, b2Val, b3Val;
158 __m128 c0Val, c1Val, c2Val, c3Val;
165 for (; number < eighthPoints; number++) {
167 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
168 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
200 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
201 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
202 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
209 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
210 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
212 number = eighthPoints * 8;
213 for (; number < num_points; number++) {
214 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
219 *result = returnValue;
225#if LV_HAVE_AVX2 && LV_HAVE_FMA
227static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
230 unsigned int num_points)
233 unsigned int number = 0;
234 const unsigned int sixteenthPoints = num_points / 16;
237 const short* aPtr = input;
238 const float* bPtr = (
float*)taps;
242 __m256 g0, g1, h0, h1, h2, h3;
243 __m256 a0Val, a1Val, a2Val, a3Val;
244 __m256 b0Val, b1Val, b2Val, b3Val;
246 __m256 dotProdVal0 = _mm256_setzero_ps();
247 __m256 dotProdVal1 = _mm256_setzero_ps();
248 __m256 dotProdVal2 = _mm256_setzero_ps();
249 __m256 dotProdVal3 = _mm256_setzero_ps();
251 for (; number < sixteenthPoints; number++) {
256 f0 = _mm256_cvtepi16_epi32(m0);
257 g0 = _mm256_cvtepi32_ps(f0);
258 f1 = _mm256_cvtepi16_epi32(m1);
259 g1 = _mm256_cvtepi32_ps(f1);
261 h0 = _mm256_unpacklo_ps(g0, g0);
262 h1 = _mm256_unpackhi_ps(g0, g0);
263 h2 = _mm256_unpacklo_ps(g1, g1);
264 h3 = _mm256_unpackhi_ps(g1, g1);
266 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
267 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
268 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
269 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
271 b0Val = _mm256_loadu_ps(bPtr);
272 b1Val = _mm256_loadu_ps(bPtr + 8);
273 b2Val = _mm256_loadu_ps(bPtr + 16);
274 b3Val = _mm256_loadu_ps(bPtr + 24);
276 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
277 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
278 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
279 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
285 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
286 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
287 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
291 _mm256_store_ps(dotProductVector,
294 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
295 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
296 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
297 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
299 number = sixteenthPoints * 16;
300 for (; number < num_points; number++) {
301 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
306 *result = returnValue;
314static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(
lv_32fc_t* result,
317 unsigned int num_points)
320 unsigned int number = 0;
321 const unsigned int sixteenthPoints = num_points / 16;
324 const short* aPtr = input;
325 const float* bPtr = (
float*)taps;
329 __m256 g0, g1, h0, h1, h2, h3;
330 __m256 a0Val, a1Val, a2Val, a3Val;
331 __m256 b0Val, b1Val, b2Val, b3Val;
332 __m256 c0Val, c1Val, c2Val, c3Val;
334 __m256 dotProdVal0 = _mm256_setzero_ps();
335 __m256 dotProdVal1 = _mm256_setzero_ps();
336 __m256 dotProdVal2 = _mm256_setzero_ps();
337 __m256 dotProdVal3 = _mm256_setzero_ps();
339 for (; number < sixteenthPoints; number++) {
344 f0 = _mm256_cvtepi16_epi32(m0);
345 g0 = _mm256_cvtepi32_ps(f0);
346 f1 = _mm256_cvtepi16_epi32(m1);
347 g1 = _mm256_cvtepi32_ps(f1);
349 h0 = _mm256_unpacklo_ps(g0, g0);
350 h1 = _mm256_unpackhi_ps(g0, g0);
351 h2 = _mm256_unpacklo_ps(g1, g1);
352 h3 = _mm256_unpackhi_ps(g1, g1);
354 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
355 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
356 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
357 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
359 b0Val = _mm256_loadu_ps(bPtr);
360 b1Val = _mm256_loadu_ps(bPtr + 8);
361 b2Val = _mm256_loadu_ps(bPtr + 16);
362 b3Val = _mm256_loadu_ps(bPtr + 24);
364 c0Val = _mm256_mul_ps(a0Val, b0Val);
365 c1Val = _mm256_mul_ps(a1Val, b1Val);
366 c2Val = _mm256_mul_ps(a2Val, b2Val);
367 c3Val = _mm256_mul_ps(a3Val, b3Val);
369 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
370 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
371 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
372 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
378 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
379 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
380 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
384 _mm256_store_ps(dotProductVector,
387 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
388 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
389 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
390 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
392 number = sixteenthPoints * 16;
393 for (; number < num_points; number++) {
394 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
399 *result = returnValue;
405#if LV_HAVE_SSE && LV_HAVE_MMX
408static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
411 unsigned int num_points)
414 unsigned int number = 0;
415 const unsigned int eighthPoints = num_points / 8;
418 const short* aPtr = input;
419 const float* bPtr = (
float*)taps;
423 __m128 a0Val, a1Val, a2Val, a3Val;
424 __m128 b0Val, b1Val, b2Val, b3Val;
425 __m128 c0Val, c1Val, c2Val, c3Val;
432 for (; number < eighthPoints; number++) {
434 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
435 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
467 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
468 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
469 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
476 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
477 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
479 number = eighthPoints * 8;
480 for (; number < num_points; number++) {
481 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
486 *result = returnValue;
493static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(
lv_32fc_t* result,
496 unsigned int num_points)
499 unsigned int number = 0;
500 const unsigned int sixteenthPoints = num_points / 16;
503 const short* aPtr = input;
504 const float* bPtr = (
float*)taps;
508 __m256 g0, g1, h0, h1, h2, h3;
509 __m256 a0Val, a1Val, a2Val, a3Val;
510 __m256 b0Val, b1Val, b2Val, b3Val;
511 __m256 c0Val, c1Val, c2Val, c3Val;
513 __m256 dotProdVal0 = _mm256_setzero_ps();
514 __m256 dotProdVal1 = _mm256_setzero_ps();
515 __m256 dotProdVal2 = _mm256_setzero_ps();
516 __m256 dotProdVal3 = _mm256_setzero_ps();
518 for (; number < sixteenthPoints; number++) {
523 f0 = _mm256_cvtepi16_epi32(m0);
524 g0 = _mm256_cvtepi32_ps(f0);
525 f1 = _mm256_cvtepi16_epi32(m1);
526 g1 = _mm256_cvtepi32_ps(f1);
528 h0 = _mm256_unpacklo_ps(g0, g0);
529 h1 = _mm256_unpackhi_ps(g0, g0);
530 h2 = _mm256_unpacklo_ps(g1, g1);
531 h3 = _mm256_unpackhi_ps(g1, g1);
533 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
534 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
535 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
536 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
538 b0Val = _mm256_load_ps(bPtr);
539 b1Val = _mm256_load_ps(bPtr + 8);
540 b2Val = _mm256_load_ps(bPtr + 16);
541 b3Val = _mm256_load_ps(bPtr + 24);
543 c0Val = _mm256_mul_ps(a0Val, b0Val);
544 c1Val = _mm256_mul_ps(a1Val, b1Val);
545 c2Val = _mm256_mul_ps(a2Val, b2Val);
546 c3Val = _mm256_mul_ps(a3Val, b3Val);
548 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
549 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
550 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
551 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
557 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
558 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
559 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
563 _mm256_store_ps(dotProductVector,
566 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
567 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
568 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
569 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
571 number = sixteenthPoints * 16;
572 for (; number < num_points; number++) {
573 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
578 *result = returnValue;
584#if LV_HAVE_AVX2 && LV_HAVE_FMA
586static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
589 unsigned int num_points)
592 unsigned int number = 0;
593 const unsigned int sixteenthPoints = num_points / 16;
596 const short* aPtr = input;
597 const float* bPtr = (
float*)taps;
601 __m256 g0, g1, h0, h1, h2, h3;
602 __m256 a0Val, a1Val, a2Val, a3Val;
603 __m256 b0Val, b1Val, b2Val, b3Val;
605 __m256 dotProdVal0 = _mm256_setzero_ps();
606 __m256 dotProdVal1 = _mm256_setzero_ps();
607 __m256 dotProdVal2 = _mm256_setzero_ps();
608 __m256 dotProdVal3 = _mm256_setzero_ps();
610 for (; number < sixteenthPoints; number++) {
615 f0 = _mm256_cvtepi16_epi32(m0);
616 g0 = _mm256_cvtepi32_ps(f0);
617 f1 = _mm256_cvtepi16_epi32(m1);
618 g1 = _mm256_cvtepi32_ps(f1);
620 h0 = _mm256_unpacklo_ps(g0, g0);
621 h1 = _mm256_unpackhi_ps(g0, g0);
622 h2 = _mm256_unpacklo_ps(g1, g1);
623 h3 = _mm256_unpackhi_ps(g1, g1);
625 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
626 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
627 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
628 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
630 b0Val = _mm256_load_ps(bPtr);
631 b1Val = _mm256_load_ps(bPtr + 8);
632 b2Val = _mm256_load_ps(bPtr + 16);
633 b3Val = _mm256_load_ps(bPtr + 24);
635 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
636 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
637 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
638 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
644 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
645 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
646 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
650 _mm256_store_ps(dotProductVector,
653 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
654 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
655 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
656 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
658 number = sixteenthPoints * 16;
659 for (; number < num_points; number++) {
660 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
665 *result = returnValue;