Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
45#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
46#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
47
48#include <stdio.h>
49#include <string.h>
50#include <volk/volk_common.h>
51#include <volk/volk_complex.h>
52
53
54#ifdef LV_HAVE_RISCV64
55extern void volk_32fc_x2_dot_prod_32fc_sifive_u74(lv_32fc_t* result,
56 const lv_32fc_t* input,
57 const lv_32fc_t* taps,
58 unsigned int num_points);
59#endif
60
61#ifdef LV_HAVE_GENERIC
62
63
65 const lv_32fc_t* input,
66 const lv_32fc_t* taps,
67 unsigned int num_points)
68{
69
70 float* res = (float*)result;
71 float* in = (float*)input;
72 float* tp = (float*)taps;
73 unsigned int n_2_ccomplex_blocks = num_points / 2;
74
75 float sum0[2] = { 0, 0 };
76 float sum1[2] = { 0, 0 };
77 unsigned int i = 0;
78
79 for (i = 0; i < n_2_ccomplex_blocks; ++i) {
80 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
81 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
82 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
83 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
84
85 in += 4;
86 tp += 4;
87 }
88
89 res[0] = sum0[0] + sum1[0];
90 res[1] = sum0[1] + sum1[1];
91
92 // Cleanup if we had an odd number of points
93 if (num_points & 1) {
94 *result += input[num_points - 1] * taps[num_points - 1];
95 }
96}
97
98#endif /*LV_HAVE_GENERIC*/
99
100
101#if LV_HAVE_SSE && LV_HAVE_64
102
103static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result,
104 const lv_32fc_t* input,
105 const lv_32fc_t* taps,
106 unsigned int num_points)
107{
108
109 const unsigned int num_bytes = num_points * 8;
110 unsigned int isodd = num_points & 1;
111
113 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
114 "# const float *taps, unsigned num_bytes)\n\t"
115 "# float sum0 = 0;\n\t"
116 "# float sum1 = 0;\n\t"
117 "# float sum2 = 0;\n\t"
118 "# float sum3 = 0;\n\t"
119 "# do {\n\t"
120 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
121 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
122 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
123 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
124 "# input += 4;\n\t"
125 "# taps += 4; \n\t"
126 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
127 "# result[0] = sum0 + sum2;\n\t"
128 "# result[1] = sum1 + sum3;\n\t"
129 "# TODO: prefetch and better scheduling\n\t"
130 " xor %%r9, %%r9\n\t"
131 " xor %%r10, %%r10\n\t"
132 " movq %%rcx, %%rax\n\t"
133 " movq %%rcx, %%r8\n\t"
134 " movq %[rsi], %%r9\n\t"
135 " movq %[rdx], %%r10\n\t"
136 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
137 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
138 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
139 " shr $4, %%r8\n\t"
140 " jmp .%=L1_test\n\t"
141 " # 4 taps / loop\n\t"
142 " # something like ?? cycles / loop\n\t"
143 ".%=Loop1: \n\t"
144 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
145 "# movups (%%r9), %%xmmA\n\t"
146 "# movups (%%r10), %%xmmB\n\t"
147 "# movups %%xmmA, %%xmmZ\n\t"
148 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
149 "# mulps %%xmmB, %%xmmA\n\t"
150 "# mulps %%xmmZ, %%xmmB\n\t"
151 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
152 "# xorps %%xmmPN, %%xmmA\n\t"
153 "# movups %%xmmA, %%xmmZ\n\t"
154 "# unpcklps %%xmmB, %%xmmA\n\t"
155 "# unpckhps %%xmmB, %%xmmZ\n\t"
156 "# movups %%xmmZ, %%xmmY\n\t"
157 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
158 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
159 "# addps %%xmmZ, %%xmmA\n\t"
160 "# addps %%xmmA, %%xmmC\n\t"
161 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
162 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
163 " movups 0(%%r9), %%xmm0\n\t"
164 " movups 16(%%r9), %%xmm1\n\t"
165 " movups %%xmm0, %%xmm4\n\t"
166 " movups 0(%%r10), %%xmm2\n\t"
167 " mulps %%xmm2, %%xmm0\n\t"
168 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
169 " movups 16(%%r10), %%xmm3\n\t"
170 " movups %%xmm1, %%xmm5\n\t"
171 " addps %%xmm0, %%xmm6\n\t"
172 " mulps %%xmm3, %%xmm1\n\t"
173 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
174 " addps %%xmm1, %%xmm6\n\t"
175 " mulps %%xmm4, %%xmm2\n\t"
176 " addps %%xmm2, %%xmm7\n\t"
177 " mulps %%xmm5, %%xmm3\n\t"
178 " add $32, %%r9\n\t"
179 " addps %%xmm3, %%xmm7\n\t"
180 " add $32, %%r10\n\t"
181 ".%=L1_test:\n\t"
182 " dec %%rax\n\t"
183 " jge .%=Loop1\n\t"
184 " # We've handled the bulk of multiplies up to here.\n\t"
185 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
186 " # If so, we've got 2 more taps to do.\n\t"
187 " and $1, %%r8\n\t"
188 " je .%=Leven\n\t"
189 " # The count was odd, do 2 more taps.\n\t"
190 " movups 0(%%r9), %%xmm0\n\t"
191 " movups %%xmm0, %%xmm4\n\t"
192 " movups 0(%%r10), %%xmm2\n\t"
193 " mulps %%xmm2, %%xmm0\n\t"
194 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
195 " addps %%xmm0, %%xmm6\n\t"
196 " mulps %%xmm4, %%xmm2\n\t"
197 " addps %%xmm2, %%xmm7\n\t"
198 ".%=Leven:\n\t"
199 " # neg inversor\n\t"
200 " xorps %%xmm1, %%xmm1\n\t"
201 " mov $0x80000000, %%r9\n\t"
202 " movd %%r9, %%xmm1\n\t"
203 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
204 " # pfpnacc\n\t"
205 " xorps %%xmm1, %%xmm6\n\t"
206 " movups %%xmm6, %%xmm2\n\t"
207 " unpcklps %%xmm7, %%xmm6\n\t"
208 " unpckhps %%xmm7, %%xmm2\n\t"
209 " movups %%xmm2, %%xmm3\n\t"
210 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
211 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
212 " addps %%xmm2, %%xmm6\n\t"
213 " # xmm6 = r1 i2 r3 i4\n\t"
214 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
215 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
216 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
217 "to memory\n\t"
218 :
219 : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
220 : "rax", "r8", "r9", "r10");
221
222
223 if (isodd) {
224 *result += input[num_points - 1] * taps[num_points - 1];
225 }
226
227 return;
228}
229
230#endif /* LV_HAVE_SSE && LV_HAVE_64 */
231
232
233#ifdef LV_HAVE_SSE3
234
235#include <pmmintrin.h>
236
238 const lv_32fc_t* input,
239 const lv_32fc_t* taps,
240 unsigned int num_points)
241{
242
243 lv_32fc_t dotProduct;
244 memset(&dotProduct, 0x0, 2 * sizeof(float));
245
246 unsigned int number = 0;
247 const unsigned int halfPoints = num_points / 2;
248 unsigned int isodd = num_points & 1;
249
250 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
251
252 const lv_32fc_t* a = input;
253 const lv_32fc_t* b = taps;
254
255 dotProdVal = _mm_setzero_ps();
256
257 for (; number < halfPoints; number++) {
258
259 x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
260 y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
261
262 yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
263 yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
264
265 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
266
267 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
268
269 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
270
271 z = _mm_addsub_ps(tmp1,
272 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
273
274 dotProdVal =
275 _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
276
277 a += 2;
278 b += 2;
279 }
280
281 __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
282
283 _mm_storeu_ps((float*)dotProductVector,
284 dotProdVal); // Store the results back into the dot product vector
285
286 dotProduct += (dotProductVector[0] + dotProductVector[1]);
287
288 if (isodd) {
289 dotProduct += input[num_points - 1] * taps[num_points - 1];
290 }
291
292 *result = dotProduct;
293}
294
295#endif /*LV_HAVE_SSE3*/
296
297// #ifdef LV_HAVE_SSE4_1
298
299// #include <smmintrin.h>
300
301// static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
302// const lv_32fc_t* input,
303// const lv_32fc_t* taps,
304// unsigned int num_points)
305// {
306
307// unsigned int i = 0;
308// const unsigned int qtr_points = num_points / 4;
309// const unsigned int isodd = num_points & 3;
310
311// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
312// float *p_input, *p_taps;
313// __m64* p_result;
314
315// p_result = (__m64*)result;
316// p_input = (float*)input;
317// p_taps = (float*)taps;
318
319// static const __m128i neg = { 0x000000000000000080000000 };
320
321// real0 = _mm_setzero_ps();
322// real1 = _mm_setzero_ps();
323// im0 = _mm_setzero_ps();
324// im1 = _mm_setzero_ps();
325
326// for (; i < qtr_points; ++i) {
327// xmm0 = _mm_loadu_ps(p_input);
328// xmm1 = _mm_loadu_ps(p_taps);
329
330// p_input += 4;
331// p_taps += 4;
332
333// xmm2 = _mm_loadu_ps(p_input);
334// xmm3 = _mm_loadu_ps(p_taps);
335
336// p_input += 4;
337// p_taps += 4;
338
339// xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
340// xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
341// xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
342// xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
343
344// // imaginary vector from input
345// xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
346// // real vector from input
347// xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
348// // imaginary vector from taps
349// xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
350// // real vector from taps
351// xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
352
353// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
354// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
355
356// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
357// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
358
359// real0 = _mm_add_ps(xmm4, real0);
360// real1 = _mm_add_ps(xmm5, real1);
361// im0 = _mm_add_ps(xmm6, im0);
362// im1 = _mm_add_ps(xmm7, im1);
363// }
364
365// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
366
367// im0 = _mm_add_ps(im0, im1);
368// real0 = _mm_add_ps(real0, real1);
369
370// im0 = _mm_add_ps(im0, real0);
371
372// _mm_storel_pi(p_result, im0);
373
374// for (i = num_points - isodd; i < num_points; i++) {
375// *result += input[i] * taps[i];
376// }
377// }
378
379// #endif /*LV_HAVE_SSE4_1*/
380
381#ifdef LV_HAVE_AVX
382
383#include <immintrin.h>
384
386 const lv_32fc_t* input,
387 const lv_32fc_t* taps,
388 unsigned int num_points)
389{
390
391 unsigned int isodd = num_points & 3;
392 unsigned int i = 0;
393 lv_32fc_t dotProduct;
394 memset(&dotProduct, 0x0, 2 * sizeof(float));
395
396 unsigned int number = 0;
397 const unsigned int quarterPoints = num_points / 4;
398
399 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
400
401 const lv_32fc_t* a = input;
402 const lv_32fc_t* b = taps;
403
404 dotProdVal = _mm256_setzero_ps();
405
406 for (; number < quarterPoints; number++) {
407 x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
408 y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
409
410 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
411 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
412
413 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
414
415 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
416
417 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
418
419 z = _mm256_addsub_ps(tmp1,
420 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
421
422 dotProdVal = _mm256_add_ps(dotProdVal,
423 z); // Add the complex multiplication results together
424
425 a += 4;
426 b += 4;
427 }
428
429 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
430
431 _mm256_storeu_ps((float*)dotProductVector,
432 dotProdVal); // Store the results back into the dot product vector
433
434 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
435 dotProductVector[3]);
436
437 for (i = num_points - isodd; i < num_points; i++) {
438 dotProduct += input[i] * taps[i];
439 }
440
441 *result = dotProduct;
442}
443
444#endif /*LV_HAVE_AVX*/
445
446#if LV_HAVE_AVX && LV_HAVE_FMA
447#include <immintrin.h>
448
449static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
450 const lv_32fc_t* input,
451 const lv_32fc_t* taps,
452 unsigned int num_points)
453{
454
455 unsigned int isodd = num_points & 3;
456 unsigned int i = 0;
457 lv_32fc_t dotProduct;
458 memset(&dotProduct, 0x0, 2 * sizeof(float));
459
460 unsigned int number = 0;
461 const unsigned int quarterPoints = num_points / 4;
462
463 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
464
465 const lv_32fc_t* a = input;
466 const lv_32fc_t* b = taps;
467
468 dotProdVal = _mm256_setzero_ps();
469
470 for (; number < quarterPoints; number++) {
471
472 x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
473 y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
474
475 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
476 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
477
478 tmp1 = x;
479
480 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
481
482 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
483
484 z = _mm256_fmaddsub_ps(
485 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
486
487 dotProdVal = _mm256_add_ps(dotProdVal,
488 z); // Add the complex multiplication results together
489
490 a += 4;
491 b += 4;
492 }
493
494 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
495
496 _mm256_storeu_ps((float*)dotProductVector,
497 dotProdVal); // Store the results back into the dot product vector
498
499 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
500 dotProductVector[3]);
501
502 for (i = num_points - isodd; i < num_points; i++) {
503 dotProduct += input[i] * taps[i];
504 }
505
506 *result = dotProduct;
507}
508
509#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
510
511#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
512
513#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
514#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
515
516#include <stdio.h>
517#include <string.h>
518#include <volk/volk_common.h>
519#include <volk/volk_complex.h>
520
521
522#if LV_HAVE_SSE && LV_HAVE_64
523
524
525static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result,
526 const lv_32fc_t* input,
527 const lv_32fc_t* taps,
528 unsigned int num_points)
529{
530
531 const unsigned int num_bytes = num_points * 8;
532 unsigned int isodd = num_points & 1;
533
535 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
536 "# const float *taps, unsigned num_bytes)\n\t"
537 "# float sum0 = 0;\n\t"
538 "# float sum1 = 0;\n\t"
539 "# float sum2 = 0;\n\t"
540 "# float sum3 = 0;\n\t"
541 "# do {\n\t"
542 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
543 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
544 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
545 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
546 "# input += 4;\n\t"
547 "# taps += 4; \n\t"
548 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
549 "# result[0] = sum0 + sum2;\n\t"
550 "# result[1] = sum1 + sum3;\n\t"
551 "# TODO: prefetch and better scheduling\n\t"
552 " xor %%r9, %%r9\n\t"
553 " xor %%r10, %%r10\n\t"
554 " movq %%rcx, %%rax\n\t"
555 " movq %%rcx, %%r8\n\t"
556 " movq %[rsi], %%r9\n\t"
557 " movq %[rdx], %%r10\n\t"
558 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
559 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
560 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
561 " shr $4, %%r8\n\t"
562 " jmp .%=L1_test\n\t"
563 " # 4 taps / loop\n\t"
564 " # something like ?? cycles / loop\n\t"
565 ".%=Loop1: \n\t"
566 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
567 "# movaps (%%r9), %%xmmA\n\t"
568 "# movaps (%%r10), %%xmmB\n\t"
569 "# movaps %%xmmA, %%xmmZ\n\t"
570 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
571 "# mulps %%xmmB, %%xmmA\n\t"
572 "# mulps %%xmmZ, %%xmmB\n\t"
573 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
574 "# xorps %%xmmPN, %%xmmA\n\t"
575 "# movaps %%xmmA, %%xmmZ\n\t"
576 "# unpcklps %%xmmB, %%xmmA\n\t"
577 "# unpckhps %%xmmB, %%xmmZ\n\t"
578 "# movaps %%xmmZ, %%xmmY\n\t"
579 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
580 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
581 "# addps %%xmmZ, %%xmmA\n\t"
582 "# addps %%xmmA, %%xmmC\n\t"
583 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
584 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
585 " movaps 0(%%r9), %%xmm0\n\t"
586 " movaps 16(%%r9), %%xmm1\n\t"
587 " movaps %%xmm0, %%xmm4\n\t"
588 " movaps 0(%%r10), %%xmm2\n\t"
589 " mulps %%xmm2, %%xmm0\n\t"
590 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
591 " movaps 16(%%r10), %%xmm3\n\t"
592 " movaps %%xmm1, %%xmm5\n\t"
593 " addps %%xmm0, %%xmm6\n\t"
594 " mulps %%xmm3, %%xmm1\n\t"
595 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
596 " addps %%xmm1, %%xmm6\n\t"
597 " mulps %%xmm4, %%xmm2\n\t"
598 " addps %%xmm2, %%xmm7\n\t"
599 " mulps %%xmm5, %%xmm3\n\t"
600 " add $32, %%r9\n\t"
601 " addps %%xmm3, %%xmm7\n\t"
602 " add $32, %%r10\n\t"
603 ".%=L1_test:\n\t"
604 " dec %%rax\n\t"
605 " jge .%=Loop1\n\t"
606 " # We've handled the bulk of multiplies up to here.\n\t"
607 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
608 " # If so, we've got 2 more taps to do.\n\t"
609 " and $1, %%r8\n\t"
610 " je .%=Leven\n\t"
611 " # The count was odd, do 2 more taps.\n\t"
612 " movaps 0(%%r9), %%xmm0\n\t"
613 " movaps %%xmm0, %%xmm4\n\t"
614 " movaps 0(%%r10), %%xmm2\n\t"
615 " mulps %%xmm2, %%xmm0\n\t"
616 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
617 " addps %%xmm0, %%xmm6\n\t"
618 " mulps %%xmm4, %%xmm2\n\t"
619 " addps %%xmm2, %%xmm7\n\t"
620 ".%=Leven:\n\t"
621 " # neg inversor\n\t"
622 " xorps %%xmm1, %%xmm1\n\t"
623 " mov $0x80000000, %%r9\n\t"
624 " movd %%r9, %%xmm1\n\t"
625 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
626 " # pfpnacc\n\t"
627 " xorps %%xmm1, %%xmm6\n\t"
628 " movaps %%xmm6, %%xmm2\n\t"
629 " unpcklps %%xmm7, %%xmm6\n\t"
630 " unpckhps %%xmm7, %%xmm2\n\t"
631 " movaps %%xmm2, %%xmm3\n\t"
632 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
633 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
634 " addps %%xmm2, %%xmm6\n\t"
635 " # xmm6 = r1 i2 r3 i4\n\t"
636 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
637 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
638 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
639 "to memory\n\t"
640 :
641 : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
642 : "rax", "r8", "r9", "r10");
643
644
645 if (isodd) {
646 *result += input[num_points - 1] * taps[num_points - 1];
647 }
648
649 return;
650}
651
652#endif
653
654#ifdef LV_HAVE_SSE3
655
656#include <pmmintrin.h>
657
659 const lv_32fc_t* input,
660 const lv_32fc_t* taps,
661 unsigned int num_points)
662{
663
664 const unsigned int num_bytes = num_points * 8;
665 unsigned int isodd = num_points & 1;
666
667 lv_32fc_t dotProduct;
668 memset(&dotProduct, 0x0, 2 * sizeof(float));
669
670 unsigned int number = 0;
671 const unsigned int halfPoints = num_bytes >> 4;
672
673 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
674
675 const lv_32fc_t* a = input;
676 const lv_32fc_t* b = taps;
677
678 dotProdVal = _mm_setzero_ps();
679
680 for (; number < halfPoints; number++) {
681
682 x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
683 y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
684
685 yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
686 yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
687
688 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
689
690 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
691
692 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
693
694 z = _mm_addsub_ps(tmp1,
695 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
696
697 dotProdVal =
698 _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
699
700 a += 2;
701 b += 2;
702 }
703
704 __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
705
706 _mm_store_ps((float*)dotProductVector,
707 dotProdVal); // Store the results back into the dot product vector
708
709 dotProduct += (dotProductVector[0] + dotProductVector[1]);
710
711 if (isodd) {
712 dotProduct += input[num_points - 1] * taps[num_points - 1];
713 }
714
715 *result = dotProduct;
716}
717
718#endif /*LV_HAVE_SSE3*/
719
720
721// #ifdef LV_HAVE_SSE4_1
722
723// #include <smmintrin.h>
724
725// static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
726// const lv_32fc_t* input,
727// const lv_32fc_t* taps,
728// unsigned int num_points)
729// {
730
731// unsigned int i = 0;
732// const unsigned int qtr_points = num_points / 4;
733// const unsigned int isodd = num_points & 3;
734
735// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
736// float *p_input, *p_taps;
737// __m64* p_result;
738
739// static const __m128i neg = { 0x000000000000000080000000 };
740
741// p_result = (__m64*)result;
742// p_input = (float*)input;
743// p_taps = (float*)taps;
744
745// real0 = _mm_setzero_ps();
746// real1 = _mm_setzero_ps();
747// im0 = _mm_setzero_ps();
748// im1 = _mm_setzero_ps();
749
750// for (; i < qtr_points; ++i) {
751// xmm0 = _mm_load_ps(p_input);
752// xmm1 = _mm_load_ps(p_taps);
753
754// p_input += 4;
755// p_taps += 4;
756
757// xmm2 = _mm_load_ps(p_input);
758// xmm3 = _mm_load_ps(p_taps);
759
760// p_input += 4;
761// p_taps += 4;
762
763// xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
764// xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
765// xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
766// xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
767
768// // imaginary vector from input
769// xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
770// // real vector from input
771// xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
772// // imaginary vector from taps
773// xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
774// // real vector from taps
775// xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
776
777// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
778// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
779
780// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
781// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
782
783// real0 = _mm_add_ps(xmm4, real0);
784// real1 = _mm_add_ps(xmm5, real1);
785// im0 = _mm_add_ps(xmm6, im0);
786// im1 = _mm_add_ps(xmm7, im1);
787// }
788
789// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
790
791// im0 = _mm_add_ps(im0, im1);
792// real0 = _mm_add_ps(real0, real1);
793
794// im0 = _mm_add_ps(im0, real0);
795
796// _mm_storel_pi(p_result, im0);
797
798// for (i = num_points - isodd; i < num_points; i++) {
799// *result += input[i] * taps[i];
800// }
801// }
802
803// #endif /*LV_HAVE_SSE4_1*/
804
805#ifdef LV_HAVE_NEON
806#include <arm_neon.h>
807
809 const lv_32fc_t* input,
810 const lv_32fc_t* taps,
811 unsigned int num_points)
812{
813
814 unsigned int quarter_points = num_points / 4;
815 unsigned int number;
816
817 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
818 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
819 // for 2-lane vectors, 1st lane holds the real part,
820 // 2nd lane holds the imaginary part
821 float32x4x2_t a_val, b_val, c_val, accumulator;
822 float32x4x2_t tmp_real, tmp_imag;
823 accumulator.val[0] = vdupq_n_f32(0);
824 accumulator.val[1] = vdupq_n_f32(0);
825
826 for (number = 0; number < quarter_points; ++number) {
827 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
828 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
829 __VOLK_PREFETCH(a_ptr + 8);
830 __VOLK_PREFETCH(b_ptr + 8);
831
832 // multiply the real*real and imag*imag to get real result
833 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
834 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
835 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
836 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
837
838 // Multiply cross terms to get the imaginary result
839 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
840 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
841 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
842 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
843
844 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
845 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
846
847 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
848 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
849
850 a_ptr += 4;
851 b_ptr += 4;
852 }
853 lv_32fc_t accum_result[4];
854 vst2q_f32((float*)accum_result, accumulator);
855 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
856
857 // tail case
858 for (number = quarter_points * 4; number < num_points; ++number) {
859 *result += (*a_ptr++) * (*b_ptr++);
860 }
861}
862#endif /*LV_HAVE_NEON*/
863
864#ifdef LV_HAVE_NEON
865#include <arm_neon.h>
867 const lv_32fc_t* input,
868 const lv_32fc_t* taps,
869 unsigned int num_points)
870{
871
872 unsigned int quarter_points = num_points / 4;
873 unsigned int number;
874
875 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
876 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
877 // for 2-lane vectors, 1st lane holds the real part,
878 // 2nd lane holds the imaginary part
879 float32x4x2_t a_val, b_val, accumulator;
880 float32x4x2_t tmp_imag;
881 accumulator.val[0] = vdupq_n_f32(0);
882 accumulator.val[1] = vdupq_n_f32(0);
883
884 for (number = 0; number < quarter_points; ++number) {
885 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
886 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
887 __VOLK_PREFETCH(a_ptr + 8);
888 __VOLK_PREFETCH(b_ptr + 8);
889
890 // do the first multiply
891 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
892 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
893
894 // use multiply accumulate/subtract to get result
895 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
896 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
897
898 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
899 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
900
901 // increment pointers
902 a_ptr += 4;
903 b_ptr += 4;
904 }
905 lv_32fc_t accum_result[4];
906 vst2q_f32((float*)accum_result, accumulator);
907 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
908
909 // tail case
910 for (number = quarter_points * 4; number < num_points; ++number) {
911 *result += (*a_ptr++) * (*b_ptr++);
912 }
913}
914#endif /*LV_HAVE_NEON*/
915
916#ifdef LV_HAVE_NEON
918 const lv_32fc_t* input,
919 const lv_32fc_t* taps,
920 unsigned int num_points)
921{
922
923 unsigned int quarter_points = num_points / 4;
924 unsigned int number;
925
926 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
927 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
928 // for 2-lane vectors, 1st lane holds the real part,
929 // 2nd lane holds the imaginary part
930 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
931 accumulator1.val[0] = vdupq_n_f32(0);
932 accumulator1.val[1] = vdupq_n_f32(0);
933 accumulator2.val[0] = vdupq_n_f32(0);
934 accumulator2.val[1] = vdupq_n_f32(0);
935
936 for (number = 0; number < quarter_points; ++number) {
937 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
938 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
939 __VOLK_PREFETCH(a_ptr + 8);
940 __VOLK_PREFETCH(b_ptr + 8);
941
942 // use 2 accumulators to remove inter-instruction data dependencies
943 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
944 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
945 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
946 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
947 // increment pointers
948 a_ptr += 4;
949 b_ptr += 4;
950 }
951 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
952 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
953 lv_32fc_t accum_result[4];
954 vst2q_f32((float*)accum_result, accumulator1);
955 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
956
957 // tail case
958 for (number = quarter_points * 4; number < num_points; ++number) {
959 *result += (*a_ptr++) * (*b_ptr++);
960 }
961}
962#endif /*LV_HAVE_NEON*/
963
964#ifdef LV_HAVE_NEON
966 const lv_32fc_t* input,
967 const lv_32fc_t* taps,
968 unsigned int num_points)
969{
970 // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
971 // fast
972
973 unsigned int quarter_points = num_points / 8;
974 unsigned int number;
975
976 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
977 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
978 // for 2-lane vectors, 1st lane holds the real part,
979 // 2nd lane holds the imaginary part
980 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
981 float32x4x2_t reduced_accumulator;
982 accumulator1.val[0] = vdupq_n_f32(0);
983 accumulator1.val[1] = vdupq_n_f32(0);
984 accumulator1.val[2] = vdupq_n_f32(0);
985 accumulator1.val[3] = vdupq_n_f32(0);
986 accumulator2.val[0] = vdupq_n_f32(0);
987 accumulator2.val[1] = vdupq_n_f32(0);
988 accumulator2.val[2] = vdupq_n_f32(0);
989 accumulator2.val[3] = vdupq_n_f32(0);
990
991 // 8 input regs, 8 accumulators -> 16/16 neon regs are used
992 for (number = 0; number < quarter_points; ++number) {
993 a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
994 b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
995 __VOLK_PREFETCH(a_ptr + 8);
996 __VOLK_PREFETCH(b_ptr + 8);
997
998 // use 2 accumulators to remove inter-instruction data dependencies
999 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
1000 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
1001
1002 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
1003 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
1004
1005 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
1006 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
1007
1008 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
1009 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
1010 // increment pointers
1011 a_ptr += 8;
1012 b_ptr += 8;
1013 }
1014 // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
1015 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
1016 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
1017 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
1018 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
1019 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
1020 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
1021 // now reduce accumulators to scalars
1022 lv_32fc_t accum_result[4];
1023 vst2q_f32((float*)accum_result, reduced_accumulator);
1024 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1025
1026 // tail case
1027 for (number = quarter_points * 8; number < num_points; ++number) {
1028 *result += (*a_ptr++) * (*b_ptr++);
1029 }
1030}
1031#endif /*LV_HAVE_NEON*/
1032
1033
1034#ifdef LV_HAVE_AVX
1035
1036#include <immintrin.h>
1037
1039 const lv_32fc_t* input,
1040 const lv_32fc_t* taps,
1041 unsigned int num_points)
1042{
1043
1044 unsigned int isodd = num_points & 3;
1045 unsigned int i = 0;
1046 lv_32fc_t dotProduct;
1047 memset(&dotProduct, 0x0, 2 * sizeof(float));
1048
1049 unsigned int number = 0;
1050 const unsigned int quarterPoints = num_points / 4;
1051
1052 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
1053
1054 const lv_32fc_t* a = input;
1055 const lv_32fc_t* b = taps;
1056
1057 dotProdVal = _mm256_setzero_ps();
1058
1059 for (; number < quarterPoints; number++) {
1060
1061 x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
1062 y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
1063
1064 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
1065 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
1066
1067 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
1068
1069 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
1070
1071 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
1072
1073 z = _mm256_addsub_ps(tmp1,
1074 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
1075
1076 dotProdVal = _mm256_add_ps(dotProdVal,
1077 z); // Add the complex multiplication results together
1078
1079 a += 4;
1080 b += 4;
1081 }
1082
1083 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
1084
1085 _mm256_store_ps((float*)dotProductVector,
1086 dotProdVal); // Store the results back into the dot product vector
1087
1088 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
1089 dotProductVector[3]);
1090
1091 for (i = num_points - isodd; i < num_points; i++) {
1092 dotProduct += input[i] * taps[i];
1093 }
1094
1095 *result = dotProduct;
1096}
1097
1098#endif /*LV_HAVE_AVX*/
1099
1100#if LV_HAVE_AVX && LV_HAVE_FMA
1101#include <immintrin.h>
1102
1103static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
1104 const lv_32fc_t* input,
1105 const lv_32fc_t* taps,
1106 unsigned int num_points)
1107{
1108
1109 unsigned int isodd = num_points & 3;
1110 unsigned int i = 0;
1111 lv_32fc_t dotProduct;
1112 memset(&dotProduct, 0x0, 2 * sizeof(float));
1113
1114 unsigned int number = 0;
1115 const unsigned int quarterPoints = num_points / 4;
1116
1117 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
1118
1119 const lv_32fc_t* a = input;
1120 const lv_32fc_t* b = taps;
1121
1122 dotProdVal = _mm256_setzero_ps();
1123
1124 for (; number < quarterPoints; number++) {
1125
1126 x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
1127 y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
1128
1129 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
1130 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
1131
1132 tmp1 = x;
1133
1134 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
1135
1136 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
1137
1138 z = _mm256_fmaddsub_ps(
1139 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
1140
1141 dotProdVal = _mm256_add_ps(dotProdVal,
1142 z); // Add the complex multiplication results together
1143
1144 a += 4;
1145 b += 4;
1146 }
1147
1148 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
1149
1150 _mm256_store_ps((float*)dotProductVector,
1151 dotProdVal); // Store the results back into the dot product vector
1152
1153 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
1154 dotProductVector[3]);
1155
1156 for (i = num_points - isodd; i < num_points; i++) {
1157 dotProduct += input[i] * taps[i];
1158 }
1159
1160 *result = dotProduct;
1161}
1162
1163#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
1164
1165
1166#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/