Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_16ic_x2_dot_prod_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
35#ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
36#define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
37
39#include <volk/volk_common.h>
40#include <volk/volk_complex.h>
41
42
43#ifdef LV_HAVE_GENERIC
44
46 const lv_16sc_t* in_a,
47 const lv_16sc_t* in_b,
48 unsigned int num_points)
49{
50 result[0] = lv_cmake((int16_t)0, (int16_t)0);
51 unsigned int n;
52 for (n = 0; n < num_points; n++) {
53 lv_16sc_t tmp = in_a[n] * in_b[n];
54 result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
55 sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
56 }
57}
58
59#endif /*LV_HAVE_GENERIC*/
60
61
62#ifdef LV_HAVE_SSE2
63#include <emmintrin.h>
64
66 const lv_16sc_t* in_a,
67 const lv_16sc_t* in_b,
68 unsigned int num_points)
69{
70 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
71
72 const unsigned int sse_iters = num_points / 4;
73 unsigned int number;
74
75 const lv_16sc_t* _in_a = in_a;
76 const lv_16sc_t* _in_b = in_b;
77 lv_16sc_t* _out = out;
78
79 if (sse_iters > 0) {
80 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
81 realcacc, imagcacc;
82 __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
83
84 realcacc = _mm_setzero_si128();
85 imagcacc = _mm_setzero_si128();
86
87 mask_imag = _mm_set_epi8(
88 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
89 mask_real = _mm_set_epi8(
90 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
91
92 for (number = 0; number < sse_iters; number++) {
93 // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
95 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
96 __VOLK_PREFETCH(_in_a + 8);
97 b = _mm_load_si128((__m128i*)_in_b);
98 __VOLK_PREFETCH(_in_b + 8);
99 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
100
101 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
102 // zeros, and store the results in dst.
103 real = _mm_subs_epi16(c, c_sr);
104
105 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
106 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
107
108 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
109 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
110
111 imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
112
113 realcacc = _mm_adds_epi16(realcacc, real);
114 imagcacc = _mm_adds_epi16(imagcacc, imag);
115
116 _in_a += 4;
117 _in_b += 4;
118 }
119
120 realcacc = _mm_and_si128(realcacc, mask_real);
121 imagcacc = _mm_and_si128(imagcacc, mask_imag);
122
123 a = _mm_or_si128(realcacc, imagcacc);
124
125 _mm_store_si128((__m128i*)dotProductVector,
126 a); // Store the results back into the dot product vector
127
128 for (number = 0; number < 4; ++number) {
129 dotProduct = lv_cmake(
130 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
131 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
132 }
133 }
134
135 for (number = 0; number < (num_points % 4); ++number) {
136 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
137 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
138 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
139 }
140
141 *_out = dotProduct;
142}
143
144#endif /* LV_HAVE_SSE2 */
145
146
147#ifdef LV_HAVE_SSE2
148#include <emmintrin.h>
149
151 const lv_16sc_t* in_a,
152 const lv_16sc_t* in_b,
153 unsigned int num_points)
154{
155 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
156
157 const unsigned int sse_iters = num_points / 4;
158
159 const lv_16sc_t* _in_a = in_a;
160 const lv_16sc_t* _in_b = in_b;
161 lv_16sc_t* _out = out;
162 unsigned int number;
163
164 if (sse_iters > 0) {
165 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
166 realcacc, imagcacc, result;
167 __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
168
169 realcacc = _mm_setzero_si128();
170 imagcacc = _mm_setzero_si128();
171
172 mask_imag = _mm_set_epi8(
173 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
174 mask_real = _mm_set_epi8(
175 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
176
177 for (number = 0; number < sse_iters; number++) {
178 // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
179 a = _mm_loadu_si128(
180 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
181 __VOLK_PREFETCH(_in_a + 8);
182 b = _mm_loadu_si128((__m128i*)_in_b);
183 __VOLK_PREFETCH(_in_b + 8);
184 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
185
186 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
187 // zeros, and store the results in dst.
188 real = _mm_subs_epi16(c, c_sr);
189
190 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
191 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
192
193 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
194 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
195
196 imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
197
198 realcacc = _mm_adds_epi16(realcacc, real);
199 imagcacc = _mm_adds_epi16(imagcacc, imag);
200
201 _in_a += 4;
202 _in_b += 4;
203 }
204
205 realcacc = _mm_and_si128(realcacc, mask_real);
206 imagcacc = _mm_and_si128(imagcacc, mask_imag);
207
208 result = _mm_or_si128(realcacc, imagcacc);
209
210 _mm_storeu_si128((__m128i*)dotProductVector,
211 result); // Store the results back into the dot product vector
212
213 for (number = 0; number < 4; ++number) {
214 dotProduct = lv_cmake(
215 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
216 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
217 }
218 }
219
220 for (number = 0; number < (num_points % 4); ++number) {
221 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
222 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
223 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
224 }
225
226 *_out = dotProduct;
227}
228#endif /* LV_HAVE_SSE2 */
229
230
231#ifdef LV_HAVE_AVX2
232#include <immintrin.h>
233
234static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(lv_16sc_t* out,
235 const lv_16sc_t* in_a,
236 const lv_16sc_t* in_b,
237 unsigned int num_points)
238{
239 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
240
241 const unsigned int avx_iters = num_points / 8;
242
243 const lv_16sc_t* _in_a = in_a;
244 const lv_16sc_t* _in_b = in_b;
245 lv_16sc_t* _out = out;
246 unsigned int number;
247
248 if (avx_iters > 0) {
249 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
250 realcacc, imagcacc, result;
251 __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
252
253 realcacc = _mm256_setzero_si256();
254 imagcacc = _mm256_setzero_si256();
255
256 mask_imag = _mm256_set_epi8(0xFF,
257 0xFF,
258 0,
259 0,
260 0xFF,
261 0xFF,
262 0,
263 0,
264 0xFF,
265 0xFF,
266 0,
267 0,
268 0xFF,
269 0xFF,
270 0,
271 0,
272 0xFF,
273 0xFF,
274 0,
275 0,
276 0xFF,
277 0xFF,
278 0,
279 0,
280 0xFF,
281 0xFF,
282 0,
283 0,
284 0xFF,
285 0xFF,
286 0,
287 0);
288 mask_real = _mm256_set_epi8(0,
289 0,
290 0xFF,
291 0xFF,
292 0,
293 0,
294 0xFF,
295 0xFF,
296 0,
297 0,
298 0xFF,
299 0xFF,
300 0,
301 0,
302 0xFF,
303 0xFF,
304 0,
305 0,
306 0xFF,
307 0xFF,
308 0,
309 0,
310 0xFF,
311 0xFF,
312 0,
313 0,
314 0xFF,
315 0xFF,
316 0,
317 0,
318 0xFF,
319 0xFF);
320
321 for (number = 0; number < avx_iters; number++) {
322 a = _mm256_loadu_si256((__m256i*)_in_a);
323 __VOLK_PREFETCH(_in_a + 16);
324 b = _mm256_loadu_si256((__m256i*)_in_b);
325 __VOLK_PREFETCH(_in_b + 16);
326 c = _mm256_mullo_epi16(a, b);
327
328 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
329 // in zeros, and store the results in dst.
330 real = _mm256_subs_epi16(c, c_sr);
331
332 b_sl = _mm256_slli_si256(b, 2);
333 a_sl = _mm256_slli_si256(a, 2);
334
335 imag1 = _mm256_mullo_epi16(a, b_sl);
336 imag2 = _mm256_mullo_epi16(b, a_sl);
337
338 imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
339
340 realcacc = _mm256_adds_epi16(realcacc, real);
341 imagcacc = _mm256_adds_epi16(imagcacc, imag);
342
343 _in_a += 8;
344 _in_b += 8;
345 }
346
347 realcacc = _mm256_and_si256(realcacc, mask_real);
348 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
349
350 result = _mm256_or_si256(realcacc, imagcacc);
351
352 _mm256_storeu_si256((__m256i*)dotProductVector,
353 result); // Store the results back into the dot product vector
354
355 for (number = 0; number < 8; ++number) {
356 dotProduct = lv_cmake(
357 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
358 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
359 }
360 }
361
362 for (number = 0; number < (num_points % 8); ++number) {
363 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
364 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
365 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
366 }
367
368 *_out = dotProduct;
369}
370#endif /* LV_HAVE_AVX2 */
371
372
373#ifdef LV_HAVE_AVX2
374#include <immintrin.h>
375
376static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(lv_16sc_t* out,
377 const lv_16sc_t* in_a,
378 const lv_16sc_t* in_b,
379 unsigned int num_points)
380{
381 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
382
383 const unsigned int avx_iters = num_points / 8;
384
385 const lv_16sc_t* _in_a = in_a;
386 const lv_16sc_t* _in_b = in_b;
387 lv_16sc_t* _out = out;
388 unsigned int number;
389
390 if (avx_iters > 0) {
391 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
392 realcacc, imagcacc, result;
393 __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
394
395 realcacc = _mm256_setzero_si256();
396 imagcacc = _mm256_setzero_si256();
397
398 mask_imag = _mm256_set_epi8(0xFF,
399 0xFF,
400 0,
401 0,
402 0xFF,
403 0xFF,
404 0,
405 0,
406 0xFF,
407 0xFF,
408 0,
409 0,
410 0xFF,
411 0xFF,
412 0,
413 0,
414 0xFF,
415 0xFF,
416 0,
417 0,
418 0xFF,
419 0xFF,
420 0,
421 0,
422 0xFF,
423 0xFF,
424 0,
425 0,
426 0xFF,
427 0xFF,
428 0,
429 0);
430 mask_real = _mm256_set_epi8(0,
431 0,
432 0xFF,
433 0xFF,
434 0,
435 0,
436 0xFF,
437 0xFF,
438 0,
439 0,
440 0xFF,
441 0xFF,
442 0,
443 0,
444 0xFF,
445 0xFF,
446 0,
447 0,
448 0xFF,
449 0xFF,
450 0,
451 0,
452 0xFF,
453 0xFF,
454 0,
455 0,
456 0xFF,
457 0xFF,
458 0,
459 0,
460 0xFF,
461 0xFF);
462
463 for (number = 0; number < avx_iters; number++) {
464 a = _mm256_load_si256((__m256i*)_in_a);
465 __VOLK_PREFETCH(_in_a + 16);
466 b = _mm256_load_si256((__m256i*)_in_b);
467 __VOLK_PREFETCH(_in_b + 16);
468 c = _mm256_mullo_epi16(a, b);
469
470 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
471 // in zeros, and store the results in dst.
472 real = _mm256_subs_epi16(c, c_sr);
473
474 b_sl = _mm256_slli_si256(b, 2);
475 a_sl = _mm256_slli_si256(a, 2);
476
477 imag1 = _mm256_mullo_epi16(a, b_sl);
478 imag2 = _mm256_mullo_epi16(b, a_sl);
479
480 imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
481
482 realcacc = _mm256_adds_epi16(realcacc, real);
483 imagcacc = _mm256_adds_epi16(imagcacc, imag);
484
485 _in_a += 8;
486 _in_b += 8;
487 }
488
489 realcacc = _mm256_and_si256(realcacc, mask_real);
490 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
491
492 result = _mm256_or_si256(realcacc, imagcacc);
493
494 _mm256_store_si256((__m256i*)dotProductVector,
495 result); // Store the results back into the dot product vector
496
497 for (number = 0; number < 8; ++number) {
498 dotProduct = lv_cmake(
499 sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
500 sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
501 }
502 }
503
504 for (number = 0; number < (num_points % 8); ++number) {
505 lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
506 dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
507 sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
508 }
509
510 *_out = dotProduct;
511}
512#endif /* LV_HAVE_AVX2 */
513
514
515#ifdef LV_HAVE_NEON
516#include <arm_neon.h>
517
519 const lv_16sc_t* in_a,
520 const lv_16sc_t* in_b,
521 unsigned int num_points)
522{
523 unsigned int quarter_points = num_points / 4;
524 unsigned int number;
525
526 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
527 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
528 *out = lv_cmake((int16_t)0, (int16_t)0);
529
530 if (quarter_points > 0) {
531 // for 2-lane vectors, 1st lane holds the real part,
532 // 2nd lane holds the imaginary part
533 int16x4x2_t a_val, b_val, c_val, accumulator;
534 int16x4x2_t tmp_real, tmp_imag;
535 __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
536 accumulator.val[0] = vdup_n_s16(0);
537 accumulator.val[1] = vdup_n_s16(0);
538 lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
539
540 for (number = 0; number < quarter_points; ++number) {
541 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
542 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
543 __VOLK_PREFETCH(a_ptr + 8);
544 __VOLK_PREFETCH(b_ptr + 8);
545
546 // multiply the real*real and imag*imag to get real result
547 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
548 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
549 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
550 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
551
552 // Multiply cross terms to get the imaginary result
553 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
554 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
555 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
556 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
557
558 c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
559 c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
560
561 accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
562 accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
563
564 a_ptr += 4;
565 b_ptr += 4;
566 }
567
568 vst2_s16((int16_t*)accum_result, accumulator);
569 for (number = 0; number < 4; ++number) {
570 dotProduct = lv_cmake(
571 sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
572 sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
573 }
574
575 *out = dotProduct;
576 }
577
578 // tail case
579 for (number = quarter_points * 4; number < num_points; ++number) {
580 *out += (*a_ptr++) * (*b_ptr++);
581 }
582}
583
584#endif /* LV_HAVE_NEON */
585
586
587#ifdef LV_HAVE_NEON
588#include <arm_neon.h>
589
591 const lv_16sc_t* in_a,
592 const lv_16sc_t* in_b,
593 unsigned int num_points)
594{
595 unsigned int quarter_points = num_points / 4;
596 unsigned int number;
597
598 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
599 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
600 // for 2-lane vectors, 1st lane holds the real part,
601 // 2nd lane holds the imaginary part
602 int16x4x2_t a_val, b_val, accumulator;
603 int16x4x2_t tmp;
604 __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
605 accumulator.val[0] = vdup_n_s16(0);
606 accumulator.val[1] = vdup_n_s16(0);
607
608 for (number = 0; number < quarter_points; ++number) {
609 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
610 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
611 __VOLK_PREFETCH(a_ptr + 8);
612 __VOLK_PREFETCH(b_ptr + 8);
613
614 tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
615 tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
616
617 // use multiply accumulate/subtract to get result
618 tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
619 tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
620
621 accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
622 accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
623
624 a_ptr += 4;
625 b_ptr += 4;
626 }
627
628 vst2_s16((int16_t*)accum_result, accumulator);
629 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
630
631 // tail case
632 for (number = quarter_points * 4; number < num_points; ++number) {
633 *out += (*a_ptr++) * (*b_ptr++);
634 }
635}
636
637#endif /* LV_HAVE_NEON */
638
639
640#ifdef LV_HAVE_NEON
641#include <arm_neon.h>
642
644 const lv_16sc_t* in_a,
645 const lv_16sc_t* in_b,
646 unsigned int num_points)
647{
648 unsigned int quarter_points = num_points / 4;
649 unsigned int number;
650
651 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
652 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
653 // for 2-lane vectors, 1st lane holds the real part,
654 // 2nd lane holds the imaginary part
655 int16x4x2_t a_val, b_val, accumulator1, accumulator2;
656
657 __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
658 accumulator1.val[0] = vdup_n_s16(0);
659 accumulator1.val[1] = vdup_n_s16(0);
660 accumulator2.val[0] = vdup_n_s16(0);
661 accumulator2.val[1] = vdup_n_s16(0);
662
663 for (number = 0; number < quarter_points; ++number) {
664 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
665 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
666 __VOLK_PREFETCH(a_ptr + 8);
667 __VOLK_PREFETCH(b_ptr + 8);
668
669 // use 2 accumulators to remove inter-instruction data dependencies
670 accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
671 accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
672 accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
673 accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
674
675 a_ptr += 4;
676 b_ptr += 4;
677 }
678
679 accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
680 accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
681
682 vst2_s16((int16_t*)accum_result, accumulator1);
683 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
684
685 // tail case
686 for (number = quarter_points * 4; number < num_points; ++number) {
687 *out += (*a_ptr++) * (*b_ptr++);
688 }
689}
690
691#endif /* LV_HAVE_NEON */
692
693#endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/