34#ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
35#define INCLUDED_volk_16ic_x2_multiply_16ic_H
45 unsigned int num_points)
48 for (n = 0; n < num_points; n++) {
49 result[n] = in_a[n] * in_b[n];
62 unsigned int num_points)
64 const unsigned int sse_iters = num_points / 4;
65 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
69 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
71 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
78 for (number = 0; number < sse_iters; number++) {
108 for (number = sse_iters * 4; number < num_points; ++number) {
109 *_out++ = (*_in_a++) * (*_in_b++);
116#include <emmintrin.h>
121 unsigned int num_points)
123 const unsigned int sse_iters = num_points / 4;
124 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
128 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
130 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
137 for (number = 0; number < sse_iters; number++) {
167 for (number = sse_iters * 4; number < num_points; ++number) {
168 *_out++ = (*_in_a++) * (*_in_b++);
175#include <immintrin.h>
177static inline void volk_16ic_x2_multiply_16ic_u_avx2(
lv_16sc_t* out,
180 unsigned int num_points)
182 unsigned int number = 0;
183 const unsigned int avx2_points = num_points / 8;
189 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
191 const __m256i mask_imag = _mm256_set_epi8(0xFF,
223 const __m256i mask_real = _mm256_set_epi8(0,
256 for (; number < avx2_points; number++) {
257 a = _mm256_loadu_si256(
259 b = _mm256_loadu_si256(
261 c = _mm256_mullo_epi16(a, b);
263 c_sr = _mm256_srli_si256(c, 2);
265 real = _mm256_subs_epi16(c, c_sr);
266 real = _mm256_and_si256(
269 b_sl = _mm256_slli_si256(b, 2);
270 a_sl = _mm256_slli_si256(a, 2);
272 imag1 = _mm256_mullo_epi16(a, b_sl);
273 imag2 = _mm256_mullo_epi16(b, a_sl);
275 imag = _mm256_adds_epi16(imag1, imag2);
276 imag = _mm256_and_si256(imag, mask_imag);
278 result = _mm256_or_si256(real, imag);
280 _mm256_storeu_si256((__m256i*)_out, result);
287 number = avx2_points * 8;
288 for (; number < num_points; number++) {
289 *_out++ = (*_in_a++) * (*_in_b++);
296#include <immintrin.h>
298static inline void volk_16ic_x2_multiply_16ic_a_avx2(
lv_16sc_t* out,
301 unsigned int num_points)
303 unsigned int number = 0;
304 const unsigned int avx2_points = num_points / 8;
310 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
312 const __m256i mask_imag = _mm256_set_epi8(0xFF,
344 const __m256i mask_real = _mm256_set_epi8(0,
377 for (; number < avx2_points; number++) {
378 a = _mm256_load_si256(
380 b = _mm256_load_si256(
382 c = _mm256_mullo_epi16(a, b);
384 c_sr = _mm256_srli_si256(c, 2);
386 real = _mm256_subs_epi16(c, c_sr);
387 real = _mm256_and_si256(
390 b_sl = _mm256_slli_si256(b, 2);
391 a_sl = _mm256_slli_si256(a, 2);
393 imag1 = _mm256_mullo_epi16(a, b_sl);
394 imag2 = _mm256_mullo_epi16(b, a_sl);
396 imag = _mm256_adds_epi16(imag1, imag2);
397 imag = _mm256_and_si256(imag, mask_imag);
399 result = _mm256_or_si256(real, imag);
401 _mm256_store_si256((__m256i*)_out, result);
408 number = avx2_points * 8;
409 for (; number < num_points; number++) {
410 *_out++ = (*_in_a++) * (*_in_b++);
422 unsigned int num_points)
426 unsigned int quarter_points = num_points / 4;
427 int16x4x2_t a_val, b_val, c_val;
428 int16x4x2_t tmp_real, tmp_imag;
429 unsigned int number = 0;
431 for (number = 0; number < quarter_points; ++number) {
432 a_val = vld2_s16((int16_t*)a_ptr);
433 b_val = vld2_s16((int16_t*)b_ptr);
439 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
441 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
445 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
447 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
450 c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
451 c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
452 vst2_s16((int16_t*)out, c_val);
459 for (number = quarter_points * 4; number < num_points; number++) {
460 *out++ = (*a_ptr++) * (*b_ptr++);