Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_16ic_x2_multiply_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
34#ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
35#define INCLUDED_volk_16ic_x2_multiply_16ic_H
36
37#include <volk/volk_common.h>
38#include <volk/volk_complex.h>
39
40#ifdef LV_HAVE_GENERIC
41
43 const lv_16sc_t* in_a,
44 const lv_16sc_t* in_b,
45 unsigned int num_points)
46{
47 unsigned int n;
48 for (n = 0; n < num_points; n++) {
49 result[n] = in_a[n] * in_b[n];
50 }
51}
52
53#endif /*LV_HAVE_GENERIC*/
54
55
56#ifdef LV_HAVE_SSE2
57#include <emmintrin.h>
58
60 const lv_16sc_t* in_a,
61 const lv_16sc_t* in_b,
62 unsigned int num_points)
63{
64 const unsigned int sse_iters = num_points / 4;
65 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
66 result;
67
68 mask_imag = _mm_set_epi8(
69 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
70 mask_real = _mm_set_epi8(
71 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
72
73 const lv_16sc_t* _in_a = in_a;
74 const lv_16sc_t* _in_b = in_b;
75 lv_16sc_t* _out = out;
76 unsigned int number;
77
78 for (number = 0; number < sse_iters; number++) {
80 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
81 b = _mm_load_si128((__m128i*)_in_b);
82 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
83
84 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
85 // zeros, and store the results in dst.
86 real = _mm_subs_epi16(c, c_sr);
87 real = _mm_and_si128(real,
88 mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
89
90 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
91 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
92
93 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
94 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
95
96 imag = _mm_adds_epi16(imag1, imag2);
97 imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
98
99 result = _mm_or_si128(real, imag);
100
101 _mm_store_si128((__m128i*)_out, result);
102
103 _in_a += 4;
104 _in_b += 4;
105 _out += 4;
106 }
107
108 for (number = sse_iters * 4; number < num_points; ++number) {
109 *_out++ = (*_in_a++) * (*_in_b++);
110 }
111}
112#endif /* LV_HAVE_SSE2 */
113
114
115#ifdef LV_HAVE_SSE2
116#include <emmintrin.h>
117
119 const lv_16sc_t* in_a,
120 const lv_16sc_t* in_b,
121 unsigned int num_points)
122{
123 const unsigned int sse_iters = num_points / 4;
124 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
125 result;
126
127 mask_imag = _mm_set_epi8(
128 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
129 mask_real = _mm_set_epi8(
130 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
131
132 const lv_16sc_t* _in_a = in_a;
133 const lv_16sc_t* _in_b = in_b;
134 lv_16sc_t* _out = out;
135 unsigned int number;
136
137 for (number = 0; number < sse_iters; number++) {
138 a = _mm_loadu_si128(
139 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
140 b = _mm_loadu_si128((__m128i*)_in_b);
141 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
142
143 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
144 // zeros, and store the results in dst.
145 real = _mm_subs_epi16(c, c_sr);
146 real = _mm_and_si128(real,
147 mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
148
149 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
150 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
151
152 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
153 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
154
155 imag = _mm_adds_epi16(imag1, imag2);
156 imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
157
158 result = _mm_or_si128(real, imag);
159
160 _mm_storeu_si128((__m128i*)_out, result);
161
162 _in_a += 4;
163 _in_b += 4;
164 _out += 4;
165 }
166
167 for (number = sse_iters * 4; number < num_points; ++number) {
168 *_out++ = (*_in_a++) * (*_in_b++);
169 }
170}
171#endif /* LV_HAVE_SSE2 */
172
173
174#ifdef LV_HAVE_AVX2
175#include <immintrin.h>
176
177static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
178 const lv_16sc_t* in_a,
179 const lv_16sc_t* in_b,
180 unsigned int num_points)
181{
182 unsigned int number = 0;
183 const unsigned int avx2_points = num_points / 8;
184
185 const lv_16sc_t* _in_a = in_a;
186 const lv_16sc_t* _in_b = in_b;
187 lv_16sc_t* _out = out;
188
189 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
190
191 const __m256i mask_imag = _mm256_set_epi8(0xFF,
192 0xFF,
193 0,
194 0,
195 0xFF,
196 0xFF,
197 0,
198 0,
199 0xFF,
200 0xFF,
201 0,
202 0,
203 0xFF,
204 0xFF,
205 0,
206 0,
207 0xFF,
208 0xFF,
209 0,
210 0,
211 0xFF,
212 0xFF,
213 0,
214 0,
215 0xFF,
216 0xFF,
217 0,
218 0,
219 0xFF,
220 0xFF,
221 0,
222 0);
223 const __m256i mask_real = _mm256_set_epi8(0,
224 0,
225 0xFF,
226 0xFF,
227 0,
228 0,
229 0xFF,
230 0xFF,
231 0,
232 0,
233 0xFF,
234 0xFF,
235 0,
236 0,
237 0xFF,
238 0xFF,
239 0,
240 0,
241 0xFF,
242 0xFF,
243 0,
244 0,
245 0xFF,
246 0xFF,
247 0,
248 0,
249 0xFF,
250 0xFF,
251 0,
252 0,
253 0xFF,
254 0xFF);
255
256 for (; number < avx2_points; number++) {
257 a = _mm256_loadu_si256(
258 (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
259 b = _mm256_loadu_si256(
260 (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
261 c = _mm256_mullo_epi16(a, b);
262
263 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
264 // zeros, and store the results in dst.
265 real = _mm256_subs_epi16(c, c_sr);
266 real = _mm256_and_si256(
267 real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
268
269 b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
270 a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
271
272 imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
273 imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
274
275 imag = _mm256_adds_epi16(imag1, imag2);
276 imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
277
278 result = _mm256_or_si256(real, imag);
279
280 _mm256_storeu_si256((__m256i*)_out, result);
281
282 _in_a += 8;
283 _in_b += 8;
284 _out += 8;
285 }
286
287 number = avx2_points * 8;
288 for (; number < num_points; number++) {
289 *_out++ = (*_in_a++) * (*_in_b++);
290 }
291}
292#endif /* LV_HAVE_AVX2 */
293
294
295#ifdef LV_HAVE_AVX2
296#include <immintrin.h>
297
298static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
299 const lv_16sc_t* in_a,
300 const lv_16sc_t* in_b,
301 unsigned int num_points)
302{
303 unsigned int number = 0;
304 const unsigned int avx2_points = num_points / 8;
305
306 const lv_16sc_t* _in_a = in_a;
307 const lv_16sc_t* _in_b = in_b;
308 lv_16sc_t* _out = out;
309
310 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
311
312 const __m256i mask_imag = _mm256_set_epi8(0xFF,
313 0xFF,
314 0,
315 0,
316 0xFF,
317 0xFF,
318 0,
319 0,
320 0xFF,
321 0xFF,
322 0,
323 0,
324 0xFF,
325 0xFF,
326 0,
327 0,
328 0xFF,
329 0xFF,
330 0,
331 0,
332 0xFF,
333 0xFF,
334 0,
335 0,
336 0xFF,
337 0xFF,
338 0,
339 0,
340 0xFF,
341 0xFF,
342 0,
343 0);
344 const __m256i mask_real = _mm256_set_epi8(0,
345 0,
346 0xFF,
347 0xFF,
348 0,
349 0,
350 0xFF,
351 0xFF,
352 0,
353 0,
354 0xFF,
355 0xFF,
356 0,
357 0,
358 0xFF,
359 0xFF,
360 0,
361 0,
362 0xFF,
363 0xFF,
364 0,
365 0,
366 0xFF,
367 0xFF,
368 0,
369 0,
370 0xFF,
371 0xFF,
372 0,
373 0,
374 0xFF,
375 0xFF);
376
377 for (; number < avx2_points; number++) {
378 a = _mm256_load_si256(
379 (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
380 b = _mm256_load_si256(
381 (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
382 c = _mm256_mullo_epi16(a, b);
383
384 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
385 // zeros, and store the results in dst.
386 real = _mm256_subs_epi16(c, c_sr);
387 real = _mm256_and_si256(
388 real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
389
390 b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
391 a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
392
393 imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
394 imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
395
396 imag = _mm256_adds_epi16(imag1, imag2);
397 imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
398
399 result = _mm256_or_si256(real, imag);
400
401 _mm256_store_si256((__m256i*)_out, result);
402
403 _in_a += 8;
404 _in_b += 8;
405 _out += 8;
406 }
407
408 number = avx2_points * 8;
409 for (; number < num_points; number++) {
410 *_out++ = (*_in_a++) * (*_in_b++);
411 }
412}
413#endif /* LV_HAVE_AVX2 */
414
415
416#ifdef LV_HAVE_NEON
417#include <arm_neon.h>
418
420 const lv_16sc_t* in_a,
421 const lv_16sc_t* in_b,
422 unsigned int num_points)
423{
424 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
425 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
426 unsigned int quarter_points = num_points / 4;
427 int16x4x2_t a_val, b_val, c_val;
428 int16x4x2_t tmp_real, tmp_imag;
429 unsigned int number = 0;
430
431 for (number = 0; number < quarter_points; ++number) {
432 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
433 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
434 __VOLK_PREFETCH(a_ptr + 4);
435 __VOLK_PREFETCH(b_ptr + 4);
436
437 // multiply the real*real and imag*imag to get real result
438 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
439 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
440 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
441 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
442
443 // Multiply cross terms to get the imaginary result
444 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
445 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
446 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
447 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
448
449 // store the results
450 c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
451 c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
452 vst2_s16((int16_t*)out, c_val);
453
454 a_ptr += 4;
455 b_ptr += 4;
456 out += 4;
457 }
458
459 for (number = quarter_points * 4; number < num_points; number++) {
460 *out++ = (*a_ptr++) * (*b_ptr++);
461 }
462}
463#endif /* LV_HAVE_NEON */
464
465#endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/