Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_8ic_x2_multiply_conjugate_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
11#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
12
13#include <inttypes.h>
14#include <limits.h>
15#include <stdio.h>
16#include <volk/volk_complex.h>
17
18#ifdef LV_HAVE_AVX2
19#include <immintrin.h>
28static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector,
29 const lv_8sc_t* aVector,
30 const lv_8sc_t* bVector,
31 unsigned int num_points)
32{
33 unsigned int number = 0;
34 const unsigned int quarterPoints = num_points / 8;
35
36 __m256i x, y, realz, imagz;
37 lv_16sc_t* c = cVector;
38 const lv_8sc_t* a = aVector;
39 const lv_8sc_t* b = bVector;
40 __m256i conjugateSign =
41 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
42
43 for (; number < quarterPoints; number++) {
44 // Convert 8 bit values into 16 bit values
45 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
46 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
47
48 // Calculate the ar*cr - ai*(-ci) portions
49 realz = _mm256_madd_epi16(x, y);
50
51 // Calculate the complex conjugate of the cr + ci j values
52 y = _mm256_sign_epi16(y, conjugateSign);
53
54 // Shift the order of the cr and ci values
55 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
56 _MM_SHUFFLE(2, 3, 0, 1));
57
58 // Calculate the ar*(-ci) + cr*(ai)
59 imagz = _mm256_madd_epi16(x, y);
60
61 // Perform the addition of products
62
63 _mm256_store_si256((__m256i*)c,
64 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
65 _mm256_unpackhi_epi32(realz, imagz)));
66
67 a += 8;
68 b += 8;
69 c += 8;
70 }
71
72 number = quarterPoints * 8;
73 int16_t* c16Ptr = (int16_t*)&cVector[number];
74 int8_t* a8Ptr = (int8_t*)&aVector[number];
75 int8_t* b8Ptr = (int8_t*)&bVector[number];
76 for (; number < num_points; number++) {
77 float aReal = (float)*a8Ptr++;
78 float aImag = (float)*a8Ptr++;
79 lv_32fc_t aVal = lv_cmake(aReal, aImag);
80 float bReal = (float)*b8Ptr++;
81 float bImag = (float)*b8Ptr++;
82 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
83 lv_32fc_t temp = aVal * bVal;
84
85 *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
86 *c16Ptr++ = (int16_t)lv_cimag(temp);
87 }
88}
89#endif /* LV_HAVE_AVX2 */
90
91
92#ifdef LV_HAVE_SSE4_1
93#include <smmintrin.h>
102static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector,
103 const lv_8sc_t* aVector,
104 const lv_8sc_t* bVector,
105 unsigned int num_points)
106{
107 unsigned int number = 0;
108 const unsigned int quarterPoints = num_points / 4;
109
110 __m128i x, y, realz, imagz;
111 lv_16sc_t* c = cVector;
112 const lv_8sc_t* a = aVector;
113 const lv_8sc_t* b = bVector;
114 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
115
116 for (; number < quarterPoints; number++) {
117 // Convert into 8 bit values into 16 bit values
120
121 // Calculate the ar*cr - ai*(-ci) portions
122 realz = _mm_madd_epi16(x, y);
123
124 // Calculate the complex conjugate of the cr + ci j values
125 y = _mm_sign_epi16(y, conjugateSign);
126
127 // Shift the order of the cr and ci values
129 _MM_SHUFFLE(2, 3, 0, 1));
130
131 // Calculate the ar*(-ci) + cr*(ai)
132 imagz = _mm_madd_epi16(x, y);
133
136 _mm_unpackhi_epi32(realz, imagz)));
137
138 a += 4;
139 b += 4;
140 c += 4;
141 }
142
143 number = quarterPoints * 4;
144 int16_t* c16Ptr = (int16_t*)&cVector[number];
145 int8_t* a8Ptr = (int8_t*)&aVector[number];
146 int8_t* b8Ptr = (int8_t*)&bVector[number];
147 for (; number < num_points; number++) {
148 float aReal = (float)*a8Ptr++;
149 float aImag = (float)*a8Ptr++;
150 lv_32fc_t aVal = lv_cmake(aReal, aImag);
151 float bReal = (float)*b8Ptr++;
152 float bImag = (float)*b8Ptr++;
153 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
154 lv_32fc_t temp = aVal * bVal;
155
156 *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
157 *c16Ptr++ = (int16_t)lv_cimag(temp);
158 }
159}
160#endif /* LV_HAVE_SSE4_1 */
161
162#ifdef LV_HAVE_GENERIC
172 const lv_8sc_t* aVector,
173 const lv_8sc_t* bVector,
174 unsigned int num_points)
175{
176 unsigned int number = 0;
177 int16_t* c16Ptr = (int16_t*)cVector;
178 int8_t* a8Ptr = (int8_t*)aVector;
179 int8_t* b8Ptr = (int8_t*)bVector;
180 for (number = 0; number < num_points; number++) {
181 float aReal = (float)*a8Ptr++;
182 float aImag = (float)*a8Ptr++;
183 lv_32fc_t aVal = lv_cmake(aReal, aImag);
184 float bReal = (float)*b8Ptr++;
185 float bImag = (float)*b8Ptr++;
186 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
187 lv_32fc_t temp = aVal * bVal;
188
189 *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
190 *c16Ptr++ = (int16_t)lv_cimag(temp);
191 }
192}
193#endif /* LV_HAVE_GENERIC */
194
195#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */
196
197#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
198#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
199
200#include <inttypes.h>
201#include <stdio.h>
202#include <volk/volk_complex.h>
203
204#ifdef LV_HAVE_AVX2
205#include <immintrin.h>
214static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector,
215 const lv_8sc_t* aVector,
216 const lv_8sc_t* bVector,
217 unsigned int num_points)
218{
219 unsigned int number = 0;
220 const unsigned int oneEigthPoints = num_points / 8;
221
222 __m256i x, y, realz, imagz;
223 lv_16sc_t* c = cVector;
224 const lv_8sc_t* a = aVector;
225 const lv_8sc_t* b = bVector;
226 __m256i conjugateSign =
227 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
228
229 for (; number < oneEigthPoints; number++) {
230 // Convert 8 bit values into 16 bit values
231 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
232 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
233
234 // Calculate the ar*cr - ai*(-ci) portions
235 realz = _mm256_madd_epi16(x, y);
236
237 // Calculate the complex conjugate of the cr + ci j values
238 y = _mm256_sign_epi16(y, conjugateSign);
239
240 // Shift the order of the cr and ci values
241 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
242 _MM_SHUFFLE(2, 3, 0, 1));
243
244 // Calculate the ar*(-ci) + cr*(ai)
245 imagz = _mm256_madd_epi16(x, y);
246
247 // Perform the addition of products
248
249 _mm256_storeu_si256((__m256i*)c,
250 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
251 _mm256_unpackhi_epi32(realz, imagz)));
252
253 a += 8;
254 b += 8;
255 c += 8;
256 }
257
258 number = oneEigthPoints * 8;
259 int16_t* c16Ptr = (int16_t*)&cVector[number];
260 int8_t* a8Ptr = (int8_t*)&aVector[number];
261 int8_t* b8Ptr = (int8_t*)&bVector[number];
262 for (; number < num_points; number++) {
263 float aReal = (float)*a8Ptr++;
264 float aImag = (float)*a8Ptr++;
265 lv_32fc_t aVal = lv_cmake(aReal, aImag);
266 float bReal = (float)*b8Ptr++;
267 float bImag = (float)*b8Ptr++;
268 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
269 lv_32fc_t temp = aVal * bVal;
270
271 *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
272 *c16Ptr++ = (int16_t)lv_cimag(temp);
273 }
274}
275#endif /* LV_HAVE_AVX2 */
276
277#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */