Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_8ic_x2_s32f_multiply_conjugate_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
44#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
46
47#include <inttypes.h>
48#include <stdio.h>
49#include <volk/volk_complex.h>
50
51#ifdef LV_HAVE_AVX2
52#include <immintrin.h>
53
54static inline void
55volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
56 const lv_8sc_t* aVector,
57 const lv_8sc_t* bVector,
58 const float scalar,
59 unsigned int num_points)
60{
61 unsigned int number = 0;
62 const unsigned int oneEigthPoints = num_points / 8;
63
64 __m256i x, y, realz, imagz;
65 __m256 ret, retlo, rethi;
66 lv_32fc_t* c = cVector;
67 const lv_8sc_t* a = aVector;
68 const lv_8sc_t* b = bVector;
69 __m256i conjugateSign =
70 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
71
72 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73
74 for (; number < oneEigthPoints; number++) {
75 // Convert 8 bit values into 16 bit values
76 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
78
79 // Calculate the ar*cr - ai*(-ci) portions
80 realz = _mm256_madd_epi16(x, y);
81
82 // Calculate the complex conjugate of the cr + ci j values
83 y = _mm256_sign_epi16(y, conjugateSign);
84
85 // Shift the order of the cr and ci values
86 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87 _MM_SHUFFLE(2, 3, 0, 1));
88
89 // Calculate the ar*(-ci) + cr*(ai)
90 imagz = _mm256_madd_epi16(x, y);
91
92 // Interleave real and imaginary and then convert to float values
93 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
94
95 // Normalize the floating point values
96 retlo = _mm256_mul_ps(retlo, invScalar);
97
98 // Interleave real and imaginary and then convert to float values
99 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
100
101 // Normalize the floating point values
102 rethi = _mm256_mul_ps(rethi, invScalar);
103
104 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105 _mm256_store_ps((float*)c, ret);
106 c += 4;
107
108 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109 _mm256_store_ps((float*)c, ret);
110 c += 4;
111
112 a += 8;
113 b += 8;
114 }
115
116 number = oneEigthPoints * 8;
117 float* cFloatPtr = (float*)&cVector[number];
118 int8_t* a8Ptr = (int8_t*)&aVector[number];
119 int8_t* b8Ptr = (int8_t*)&bVector[number];
120 for (; number < num_points; number++) {
121 float aReal = (float)*a8Ptr++;
122 float aImag = (float)*a8Ptr++;
123 lv_32fc_t aVal = lv_cmake(aReal, aImag);
124 float bReal = (float)*b8Ptr++;
125 float bImag = (float)*b8Ptr++;
126 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
127 lv_32fc_t temp = aVal * bVal;
128
129 *cFloatPtr++ = lv_creal(temp) / scalar;
130 *cFloatPtr++ = lv_cimag(temp) / scalar;
131 }
132}
133#endif /* LV_HAVE_AVX2*/
134
135
136#ifdef LV_HAVE_SSE4_1
137#include <smmintrin.h>
138
139static inline void
140volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
141 const lv_8sc_t* aVector,
142 const lv_8sc_t* bVector,
143 const float scalar,
144 unsigned int num_points)
145{
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
148
149 __m128i x, y, realz, imagz;
150 __m128 ret;
151 lv_32fc_t* c = cVector;
152 const lv_8sc_t* a = aVector;
153 const lv_8sc_t* b = bVector;
154 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
155
156 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
157
158 for (; number < quarterPoints; number++) {
159 // Convert into 8 bit values into 16 bit values
162
163 // Calculate the ar*cr - ai*(-ci) portions
164 realz = _mm_madd_epi16(x, y);
165
166 // Calculate the complex conjugate of the cr + ci j values
167 y = _mm_sign_epi16(y, conjugateSign);
168
169 // Shift the order of the cr and ci values
171 _MM_SHUFFLE(2, 3, 0, 1));
172
173 // Calculate the ar*(-ci) + cr*(ai)
174 imagz = _mm_madd_epi16(x, y);
175
176 // Interleave real and imaginary and then convert to float values
177 ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
178
179 // Normalize the floating point values
180 ret = _mm_mul_ps(ret, invScalar);
181
182 // Store the floating point values
183 _mm_store_ps((float*)c, ret);
184 c += 2;
185
186 // Interleave real and imaginary and then convert to float values
187 ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
188
189 // Normalize the floating point values
190 ret = _mm_mul_ps(ret, invScalar);
191
192 // Store the floating point values
193 _mm_store_ps((float*)c, ret);
194 c += 2;
195
196 a += 4;
197 b += 4;
198 }
199
200 number = quarterPoints * 4;
201 float* cFloatPtr = (float*)&cVector[number];
202 int8_t* a8Ptr = (int8_t*)&aVector[number];
203 int8_t* b8Ptr = (int8_t*)&bVector[number];
204 for (; number < num_points; number++) {
205 float aReal = (float)*a8Ptr++;
206 float aImag = (float)*a8Ptr++;
207 lv_32fc_t aVal = lv_cmake(aReal, aImag);
208 float bReal = (float)*b8Ptr++;
209 float bImag = (float)*b8Ptr++;
210 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
211 lv_32fc_t temp = aVal * bVal;
212
213 *cFloatPtr++ = lv_creal(temp) / scalar;
214 *cFloatPtr++ = lv_cimag(temp) / scalar;
215 }
216}
217#endif /* LV_HAVE_SSE4_1 */
218
219
220#ifdef LV_HAVE_GENERIC
221
222static inline void
224 const lv_8sc_t* aVector,
225 const lv_8sc_t* bVector,
226 const float scalar,
227 unsigned int num_points)
228{
229 unsigned int number = 0;
230 float* cPtr = (float*)cVector;
231 const float invScalar = 1.0 / scalar;
232 int8_t* a8Ptr = (int8_t*)aVector;
233 int8_t* b8Ptr = (int8_t*)bVector;
234 for (number = 0; number < num_points; number++) {
235 float aReal = (float)*a8Ptr++;
236 float aImag = (float)*a8Ptr++;
237 lv_32fc_t aVal = lv_cmake(aReal, aImag);
238 float bReal = (float)*b8Ptr++;
239 float bImag = (float)*b8Ptr++;
240 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
241 lv_32fc_t temp = aVal * bVal;
242
243 *cPtr++ = (lv_creal(temp) * invScalar);
244 *cPtr++ = (lv_cimag(temp) * invScalar);
245 }
246}
247#endif /* LV_HAVE_GENERIC */
248
249
250#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
251
252#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
253#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
254
255#include <inttypes.h>
256#include <stdio.h>
257#include <volk/volk_complex.h>
258
259#ifdef LV_HAVE_AVX2
260#include <immintrin.h>
261
262static inline void
263volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
264 const lv_8sc_t* aVector,
265 const lv_8sc_t* bVector,
266 const float scalar,
267 unsigned int num_points)
268{
269 unsigned int number = 0;
270 const unsigned int oneEigthPoints = num_points / 8;
271
272 __m256i x, y, realz, imagz;
273 __m256 ret, retlo, rethi;
274 lv_32fc_t* c = cVector;
275 const lv_8sc_t* a = aVector;
276 const lv_8sc_t* b = bVector;
277 __m256i conjugateSign =
278 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
279
280 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
281
282 for (; number < oneEigthPoints; number++) {
283 // Convert 8 bit values into 16 bit values
284 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
285 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
286
287 // Calculate the ar*cr - ai*(-ci) portions
288 realz = _mm256_madd_epi16(x, y);
289
290 // Calculate the complex conjugate of the cr + ci j values
291 y = _mm256_sign_epi16(y, conjugateSign);
292
293 // Shift the order of the cr and ci values
294 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
295 _MM_SHUFFLE(2, 3, 0, 1));
296
297 // Calculate the ar*(-ci) + cr*(ai)
298 imagz = _mm256_madd_epi16(x, y);
299
300 // Interleave real and imaginary and then convert to float values
301 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
302
303 // Normalize the floating point values
304 retlo = _mm256_mul_ps(retlo, invScalar);
305
306 // Interleave real and imaginary and then convert to float values
307 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
308
309 // Normalize the floating point values
310 rethi = _mm256_mul_ps(rethi, invScalar);
311
312 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
313 _mm256_storeu_ps((float*)c, ret);
314 c += 4;
315
316 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
317 _mm256_storeu_ps((float*)c, ret);
318 c += 4;
319
320 a += 8;
321 b += 8;
322 }
323
324 number = oneEigthPoints * 8;
325 float* cFloatPtr = (float*)&cVector[number];
326 int8_t* a8Ptr = (int8_t*)&aVector[number];
327 int8_t* b8Ptr = (int8_t*)&bVector[number];
328 for (; number < num_points; number++) {
329 float aReal = (float)*a8Ptr++;
330 float aImag = (float)*a8Ptr++;
331 lv_32fc_t aVal = lv_cmake(aReal, aImag);
332 float bReal = (float)*b8Ptr++;
333 float bImag = (float)*b8Ptr++;
334 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
335 lv_32fc_t temp = aVal * bVal;
336
337 *cFloatPtr++ = lv_creal(temp) / scalar;
338 *cFloatPtr++ = lv_cimag(temp) / scalar;
339 }
340}
341#endif /* LV_HAVE_AVX2*/
342
343
344#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */