Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_8ic_s32f_deinterleave_32f_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
43#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
44#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
45
46#include <inttypes.h>
47#include <stdio.h>
48#include <volk/volk_common.h>
49
50
51#ifdef LV_HAVE_SSE4_1
52#include <smmintrin.h>
53
54static inline void
55volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
56 float* qBuffer,
57 const lv_8sc_t* complexVector,
58 const float scalar,
59 unsigned int num_points)
60{
61 float* iBufferPtr = iBuffer;
62 float* qBufferPtr = qBuffer;
63
64 unsigned int number = 0;
65 const unsigned int eighthPoints = num_points / 8;
66 __m128 iFloatValue, qFloatValue;
67
68 const float iScalar = 1.0 / scalar;
69 __m128 invScalar = _mm_set_ps1(iScalar);
70 __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
71 int8_t* complexVectorPtr = (int8_t*)complexVector;
72
73 __m128i iMoveMask = _mm_set_epi8(
74 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
75 __m128i qMoveMask = _mm_set_epi8(
76 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
77
78 for (; number < eighthPoints; number++) {
79 complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
80 complexVectorPtr += 16;
81 iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
82 qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
83
84 iIntVal = _mm_cvtepi8_epi32(iComplexVal);
85 iFloatValue = _mm_cvtepi32_ps(iIntVal);
86 iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
87 _mm_store_ps(iBufferPtr, iFloatValue);
88 iBufferPtr += 4;
89
90 iComplexVal = _mm_srli_si128(iComplexVal, 4);
91
92 iIntVal = _mm_cvtepi8_epi32(iComplexVal);
93 iFloatValue = _mm_cvtepi32_ps(iIntVal);
94 iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
95 _mm_store_ps(iBufferPtr, iFloatValue);
96 iBufferPtr += 4;
97
98 qIntVal = _mm_cvtepi8_epi32(qComplexVal);
99 qFloatValue = _mm_cvtepi32_ps(qIntVal);
100 qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
101 _mm_store_ps(qBufferPtr, qFloatValue);
102 qBufferPtr += 4;
103
104 qComplexVal = _mm_srli_si128(qComplexVal, 4);
105
106 qIntVal = _mm_cvtepi8_epi32(qComplexVal);
107 qFloatValue = _mm_cvtepi32_ps(qIntVal);
108 qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
109 _mm_store_ps(qBufferPtr, qFloatValue);
110
111 qBufferPtr += 4;
112 }
113
114 number = eighthPoints * 8;
115 for (; number < num_points; number++) {
116 *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
117 *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
118 }
119}
120#endif /* LV_HAVE_SSE4_1 */
121
122
123#ifdef LV_HAVE_SSE
124#include <xmmintrin.h>
125
126static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
127 float* qBuffer,
128 const lv_8sc_t* complexVector,
129 const float scalar,
130 unsigned int num_points)
131{
132 float* iBufferPtr = iBuffer;
133 float* qBufferPtr = qBuffer;
134
135 unsigned int number = 0;
136 const unsigned int quarterPoints = num_points / 4;
137 __m128 cplxValue1, cplxValue2, iValue, qValue;
138
139 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
140 int8_t* complexVectorPtr = (int8_t*)complexVector;
141
142 __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
143
144 for (; number < quarterPoints; number++) {
145 floatBuffer[0] = (float)(complexVectorPtr[0]);
146 floatBuffer[1] = (float)(complexVectorPtr[1]);
147 floatBuffer[2] = (float)(complexVectorPtr[2]);
148 floatBuffer[3] = (float)(complexVectorPtr[3]);
149
150 floatBuffer[4] = (float)(complexVectorPtr[4]);
151 floatBuffer[5] = (float)(complexVectorPtr[5]);
152 floatBuffer[6] = (float)(complexVectorPtr[6]);
153 floatBuffer[7] = (float)(complexVectorPtr[7]);
154
155 cplxValue1 = _mm_load_ps(&floatBuffer[0]);
156 cplxValue2 = _mm_load_ps(&floatBuffer[4]);
157
158 complexVectorPtr += 8;
159
160 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
161 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
162
163 // Arrange in i1i2i3i4 format
164 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
165 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
166
167 _mm_store_ps(iBufferPtr, iValue);
168 _mm_store_ps(qBufferPtr, qValue);
169
170 iBufferPtr += 4;
171 qBufferPtr += 4;
172 }
173
174 number = quarterPoints * 4;
175 complexVectorPtr = (int8_t*)&complexVector[number];
176 for (; number < num_points; number++) {
177 *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
178 *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
179 }
180}
181#endif /* LV_HAVE_SSE */
182
183
184#ifdef LV_HAVE_AVX2
185#include <immintrin.h>
186
187static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
188 float* qBuffer,
189 const lv_8sc_t* complexVector,
190 const float scalar,
191 unsigned int num_points)
192{
193 float* iBufferPtr = iBuffer;
194 float* qBufferPtr = qBuffer;
195
196 unsigned int number = 0;
197 const unsigned int sixteenthPoints = num_points / 16;
198 __m256 iFloatValue, qFloatValue;
199
200 const float iScalar = 1.0 / scalar;
201 __m256 invScalar = _mm256_set1_ps(iScalar);
202 __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
203 int8_t* complexVectorPtr = (int8_t*)complexVector;
204
205 __m256i iMoveMask = _mm256_set_epi8(0x80,
206 0x80,
207 0x80,
208 0x80,
209 0x80,
210 0x80,
211 0x80,
212 0x80,
213 14,
214 12,
215 10,
216 8,
217 6,
218 4,
219 2,
220 0,
221 0x80,
222 0x80,
223 0x80,
224 0x80,
225 0x80,
226 0x80,
227 0x80,
228 0x80,
229 14,
230 12,
231 10,
232 8,
233 6,
234 4,
235 2,
236 0);
237 __m256i qMoveMask = _mm256_set_epi8(0x80,
238 0x80,
239 0x80,
240 0x80,
241 0x80,
242 0x80,
243 0x80,
244 0x80,
245 15,
246 13,
247 11,
248 9,
249 7,
250 5,
251 3,
252 1,
253 0x80,
254 0x80,
255 0x80,
256 0x80,
257 0x80,
258 0x80,
259 0x80,
260 0x80,
261 15,
262 13,
263 11,
264 9,
265 7,
266 5,
267 3,
268 1);
269
270 for (; number < sixteenthPoints; number++) {
271 complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
272 complexVectorPtr += 32;
273 iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
274 qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
275
276 iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
277 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
278 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
279 _mm256_store_ps(iBufferPtr, iFloatValue);
280 iBufferPtr += 8;
281
282 iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
283 iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
284 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
285 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
286 _mm256_store_ps(iBufferPtr, iFloatValue);
287 iBufferPtr += 8;
288
289 qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
290 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
291 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
292 _mm256_store_ps(qBufferPtr, qFloatValue);
293 qBufferPtr += 8;
294
295 qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
296 qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
297 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
298 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
299 _mm256_store_ps(qBufferPtr, qFloatValue);
300 qBufferPtr += 8;
301 }
302
303 number = sixteenthPoints * 16;
304 for (; number < num_points; number++) {
305 *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
306 *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
307 }
308}
309#endif /* LV_HAVE_AVX2 */
310
311
312#ifdef LV_HAVE_GENERIC
313
314static inline void
316 float* qBuffer,
317 const lv_8sc_t* complexVector,
318 const float scalar,
319 unsigned int num_points)
320{
321 const int8_t* complexVectorPtr = (const int8_t*)complexVector;
322 float* iBufferPtr = iBuffer;
323 float* qBufferPtr = qBuffer;
324 unsigned int number;
325 const float invScalar = 1.0 / scalar;
326 for (number = 0; number < num_points; number++) {
327 *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
328 *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
329 }
330}
331#endif /* LV_HAVE_GENERIC */
332
333
334#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
335
336
337#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
338#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
339
340#include <inttypes.h>
341#include <stdio.h>
342#include <volk/volk_common.h>
343
344#ifdef LV_HAVE_AVX2
345#include <immintrin.h>
346
347static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
348 float* qBuffer,
349 const lv_8sc_t* complexVector,
350 const float scalar,
351 unsigned int num_points)
352{
353 float* iBufferPtr = iBuffer;
354 float* qBufferPtr = qBuffer;
355
356 unsigned int number = 0;
357 const unsigned int sixteenthPoints = num_points / 16;
358 __m256 iFloatValue, qFloatValue;
359
360 const float iScalar = 1.0 / scalar;
361 __m256 invScalar = _mm256_set1_ps(iScalar);
362 __m256i complexVal, iIntVal, qIntVal;
363 __m128i iComplexVal, qComplexVal;
364 int8_t* complexVectorPtr = (int8_t*)complexVector;
365
366 __m256i MoveMask = _mm256_set_epi8(15,
367 13,
368 11,
369 9,
370 7,
371 5,
372 3,
373 1,
374 14,
375 12,
376 10,
377 8,
378 6,
379 4,
380 2,
381 0,
382 15,
383 13,
384 11,
385 9,
386 7,
387 5,
388 3,
389 1,
390 14,
391 12,
392 10,
393 8,
394 6,
395 4,
396 2,
397 0);
398
399 for (; number < sixteenthPoints; number++) {
400 complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
401 complexVectorPtr += 32;
402 complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
403 complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
404 iComplexVal = _mm256_extractf128_si256(complexVal, 0);
405 qComplexVal = _mm256_extractf128_si256(complexVal, 1);
406
407 iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
408 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
409 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
410 _mm256_storeu_ps(iBufferPtr, iFloatValue);
411 iBufferPtr += 8;
412
413 qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
414 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
415 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
416 _mm256_storeu_ps(qBufferPtr, qFloatValue);
417 qBufferPtr += 8;
418
419 complexVal = _mm256_srli_si256(complexVal, 8);
420 iComplexVal = _mm256_extractf128_si256(complexVal, 0);
421 qComplexVal = _mm256_extractf128_si256(complexVal, 1);
422
423 iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
424 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
425 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
426 _mm256_storeu_ps(iBufferPtr, iFloatValue);
427 iBufferPtr += 8;
428
429 qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
430 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
431 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
432 _mm256_storeu_ps(qBufferPtr, qFloatValue);
433 qBufferPtr += 8;
434 }
435
436 number = sixteenthPoints * 16;
437 for (; number < num_points; number++) {
438 *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
439 *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
440 }
441}
442#endif /* LV_HAVE_AVX2 */
443
444#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */