Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
41#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
42#define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
43
44#include <inttypes.h>
45#include <stdio.h>
46
47#ifdef LV_HAVE_AVX2
48#include <immintrin.h>
49
50static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
51 int16_t* qBuffer,
52 const lv_8sc_t* complexVector,
53 unsigned int num_points)
54{
55 unsigned int number = 0;
56 const int8_t* complexVectorPtr = (int8_t*)complexVector;
57 int16_t* iBufferPtr = iBuffer;
58 int16_t* qBufferPtr = qBuffer;
59 __m256i MoveMask = _mm256_set_epi8(15,
60 13,
61 11,
62 9,
63 7,
64 5,
65 3,
66 1,
67 14,
68 12,
69 10,
70 8,
71 6,
72 4,
73 2,
74 0,
75 15,
76 13,
77 11,
78 9,
79 7,
80 5,
81 3,
82 1,
83 14,
84 12,
85 10,
86 8,
87 6,
88 4,
89 2,
90 0);
91 __m256i complexVal, iOutputVal, qOutputVal;
92 __m128i iOutputVal0, qOutputVal0;
93
94 unsigned int sixteenthPoints = num_points / 16;
95
96 for (number = 0; number < sixteenthPoints; number++) {
97 complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
98 complexVectorPtr += 32;
99
100 complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
101 complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
102
103 iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
104 qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
105
106 iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
107 iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
108
109 qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
110 qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
111
112 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
113 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
114
115 iBufferPtr += 16;
116 qBufferPtr += 16;
117 }
118
119 number = sixteenthPoints * 16;
120 for (; number < num_points; number++) {
121 *iBufferPtr++ =
122 ((int16_t)*complexVectorPtr++) *
123 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
124 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
125 }
126}
127#endif /* LV_HAVE_AVX2 */
128
129#ifdef LV_HAVE_SSE4_1
130#include <smmintrin.h>
131
132static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
133 int16_t* qBuffer,
134 const lv_8sc_t* complexVector,
135 unsigned int num_points)
136{
137 unsigned int number = 0;
138 const int8_t* complexVectorPtr = (int8_t*)complexVector;
139 int16_t* iBufferPtr = iBuffer;
140 int16_t* qBufferPtr = qBuffer;
141 __m128i iMoveMask = _mm_set_epi8(0x80,
142 0x80,
143 0x80,
144 0x80,
145 0x80,
146 0x80,
147 0x80,
148 0x80,
149 14,
150 12,
151 10,
152 8,
153 6,
154 4,
155 2,
156 0); // set 16 byte values
157 __m128i qMoveMask = _mm_set_epi8(
158 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
159 __m128i complexVal, iOutputVal, qOutputVal;
160
161 unsigned int eighthPoints = num_points / 8;
162
163 for (number = 0; number < eighthPoints; number++) {
164 complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
165 complexVectorPtr += 16; // aligned load
166
167 iOutputVal = _mm_shuffle_epi8(complexVal,
168 iMoveMask); // shuffle 16 bytes of 128bit complexVal
169 qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
170
171 iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
172 // of lower 8 bytes of input to output
173 iOutputVal =
174 _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
175 // 16-bit integers, shift in with zeros
176
177 qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
178 qOutputVal = _mm_slli_epi16(qOutputVal, 8);
179
180 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
181 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
182
183 iBufferPtr += 8;
184 qBufferPtr += 8;
185 }
186
187 number = eighthPoints * 8;
188 for (; number < num_points; number++) {
189 *iBufferPtr++ =
190 ((int16_t)*complexVectorPtr++) *
191 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
192 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
193 }
194}
195#endif /* LV_HAVE_SSE4_1 */
196
197
198#ifdef LV_HAVE_AVX
199#include <immintrin.h>
200
201static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
202 int16_t* qBuffer,
203 const lv_8sc_t* complexVector,
204 unsigned int num_points)
205{
206 unsigned int number = 0;
207 const int8_t* complexVectorPtr = (int8_t*)complexVector;
208 int16_t* iBufferPtr = iBuffer;
209 int16_t* qBufferPtr = qBuffer;
210 __m128i iMoveMask = _mm_set_epi8(0x80,
211 0x80,
212 0x80,
213 0x80,
214 0x80,
215 0x80,
216 0x80,
217 0x80,
218 14,
219 12,
220 10,
221 8,
222 6,
223 4,
224 2,
225 0); // set 16 byte values
226 __m128i qMoveMask = _mm_set_epi8(
227 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
228 __m256i complexVal, iOutputVal, qOutputVal;
229 __m128i complexVal1, complexVal0;
230 __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
231
232 unsigned int sixteenthPoints = num_points / 16;
233
234 for (number = 0; number < sixteenthPoints; number++) {
235 complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
236 complexVectorPtr += 32; // aligned load
237
238 // Extract from complexVal to iOutputVal and qOutputVal
239 complexVal1 = _mm256_extractf128_si256(complexVal, 1);
240 complexVal0 = _mm256_extractf128_si256(complexVal, 0);
241
242 iOutputVal1 = _mm_shuffle_epi8(
243 complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
244 iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
245 qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
246 qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
247
248 iOutputVal1 =
249 _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
250 // lower 8 bytes of input to output
251 iOutputVal1 =
252 _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
253 // 16-bit integers, shift in with zeros
254 iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
255 iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
256
257 qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
258 qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
259 qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
260 qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
261
262 // Pack iOutputVal0,1 to iOutputVal
263 __m256i dummy = _mm256_setzero_si256();
264 iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
265 iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
266 qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
267 qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
268
269 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
270 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
271
272 iBufferPtr += 16;
273 qBufferPtr += 16;
274 }
275
276 number = sixteenthPoints * 16;
277 for (; number < num_points; number++) {
278 *iBufferPtr++ =
279 ((int16_t)*complexVectorPtr++) *
280 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
281 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
282 }
283}
284#endif /* LV_HAVE_AVX */
285
286
287#ifdef LV_HAVE_GENERIC
288
289static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
290 int16_t* qBuffer,
291 const lv_8sc_t* complexVector,
292 unsigned int num_points)
293{
294 const int8_t* complexVectorPtr = (const int8_t*)complexVector;
295 int16_t* iBufferPtr = iBuffer;
296 int16_t* qBufferPtr = qBuffer;
297 unsigned int number;
298 for (number = 0; number < num_points; number++) {
299 *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
300 *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
301 }
302}
303#endif /* LV_HAVE_GENERIC */
304
305
306#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
307
308#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
309#define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
310
311#include <inttypes.h>
312#include <stdio.h>
313
314#ifdef LV_HAVE_AVX2
315#include <immintrin.h>
316
317static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
318 int16_t* qBuffer,
319 const lv_8sc_t* complexVector,
320 unsigned int num_points)
321{
322 unsigned int number = 0;
323 const int8_t* complexVectorPtr = (int8_t*)complexVector;
324 int16_t* iBufferPtr = iBuffer;
325 int16_t* qBufferPtr = qBuffer;
326 __m256i MoveMask = _mm256_set_epi8(15,
327 13,
328 11,
329 9,
330 7,
331 5,
332 3,
333 1,
334 14,
335 12,
336 10,
337 8,
338 6,
339 4,
340 2,
341 0,
342 15,
343 13,
344 11,
345 9,
346 7,
347 5,
348 3,
349 1,
350 14,
351 12,
352 10,
353 8,
354 6,
355 4,
356 2,
357 0);
358 __m256i complexVal, iOutputVal, qOutputVal;
359 __m128i iOutputVal0, qOutputVal0;
360
361 unsigned int sixteenthPoints = num_points / 16;
362
363 for (number = 0; number < sixteenthPoints; number++) {
364 complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
365 complexVectorPtr += 32;
366
367 complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
368 complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
369
370 iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
371 qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
372
373 iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
374 iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
375
376 qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
377 qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
378
379 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
380 _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
381
382 iBufferPtr += 16;
383 qBufferPtr += 16;
384 }
385
386 number = sixteenthPoints * 16;
387 for (; number < num_points; number++) {
388 *iBufferPtr++ =
389 ((int16_t)*complexVectorPtr++) *
390 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
391 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
392 }
393}
394#endif /* LV_HAVE_AVX2 */
395#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */