Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
41#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
42#define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
43
44#include <inttypes.h>
45#include <stdio.h>
46
47
48#ifdef LV_HAVE_AVX2
49#include <immintrin.h>
50
51static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
52 const lv_16sc_t* complexVector,
53 unsigned int num_points)
54{
55 unsigned int number = 0;
56 const int16_t* complexVectorPtr = (int16_t*)complexVector;
57 int16_t* iBufferPtr = iBuffer;
58
59 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
60 0x80,
61 0x80,
62 0x80,
63 0x80,
64 0x80,
65 0x80,
66 0x80,
67 13,
68 12,
69 9,
70 8,
71 5,
72 4,
73 1,
74 0,
75 0x80,
76 0x80,
77 0x80,
78 0x80,
79 0x80,
80 0x80,
81 0x80,
82 0x80,
83 13,
84 12,
85 9,
86 8,
87 5,
88 4,
89 1,
90 0);
91 __m256i iMoveMask2 = _mm256_set_epi8(13,
92 12,
93 9,
94 8,
95 5,
96 4,
97 1,
98 0,
99 0x80,
100 0x80,
101 0x80,
102 0x80,
103 0x80,
104 0x80,
105 0x80,
106 0x80,
107 13,
108 12,
109 9,
110 8,
111 5,
112 4,
113 1,
114 0,
115 0x80,
116 0x80,
117 0x80,
118 0x80,
119 0x80,
120 0x80,
121 0x80,
122 0x80);
123
124 __m256i complexVal1, complexVal2, iOutputVal;
125
126 unsigned int sixteenthPoints = num_points / 16;
127
128 for (number = 0; number < sixteenthPoints; number++) {
129 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
130 complexVectorPtr += 16;
131 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
132 complexVectorPtr += 16;
133
134 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
135 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
136
137 iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
138 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
139
140 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
141
142 iBufferPtr += 16;
143 }
144
145 number = sixteenthPoints * 16;
146 for (; number < num_points; number++) {
147 *iBufferPtr++ = *complexVectorPtr++;
148 complexVectorPtr++;
149 }
150}
151#endif /* LV_HAVE_AVX2 */
152
153#ifdef LV_HAVE_SSSE3
154#include <tmmintrin.h>
155
156static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
157 const lv_16sc_t* complexVector,
158 unsigned int num_points)
159{
160 unsigned int number = 0;
161 const int16_t* complexVectorPtr = (int16_t*)complexVector;
162 int16_t* iBufferPtr = iBuffer;
163
164 __m128i iMoveMask1 = _mm_set_epi8(
165 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
166 __m128i iMoveMask2 = _mm_set_epi8(
167 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
168
169 __m128i complexVal1, complexVal2, iOutputVal;
170
171 unsigned int eighthPoints = num_points / 8;
172
173 for (number = 0; number < eighthPoints; number++) {
174 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
175 complexVectorPtr += 8;
176 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
177 complexVectorPtr += 8;
178
179 complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
180 complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
181
182 iOutputVal = _mm_or_si128(complexVal1, complexVal2);
183
184 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
185
186 iBufferPtr += 8;
187 }
188
189 number = eighthPoints * 8;
190 for (; number < num_points; number++) {
191 *iBufferPtr++ = *complexVectorPtr++;
192 complexVectorPtr++;
193 }
194}
195#endif /* LV_HAVE_SSSE3 */
196
197
198#ifdef LV_HAVE_SSE2
199#include <emmintrin.h>
200
201static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
202 const lv_16sc_t* complexVector,
203 unsigned int num_points)
204{
205 unsigned int number = 0;
206 const int16_t* complexVectorPtr = (int16_t*)complexVector;
207 int16_t* iBufferPtr = iBuffer;
208 __m128i complexVal1, complexVal2, iOutputVal;
209 __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
210 __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
211
212 unsigned int eighthPoints = num_points / 8;
213
214 for (number = 0; number < eighthPoints; number++) {
215 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
216 complexVectorPtr += 8;
217 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
218 complexVectorPtr += 8;
219
220 complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
221
222 complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
223
224 complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
225
226 complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
227
228 complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
229
230 complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
231
232 iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
233 _mm_and_si128(complexVal2, highMask));
234
235 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
236
237 iBufferPtr += 8;
238 }
239
240 number = eighthPoints * 8;
241 for (; number < num_points; number++) {
242 *iBufferPtr++ = *complexVectorPtr++;
243 complexVectorPtr++;
244 }
245}
246#endif /* LV_HAVE_SSE2 */
247
248#ifdef LV_HAVE_GENERIC
249
250static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
251 const lv_16sc_t* complexVector,
252 unsigned int num_points)
253{
254 unsigned int number = 0;
255 const int16_t* complexVectorPtr = (int16_t*)complexVector;
256 int16_t* iBufferPtr = iBuffer;
257 for (number = 0; number < num_points; number++) {
258 *iBufferPtr++ = *complexVectorPtr++;
259 complexVectorPtr++;
260 }
261}
262#endif /* LV_HAVE_GENERIC */
263
264
265#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
266
267
268#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_u_H
269#define INCLUDED_volk_16ic_deinterleave_real_16i_u_H
270
271#include <inttypes.h>
272#include <stdio.h>
273
274
275#ifdef LV_HAVE_AVX2
276#include <immintrin.h>
277
278static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
279 const lv_16sc_t* complexVector,
280 unsigned int num_points)
281{
282 unsigned int number = 0;
283 const int16_t* complexVectorPtr = (int16_t*)complexVector;
284 int16_t* iBufferPtr = iBuffer;
285
286 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
287 0x80,
288 0x80,
289 0x80,
290 0x80,
291 0x80,
292 0x80,
293 0x80,
294 13,
295 12,
296 9,
297 8,
298 5,
299 4,
300 1,
301 0,
302 0x80,
303 0x80,
304 0x80,
305 0x80,
306 0x80,
307 0x80,
308 0x80,
309 0x80,
310 13,
311 12,
312 9,
313 8,
314 5,
315 4,
316 1,
317 0);
318 __m256i iMoveMask2 = _mm256_set_epi8(13,
319 12,
320 9,
321 8,
322 5,
323 4,
324 1,
325 0,
326 0x80,
327 0x80,
328 0x80,
329 0x80,
330 0x80,
331 0x80,
332 0x80,
333 0x80,
334 13,
335 12,
336 9,
337 8,
338 5,
339 4,
340 1,
341 0,
342 0x80,
343 0x80,
344 0x80,
345 0x80,
346 0x80,
347 0x80,
348 0x80,
349 0x80);
350
351 __m256i complexVal1, complexVal2, iOutputVal;
352
353 unsigned int sixteenthPoints = num_points / 16;
354
355 for (number = 0; number < sixteenthPoints; number++) {
356 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
357 complexVectorPtr += 16;
358 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
359 complexVectorPtr += 16;
360
361 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
362 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
363
364 iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
365 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
366
367 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
368
369 iBufferPtr += 16;
370 }
371
372 number = sixteenthPoints * 16;
373 for (; number < num_points; number++) {
374 *iBufferPtr++ = *complexVectorPtr++;
375 complexVectorPtr++;
376 }
377}
378#endif /* LV_HAVE_AVX2 */
379
380#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */