Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_binary_slicer_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59#ifndef INCLUDED_volk_32f_binary_slicer_8i_H
60#define INCLUDED_volk_32f_binary_slicer_8i_H
61
62
63#ifdef LV_HAVE_GENERIC
64
65static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector,
66 const float* aVector,
67 unsigned int num_points)
68{
69 int8_t* cPtr = cVector;
70 const float* aPtr = aVector;
71 unsigned int number = 0;
72
73 for (number = 0; number < num_points; number++) {
74 if (*aPtr++ >= 0) {
75 *cPtr++ = 1;
76 } else {
77 *cPtr++ = 0;
78 }
79 }
80}
81#endif /* LV_HAVE_GENERIC */
82
83
84#ifdef LV_HAVE_GENERIC
85
86static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector,
87 const float* aVector,
88 unsigned int num_points)
89{
90 int8_t* cPtr = cVector;
91 const float* aPtr = aVector;
92 unsigned int number = 0;
93
94 for (number = 0; number < num_points; number++) {
95 *cPtr++ = (*aPtr++ >= 0);
96 }
97}
98#endif /* LV_HAVE_GENERIC */
99
100
101#ifdef LV_HAVE_AVX2
102#include <immintrin.h>
103
104static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
105 const float* aVector,
106 unsigned int num_points)
107{
108 int8_t* cPtr = cVector;
109 const float* aPtr = aVector;
110 unsigned int number = 0;
111 unsigned int n32points = num_points / 32;
112
113 const __m256 zero_val = _mm256_set1_ps(0.0f);
114 __m256 a0_val, a1_val, a2_val, a3_val;
115 __m256 res0_f, res1_f, res2_f, res3_f;
116 __m256i res0_i, res1_i, res2_i, res3_i;
117 __m256i byte_shuffle = _mm256_set_epi8(15,
118 14,
119 13,
120 12,
121 7,
122 6,
123 5,
124 4,
125 11,
126 10,
127 9,
128 8,
129 3,
130 2,
131 1,
132 0,
133 15,
134 14,
135 13,
136 12,
137 7,
138 6,
139 5,
140 4,
141 11,
142 10,
143 9,
144 8,
145 3,
146 2,
147 1,
148 0);
149
150 for (number = 0; number < n32points; number++) {
151 a0_val = _mm256_load_ps(aPtr);
152 a1_val = _mm256_load_ps(aPtr + 8);
153 a2_val = _mm256_load_ps(aPtr + 16);
154 a3_val = _mm256_load_ps(aPtr + 24);
155
156 // compare >= 0; return float
157 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
158 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
159 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
160 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
161
162 // convert to 32i and >> 31
163 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
164 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
165 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
166 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
167
168 // pack in to 16-bit results
169 res0_i = _mm256_packs_epi32(res0_i, res1_i);
170 res2_i = _mm256_packs_epi32(res2_i, res3_i);
171 // pack in to 8-bit results
172 // res0: (after packs_epi32)
173 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
174 // res2:
175 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
176 res0_i = _mm256_packs_epi16(res0_i, res2_i);
177 // shuffle the lanes
178 // res0: (after packs_epi16)
179 // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
180 // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
181 // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
182 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
183
184 // shuffle bytes within lanes
185 // res0: (after shuffle_epi8)
186 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
187 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
188 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
189
190 _mm256_store_si256((__m256i*)cPtr, res0_i);
191 aPtr += 32;
192 cPtr += 32;
193 }
194
195 for (number = n32points * 32; number < num_points; number++) {
196 if (*aPtr++ >= 0) {
197 *cPtr++ = 1;
198 } else {
199 *cPtr++ = 0;
200 }
201 }
202}
203#endif
204
205#ifdef LV_HAVE_AVX2
206#include <immintrin.h>
207
208static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
209 const float* aVector,
210 unsigned int num_points)
211{
212 int8_t* cPtr = cVector;
213 const float* aPtr = aVector;
214 unsigned int number = 0;
215 unsigned int n32points = num_points / 32;
216
217 const __m256 zero_val = _mm256_set1_ps(0.0f);
218 __m256 a0_val, a1_val, a2_val, a3_val;
219 __m256 res0_f, res1_f, res2_f, res3_f;
220 __m256i res0_i, res1_i, res2_i, res3_i;
221 __m256i byte_shuffle = _mm256_set_epi8(15,
222 14,
223 13,
224 12,
225 7,
226 6,
227 5,
228 4,
229 11,
230 10,
231 9,
232 8,
233 3,
234 2,
235 1,
236 0,
237 15,
238 14,
239 13,
240 12,
241 7,
242 6,
243 5,
244 4,
245 11,
246 10,
247 9,
248 8,
249 3,
250 2,
251 1,
252 0);
253
254 for (number = 0; number < n32points; number++) {
255 a0_val = _mm256_loadu_ps(aPtr);
256 a1_val = _mm256_loadu_ps(aPtr + 8);
257 a2_val = _mm256_loadu_ps(aPtr + 16);
258 a3_val = _mm256_loadu_ps(aPtr + 24);
259
260 // compare >= 0; return float
261 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
262 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
263 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
264 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
265
266 // convert to 32i and >> 31
267 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
268 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
269 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
270 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
271
272 // pack in to 16-bit results
273 res0_i = _mm256_packs_epi32(res0_i, res1_i);
274 res2_i = _mm256_packs_epi32(res2_i, res3_i);
275 // pack in to 8-bit results
276 // res0: (after packs_epi32)
277 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
278 // res2:
279 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
280 res0_i = _mm256_packs_epi16(res0_i, res2_i);
281 // shuffle the lanes
282 // res0: (after packs_epi16)
283 // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
284 // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
285 // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
286 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
287
288 // shuffle bytes within lanes
289 // res0: (after shuffle_epi8)
290 // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
291 // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
292 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
293
294 _mm256_storeu_si256((__m256i*)cPtr, res0_i);
295 aPtr += 32;
296 cPtr += 32;
297 }
298
299 for (number = n32points * 32; number < num_points; number++) {
300 if (*aPtr++ >= 0) {
301 *cPtr++ = 1;
302 } else {
303 *cPtr++ = 0;
304 }
305 }
306}
307#endif
308
309
310#ifdef LV_HAVE_SSE2
311
312#include <emmintrin.h>
313
314static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector,
315 const float* aVector,
316 unsigned int num_points)
317{
318 int8_t* cPtr = cVector;
319 const float* aPtr = aVector;
320 unsigned int number = 0;
321
322 unsigned int n16points = num_points / 16;
323 __m128 a0_val, a1_val, a2_val, a3_val;
324 __m128 res0_f, res1_f, res2_f, res3_f;
325 __m128i res0_i, res1_i, res2_i, res3_i;
326 __m128 zero_val;
327 zero_val = _mm_set1_ps(0.0f);
328
329 for (number = 0; number < n16points; number++) {
330 a0_val = _mm_load_ps(aPtr);
331 a1_val = _mm_load_ps(aPtr + 4);
332 a2_val = _mm_load_ps(aPtr + 8);
333 a3_val = _mm_load_ps(aPtr + 12);
334
335 // compare >= 0; return float
336 res0_f = _mm_cmpge_ps(a0_val, zero_val);
337 res1_f = _mm_cmpge_ps(a1_val, zero_val);
338 res2_f = _mm_cmpge_ps(a2_val, zero_val);
339 res3_f = _mm_cmpge_ps(a3_val, zero_val);
340
341 // convert to 32i and >> 31
342 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
343 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
344 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
345 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
346
347 // pack into 16-bit results
348 res0_i = _mm_packs_epi32(res0_i, res1_i);
349 res2_i = _mm_packs_epi32(res2_i, res3_i);
350
351 // pack into 8-bit results
352 res0_i = _mm_packs_epi16(res0_i, res2_i);
353
354 _mm_store_si128((__m128i*)cPtr, res0_i);
355
356 cPtr += 16;
357 aPtr += 16;
358 }
359
360 for (number = n16points * 16; number < num_points; number++) {
361 if (*aPtr++ >= 0) {
362 *cPtr++ = 1;
363 } else {
364 *cPtr++ = 0;
365 }
366 }
367}
368#endif /* LV_HAVE_SSE2 */
369
370
371#ifdef LV_HAVE_SSE2
372#include <emmintrin.h>
373
374static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector,
375 const float* aVector,
376 unsigned int num_points)
377{
378 int8_t* cPtr = cVector;
379 const float* aPtr = aVector;
380 unsigned int number = 0;
381
382 unsigned int n16points = num_points / 16;
383 __m128 a0_val, a1_val, a2_val, a3_val;
384 __m128 res0_f, res1_f, res2_f, res3_f;
385 __m128i res0_i, res1_i, res2_i, res3_i;
386 __m128 zero_val;
387 zero_val = _mm_set1_ps(0.0f);
388
389 for (number = 0; number < n16points; number++) {
390 a0_val = _mm_loadu_ps(aPtr);
391 a1_val = _mm_loadu_ps(aPtr + 4);
392 a2_val = _mm_loadu_ps(aPtr + 8);
393 a3_val = _mm_loadu_ps(aPtr + 12);
394
395 // compare >= 0; return float
396 res0_f = _mm_cmpge_ps(a0_val, zero_val);
397 res1_f = _mm_cmpge_ps(a1_val, zero_val);
398 res2_f = _mm_cmpge_ps(a2_val, zero_val);
399 res3_f = _mm_cmpge_ps(a3_val, zero_val);
400
401 // convert to 32i and >> 31
402 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
403 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
404 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
405 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
406
407 // pack into 16-bit results
408 res0_i = _mm_packs_epi32(res0_i, res1_i);
409 res2_i = _mm_packs_epi32(res2_i, res3_i);
410
411 // pack into 8-bit results
412 res0_i = _mm_packs_epi16(res0_i, res2_i);
413
414 _mm_storeu_si128((__m128i*)cPtr, res0_i);
415
416 cPtr += 16;
417 aPtr += 16;
418 }
419
420 for (number = n16points * 16; number < num_points; number++) {
421 if (*aPtr++ >= 0) {
422 *cPtr++ = 1;
423 } else {
424 *cPtr++ = 0;
425 }
426 }
427}
428#endif /* LV_HAVE_SSE2 */
429
430
431#ifdef LV_HAVE_NEON
432#include <arm_neon.h>
433
434static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
435 const float* aVector,
436 unsigned int num_points)
437{
438 int8_t* cPtr = cVector;
439 const float* aPtr = aVector;
440 unsigned int number = 0;
441 unsigned int n16points = num_points / 16;
442
443 float32x4x2_t input_val0, input_val1;
444 float32x4_t zero_val;
445 uint32x4x2_t res0_u32, res1_u32;
446 uint16x4x2_t res0_u16x4, res1_u16x4;
447 uint16x8x2_t res_u16x8;
448 uint8x8x2_t res_u8;
449 uint8x8_t one;
450
451 zero_val = vdupq_n_f32(0.0);
452 one = vdup_n_u8(0x01);
453
454 // TODO: this is a good candidate for asm because the vcombines
455 // can be eliminated simply by picking dst registers that are
456 // adjacent.
457 for (number = 0; number < n16points; number++) {
458 input_val0 = vld2q_f32(aPtr);
459 input_val1 = vld2q_f32(aPtr + 8);
460
461 // test against 0; return uint32
462 res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
463 res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
464 res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
465 res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
466
467 // narrow uint32 -> uint16 followed by combine to 8-element vectors
468 res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
469 res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
470 res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
471 res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
472
473 res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
474 res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
475
476 // narrow uint16x8 -> uint8x8
477 res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
478 res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
479 // we *could* load twice as much data and do another vcombine here
480 // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
481 // but that turns out to be ~16% slower than this version on zc702
482 // it's possible register contention in GCC scheduler slows it down
483 // and a hand-written asm with quad-word u8 registers is much faster.
484
485 res_u8.val[0] = vand_u8(one, res_u8.val[0]);
486 res_u8.val[1] = vand_u8(one, res_u8.val[1]);
487
488 vst2_u8((unsigned char*)cPtr, res_u8);
489 cPtr += 16;
490 aPtr += 16;
491 }
492
493 for (number = n16points * 16; number < num_points; number++) {
494 if (*aPtr++ >= 0) {
495 *cPtr++ = 1;
496 } else {
497 *cPtr++ = 0;
498 }
499 }
500}
501#endif /* LV_HAVE_NEON */
502
503
504#endif /* INCLUDED_volk_32f_binary_slicer_8i_H */