Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
55#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
56#define INCLUDED_volk_32f_s32f_convert_16i_u_H
57
58#include <inttypes.h>
59#include <limits.h>
60#include <stdio.h>
61
62#ifdef LV_HAVE_AVX2
63#include <immintrin.h>
64
65static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
66 const float* inputVector,
67 const float scalar,
68 unsigned int num_points)
69{
70 unsigned int number = 0;
71
72 const unsigned int sixteenthPoints = num_points / 16;
73
74 const float* inputVectorPtr = (const float*)inputVector;
75 int16_t* outputVectorPtr = outputVector;
76
77 float min_val = SHRT_MIN;
78 float max_val = SHRT_MAX;
79 float r;
80
81 __m256 vScalar = _mm256_set1_ps(scalar);
82 __m256 inputVal1, inputVal2;
83 __m256i intInputVal1, intInputVal2;
84 __m256 ret1, ret2;
85 __m256 vmin_val = _mm256_set1_ps(min_val);
86 __m256 vmax_val = _mm256_set1_ps(max_val);
87
88 for (; number < sixteenthPoints; number++) {
89 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
90 inputVectorPtr += 8;
91 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
92 inputVectorPtr += 8;
93
94 // Scale and clip
95 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
96 vmin_val);
97 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
98 vmin_val);
99
100 intInputVal1 = _mm256_cvtps_epi32(ret1);
101 intInputVal2 = _mm256_cvtps_epi32(ret2);
102
103 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
104 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
105
106 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
107 outputVectorPtr += 16;
108 }
109
110 number = sixteenthPoints * 16;
111 for (; number < num_points; number++) {
112 r = inputVector[number] * scalar;
113 if (r > max_val)
114 r = max_val;
115 else if (r < min_val)
116 r = min_val;
117 outputVector[number] = (int16_t)rintf(r);
118 }
119}
120#endif /* LV_HAVE_AVX2 */
121
122
123#ifdef LV_HAVE_AVX
124#include <immintrin.h>
125
126static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
127 const float* inputVector,
128 const float scalar,
129 unsigned int num_points)
130{
131 unsigned int number = 0;
132
133 const unsigned int eighthPoints = num_points / 8;
134
135 const float* inputVectorPtr = (const float*)inputVector;
136 int16_t* outputVectorPtr = outputVector;
137
138 float min_val = SHRT_MIN;
139 float max_val = SHRT_MAX;
140 float r;
141
142 __m256 vScalar = _mm256_set1_ps(scalar);
143 __m256 inputVal, ret;
144 __m256i intInputVal;
145 __m128i intInputVal1, intInputVal2;
146 __m256 vmin_val = _mm256_set1_ps(min_val);
147 __m256 vmax_val = _mm256_set1_ps(max_val);
148
149 for (; number < eighthPoints; number++) {
150 inputVal = _mm256_loadu_ps(inputVectorPtr);
151 inputVectorPtr += 8;
152
153 // Scale and clip
154 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
155 vmin_val);
156
157 intInputVal = _mm256_cvtps_epi32(ret);
158
159 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
160 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
161
162 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
163
164 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
165 outputVectorPtr += 8;
166 }
167
168 number = eighthPoints * 8;
169 for (; number < num_points; number++) {
170 r = inputVector[number] * scalar;
171 if (r > max_val)
172 r = max_val;
173 else if (r < min_val)
174 r = min_val;
175 outputVector[number] = (int16_t)rintf(r);
176 }
177}
178#endif /* LV_HAVE_AVX */
179
180
181#ifdef LV_HAVE_SSE2
182#include <emmintrin.h>
183
184static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
185 const float* inputVector,
186 const float scalar,
187 unsigned int num_points)
188{
189 unsigned int number = 0;
190
191 const unsigned int eighthPoints = num_points / 8;
192
193 const float* inputVectorPtr = (const float*)inputVector;
194 int16_t* outputVectorPtr = outputVector;
195
196 float min_val = SHRT_MIN;
197 float max_val = SHRT_MAX;
198 float r;
199
200 __m128 vScalar = _mm_set_ps1(scalar);
201 __m128 inputVal1, inputVal2;
202 __m128i intInputVal1, intInputVal2;
203 __m128 ret1, ret2;
204 __m128 vmin_val = _mm_set_ps1(min_val);
205 __m128 vmax_val = _mm_set_ps1(max_val);
206
207 for (; number < eighthPoints; number++) {
208 inputVal1 = _mm_loadu_ps(inputVectorPtr);
209 inputVectorPtr += 4;
210 inputVal2 = _mm_loadu_ps(inputVectorPtr);
211 inputVectorPtr += 4;
212
213 // Scale and clip
214 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
215 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
216
217 intInputVal1 = _mm_cvtps_epi32(ret1);
218 intInputVal2 = _mm_cvtps_epi32(ret2);
219
220 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
221
222 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
223 outputVectorPtr += 8;
224 }
225
226 number = eighthPoints * 8;
227 for (; number < num_points; number++) {
228 r = inputVector[number] * scalar;
229 if (r > max_val)
230 r = max_val;
231 else if (r < min_val)
232 r = min_val;
233 outputVector[number] = (int16_t)rintf(r);
234 }
235}
236#endif /* LV_HAVE_SSE2 */
237
238
239#ifdef LV_HAVE_SSE
240#include <xmmintrin.h>
241
242static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
243 const float* inputVector,
244 const float scalar,
245 unsigned int num_points)
246{
247 unsigned int number = 0;
248
249 const unsigned int quarterPoints = num_points / 4;
250
251 const float* inputVectorPtr = (const float*)inputVector;
252 int16_t* outputVectorPtr = outputVector;
253
254 float min_val = SHRT_MIN;
255 float max_val = SHRT_MAX;
256 float r;
257
258 __m128 vScalar = _mm_set_ps1(scalar);
259 __m128 ret;
260 __m128 vmin_val = _mm_set_ps1(min_val);
261 __m128 vmax_val = _mm_set_ps1(max_val);
262
263 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
264
265 for (; number < quarterPoints; number++) {
266 ret = _mm_loadu_ps(inputVectorPtr);
267 inputVectorPtr += 4;
268
269 // Scale and clip
270 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
271
272 _mm_store_ps(outputFloatBuffer, ret);
273 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
274 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
275 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
276 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
277 }
278
279 number = quarterPoints * 4;
280 for (; number < num_points; number++) {
281 r = inputVector[number] * scalar;
282 if (r > max_val)
283 r = max_val;
284 else if (r < min_val)
285 r = min_val;
286 outputVector[number] = (int16_t)rintf(r);
287 }
288}
289#endif /* LV_HAVE_SSE */
290
291
292#ifdef LV_HAVE_GENERIC
293
294static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
295 const float* inputVector,
296 const float scalar,
297 unsigned int num_points)
298{
299 int16_t* outputVectorPtr = outputVector;
300 const float* inputVectorPtr = inputVector;
301 unsigned int number = 0;
302 float min_val = SHRT_MIN;
303 float max_val = SHRT_MAX;
304 float r;
305
306 for (number = 0; number < num_points; number++) {
307 r = *inputVectorPtr++ * scalar;
308 if (r > max_val)
309 r = max_val;
310 else if (r < min_val)
311 r = min_val;
312 *outputVectorPtr++ = (int16_t)rintf(r);
313 }
314}
315#endif /* LV_HAVE_GENERIC */
316
317
318#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
319#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
320#define INCLUDED_volk_32f_s32f_convert_16i_a_H
321
322#include <inttypes.h>
323#include <math.h>
324#include <stdio.h>
325#include <volk/volk_common.h>
326
327#ifdef LV_HAVE_AVX2
328#include <immintrin.h>
329
330static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
331 const float* inputVector,
332 const float scalar,
333 unsigned int num_points)
334{
335 unsigned int number = 0;
336
337 const unsigned int sixteenthPoints = num_points / 16;
338
339 const float* inputVectorPtr = (const float*)inputVector;
340 int16_t* outputVectorPtr = outputVector;
341
342 float min_val = SHRT_MIN;
343 float max_val = SHRT_MAX;
344 float r;
345
346 __m256 vScalar = _mm256_set1_ps(scalar);
347 __m256 inputVal1, inputVal2;
348 __m256i intInputVal1, intInputVal2;
349 __m256 ret1, ret2;
350 __m256 vmin_val = _mm256_set1_ps(min_val);
351 __m256 vmax_val = _mm256_set1_ps(max_val);
352
353 for (; number < sixteenthPoints; number++) {
354 inputVal1 = _mm256_load_ps(inputVectorPtr);
355 inputVectorPtr += 8;
356 inputVal2 = _mm256_load_ps(inputVectorPtr);
357 inputVectorPtr += 8;
358
359 // Scale and clip
360 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
361 vmin_val);
362 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
363 vmin_val);
364
365 intInputVal1 = _mm256_cvtps_epi32(ret1);
366 intInputVal2 = _mm256_cvtps_epi32(ret2);
367
368 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
369 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
370
371 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
372 outputVectorPtr += 16;
373 }
374
375 number = sixteenthPoints * 16;
376 for (; number < num_points; number++) {
377 r = inputVector[number] * scalar;
378 if (r > max_val)
379 r = max_val;
380 else if (r < min_val)
381 r = min_val;
382 outputVector[number] = (int16_t)rintf(r);
383 }
384}
385#endif /* LV_HAVE_AVX2 */
386
387
388#ifdef LV_HAVE_AVX
389#include <immintrin.h>
390
391static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
392 const float* inputVector,
393 const float scalar,
394 unsigned int num_points)
395{
396 unsigned int number = 0;
397
398 const unsigned int eighthPoints = num_points / 8;
399
400 const float* inputVectorPtr = (const float*)inputVector;
401 int16_t* outputVectorPtr = outputVector;
402
403 float min_val = SHRT_MIN;
404 float max_val = SHRT_MAX;
405 float r;
406
407 __m256 vScalar = _mm256_set1_ps(scalar);
408 __m256 inputVal, ret;
409 __m256i intInputVal;
410 __m128i intInputVal1, intInputVal2;
411 __m256 vmin_val = _mm256_set1_ps(min_val);
412 __m256 vmax_val = _mm256_set1_ps(max_val);
413
414 for (; number < eighthPoints; number++) {
415 inputVal = _mm256_load_ps(inputVectorPtr);
416 inputVectorPtr += 8;
417
418 // Scale and clip
419 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
420 vmin_val);
421
422 intInputVal = _mm256_cvtps_epi32(ret);
423
424 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
425 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
426
427 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
428
429 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
430 outputVectorPtr += 8;
431 }
432
433 number = eighthPoints * 8;
434 for (; number < num_points; number++) {
435 r = inputVector[number] * scalar;
436 if (r > max_val)
437 r = max_val;
438 else if (r < min_val)
439 r = min_val;
440 outputVector[number] = (int16_t)rintf(r);
441 }
442}
443#endif /* LV_HAVE_AVX */
444
445#ifdef LV_HAVE_SSE2
446#include <emmintrin.h>
447
448static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
449 const float* inputVector,
450 const float scalar,
451 unsigned int num_points)
452{
453 unsigned int number = 0;
454
455 const unsigned int eighthPoints = num_points / 8;
456
457 const float* inputVectorPtr = (const float*)inputVector;
458 int16_t* outputVectorPtr = outputVector;
459
460 float min_val = SHRT_MIN;
461 float max_val = SHRT_MAX;
462 float r;
463
464 __m128 vScalar = _mm_set_ps1(scalar);
465 __m128 inputVal1, inputVal2;
466 __m128i intInputVal1, intInputVal2;
467 __m128 ret1, ret2;
468 __m128 vmin_val = _mm_set_ps1(min_val);
469 __m128 vmax_val = _mm_set_ps1(max_val);
470
471 for (; number < eighthPoints; number++) {
472 inputVal1 = _mm_load_ps(inputVectorPtr);
473 inputVectorPtr += 4;
474 inputVal2 = _mm_load_ps(inputVectorPtr);
475 inputVectorPtr += 4;
476
477 // Scale and clip
478 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
479 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
480
481 intInputVal1 = _mm_cvtps_epi32(ret1);
482 intInputVal2 = _mm_cvtps_epi32(ret2);
483
484 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
485
486 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
487 outputVectorPtr += 8;
488 }
489
490 number = eighthPoints * 8;
491 for (; number < num_points; number++) {
492 r = inputVector[number] * scalar;
493 if (r > max_val)
494 r = max_val;
495 else if (r < min_val)
496 r = min_val;
497 outputVector[number] = (int16_t)rintf(r);
498 }
499}
500#endif /* LV_HAVE_SSE2 */
501
502
503#ifdef LV_HAVE_SSE
504#include <xmmintrin.h>
505
506static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
507 const float* inputVector,
508 const float scalar,
509 unsigned int num_points)
510{
511 unsigned int number = 0;
512
513 const unsigned int quarterPoints = num_points / 4;
514
515 const float* inputVectorPtr = (const float*)inputVector;
516 int16_t* outputVectorPtr = outputVector;
517
518 float min_val = SHRT_MIN;
519 float max_val = SHRT_MAX;
520 float r;
521
522 __m128 vScalar = _mm_set_ps1(scalar);
523 __m128 ret;
524 __m128 vmin_val = _mm_set_ps1(min_val);
525 __m128 vmax_val = _mm_set_ps1(max_val);
526
527 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
528
529 for (; number < quarterPoints; number++) {
530 ret = _mm_load_ps(inputVectorPtr);
531 inputVectorPtr += 4;
532
533 // Scale and clip
534 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
535
536 _mm_store_ps(outputFloatBuffer, ret);
537 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
538 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
539 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
540 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
541 }
542
543 number = quarterPoints * 4;
544 for (; number < num_points; number++) {
545 r = inputVector[number] * scalar;
546 if (r > max_val)
547 r = max_val;
548 else if (r < min_val)
549 r = min_val;
550 outputVector[number] = (int16_t)rintf(r);
551 }
552}
553#endif /* LV_HAVE_SSE */
554
555
556#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */