Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_x2_convert_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2023 Daniel Estevez <daniel@destevez.net>
4 * Copyright 2012, 2014 Free Software Foundation, Inc.
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
60#ifndef INCLUDED_volk_32f_s32f_x2_convert_8u_u_H
61#define INCLUDED_volk_32f_s32f_x2_convert_8u_u_H
62
63#include <inttypes.h>
64
65static inline void volk_32f_s32f_x2_convert_8u_single(uint8_t* out, const float in)
66{
67 const float min_val = 0.0f;
68 const float max_val = UINT8_MAX;
69 if (in > max_val) {
70 *out = (uint8_t)(max_val);
71 } else if (in < min_val) {
72 *out = (uint8_t)(min_val);
73 } else {
74 *out = (uint8_t)(rintf(in));
75 }
76}
77
78
79#ifdef LV_HAVE_GENERIC
80
81static inline void volk_32f_s32f_x2_convert_8u_generic(uint8_t* outputVector,
82 const float* inputVector,
83 const float scale,
84 const float bias,
85 unsigned int num_points)
86{
87 const float* inputVectorPtr = inputVector;
88
89 for (unsigned int number = 0; number < num_points; number++) {
90 const float r = *inputVectorPtr++ * scale + bias;
91 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
92 }
93}
94
95#endif /* LV_HAVE_GENERIC */
96
97
98#if LV_HAVE_AVX2 && LV_HAVE_FMA
99#include <immintrin.h>
100
101static inline void volk_32f_s32f_x2_convert_8u_u_avx2_fma(uint8_t* outputVector,
102 const float* inputVector,
103 const float scale,
104 const float bias,
105 unsigned int num_points)
106{
107 const unsigned int thirtysecondPoints = num_points / 32;
108
109 const float* inputVectorPtr = (const float*)inputVector;
110 uint8_t* outputVectorPtr = outputVector;
111
112 const float min_val = 0.0f;
113 const float max_val = UINT8_MAX;
114 const __m256 vmin_val = _mm256_set1_ps(min_val);
115 const __m256 vmax_val = _mm256_set1_ps(max_val);
116
117 const __m256 vScale = _mm256_set1_ps(scale);
118 const __m256 vBias = _mm256_set1_ps(bias);
119
120 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
121 __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
122 inputVectorPtr += 8;
123 __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
124 inputVectorPtr += 8;
125 __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
126 inputVectorPtr += 8;
127 __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
128 inputVectorPtr += 8;
129
130 inputVal1 = _mm256_max_ps(
131 _mm256_min_ps(_mm256_fmadd_ps(inputVal1, vScale, vBias), vmax_val), vmin_val);
132 inputVal2 = _mm256_max_ps(
133 _mm256_min_ps(_mm256_fmadd_ps(inputVal2, vScale, vBias), vmax_val), vmin_val);
134 inputVal3 = _mm256_max_ps(
135 _mm256_min_ps(_mm256_fmadd_ps(inputVal3, vScale, vBias), vmax_val), vmin_val);
136 inputVal4 = _mm256_max_ps(
137 _mm256_min_ps(_mm256_fmadd_ps(inputVal4, vScale, vBias), vmax_val), vmin_val);
138
139 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
140 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
141 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
142 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
143
144 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
145 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
146 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
147 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
148
149 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
150 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
151
152 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
153 outputVectorPtr += 32;
154 }
155
156 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
157 const float r = inputVector[number] * scale + bias;
158 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
159 }
160}
161
162#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
163
164
165#ifdef LV_HAVE_AVX2
166#include <immintrin.h>
167
168static inline void volk_32f_s32f_x2_convert_8u_u_avx2(uint8_t* outputVector,
169 const float* inputVector,
170 const float scale,
171 const float bias,
172 unsigned int num_points)
173{
174 const unsigned int thirtysecondPoints = num_points / 32;
175
176 const float* inputVectorPtr = (const float*)inputVector;
177 uint8_t* outputVectorPtr = outputVector;
178
179 const float min_val = 0.0f;
180 const float max_val = UINT8_MAX;
181 const __m256 vmin_val = _mm256_set1_ps(min_val);
182 const __m256 vmax_val = _mm256_set1_ps(max_val);
183
184 const __m256 vScale = _mm256_set1_ps(scale);
185 const __m256 vBias = _mm256_set1_ps(bias);
186
187 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
188 __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
189 inputVectorPtr += 8;
190 __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
191 inputVectorPtr += 8;
192 __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
193 inputVectorPtr += 8;
194 __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
195 inputVectorPtr += 8;
196
197 inputVal1 = _mm256_max_ps(
198 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal1, vScale), vBias),
199 vmax_val),
200 vmin_val);
201 inputVal2 = _mm256_max_ps(
202 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal2, vScale), vBias),
203 vmax_val),
204 vmin_val);
205 inputVal3 = _mm256_max_ps(
206 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal3, vScale), vBias),
207 vmax_val),
208 vmin_val);
209 inputVal4 = _mm256_max_ps(
210 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal4, vScale), vBias),
211 vmax_val),
212 vmin_val);
213
214 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
215 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
216 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
217 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
218
219 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
220 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
221 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
222 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
223
224 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
225 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
226
227 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
228 outputVectorPtr += 32;
229 }
230
231 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
232 float r = inputVector[number] * scale + bias;
233 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
234 }
235}
236
237#endif /* LV_HAVE_AVX2 */
238
239
240#ifdef LV_HAVE_SSE2
241#include <emmintrin.h>
242
243static inline void volk_32f_s32f_x2_convert_8u_u_sse2(uint8_t* outputVector,
244 const float* inputVector,
245 const float scale,
246 const float bias,
247 unsigned int num_points)
248{
249 const unsigned int sixteenthPoints = num_points / 16;
250
251 const float* inputVectorPtr = (const float*)inputVector;
252 uint8_t* outputVectorPtr = outputVector;
253
254 const float min_val = 0.0f;
255 const float max_val = UINT8_MAX;
256 const __m128 vmin_val = _mm_set_ps1(min_val);
257 const __m128 vmax_val = _mm_set_ps1(max_val);
258
259 const __m128 vScale = _mm_set_ps1(scale);
260 const __m128 vBias = _mm_set_ps1(bias);
261
262 for (unsigned int number = 0; number < sixteenthPoints; number++) {
263 __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
264 inputVectorPtr += 4;
265 __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
266 inputVectorPtr += 4;
267 __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
268 inputVectorPtr += 4;
269 __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
270 inputVectorPtr += 4;
271
272 inputVal1 = _mm_max_ps(
273 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal1, vScale), vBias), vmax_val),
274 vmin_val);
275 inputVal2 = _mm_max_ps(
276 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal2, vScale), vBias), vmax_val),
277 vmin_val);
278 inputVal3 = _mm_max_ps(
279 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal3, vScale), vBias), vmax_val),
280 vmin_val);
281 inputVal4 = _mm_max_ps(
282 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal4, vScale), vBias), vmax_val),
283 vmin_val);
284
285 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
286 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
287 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
288 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
289
290 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
291 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
292
293 intInputVal1 = _mm_packus_epi16(intInputVal1, intInputVal3);
294
295 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
296 outputVectorPtr += 16;
297 }
298
299 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
300 const float r = inputVector[number] * scale + bias;
301 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
302 }
303}
304
305#endif /* LV_HAVE_SSE2 */
306
307
308#ifdef LV_HAVE_SSE
309#include <xmmintrin.h>
310
311static inline void volk_32f_s32f_x2_convert_8u_u_sse(uint8_t* outputVector,
312 const float* inputVector,
313 const float scale,
314 const float bias,
315 unsigned int num_points)
316{
317 const unsigned int quarterPoints = num_points / 4;
318
319 const float* inputVectorPtr = (const float*)inputVector;
320 uint8_t* outputVectorPtr = outputVector;
321
322 const float min_val = 0.0f;
323 const float max_val = UINT8_MAX;
324 const __m128 vmin_val = _mm_set_ps1(min_val);
325 const __m128 vmax_val = _mm_set_ps1(max_val);
326
327 const __m128 vScale = _mm_set_ps1(scale);
328 const __m128 vBias = _mm_set_ps1(bias);
329
330 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
331
332 for (unsigned int number = 0; number < quarterPoints; number++) {
333 __m128 ret = _mm_loadu_ps(inputVectorPtr);
334 inputVectorPtr += 4;
335
336 ret = _mm_max_ps(_mm_min_ps(_mm_add_ps(_mm_mul_ps(ret, vScale), vBias), vmax_val),
337 vmin_val);
338
339 _mm_store_ps(outputFloatBuffer, ret);
340 for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
341 *outputVectorPtr++ = (uint8_t)(rintf(outputFloatBuffer[inner_loop]));
342 }
343 }
344
345 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
346 const float r = inputVector[number] * scale + bias;
347 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
348 }
349}
350
351#endif /* LV_HAVE_SSE */
352
353
354#endif /* INCLUDED_volk_32f_s32f_x2_convert_8u_u_H */
355#ifndef INCLUDED_volk_32f_s32f_x2_convert_8u_a_H
356#define INCLUDED_volk_32f_s32f_x2_convert_8u_a_H
357
358#include <inttypes.h>
359#include <volk/volk_common.h>
360
361#if LV_HAVE_AVX2 && LV_HAVE_FMA
362#include <immintrin.h>
363
364static inline void volk_32f_s32f_x2_convert_8u_a_avx2_fma(uint8_t* outputVector,
365 const float* inputVector,
366 const float scale,
367 const float bias,
368 unsigned int num_points)
369{
370 const unsigned int thirtysecondPoints = num_points / 32;
371
372 const float* inputVectorPtr = (const float*)inputVector;
373 uint8_t* outputVectorPtr = outputVector;
374
375 const float min_val = 0.0f;
376 const float max_val = UINT8_MAX;
377 const __m256 vmin_val = _mm256_set1_ps(min_val);
378 const __m256 vmax_val = _mm256_set1_ps(max_val);
379
380 const __m256 vScale = _mm256_set1_ps(scale);
381 const __m256 vBias = _mm256_set1_ps(bias);
382
383 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
384 __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
385 inputVectorPtr += 8;
386 __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
387 inputVectorPtr += 8;
388 __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
389 inputVectorPtr += 8;
390 __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
391 inputVectorPtr += 8;
392
393 inputVal1 = _mm256_max_ps(
394 _mm256_min_ps(_mm256_fmadd_ps(inputVal1, vScale, vBias), vmax_val), vmin_val);
395 inputVal2 = _mm256_max_ps(
396 _mm256_min_ps(_mm256_fmadd_ps(inputVal2, vScale, vBias), vmax_val), vmin_val);
397 inputVal3 = _mm256_max_ps(
398 _mm256_min_ps(_mm256_fmadd_ps(inputVal3, vScale, vBias), vmax_val), vmin_val);
399 inputVal4 = _mm256_max_ps(
400 _mm256_min_ps(_mm256_fmadd_ps(inputVal4, vScale, vBias), vmax_val), vmin_val);
401
402 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
403 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
404 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
405 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
406
407 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
408 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
409 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
410 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
411
412 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
413 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
414
415 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
416 outputVectorPtr += 32;
417 }
418
419 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
420 const float r = inputVector[number] * scale + bias;
421 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
422 }
423}
424
425#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
426
427
428#ifdef LV_HAVE_AVX2
429#include <immintrin.h>
430
431static inline void volk_32f_s32f_x2_convert_8u_a_avx2(uint8_t* outputVector,
432 const float* inputVector,
433 const float scale,
434 const float bias,
435 unsigned int num_points)
436{
437 const unsigned int thirtysecondPoints = num_points / 32;
438
439 const float* inputVectorPtr = (const float*)inputVector;
440 uint8_t* outputVectorPtr = outputVector;
441
442 const float min_val = 0.0f;
443 const float max_val = UINT8_MAX;
444 const __m256 vmin_val = _mm256_set1_ps(min_val);
445 const __m256 vmax_val = _mm256_set1_ps(max_val);
446
447 const __m256 vScale = _mm256_set1_ps(scale);
448 const __m256 vBias = _mm256_set1_ps(bias);
449
450 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
451 __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
452 inputVectorPtr += 8;
453 __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
454 inputVectorPtr += 8;
455 __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
456 inputVectorPtr += 8;
457 __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
458 inputVectorPtr += 8;
459
460 inputVal1 = _mm256_max_ps(
461 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal1, vScale), vBias),
462 vmax_val),
463 vmin_val);
464 inputVal2 = _mm256_max_ps(
465 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal2, vScale), vBias),
466 vmax_val),
467 vmin_val);
468 inputVal3 = _mm256_max_ps(
469 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal3, vScale), vBias),
470 vmax_val),
471 vmin_val);
472 inputVal4 = _mm256_max_ps(
473 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal4, vScale), vBias),
474 vmax_val),
475 vmin_val);
476
477 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
478 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
479 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
480 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
481
482 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
483 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
484 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
485 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
486
487 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
488 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
489
490 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
491 outputVectorPtr += 32;
492 }
493
494 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
495 const float r = inputVector[number] * scale + bias;
496 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
497 }
498}
499
500#endif /* LV_HAVE_AVX2 */
501
502
503#ifdef LV_HAVE_SSE2
504#include <emmintrin.h>
505
506static inline void volk_32f_s32f_x2_convert_8u_a_sse2(uint8_t* outputVector,
507 const float* inputVector,
508 const float scale,
509 const float bias,
510 unsigned int num_points)
511{
512 const unsigned int sixteenthPoints = num_points / 16;
513
514 const float* inputVectorPtr = (const float*)inputVector;
515 uint8_t* outputVectorPtr = outputVector;
516
517 const float min_val = 0.0f;
518 const float max_val = UINT8_MAX;
519 const __m128 vmin_val = _mm_set_ps1(min_val);
520 const __m128 vmax_val = _mm_set_ps1(max_val);
521
522 const __m128 vScale = _mm_set_ps1(scale);
523 const __m128 vBias = _mm_set_ps1(bias);
524
525 for (unsigned int number = 0; number < sixteenthPoints; number++) {
526 __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
527 inputVectorPtr += 4;
528 __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
529 inputVectorPtr += 4;
530 __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
531 inputVectorPtr += 4;
532 __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
533 inputVectorPtr += 4;
534
535 inputVal1 = _mm_max_ps(
536 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal1, vScale), vBias), vmax_val),
537 vmin_val);
538 inputVal2 = _mm_max_ps(
539 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal2, vScale), vBias), vmax_val),
540 vmin_val);
541 inputVal3 = _mm_max_ps(
542 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal3, vScale), vBias), vmax_val),
543 vmin_val);
544 inputVal4 = _mm_max_ps(
545 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal4, vScale), vBias), vmax_val),
546 vmin_val);
547
548 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
549 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
550 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
551 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
552
553 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
554 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
555
556 intInputVal1 = _mm_packus_epi16(intInputVal1, intInputVal3);
557
558 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
559 outputVectorPtr += 16;
560 }
561
562 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
563 const float r = inputVector[number] * scale + bias;
564 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
565 }
566}
567#endif /* LV_HAVE_SSE2 */
568
569
570#ifdef LV_HAVE_SSE
571#include <xmmintrin.h>
572
573static inline void volk_32f_s32f_x2_convert_8u_a_sse(uint8_t* outputVector,
574 const float* inputVector,
575 const float scale,
576 const float bias,
577 unsigned int num_points)
578{
579 const unsigned int quarterPoints = num_points / 4;
580
581 const float* inputVectorPtr = (const float*)inputVector;
582 uint8_t* outputVectorPtr = outputVector;
583
584 const float min_val = 0.0f;
585 const float max_val = UINT8_MAX;
586 const __m128 vmin_val = _mm_set_ps1(min_val);
587 const __m128 vmax_val = _mm_set_ps1(max_val);
588
589 const __m128 vScalar = _mm_set_ps1(scale);
590 const __m128 vBias = _mm_set_ps1(bias);
591
592 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
593
594 for (unsigned int number = 0; number < quarterPoints; number++) {
595 __m128 ret = _mm_load_ps(inputVectorPtr);
596 inputVectorPtr += 4;
597
598 ret = _mm_max_ps(
599 _mm_min_ps(_mm_add_ps(_mm_mul_ps(ret, vScalar), vBias), vmax_val), vmin_val);
600
601 _mm_store_ps(outputFloatBuffer, ret);
602 for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
603 *outputVectorPtr++ = (uint8_t)(rintf(outputFloatBuffer[inner_loop]));
604 }
605 }
606
607 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
608 const float r = inputVector[number] * scale + bias;
609 volk_32f_s32f_x2_convert_8u_single(&outputVector[number], r);
610 }
611}
612
613#endif /* LV_HAVE_SSE */
614
615
616#endif /* INCLUDED_volk_32f_s32f_x2_convert_8u_a_H */