33#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34#define INCLUDED_volk_32fc_convert_16ic_a_H
43static inline void volk_32fc_convert_16ic_a_avx2(
lv_16sc_t* outputVector,
45 unsigned int num_points)
47 const unsigned int avx_iters = num_points / 8;
49 float* inputVectorPtr = (
float*)inputVector;
50 int16_t* outputVectorPtr = (int16_t*)outputVector;
53 const float min_val = (float)SHRT_MIN;
54 const float max_val = (float)SHRT_MAX;
56 __m256 inputVal1, inputVal2;
57 __m256i intInputVal1, intInputVal2;
59 const __m256 vmin_val = _mm256_set1_ps(min_val);
60 const __m256 vmax_val = _mm256_set1_ps(max_val);
63 for (i = 0; i < avx_iters; i++) {
64 inputVal1 = _mm256_load_ps((
float*)inputVectorPtr);
66 inputVal2 = _mm256_load_ps((
float*)inputVectorPtr);
71 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
74 intInputVal1 = _mm256_cvtps_epi32(ret1);
75 intInputVal2 = _mm256_cvtps_epi32(ret2);
77 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
80 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81 outputVectorPtr += 16;
84 for (i = avx_iters * 16; i < num_points * 2; i++) {
85 aux = *inputVectorPtr++;
88 else if (aux < min_val)
90 *outputVectorPtr++ = (int16_t)rintf(aux);
100 unsigned int num_points)
102 const unsigned int sse_iters = num_points / 4;
104 float* inputVectorPtr = (
float*)inputVector;
105 int16_t* outputVectorPtr = (int16_t*)outputVector;
108 const float min_val = (float)SHRT_MIN;
109 const float max_val = (float)SHRT_MAX;
111 __m128 inputVal1, inputVal2;
112 __m128i intInputVal1, intInputVal2;
118 for (i = 0; i < sse_iters; i++) {
135 outputVectorPtr += 8;
138 for (i = sse_iters * 8; i < num_points * 2; i++) {
139 aux = *inputVectorPtr++;
142 else if (aux < min_val)
144 *outputVectorPtr++ = (int16_t)rintf(aux);
153#define VCVTRQ_S32_F32(result, value) \
154 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[0]) : "t"(value[0]) :); \
155 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[1]) : "t"(value[1]) :); \
156 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[2]) : "t"(value[2]) :); \
157 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[3]) : "t"(value[3]) :);
159static inline void volk_32fc_convert_16ic_neon(
lv_16sc_t* outputVector,
161 unsigned int num_points)
164 const unsigned int neon_iters = num_points / 4;
166 float32_t* inputVectorPtr = (float32_t*)inputVector;
167 int16_t* outputVectorPtr = (int16_t*)outputVector;
169 const float min_val_f = (float)SHRT_MIN;
170 const float max_val_f = (float)SHRT_MAX;
174 const float32x4_t min_val = vmovq_n_f32(min_val_f);
175 const float32x4_t max_val = vmovq_n_f32(max_val_f);
176 float32x4_t ret1, ret2, a, b;
178 int32x4_t toint_a = { 0, 0, 0, 0 };
179 int32x4_t toint_b = { 0, 0, 0, 0 };
180 int16x4_t intInputVal1, intInputVal2;
183 for (i = 0; i < neon_iters; i++) {
184 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
186 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
190 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
191 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
194 VCVTRQ_S32_F32(toint_a, ret1);
195 VCVTRQ_S32_F32(toint_b, ret2);
197 intInputVal1 = vqmovn_s32(toint_a);
198 intInputVal2 = vqmovn_s32(toint_b);
200 res = vcombine_s16(intInputVal1, intInputVal2);
201 vst1q_s16((int16_t*)outputVectorPtr, res);
202 outputVectorPtr += 8;
205 for (i = neon_iters * 8; i < num_points * 2; i++) {
206 aux = *inputVectorPtr++;
209 else if (aux < min_val_f)
211 *outputVectorPtr++ = (int16_t)rintf(aux);
221static inline void volk_32fc_convert_16ic_neonv8(
lv_16sc_t* outputVector,
223 unsigned int num_points)
225 const unsigned int neon_iters = num_points / 4;
227 float32_t* inputVectorPtr = (float32_t*)inputVector;
228 int16_t* outputVectorPtr = (int16_t*)outputVector;
230 const float min_val_f = (float)SHRT_MIN;
231 const float max_val_f = (float)SHRT_MAX;
235 const float32x4_t min_val = vmovq_n_f32(min_val_f);
236 const float32x4_t max_val = vmovq_n_f32(max_val_f);
237 float32x4_t ret1, ret2, a, b;
239 int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
240 int16x4_t intInputVal1, intInputVal2;
243 for (i = 0; i < neon_iters; i++) {
244 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
246 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
250 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
251 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
254 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
255 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
257 intInputVal1 = vqmovn_s32(toint_a);
258 intInputVal2 = vqmovn_s32(toint_b);
260 res = vcombine_s16(intInputVal1, intInputVal2);
261 vst1q_s16((int16_t*)outputVectorPtr, res);
262 outputVectorPtr += 8;
265 for (i = neon_iters * 8; i < num_points * 2; i++) {
266 aux = *inputVectorPtr++;
269 else if (aux < min_val_f)
271 *outputVectorPtr++ = (int16_t)rintf(aux);
277#ifdef LV_HAVE_GENERIC
281 unsigned int num_points)
283 float* inputVectorPtr = (
float*)inputVector;
284 int16_t* outputVectorPtr = (int16_t*)outputVector;
285 const float min_val = (float)SHRT_MIN;
286 const float max_val = (float)SHRT_MAX;
289 for (i = 0; i < num_points * 2; i++) {
290 aux = *inputVectorPtr++;
293 else if (aux < min_val)
295 *outputVectorPtr++ = (int16_t)rintf(aux);
302#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
303#define INCLUDED_volk_32fc_convert_16ic_u_H
311#include <immintrin.h>
313static inline void volk_32fc_convert_16ic_u_avx2(
lv_16sc_t* outputVector,
315 unsigned int num_points)
317 const unsigned int avx_iters = num_points / 8;
319 float* inputVectorPtr = (
float*)inputVector;
320 int16_t* outputVectorPtr = (int16_t*)outputVector;
323 const float min_val = (float)SHRT_MIN;
324 const float max_val = (float)SHRT_MAX;
326 __m256 inputVal1, inputVal2;
327 __m256i intInputVal1, intInputVal2;
329 const __m256 vmin_val = _mm256_set1_ps(min_val);
330 const __m256 vmax_val = _mm256_set1_ps(max_val);
333 for (i = 0; i < avx_iters; i++) {
334 inputVal1 = _mm256_loadu_ps((
float*)inputVectorPtr);
336 inputVal2 = _mm256_loadu_ps((
float*)inputVectorPtr);
341 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
342 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
344 intInputVal1 = _mm256_cvtps_epi32(ret1);
345 intInputVal2 = _mm256_cvtps_epi32(ret2);
347 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
348 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
350 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
351 outputVectorPtr += 16;
354 for (i = avx_iters * 16; i < num_points * 2; i++) {
355 aux = *inputVectorPtr++;
358 else if (aux < min_val)
360 *outputVectorPtr++ = (int16_t)rintf(aux);
367#include <emmintrin.h>
371 unsigned int num_points)
373 const unsigned int sse_iters = num_points / 4;
375 float* inputVectorPtr = (
float*)inputVector;
376 int16_t* outputVectorPtr = (int16_t*)outputVector;
379 const float min_val = (float)SHRT_MIN;
380 const float max_val = (float)SHRT_MAX;
382 __m128 inputVal1, inputVal2;
383 __m128i intInputVal1, intInputVal2;
389 for (i = 0; i < sse_iters; i++) {
406 outputVectorPtr += 8;
409 for (i = sse_iters * 8; i < num_points * 2; i++) {
410 aux = *inputVectorPtr++;
413 else if (aux < min_val)
415 *outputVectorPtr++ = (int16_t)rintf(aux);