41#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42#define INCLUDED_volk_8i_s32f_convert_32f_u_H
50static inline void volk_8i_s32f_convert_32f_u_avx2(
float* outputVector,
51 const int8_t* inputVector,
53 unsigned int num_points)
55 unsigned int number = 0;
56 const unsigned int sixteenthPoints = num_points / 16;
58 float* outputVectorPtr = outputVector;
59 const float iScalar = 1.0 / scalar;
60 __m256 invScalar = _mm256_set1_ps(iScalar);
61 const int8_t* inputVectorPtr = inputVector;
66 for (; number < sixteenthPoints; number++) {
69 interimVal = _mm256_cvtepi8_epi32(inputVal128);
70 ret = _mm256_cvtepi32_ps(interimVal);
71 ret = _mm256_mul_ps(ret, invScalar);
72 _mm256_storeu_ps(outputVectorPtr, ret);
76 interimVal = _mm256_cvtepi8_epi32(inputVal128);
77 ret = _mm256_cvtepi32_ps(interimVal);
78 ret = _mm256_mul_ps(ret, invScalar);
79 _mm256_storeu_ps(outputVectorPtr, ret);
85 number = sixteenthPoints * 16;
86 for (; number < num_points; number++) {
87 outputVector[number] = (float)(inputVector[number]) * iScalar;
96static inline void volk_8i_s32f_convert_32f_u_sse4_1(
float* outputVector,
97 const int8_t* inputVector,
99 unsigned int num_points)
101 unsigned int number = 0;
102 const unsigned int sixteenthPoints = num_points / 16;
104 float* outputVectorPtr = outputVector;
105 const float iScalar = 1.0 / scalar;
107 const int8_t* inputVectorPtr = inputVector;
112 for (; number < sixteenthPoints; number++) {
119 outputVectorPtr += 4;
126 outputVectorPtr += 4;
133 outputVectorPtr += 4;
140 outputVectorPtr += 4;
142 inputVectorPtr += 16;
145 number = sixteenthPoints * 16;
146 for (; number < num_points; number++) {
147 outputVector[number] = (float)(inputVector[number]) * iScalar;
152#ifdef LV_HAVE_GENERIC
155 const int8_t* inputVector,
157 unsigned int num_points)
159 float* outputVectorPtr = outputVector;
160 const int8_t* inputVectorPtr = inputVector;
161 unsigned int number = 0;
162 const float iScalar = 1.0 / scalar;
164 for (number = 0; number < num_points; number++) {
165 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
173#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
174#define INCLUDED_volk_8i_s32f_convert_32f_a_H
180#include <immintrin.h>
182static inline void volk_8i_s32f_convert_32f_a_avx2(
float* outputVector,
183 const int8_t* inputVector,
185 unsigned int num_points)
187 unsigned int number = 0;
188 const unsigned int sixteenthPoints = num_points / 16;
190 float* outputVectorPtr = outputVector;
191 const float iScalar = 1.0 / scalar;
192 __m256 invScalar = _mm256_set1_ps(iScalar);
193 const int8_t* inputVectorPtr = inputVector;
198 for (; number < sixteenthPoints; number++) {
201 interimVal = _mm256_cvtepi8_epi32(inputVal128);
202 ret = _mm256_cvtepi32_ps(interimVal);
203 ret = _mm256_mul_ps(ret, invScalar);
204 _mm256_store_ps(outputVectorPtr, ret);
205 outputVectorPtr += 8;
208 interimVal = _mm256_cvtepi8_epi32(inputVal128);
209 ret = _mm256_cvtepi32_ps(interimVal);
210 ret = _mm256_mul_ps(ret, invScalar);
211 _mm256_store_ps(outputVectorPtr, ret);
212 outputVectorPtr += 8;
214 inputVectorPtr += 16;
217 number = sixteenthPoints * 16;
218 for (; number < num_points; number++) {
219 outputVector[number] = (float)(inputVector[number]) * iScalar;
225#include <smmintrin.h>
227static inline void volk_8i_s32f_convert_32f_a_sse4_1(
float* outputVector,
228 const int8_t* inputVector,
230 unsigned int num_points)
232 unsigned int number = 0;
233 const unsigned int sixteenthPoints = num_points / 16;
235 float* outputVectorPtr = outputVector;
236 const float iScalar = 1.0 / scalar;
238 const int8_t* inputVectorPtr = inputVector;
243 for (; number < sixteenthPoints; number++) {
250 outputVectorPtr += 4;
257 outputVectorPtr += 4;
264 outputVectorPtr += 4;
271 outputVectorPtr += 4;
273 inputVectorPtr += 16;
276 number = sixteenthPoints * 16;
277 for (; number < num_points; number++) {
278 outputVector[number] = (float)(inputVector[number]) * iScalar;
287 const int8_t* inputVector,
289 unsigned int num_points)
291 float* outputVectorPtr = outputVector;
292 const int8_t* inputVectorPtr = inputVector;
294 const float iScalar = 1.0 / scalar;
295 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
302 float32x4_t outputFloat;
304 unsigned int number = 0;
305 const unsigned int sixteenthPoints = num_points / 16;
306 for (; number < sixteenthPoints; number++) {
307 inputVal = vld1q_s8(inputVectorPtr);
308 inputVectorPtr += 16;
310 lower = vmovl_s8(vget_low_s8(inputVal));
311 higher = vmovl_s8(vget_high_s8(inputVal));
313 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
314 vst1q_f32(outputVectorPtr, outputFloat);
315 outputVectorPtr += 4;
317 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
318 vst1q_f32(outputVectorPtr, outputFloat);
319 outputVectorPtr += 4;
321 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
322 vst1q_f32(outputVectorPtr, outputFloat);
323 outputVectorPtr += 4;
326 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
327 vst1q_f32(outputVectorPtr, outputFloat);
328 outputVectorPtr += 4;
330 for (number = sixteenthPoints * 16; number < num_points; number++) {
331 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
338extern void volk_8i_s32f_convert_32f_a_orc_impl(
float* outputVector,
339 const int8_t* inputVector,
341 unsigned int num_points);
343static inline void volk_8i_s32f_convert_32f_u_orc(
float* outputVector,
344 const int8_t* inputVector,
346 unsigned int num_points)
348 float invscalar = 1.0 / scalar;
349 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);