Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_8i_s32f_convert_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
41#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42#define INCLUDED_volk_8i_s32f_convert_32f_u_H
43
44#include <inttypes.h>
45#include <stdio.h>
46
47#ifdef LV_HAVE_AVX2
48#include <immintrin.h>
49
50static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
51 const int8_t* inputVector,
52 const float scalar,
53 unsigned int num_points)
54{
55 unsigned int number = 0;
56 const unsigned int sixteenthPoints = num_points / 16;
57
58 float* outputVectorPtr = outputVector;
59 const float iScalar = 1.0 / scalar;
60 __m256 invScalar = _mm256_set1_ps(iScalar);
61 const int8_t* inputVectorPtr = inputVector;
62 __m256 ret;
63 __m128i inputVal128;
64 __m256i interimVal;
65
66 for (; number < sixteenthPoints; number++) {
67 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
68
69 interimVal = _mm256_cvtepi8_epi32(inputVal128);
70 ret = _mm256_cvtepi32_ps(interimVal);
71 ret = _mm256_mul_ps(ret, invScalar);
72 _mm256_storeu_ps(outputVectorPtr, ret);
73 outputVectorPtr += 8;
74
75 inputVal128 = _mm_srli_si128(inputVal128, 8);
76 interimVal = _mm256_cvtepi8_epi32(inputVal128);
77 ret = _mm256_cvtepi32_ps(interimVal);
78 ret = _mm256_mul_ps(ret, invScalar);
79 _mm256_storeu_ps(outputVectorPtr, ret);
80 outputVectorPtr += 8;
81
82 inputVectorPtr += 16;
83 }
84
85 number = sixteenthPoints * 16;
86 for (; number < num_points; number++) {
87 outputVector[number] = (float)(inputVector[number]) * iScalar;
88 }
89}
90#endif /* LV_HAVE_AVX2 */
91
92
93#ifdef LV_HAVE_SSE4_1
94#include <smmintrin.h>
95
96static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
97 const int8_t* inputVector,
98 const float scalar,
99 unsigned int num_points)
100{
101 unsigned int number = 0;
102 const unsigned int sixteenthPoints = num_points / 16;
103
104 float* outputVectorPtr = outputVector;
105 const float iScalar = 1.0 / scalar;
106 __m128 invScalar = _mm_set_ps1(iScalar);
107 const int8_t* inputVectorPtr = inputVector;
108 __m128 ret;
109 __m128i inputVal;
110 __m128i interimVal;
111
112 for (; number < sixteenthPoints; number++) {
113 inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
114
115 interimVal = _mm_cvtepi8_epi32(inputVal);
116 ret = _mm_cvtepi32_ps(interimVal);
117 ret = _mm_mul_ps(ret, invScalar);
118 _mm_storeu_ps(outputVectorPtr, ret);
119 outputVectorPtr += 4;
120
121 inputVal = _mm_srli_si128(inputVal, 4);
122 interimVal = _mm_cvtepi8_epi32(inputVal);
123 ret = _mm_cvtepi32_ps(interimVal);
124 ret = _mm_mul_ps(ret, invScalar);
125 _mm_storeu_ps(outputVectorPtr, ret);
126 outputVectorPtr += 4;
127
128 inputVal = _mm_srli_si128(inputVal, 4);
129 interimVal = _mm_cvtepi8_epi32(inputVal);
130 ret = _mm_cvtepi32_ps(interimVal);
131 ret = _mm_mul_ps(ret, invScalar);
132 _mm_storeu_ps(outputVectorPtr, ret);
133 outputVectorPtr += 4;
134
135 inputVal = _mm_srli_si128(inputVal, 4);
136 interimVal = _mm_cvtepi8_epi32(inputVal);
137 ret = _mm_cvtepi32_ps(interimVal);
138 ret = _mm_mul_ps(ret, invScalar);
139 _mm_storeu_ps(outputVectorPtr, ret);
140 outputVectorPtr += 4;
141
142 inputVectorPtr += 16;
143 }
144
145 number = sixteenthPoints * 16;
146 for (; number < num_points; number++) {
147 outputVector[number] = (float)(inputVector[number]) * iScalar;
148 }
149}
150#endif /* LV_HAVE_SSE4_1 */
151
152#ifdef LV_HAVE_GENERIC
153
154static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
155 const int8_t* inputVector,
156 const float scalar,
157 unsigned int num_points)
158{
159 float* outputVectorPtr = outputVector;
160 const int8_t* inputVectorPtr = inputVector;
161 unsigned int number = 0;
162 const float iScalar = 1.0 / scalar;
163
164 for (number = 0; number < num_points; number++) {
165 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
166 }
167}
168#endif /* LV_HAVE_GENERIC */
169
170
171#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
172
173#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
174#define INCLUDED_volk_8i_s32f_convert_32f_a_H
175
176#include <inttypes.h>
177#include <stdio.h>
178
179#ifdef LV_HAVE_AVX2
180#include <immintrin.h>
181
182static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
183 const int8_t* inputVector,
184 const float scalar,
185 unsigned int num_points)
186{
187 unsigned int number = 0;
188 const unsigned int sixteenthPoints = num_points / 16;
189
190 float* outputVectorPtr = outputVector;
191 const float iScalar = 1.0 / scalar;
192 __m256 invScalar = _mm256_set1_ps(iScalar);
193 const int8_t* inputVectorPtr = inputVector;
194 __m256 ret;
195 __m128i inputVal128;
196 __m256i interimVal;
197
198 for (; number < sixteenthPoints; number++) {
199 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
200
201 interimVal = _mm256_cvtepi8_epi32(inputVal128);
202 ret = _mm256_cvtepi32_ps(interimVal);
203 ret = _mm256_mul_ps(ret, invScalar);
204 _mm256_store_ps(outputVectorPtr, ret);
205 outputVectorPtr += 8;
206
207 inputVal128 = _mm_srli_si128(inputVal128, 8);
208 interimVal = _mm256_cvtepi8_epi32(inputVal128);
209 ret = _mm256_cvtepi32_ps(interimVal);
210 ret = _mm256_mul_ps(ret, invScalar);
211 _mm256_store_ps(outputVectorPtr, ret);
212 outputVectorPtr += 8;
213
214 inputVectorPtr += 16;
215 }
216
217 number = sixteenthPoints * 16;
218 for (; number < num_points; number++) {
219 outputVector[number] = (float)(inputVector[number]) * iScalar;
220 }
221}
222#endif /* LV_HAVE_AVX2 */
223
224#ifdef LV_HAVE_SSE4_1
225#include <smmintrin.h>
226
227static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
228 const int8_t* inputVector,
229 const float scalar,
230 unsigned int num_points)
231{
232 unsigned int number = 0;
233 const unsigned int sixteenthPoints = num_points / 16;
234
235 float* outputVectorPtr = outputVector;
236 const float iScalar = 1.0 / scalar;
237 __m128 invScalar = _mm_set_ps1(iScalar);
238 const int8_t* inputVectorPtr = inputVector;
239 __m128 ret;
240 __m128i inputVal;
241 __m128i interimVal;
242
243 for (; number < sixteenthPoints; number++) {
244 inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
245
246 interimVal = _mm_cvtepi8_epi32(inputVal);
247 ret = _mm_cvtepi32_ps(interimVal);
248 ret = _mm_mul_ps(ret, invScalar);
249 _mm_store_ps(outputVectorPtr, ret);
250 outputVectorPtr += 4;
251
252 inputVal = _mm_srli_si128(inputVal, 4);
253 interimVal = _mm_cvtepi8_epi32(inputVal);
254 ret = _mm_cvtepi32_ps(interimVal);
255 ret = _mm_mul_ps(ret, invScalar);
256 _mm_store_ps(outputVectorPtr, ret);
257 outputVectorPtr += 4;
258
259 inputVal = _mm_srli_si128(inputVal, 4);
260 interimVal = _mm_cvtepi8_epi32(inputVal);
261 ret = _mm_cvtepi32_ps(interimVal);
262 ret = _mm_mul_ps(ret, invScalar);
263 _mm_store_ps(outputVectorPtr, ret);
264 outputVectorPtr += 4;
265
266 inputVal = _mm_srli_si128(inputVal, 4);
267 interimVal = _mm_cvtepi8_epi32(inputVal);
268 ret = _mm_cvtepi32_ps(interimVal);
269 ret = _mm_mul_ps(ret, invScalar);
270 _mm_store_ps(outputVectorPtr, ret);
271 outputVectorPtr += 4;
272
273 inputVectorPtr += 16;
274 }
275
276 number = sixteenthPoints * 16;
277 for (; number < num_points; number++) {
278 outputVector[number] = (float)(inputVector[number]) * iScalar;
279 }
280}
281#endif /* LV_HAVE_SSE4_1 */
282
283#ifdef LV_HAVE_NEON
284#include <arm_neon.h>
285
286static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
287 const int8_t* inputVector,
288 const float scalar,
289 unsigned int num_points)
290{
291 float* outputVectorPtr = outputVector;
292 const int8_t* inputVectorPtr = inputVector;
293
294 const float iScalar = 1.0 / scalar;
295 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
296
297 int8x16_t inputVal;
298
299 int16x8_t lower;
300 int16x8_t higher;
301
302 float32x4_t outputFloat;
303
304 unsigned int number = 0;
305 const unsigned int sixteenthPoints = num_points / 16;
306 for (; number < sixteenthPoints; number++) {
307 inputVal = vld1q_s8(inputVectorPtr);
308 inputVectorPtr += 16;
309
310 lower = vmovl_s8(vget_low_s8(inputVal));
311 higher = vmovl_s8(vget_high_s8(inputVal));
312
313 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
314 vst1q_f32(outputVectorPtr, outputFloat);
315 outputVectorPtr += 4;
316
317 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
318 vst1q_f32(outputVectorPtr, outputFloat);
319 outputVectorPtr += 4;
320
321 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
322 vst1q_f32(outputVectorPtr, outputFloat);
323 outputVectorPtr += 4;
324
325 outputFloat =
326 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
327 vst1q_f32(outputVectorPtr, outputFloat);
328 outputVectorPtr += 4;
329 }
330 for (number = sixteenthPoints * 16; number < num_points; number++) {
331 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
332 }
333}
334
335#endif /* LV_HAVE_NEON */
336
337#ifdef LV_HAVE_ORC
338extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
339 const int8_t* inputVector,
340 const float scalar,
341 unsigned int num_points);
342
343static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
344 const int8_t* inputVector,
345 const float scalar,
346 unsigned int num_points)
347{
348 float invscalar = 1.0 / scalar;
349 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
350}
351#endif /* LV_HAVE_ORC */
352
353
354#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */