41#ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
42#define INCLUDED_volk_16ic_magnitude_16i_a_H
53static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector,
55 unsigned int num_points)
57 unsigned int number = 0;
58 const unsigned int eighthPoints = num_points / 8;
60 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
61 int16_t* magnitudeVectorPtr = magnitudeVector;
63 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
64 __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
67 __m256 cplxValue1, cplxValue2, result;
68 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
70 for (; number < eighthPoints; number++) {
72 int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
73 complexVectorPtr += 16;
74 short1 = _mm256_extracti128_si256(int1, 0);
75 short2 = _mm256_extracti128_si256(int1, 1);
77 int1 = _mm256_cvtepi16_epi32(short1);
78 int2 = _mm256_cvtepi16_epi32(short2);
79 cplxValue1 = _mm256_cvtepi32_ps(int1);
80 cplxValue2 = _mm256_cvtepi32_ps(int2);
82 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
83 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
85 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
86 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
88 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
90 result = _mm256_sqrt_ps(result);
92 result = _mm256_mul_ps(result, vScalar);
94 int1 = _mm256_cvtps_epi32(result);
95 int1 = _mm256_packs_epi32(int1, int1);
96 int1 = _mm256_permutevar8x32_epi32(
98 short1 = _mm256_extracti128_si256(int1, 0);
100 magnitudeVectorPtr += 8;
103 number = eighthPoints * 8;
104 magnitudeVectorPtr = &magnitudeVector[number];
105 complexVectorPtr = (
const int16_t*)&complexVector[number];
106 for (; number < num_points; number++) {
107 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
108 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
109 const float val1Result =
110 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
111 *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
117#include <pmmintrin.h>
121 unsigned int num_points)
123 unsigned int number = 0;
124 const unsigned int quarterPoints = num_points / 4;
126 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
127 int16_t* magnitudeVectorPtr = magnitudeVector;
132 __m128 cplxValue1, cplxValue2, result;
137 for (; number < quarterPoints; number++) {
139 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
140 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
141 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
142 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
144 inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
145 inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
146 inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
147 inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
152 complexVectorPtr += 8;
154 cplxValue1 =
_mm_mul_ps(cplxValue1, invScalar);
155 cplxValue2 =
_mm_mul_ps(cplxValue2, invScalar);
157 cplxValue1 =
_mm_mul_ps(cplxValue1, cplxValue1);
158 cplxValue2 =
_mm_mul_ps(cplxValue2, cplxValue2);
167 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
168 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
169 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
170 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
173 number = quarterPoints * 4;
174 magnitudeVectorPtr = &magnitudeVector[number];
175 complexVectorPtr = (
const int16_t*)&complexVector[number];
176 for (; number < num_points; number++) {
177 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
178 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
179 const float val1Result =
180 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
181 *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
187#include <xmmintrin.h>
191 unsigned int num_points)
193 unsigned int number = 0;
194 const unsigned int quarterPoints = num_points / 4;
196 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
197 int16_t* magnitudeVectorPtr = magnitudeVector;
202 __m128 cplxValue1, cplxValue2, iValue, qValue, result;
207 for (; number < quarterPoints; number++) {
209 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
210 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
211 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
212 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
215 complexVectorPtr += 4;
217 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
218 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
219 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
220 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
223 complexVectorPtr += 4;
225 cplxValue1 =
_mm_mul_ps(cplxValue1, invScalar);
226 cplxValue2 =
_mm_mul_ps(cplxValue2, invScalar);
243 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
244 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
245 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
246 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
249 number = quarterPoints * 4;
250 magnitudeVectorPtr = &magnitudeVector[number];
251 complexVectorPtr = (
const int16_t*)&complexVector[number];
252 for (; number < num_points; number++) {
253 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
254 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
255 const float val1Result =
256 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
257 *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
262#ifdef LV_HAVE_GENERIC
266 unsigned int num_points)
268 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
269 int16_t* magnitudeVectorPtr = magnitudeVector;
270 unsigned int number = 0;
271 const float scalar = SHRT_MAX;
272 for (number = 0; number < num_points; number++) {
273 float real = ((float)(*complexVectorPtr++)) / scalar;
274 float imag = ((float)(*complexVectorPtr++)) / scalar;
275 *magnitudeVectorPtr++ =
276 (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
285#ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
286#define INCLUDED_volk_16ic_magnitude_16i_u_H
294#include <immintrin.h>
296static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector,
298 unsigned int num_points)
300 unsigned int number = 0;
301 const unsigned int eighthPoints = num_points / 8;
303 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
304 int16_t* magnitudeVectorPtr = magnitudeVector;
306 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
307 __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
310 __m256 cplxValue1, cplxValue2, result;
311 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
313 for (; number < eighthPoints; number++) {
315 int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
316 complexVectorPtr += 16;
317 short1 = _mm256_extracti128_si256(int1, 0);
318 short2 = _mm256_extracti128_si256(int1, 1);
320 int1 = _mm256_cvtepi16_epi32(short1);
321 int2 = _mm256_cvtepi16_epi32(short2);
322 cplxValue1 = _mm256_cvtepi32_ps(int1);
323 cplxValue2 = _mm256_cvtepi32_ps(int2);
325 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
326 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
328 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
329 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
331 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
333 result = _mm256_sqrt_ps(result);
335 result = _mm256_mul_ps(result, vScalar);
337 int1 = _mm256_cvtps_epi32(result);
338 int1 = _mm256_packs_epi32(int1, int1);
339 int1 = _mm256_permutevar8x32_epi32(
341 short1 = _mm256_extracti128_si256(int1, 0);
343 magnitudeVectorPtr += 8;
346 number = eighthPoints * 8;
347 magnitudeVectorPtr = &magnitudeVector[number];
348 complexVectorPtr = (
const int16_t*)&complexVector[number];
349 for (; number < num_points; number++) {
350 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
351 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
352 const float val1Result =
353 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
354 *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
363static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
365 unsigned int num_points)
367 unsigned int number = 0;
368 unsigned int quarter_points = num_points / 4;
370 const float scalar = SHRT_MAX;
371 const float inv_scalar = 1.0f / scalar;
373 int16_t* magnitudeVectorPtr = magnitudeVector;
374 const lv_16sc_t* complexVectorPtr = complexVector;
379 for (number = 0; number < quarter_points; number++) {
380 const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
382 c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
383 c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
385 c_vec.val[0] = vmulq_n_f32(c_vec.val[0], inv_scalar);
386 c_vec.val[1] = vmulq_n_f32(c_vec.val[1], inv_scalar);
389 mag_vec = vmulq_f32(mag_vec_squared,
_vinvsqrtq_f32(mag_vec_squared));
391 mag_vec = vmulq_n_f32(mag_vec, scalar);
394 mag_vec = vaddq_f32(mag_vec, vdupq_n_f32(0.5));
395 const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
396 vst1_s16(magnitudeVectorPtr, mag16_vec);
398 magnitudeVectorPtr += 4;
399 complexVectorPtr += 4;
403 for (number = quarter_points * 4; number < num_points; number++) {
404 const float real =
lv_creal(*complexVectorPtr) * inv_scalar;
405 const float imag =
lv_cimag(*complexVectorPtr) * inv_scalar;
406 *magnitudeVectorPtr =
407 (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
409 magnitudeVectorPtr++;