Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_magnitude_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
60#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
61#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
62
63#include <inttypes.h>
64#include <math.h>
65#include <stdio.h>
66#include <volk/volk_common.h>
67
68#ifdef LV_HAVE_GENERIC
69
70static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
71 const lv_32fc_t* complexVector,
72 const float scalar,
73 unsigned int num_points)
74{
75 const float* complexVectorPtr = (float*)complexVector;
76 int16_t* magnitudeVectorPtr = magnitudeVector;
77 unsigned int number = 0;
78 for (number = 0; number < num_points; number++) {
79 __VOLK_VOLATILE float real = *complexVectorPtr++;
80 __VOLK_VOLATILE float imag = *complexVectorPtr++;
81 real *= real;
82 imag *= imag;
83 *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag));
84 }
85}
86#endif /* LV_HAVE_GENERIC */
87
88#ifdef LV_HAVE_AVX2
89#include <immintrin.h>
90
91static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
92 const lv_32fc_t* complexVector,
93 const float scalar,
94 unsigned int num_points)
95{
96 unsigned int number = 0;
97 const unsigned int eighthPoints = num_points / 8;
98
99 const float* complexVectorPtr = (const float*)complexVector;
100 int16_t* magnitudeVectorPtr = magnitudeVector;
101
102 __m256 vScalar = _mm256_set1_ps(scalar);
103 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
104 __m256 cplxValue1, cplxValue2, result;
105 __m256i resultInt;
106 __m128i resultShort;
107
108 for (; number < eighthPoints; number++) {
109 cplxValue1 = _mm256_load_ps(complexVectorPtr);
110 complexVectorPtr += 8;
111
112 cplxValue2 = _mm256_load_ps(complexVectorPtr);
113 complexVectorPtr += 8;
114
115 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
116 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
117
118 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
119
120 result = _mm256_sqrt_ps(result);
121
122 result = _mm256_mul_ps(result, vScalar);
123
124 resultInt = _mm256_cvtps_epi32(result);
125 resultInt = _mm256_packs_epi32(resultInt, resultInt);
126 resultInt = _mm256_permutevar8x32_epi32(
127 resultInt, idx); // permute to compensate for shuffling in hadd and packs
128 resultShort = _mm256_extracti128_si256(resultInt, 0);
129 _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
130 magnitudeVectorPtr += 8;
131 }
132
133 number = eighthPoints * 8;
135 magnitudeVector + number, complexVector + number, scalar, num_points - number);
136}
137#endif /* LV_HAVE_AVX2 */
138
139#ifdef LV_HAVE_SSE3
140#include <pmmintrin.h>
141
142static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
143 const lv_32fc_t* complexVector,
144 const float scalar,
145 unsigned int num_points)
146{
147 unsigned int number = 0;
148 const unsigned int quarterPoints = num_points / 4;
149
150 const float* complexVectorPtr = (const float*)complexVector;
151 int16_t* magnitudeVectorPtr = magnitudeVector;
152
153 __m128 vScalar = _mm_set_ps1(scalar);
154
155 __m128 cplxValue1, cplxValue2, result;
156
157 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
158
159 for (; number < quarterPoints; number++) {
160 cplxValue1 = _mm_load_ps(complexVectorPtr);
161 complexVectorPtr += 4;
162
163 cplxValue2 = _mm_load_ps(complexVectorPtr);
164 complexVectorPtr += 4;
165
166 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
167 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
168
169 result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
170
171 result = _mm_sqrt_ps(result);
172
173 result = _mm_mul_ps(result, vScalar);
174
175 _mm_store_ps(floatBuffer, result);
176 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
177 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
178 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
179 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
180 }
181
182 number = quarterPoints * 4;
184 magnitudeVector + number, complexVector + number, scalar, num_points - number);
185}
186#endif /* LV_HAVE_SSE3 */
187
188
189#ifdef LV_HAVE_SSE
190#include <xmmintrin.h>
191
192static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
193 const lv_32fc_t* complexVector,
194 const float scalar,
195 unsigned int num_points)
196{
197 unsigned int number = 0;
198 const unsigned int quarterPoints = num_points / 4;
199
200 const float* complexVectorPtr = (const float*)complexVector;
201 int16_t* magnitudeVectorPtr = magnitudeVector;
202
203 __m128 vScalar = _mm_set_ps1(scalar);
204
205 __m128 cplxValue1, cplxValue2, result;
206 __m128 iValue, qValue;
207
208 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
209
210 for (; number < quarterPoints; number++) {
211 cplxValue1 = _mm_load_ps(complexVectorPtr);
212 complexVectorPtr += 4;
213
214 cplxValue2 = _mm_load_ps(complexVectorPtr);
215 complexVectorPtr += 4;
216
217 // Arrange in i1i2i3i4 format
218 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
219 // Arrange in q1q2q3q4 format
220 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
221
222 __VOLK_VOLATILE __m128 iValue2 =
223 _mm_mul_ps(iValue, iValue); // Square the I values
224 __VOLK_VOLATILE __m128 qValue2 =
225 _mm_mul_ps(qValue, qValue); // Square the Q Values
226
227 result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
228
229 result = _mm_sqrt_ps(result);
230
231 result = _mm_mul_ps(result, vScalar);
232
233 _mm_store_ps(floatBuffer, result);
234 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
235 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
236 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
237 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
238 }
239
240 number = quarterPoints * 4;
242 magnitudeVector + number, complexVector + number, scalar, num_points - number);
243}
244#endif /* LV_HAVE_SSE */
245
246
247#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
248
249#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
250#define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
251
252#include <inttypes.h>
253#include <math.h>
254#include <stdio.h>
255#include <volk/volk_common.h>
256
257#ifdef LV_HAVE_AVX2
258#include <immintrin.h>
259
260static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
261 const lv_32fc_t* complexVector,
262 const float scalar,
263 unsigned int num_points)
264{
265 unsigned int number = 0;
266 const unsigned int eighthPoints = num_points / 8;
267
268 const float* complexVectorPtr = (const float*)complexVector;
269 int16_t* magnitudeVectorPtr = magnitudeVector;
270
271 __m256 vScalar = _mm256_set1_ps(scalar);
272 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
273 __m256 cplxValue1, cplxValue2, result;
274 __m256i resultInt;
275 __m128i resultShort;
276
277 for (; number < eighthPoints; number++) {
278 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
279 complexVectorPtr += 8;
280
281 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
282 complexVectorPtr += 8;
283
284 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
285 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
286
287 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
288
289 result = _mm256_sqrt_ps(result);
290
291 result = _mm256_mul_ps(result, vScalar);
292
293 resultInt = _mm256_cvtps_epi32(result);
294 resultInt = _mm256_packs_epi32(resultInt, resultInt);
295 resultInt = _mm256_permutevar8x32_epi32(
296 resultInt, idx); // permute to compensate for shuffling in hadd and packs
297 resultShort = _mm256_extracti128_si256(resultInt, 0);
298 _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
299 magnitudeVectorPtr += 8;
300 }
301
302 number = eighthPoints * 8;
304 magnitudeVector + number, complexVector + number, scalar, num_points - number);
305}
306#endif /* LV_HAVE_AVX2 */
307
308#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */