Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_magnitude_squared_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
59#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
60
61#include <inttypes.h>
62#include <math.h>
63#include <stdio.h>
64
65#ifdef LV_HAVE_AVX
66#include <immintrin.h>
68
69static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector,
70 const lv_32fc_t* complexVector,
71 unsigned int num_points)
72{
73 unsigned int number = 0;
74 const unsigned int eighthPoints = num_points / 8;
75
76 const float* complexVectorPtr = (float*)complexVector;
77 float* magnitudeVectorPtr = magnitudeVector;
78
79 __m256 cplxValue1, cplxValue2, result;
80
81 for (; number < eighthPoints; number++) {
82 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
83 cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
84 result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
85 _mm256_storeu_ps(magnitudeVectorPtr, result);
86
87 complexVectorPtr += 16;
88 magnitudeVectorPtr += 8;
89 }
90
91 number = eighthPoints * 8;
92 for (; number < num_points; number++) {
93 float val1Real = *complexVectorPtr++;
94 float val1Imag = *complexVectorPtr++;
95 *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
96 }
97}
98#endif /* LV_HAVE_AVX */
99
100
101#ifdef LV_HAVE_SSE3
102#include <pmmintrin.h>
104
105static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector,
106 const lv_32fc_t* complexVector,
107 unsigned int num_points)
108{
109 unsigned int number = 0;
110 const unsigned int quarterPoints = num_points / 4;
111
112 const float* complexVectorPtr = (float*)complexVector;
113 float* magnitudeVectorPtr = magnitudeVector;
114
115 __m128 cplxValue1, cplxValue2, result;
116 for (; number < quarterPoints; number++) {
117 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
118 complexVectorPtr += 4;
119
120 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
121 complexVectorPtr += 4;
122
123 result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
124 _mm_storeu_ps(magnitudeVectorPtr, result);
125 magnitudeVectorPtr += 4;
126 }
127
128 number = quarterPoints * 4;
129 for (; number < num_points; number++) {
130 float val1Real = *complexVectorPtr++;
131 float val1Imag = *complexVectorPtr++;
132 *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
133 }
134}
135#endif /* LV_HAVE_SSE3 */
136
137
138#ifdef LV_HAVE_SSE
140#include <xmmintrin.h>
141
142static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector,
143 const lv_32fc_t* complexVector,
144 unsigned int num_points)
145{
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
148
149 const float* complexVectorPtr = (float*)complexVector;
150 float* magnitudeVectorPtr = magnitudeVector;
151
152 __m128 cplxValue1, cplxValue2, result;
153
154 for (; number < quarterPoints; number++) {
155 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
156 complexVectorPtr += 4;
157
158 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
159 complexVectorPtr += 4;
160
161 result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
162 _mm_storeu_ps(magnitudeVectorPtr, result);
163 magnitudeVectorPtr += 4;
164 }
165
166 number = quarterPoints * 4;
167 for (; number < num_points; number++) {
168 float val1Real = *complexVectorPtr++;
169 float val1Imag = *complexVectorPtr++;
170 *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
171 }
172}
173#endif /* LV_HAVE_SSE */
174
175
176#ifdef LV_HAVE_GENERIC
177
178static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector,
179 const lv_32fc_t* complexVector,
180 unsigned int num_points)
181{
182 const float* complexVectorPtr = (float*)complexVector;
183 float* magnitudeVectorPtr = magnitudeVector;
184 unsigned int number = 0;
185 for (number = 0; number < num_points; number++) {
186 const float real = *complexVectorPtr++;
187 const float imag = *complexVectorPtr++;
188 *magnitudeVectorPtr++ = (real * real) + (imag * imag);
189 }
190}
191#endif /* LV_HAVE_GENERIC */
192
193
194#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
195#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
196#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
197
198#include <inttypes.h>
199#include <math.h>
200#include <stdio.h>
201
202#ifdef LV_HAVE_AVX
203#include <immintrin.h>
205
206static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector,
207 const lv_32fc_t* complexVector,
208 unsigned int num_points)
209{
210 unsigned int number = 0;
211 const unsigned int eighthPoints = num_points / 8;
212
213 const float* complexVectorPtr = (float*)complexVector;
214 float* magnitudeVectorPtr = magnitudeVector;
215
216 __m256 cplxValue1, cplxValue2, result;
217 for (; number < eighthPoints; number++) {
218 cplxValue1 = _mm256_load_ps(complexVectorPtr);
219 complexVectorPtr += 8;
220
221 cplxValue2 = _mm256_load_ps(complexVectorPtr);
222 complexVectorPtr += 8;
223
224 result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
225 _mm256_store_ps(magnitudeVectorPtr, result);
226 magnitudeVectorPtr += 8;
227 }
228
229 number = eighthPoints * 8;
230 for (; number < num_points; number++) {
231 float val1Real = *complexVectorPtr++;
232 float val1Imag = *complexVectorPtr++;
233 *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
234 }
235}
236#endif /* LV_HAVE_AVX */
237
238
239#ifdef LV_HAVE_SSE3
240#include <pmmintrin.h>
242
243static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector,
244 const lv_32fc_t* complexVector,
245 unsigned int num_points)
246{
247 unsigned int number = 0;
248 const unsigned int quarterPoints = num_points / 4;
249
250 const float* complexVectorPtr = (float*)complexVector;
251 float* magnitudeVectorPtr = magnitudeVector;
252
253 __m128 cplxValue1, cplxValue2, result;
254 for (; number < quarterPoints; number++) {
255 cplxValue1 = _mm_load_ps(complexVectorPtr);
256 complexVectorPtr += 4;
257
258 cplxValue2 = _mm_load_ps(complexVectorPtr);
259 complexVectorPtr += 4;
260
261 result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
262 _mm_store_ps(magnitudeVectorPtr, result);
263 magnitudeVectorPtr += 4;
264 }
265
266 number = quarterPoints * 4;
267 for (; number < num_points; number++) {
268 float val1Real = *complexVectorPtr++;
269 float val1Imag = *complexVectorPtr++;
270 *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
271 }
272}
273#endif /* LV_HAVE_SSE3 */
274
275
276#ifdef LV_HAVE_SSE
278#include <xmmintrin.h>
279
280static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector,
281 const lv_32fc_t* complexVector,
282 unsigned int num_points)
283{
284 unsigned int number = 0;
285 const unsigned int quarterPoints = num_points / 4;
286
287 const float* complexVectorPtr = (float*)complexVector;
288 float* magnitudeVectorPtr = magnitudeVector;
289
290 __m128 cplxValue1, cplxValue2, result;
291 for (; number < quarterPoints; number++) {
292 cplxValue1 = _mm_load_ps(complexVectorPtr);
293 complexVectorPtr += 4;
294
295 cplxValue2 = _mm_load_ps(complexVectorPtr);
296 complexVectorPtr += 4;
297
298 result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
299 _mm_store_ps(magnitudeVectorPtr, result);
300 magnitudeVectorPtr += 4;
301 }
302
303 number = quarterPoints * 4;
304 for (; number < num_points; number++) {
305 float val1Real = *complexVectorPtr++;
306 float val1Imag = *complexVectorPtr++;
307 *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
308 }
309}
310#endif /* LV_HAVE_SSE */
311
312
313#ifdef LV_HAVE_NEON
314#include <arm_neon.h>
315
316static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector,
317 const lv_32fc_t* complexVector,
318 unsigned int num_points)
319{
320 unsigned int number = 0;
321 const unsigned int quarterPoints = num_points / 4;
322
323 const float* complexVectorPtr = (float*)complexVector;
324 float* magnitudeVectorPtr = magnitudeVector;
325
326 float32x4x2_t cmplx_val;
327 float32x4_t result;
328 for (; number < quarterPoints; number++) {
329 cmplx_val = vld2q_f32(complexVectorPtr);
330 complexVectorPtr += 8;
331
332 cmplx_val.val[0] =
333 vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
334 cmplx_val.val[1] =
335 vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
336
337 result =
338 vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
339
340 vst1q_f32(magnitudeVectorPtr, result);
341 magnitudeVectorPtr += 4;
342 }
343
344 number = quarterPoints * 4;
345 for (; number < num_points; number++) {
346 float val1Real = *complexVectorPtr++;
347 float val1Imag = *complexVectorPtr++;
348 *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
349 }
350}
351#endif /* LV_HAVE_NEON */
352
353
354#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */