Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_stddev_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
55#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
56#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
57
58#include <inttypes.h>
59#include <math.h>
60#include <stdio.h>
61#include <volk/volk_common.h>
62
63#ifdef LV_HAVE_SSE4_1
64#include <smmintrin.h>
65
66static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev,
67 const float* inputBuffer,
68 const float mean,
69 unsigned int num_points)
70{
71 float returnValue = 0;
72 if (num_points > 0) {
73 unsigned int number = 0;
74 const unsigned int sixteenthPoints = num_points / 16;
75
76 const float* aPtr = inputBuffer;
77
78 __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
79
80 __m128 squareAccumulator = _mm_setzero_ps();
81 __m128 aVal1, aVal2, aVal3, aVal4;
82 __m128 cVal1, cVal2, cVal3, cVal4;
83 for (; number < sixteenthPoints; number++) {
84 aVal1 = _mm_load_ps(aPtr);
85 aPtr += 4;
86 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
87
88 aVal2 = _mm_load_ps(aPtr);
89 aPtr += 4;
90 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
91
92 aVal3 = _mm_load_ps(aPtr);
93 aPtr += 4;
94 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
95
96 aVal4 = _mm_load_ps(aPtr);
97 aPtr += 4;
98 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
99
100 cVal1 = _mm_or_ps(cVal1, cVal2);
101 cVal3 = _mm_or_ps(cVal3, cVal4);
102 cVal1 = _mm_or_ps(cVal1, cVal3);
103
104 squareAccumulator =
105 _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
106 }
107 _mm_store_ps(squareBuffer,
108 squareAccumulator); // Store the results back into the C container
109 returnValue = squareBuffer[0];
110 returnValue += squareBuffer[1];
111 returnValue += squareBuffer[2];
112 returnValue += squareBuffer[3];
113
114 number = sixteenthPoints * 16;
115 for (; number < num_points; number++) {
116 returnValue += (*aPtr) * (*aPtr);
117 aPtr++;
118 }
119 returnValue /= num_points;
120 returnValue -= (mean * mean);
121 returnValue = sqrtf(returnValue);
122 }
123 *stddev = returnValue;
124}
125
126#endif /* LV_HAVE_SSE4_1 */
127
128#ifdef LV_HAVE_SSE
129#include <xmmintrin.h>
130
131static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev,
132 const float* inputBuffer,
133 const float mean,
134 unsigned int num_points)
135{
136 float returnValue = 0;
137 if (num_points > 0) {
138 unsigned int number = 0;
139 const unsigned int quarterPoints = num_points / 4;
140
141 const float* aPtr = inputBuffer;
142
143 __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
144
145 __m128 squareAccumulator = _mm_setzero_ps();
146 __m128 aVal = _mm_setzero_ps();
147 for (; number < quarterPoints; number++) {
148 aVal = _mm_load_ps(aPtr); // aVal = x
149 aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
150 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
151 aPtr += 4;
152 }
153 _mm_store_ps(squareBuffer,
154 squareAccumulator); // Store the results back into the C container
155 returnValue = squareBuffer[0];
156 returnValue += squareBuffer[1];
157 returnValue += squareBuffer[2];
158 returnValue += squareBuffer[3];
159
160 number = quarterPoints * 4;
161 for (; number < num_points; number++) {
162 returnValue += (*aPtr) * (*aPtr);
163 aPtr++;
164 }
165 returnValue /= num_points;
166 returnValue -= (mean * mean);
167 returnValue = sqrtf(returnValue);
168 }
169 *stddev = returnValue;
170}
171#endif /* LV_HAVE_SSE */
172
173
174#ifdef LV_HAVE_AVX
175#include <immintrin.h>
176
177static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev,
178 const float* inputBuffer,
179 const float mean,
180 unsigned int num_points)
181{
182 float stdDev = 0;
183 if (num_points > 0) {
184 unsigned int number = 0;
185 const unsigned int thirtySecondthPoints = num_points / 32;
186
187 const float* aPtr = inputBuffer;
188 __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
189
190 __m256 squareAccumulator = _mm256_setzero_ps();
191 __m256 aVal1, aVal2, aVal3, aVal4;
192 __m256 cVal1, cVal2, cVal3, cVal4;
193 for (; number < thirtySecondthPoints; number++) {
194 aVal1 = _mm256_load_ps(aPtr);
195 aPtr += 8;
196 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
197
198 aVal2 = _mm256_load_ps(aPtr);
199 aPtr += 8;
200 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
201
202 aVal3 = _mm256_load_ps(aPtr);
203 aPtr += 8;
204 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
205
206 aVal4 = _mm256_load_ps(aPtr);
207 aPtr += 8;
208 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
209
210 cVal1 = _mm256_or_ps(cVal1, cVal2);
211 cVal3 = _mm256_or_ps(cVal3, cVal4);
212 cVal1 = _mm256_or_ps(cVal1, cVal3);
213
214 squareAccumulator =
215 _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
216 }
217 _mm256_store_ps(squareBuffer,
218 squareAccumulator); // Store the results back into the C container
219 stdDev = squareBuffer[0];
220 stdDev += squareBuffer[1];
221 stdDev += squareBuffer[2];
222 stdDev += squareBuffer[3];
223 stdDev += squareBuffer[4];
224 stdDev += squareBuffer[5];
225 stdDev += squareBuffer[6];
226 stdDev += squareBuffer[7];
227
228 number = thirtySecondthPoints * 32;
229 for (; number < num_points; number++) {
230 stdDev += (*aPtr) * (*aPtr);
231 aPtr++;
232 }
233 stdDev /= num_points;
234 stdDev -= (mean * mean);
235 stdDev = sqrtf(stdDev);
236 }
237 *stddev = stdDev;
238}
239#endif /* LV_HAVE_AVX */
240
241
242#ifdef LV_HAVE_GENERIC
243
244static inline void volk_32f_s32f_stddev_32f_generic(float* stddev,
245 const float* inputBuffer,
246 const float mean,
247 unsigned int num_points)
248{
249 float returnValue = 0;
250 if (num_points > 0) {
251 const float* aPtr = inputBuffer;
252 unsigned int number = 0;
253
254 for (number = 0; number < num_points; number++) {
255 returnValue += (*aPtr) * (*aPtr);
256 aPtr++;
257 }
258
259 returnValue /= num_points;
260 returnValue -= (mean * mean);
261 returnValue = sqrtf(returnValue);
262 }
263 *stddev = returnValue;
264}
265
266#endif /* LV_HAVE_GENERIC */
267
268
269#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
270
271#ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
272#define INCLUDED_volk_32f_s32f_stddev_32f_u_H
273
274#include <inttypes.h>
275#include <math.h>
276#include <stdio.h>
277#include <volk/volk_common.h>
278
279#ifdef LV_HAVE_AVX
280#include <immintrin.h>
281
282static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
283 const float* inputBuffer,
284 const float mean,
285 unsigned int num_points)
286{
287 float stdDev = 0;
288 if (num_points > 0) {
289 unsigned int number = 0;
290 const unsigned int thirtySecondthPoints = num_points / 32;
291
292 const float* aPtr = inputBuffer;
293 __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
294
295 __m256 squareAccumulator = _mm256_setzero_ps();
296 __m256 aVal1, aVal2, aVal3, aVal4;
297 __m256 cVal1, cVal2, cVal3, cVal4;
298 for (; number < thirtySecondthPoints; number++) {
299 aVal1 = _mm256_loadu_ps(aPtr);
300 aPtr += 8;
301 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
302
303 aVal2 = _mm256_loadu_ps(aPtr);
304 aPtr += 8;
305 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
306
307 aVal3 = _mm256_loadu_ps(aPtr);
308 aPtr += 8;
309 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
310
311 aVal4 = _mm256_loadu_ps(aPtr);
312 aPtr += 8;
313 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
314
315 cVal1 = _mm256_or_ps(cVal1, cVal2);
316 cVal3 = _mm256_or_ps(cVal3, cVal4);
317 cVal1 = _mm256_or_ps(cVal1, cVal3);
318
319 squareAccumulator =
320 _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
321 }
322 _mm256_storeu_ps(
323 squareBuffer,
324 squareAccumulator); // Store the results back into the C container
325 stdDev = squareBuffer[0];
326 stdDev += squareBuffer[1];
327 stdDev += squareBuffer[2];
328 stdDev += squareBuffer[3];
329 stdDev += squareBuffer[4];
330 stdDev += squareBuffer[5];
331 stdDev += squareBuffer[6];
332 stdDev += squareBuffer[7];
333
334 number = thirtySecondthPoints * 32;
335 for (; number < num_points; number++) {
336 stdDev += (*aPtr) * (*aPtr);
337 aPtr++;
338 }
339 stdDev /= num_points;
340 stdDev -= (mean * mean);
341 stdDev = sqrtf(stdDev);
342 }
343 *stddev = stdDev;
344}
345#endif /* LV_HAVE_AVX */
346
347#endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */