Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_stddev_and_mean_32f_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014, 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
60#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
61#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
62
63#include <inttypes.h>
64#include <math.h>
65#include <volk/volk_common.h>
66
67// Youngs and Cramer's Algorithm for calculating std and mean
68// Using the methods discussed here:
69// https://doi.org/10.1145/3221269.3223036
70#ifdef LV_HAVE_GENERIC
71
72static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
73 float* mean,
74 const float* inputBuffer,
75 unsigned int num_points)
76{
77 const float* in_ptr = inputBuffer;
78 if (num_points == 0) {
79 return;
80 } else if (num_points == 1) {
81 *stddev = 0.f;
82 *mean = (*in_ptr);
83 return;
84 }
85
86 float Sum[2];
87 float SquareSum[2] = { 0.f, 0.f };
88 Sum[0] = (*in_ptr++);
89 Sum[1] = (*in_ptr++);
90
91 uint32_t half_points = num_points / 2;
92
93 for (uint32_t number = 1; number < half_points; number++) {
94 float Val0 = (*in_ptr++);
95 float Val1 = (*in_ptr++);
96 float n = (float)number;
97 float n_plus_one = n + 1.f;
98 float r = 1.f / (n * n_plus_one);
99
100 Sum[0] += Val0;
101 Sum[1] += Val1;
102
103 SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
104 SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
105 }
106
107 SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
108 Sum[0] += Sum[1];
109
110 uint32_t points_done = half_points * 2;
111
112 for (; points_done < num_points; points_done++) {
113 float Val = (*in_ptr++);
114 float n = (float)points_done;
115 float n_plus_one = n + 1.f;
116 float r = 1.f / (n * n_plus_one);
117 Sum[0] += Val;
118 SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
119 }
120 *stddev = sqrtf(SquareSum[0] / num_points);
121 *mean = Sum[0] / num_points;
122}
123#endif /* LV_HAVE_GENERIC */
124
125static inline float update_square_sum_1_val(const float SquareSum,
126 const float Sum,
127 const uint32_t len,
128 const float val)
129{
130 // Updates a sum of squares calculated over len values with the value val
131 float n = (float)len;
132 float n_plus_one = n + 1.f;
133 return SquareSum +
134 1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum);
135}
136
137static inline float add_square_sums(const float SquareSum0,
138 const float Sum0,
139 const float SquareSum1,
140 const float Sum1,
141 const uint32_t len)
142{
143 // Add two sums of squares calculated over the same number of values, len
144 float n = (float)len;
145 return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
146}
147
148static inline void accrue_result(float* PartialSquareSums,
149 float* PartialSums,
150 const uint32_t NumberOfPartitions,
151 const uint32_t PartitionLen)
152{
153 // Add all partial sums and square sums into the first element of the arrays
154 uint32_t accumulators = NumberOfPartitions;
155 uint32_t stages = 0;
156 uint32_t offset = 1;
157 uint32_t partition_len = PartitionLen;
158
159 while (accumulators >>= 1) {
160 stages++;
161 } // Integer log2
162 accumulators = NumberOfPartitions;
163
164 for (uint32_t s = 0; s < stages; s++) {
165 accumulators /= 2;
166 uint32_t idx = 0;
167 for (uint32_t a = 0; a < accumulators; a++) {
168 PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx],
169 PartialSums[idx],
170 PartialSquareSums[idx + offset],
171 PartialSums[idx + offset],
172 partition_len);
173 PartialSums[idx] += PartialSums[idx + offset];
174 idx += 2 * offset;
175 }
176 offset *= 2;
177 partition_len *= 2;
178 }
179}
180
181#ifdef LV_HAVE_NEON
182#include <arm_neon.h>
184
185static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev,
186 float* mean,
187 const float* inputBuffer,
188 unsigned int num_points)
189{
190 if (num_points < 8) {
191 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
192 return;
193 }
194
195 const float* in_ptr = inputBuffer;
196
197 __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f };
198 __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f };
199
200 const uint32_t eigth_points = num_points / 8;
201
202 float32x4_t Sum0, Sum1;
203
204 Sum0 = vld1q_f32((const float32_t*)in_ptr);
205 in_ptr += 4;
206 __VOLK_PREFETCH(in_ptr + 4);
207
208 Sum1 = vld1q_f32((const float32_t*)in_ptr);
209 in_ptr += 4;
210 __VOLK_PREFETCH(in_ptr + 4);
211
212 float32x4_t SquareSum0 = { 0.f };
213 float32x4_t SquareSum1 = { 0.f };
214
215 float32x4_t Values0, Values1;
216 float32x4_t Aux0, Aux1;
217 float32x4_t Reciprocal;
218
219 for (uint32_t number = 1; number < eigth_points; number++) {
220 Values0 = vld1q_f32(in_ptr);
221 in_ptr += 4;
222 __VOLK_PREFETCH(in_ptr + 4);
223
224 Values1 = vld1q_f32(in_ptr);
225 in_ptr += 4;
226 __VOLK_PREFETCH(in_ptr + 4);
227
228 float n = (float)number;
229 float n_plus_one = n + 1.f;
230 Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
231
232 Sum0 = vaddq_f32(Sum0, Values0);
233 Aux0 = vdupq_n_f32(n_plus_one);
234 SquareSum0 =
235 _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
236
237 Sum1 = vaddq_f32(Sum1, Values1);
238 Aux1 = vdupq_n_f32(n_plus_one);
239 SquareSum1 =
240 _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
241 }
242
243 vst1q_f32(&SumLocal[0], Sum0);
244 vst1q_f32(&SumLocal[4], Sum1);
245 vst1q_f32(&SquareSumLocal[0], SquareSum0);
246 vst1q_f32(&SquareSumLocal[4], SquareSum1);
247
248 accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
249
250 uint32_t points_done = eigth_points * 8;
251
252 for (; points_done < num_points; points_done++) {
253 float val = (*in_ptr++);
254 SumLocal[0] += val;
255 SquareSumLocal[0] =
256 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
257 }
258
259 *stddev = sqrtf(SquareSumLocal[0] / num_points);
260 *mean = SumLocal[0] / num_points;
261}
262#endif /* LV_HAVE_NEON */
263
264#ifdef LV_HAVE_SSE
266#include <xmmintrin.h>
267
268static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev,
269 float* mean,
270 const float* inputBuffer,
271 unsigned int num_points)
272{
273 if (num_points < 8) {
274 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
275 return;
276 }
277
278 const float* in_ptr = inputBuffer;
279
280 __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
281 __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
282
283
284 const uint32_t eigth_points = num_points / 8;
285
286 __m128 Sum0 = _mm_loadu_ps(in_ptr);
287 in_ptr += 4;
288 __m128 Sum1 = _mm_loadu_ps(in_ptr);
289 in_ptr += 4;
290 __m128 SquareSum0 = _mm_setzero_ps();
291 __m128 SquareSum1 = _mm_setzero_ps();
292 __m128 Values0, Values1;
293 __m128 Aux0, Aux1;
294 __m128 Reciprocal;
295
296 for (uint32_t number = 1; number < eigth_points; number++) {
297 Values0 = _mm_loadu_ps(in_ptr);
298 in_ptr += 4;
299 __VOLK_PREFETCH(in_ptr + 4);
300
301 Values1 = _mm_loadu_ps(in_ptr);
302 in_ptr += 4;
303 __VOLK_PREFETCH(in_ptr + 4);
304
305 float n = (float)number;
306 float n_plus_one = n + 1.f;
307 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
308
309 Sum0 = _mm_add_ps(Sum0, Values0);
310 Aux0 = _mm_set_ps1(n_plus_one);
311 SquareSum0 =
312 _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
313
314 Sum1 = _mm_add_ps(Sum1, Values1);
315 Aux1 = _mm_set_ps1(n_plus_one);
316 SquareSum1 =
317 _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
318 }
319
320 _mm_store_ps(&SumLocal[0], Sum0);
321 _mm_store_ps(&SumLocal[4], Sum1);
322 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
323 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
324
325 accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
326
327 uint32_t points_done = eigth_points * 8;
328
329 for (; points_done < num_points; points_done++) {
330 float val = (*in_ptr++);
331 SumLocal[0] += val;
332 SquareSumLocal[0] =
333 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
334 }
335
336 *stddev = sqrtf(SquareSumLocal[0] / num_points);
337 *mean = SumLocal[0] / num_points;
338}
339#endif /* LV_HAVE_SSE */
340
341#ifdef LV_HAVE_AVX
342#include <immintrin.h>
344
345static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
346 float* mean,
347 const float* inputBuffer,
348 unsigned int num_points)
349{
350 if (num_points < 16) {
351 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
352 return;
353 }
354
355 const float* in_ptr = inputBuffer;
356
357 __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
358 __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
359
360 const unsigned int sixteenth_points = num_points / 16;
361
362 __m256 Sum0 = _mm256_loadu_ps(in_ptr);
363 in_ptr += 8;
364 __m256 Sum1 = _mm256_loadu_ps(in_ptr);
365 in_ptr += 8;
366
367 __m256 SquareSum0 = _mm256_setzero_ps();
368 __m256 SquareSum1 = _mm256_setzero_ps();
369 __m256 Values0, Values1;
370 __m256 Aux0, Aux1;
371 __m256 Reciprocal;
372
373 for (uint32_t number = 1; number < sixteenth_points; number++) {
374 Values0 = _mm256_loadu_ps(in_ptr);
375 in_ptr += 8;
376 __VOLK_PREFETCH(in_ptr + 8);
377
378 Values1 = _mm256_loadu_ps(in_ptr);
379 in_ptr += 8;
380 __VOLK_PREFETCH(in_ptr + 8);
381
382 float n = (float)number;
383 float n_plus_one = n + 1.f;
384
385 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
386
387 Sum0 = _mm256_add_ps(Sum0, Values0);
388 Aux0 = _mm256_set1_ps(n_plus_one);
389 SquareSum0 =
390 _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
391
392 Sum1 = _mm256_add_ps(Sum1, Values1);
393 Aux1 = _mm256_set1_ps(n_plus_one);
394 SquareSum1 =
395 _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
396 }
397
398 _mm256_store_ps(&SumLocal[0], Sum0);
399 _mm256_store_ps(&SumLocal[8], Sum1);
400 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
401 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
402
403 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
404
405 uint32_t points_done = sixteenth_points * 16;
406
407 for (; points_done < num_points; points_done++) {
408 float val = (*in_ptr++);
409 SumLocal[0] += val;
410 SquareSumLocal[0] =
411 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
412 }
413
414 *stddev = sqrtf(SquareSumLocal[0] / num_points);
415 *mean = SumLocal[0] / num_points;
416}
417#endif /* LV_HAVE_AVX */
418
419#ifdef LV_HAVE_SSE
420#include <xmmintrin.h>
421
422static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
423 float* mean,
424 const float* inputBuffer,
425 unsigned int num_points)
426{
427 if (num_points < 8) {
428 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
429 return;
430 }
431
432 const float* in_ptr = inputBuffer;
433
434 __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
435 __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
436
437
438 const uint32_t eigth_points = num_points / 8;
439
440 __m128 Sum0 = _mm_load_ps(in_ptr);
441 in_ptr += 4;
442 __m128 Sum1 = _mm_load_ps(in_ptr);
443 in_ptr += 4;
444 __m128 SquareSum0 = _mm_setzero_ps();
445 __m128 SquareSum1 = _mm_setzero_ps();
446 __m128 Values0, Values1;
447 __m128 Aux0, Aux1;
448 __m128 Reciprocal;
449
450 for (uint32_t number = 1; number < eigth_points; number++) {
451 Values0 = _mm_load_ps(in_ptr);
452 in_ptr += 4;
453 __VOLK_PREFETCH(in_ptr + 4);
454
455 Values1 = _mm_load_ps(in_ptr);
456 in_ptr += 4;
457 __VOLK_PREFETCH(in_ptr + 4);
458
459 float n = (float)number;
460 float n_plus_one = n + 1.f;
461 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
462
463 Sum0 = _mm_add_ps(Sum0, Values0);
464 Aux0 = _mm_set_ps1(n_plus_one);
465 SquareSum0 =
466 _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
467
468 Sum1 = _mm_add_ps(Sum1, Values1);
469 Aux1 = _mm_set_ps1(n_plus_one);
470 SquareSum1 =
471 _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
472 }
473
474 _mm_store_ps(&SumLocal[0], Sum0);
475 _mm_store_ps(&SumLocal[4], Sum1);
476 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
477 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
478
479 accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
480
481 uint32_t points_done = eigth_points * 8;
482
483 for (; points_done < num_points; points_done++) {
484 float val = (*in_ptr++);
485 SumLocal[0] += val;
486 SquareSumLocal[0] =
487 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
488 }
489
490 *stddev = sqrtf(SquareSumLocal[0] / num_points);
491 *mean = SumLocal[0] / num_points;
492}
493#endif /* LV_HAVE_SSE */
494
495#ifdef LV_HAVE_AVX
496#include <immintrin.h>
497
498static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
499 float* mean,
500 const float* inputBuffer,
501 unsigned int num_points)
502{
503 if (num_points < 16) {
504 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
505 return;
506 }
507
508 const float* in_ptr = inputBuffer;
509
510 __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
511 __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
512
513 const unsigned int sixteenth_points = num_points / 16;
514
515 __m256 Sum0 = _mm256_load_ps(in_ptr);
516 in_ptr += 8;
517 __m256 Sum1 = _mm256_load_ps(in_ptr);
518 in_ptr += 8;
519
520 __m256 SquareSum0 = _mm256_setzero_ps();
521 __m256 SquareSum1 = _mm256_setzero_ps();
522 __m256 Values0, Values1;
523 __m256 Aux0, Aux1;
524 __m256 Reciprocal;
525
526 for (uint32_t number = 1; number < sixteenth_points; number++) {
527 Values0 = _mm256_load_ps(in_ptr);
528 in_ptr += 8;
529 __VOLK_PREFETCH(in_ptr + 8);
530
531 Values1 = _mm256_load_ps(in_ptr);
532 in_ptr += 8;
533 __VOLK_PREFETCH(in_ptr + 8);
534
535 float n = (float)number;
536 float n_plus_one = n + 1.f;
537
538 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
539
540 Sum0 = _mm256_add_ps(Sum0, Values0);
541 Aux0 = _mm256_set1_ps(n_plus_one);
542 SquareSum0 =
543 _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
544
545 Sum1 = _mm256_add_ps(Sum1, Values1);
546 Aux1 = _mm256_set1_ps(n_plus_one);
547 SquareSum1 =
548 _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
549 }
550
551 _mm256_store_ps(&SumLocal[0], Sum0);
552 _mm256_store_ps(&SumLocal[8], Sum1);
553 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
554 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
555
556 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
557
558 uint32_t points_done = sixteenth_points * 16;
559
560 for (; points_done < num_points; points_done++) {
561 float val = (*in_ptr++);
562 SumLocal[0] += val;
563 SquareSumLocal[0] =
564 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
565 }
566
567 *stddev = sqrtf(SquareSumLocal[0] / num_points);
568 *mean = SumLocal[0] / num_points;
569}
570#endif /* LV_HAVE_AVX */
571
572#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */