Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
66#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
68
69#include <volk/volk_complex.h>
70
71
72static inline void calculate_scaled_distances(float* target,
73 const lv_32fc_t symbol,
74 const lv_32fc_t* points,
75 const float scalar,
76 const unsigned int num_points)
77{
78 lv_32fc_t diff;
79 for (unsigned int i = 0; i < num_points; ++i) {
80 /*
81 * Calculate: |y - x|^2 * SNR_lin
82 * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
83 */
84 diff = symbol - *points++;
85 *target++ =
86 scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
87 }
88}
89
90
91#ifdef LV_HAVE_AVX2
92#include <immintrin.h>
94
95static inline void
96volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
97 lv_32fc_t* src0,
98 lv_32fc_t* points,
99 float scalar,
100 unsigned int num_points)
101{
102 const unsigned int num_bytes = num_points * 8;
103 __m128 xmm9, xmm10;
104 __m256 xmm4, xmm6;
105 __m256 xmm_points0, xmm_points1, xmm_result;
106
107 const unsigned int bound = num_bytes >> 6;
108
109 // load complex value into all parts of the register.
110 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
111 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
112
113 // Load scalar into all 8 parts of the register
114 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
116
117 // Set permutation constant
118 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
119
120 for (unsigned int i = 0; i < bound; ++i) {
121 xmm_points0 = _mm256_load_ps((float*)points);
122 xmm_points1 = _mm256_load_ps((float*)(points + 4));
123 points += 8;
124 __VOLK_PREFETCH(points);
125
127 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
128
129 _mm256_store_ps(target, xmm_result);
130 target += 8;
131 }
132
133 if (num_bytes >> 5 & 1) {
134 xmm_points0 = _mm256_load_ps((float*)points);
135
136 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
137
138 points += 4;
139
140 xmm6 = _mm256_mul_ps(xmm4, xmm4);
141
142 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
144
145 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
146
147 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
148 _mm_store_ps(target, xmm9);
149 target += 4;
150 }
151
152 if (num_bytes >> 4 & 1) {
153 xmm9 = _mm_load_ps((float*)points);
154
155 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
156
157 points += 2;
158
159 xmm9 = _mm_mul_ps(xmm10, xmm10);
160
161 xmm10 = _mm_hadd_ps(xmm9, xmm9);
162
163 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
164
165 _mm_storeh_pi((__m64*)target, xmm10);
166 target += 2;
167 }
168
169 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
170}
171
172#endif /*LV_HAVE_AVX2*/
173
174
175#ifdef LV_HAVE_AVX
176#include <immintrin.h>
178
179static inline void
181 lv_32fc_t* src0,
182 lv_32fc_t* points,
183 float scalar,
184 unsigned int num_points)
185{
186 const int eightsPoints = num_points / 8;
187 const int remainder = num_points - 8 * eightsPoints;
188
189 __m256 xmm_points0, xmm_points1, xmm_result;
190
191 // load complex value into all parts of the register.
192 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
193
194 // Load scalar into all 8 parts of the register
195 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
196
197 for (int i = 0; i < eightsPoints; ++i) {
198 xmm_points0 = _mm256_load_ps((float*)points);
199 xmm_points1 = _mm256_load_ps((float*)(points + 4));
200 points += 8;
201
202 xmm_result = _mm256_scaled_norm_dist_ps(
203 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
204
205 _mm256_store_ps(target, xmm_result);
206 target += 8;
207 }
208
209 const lv_32fc_t symbol = *src0;
210 calculate_scaled_distances(target, symbol, points, scalar, remainder);
211}
212
213#endif /* LV_HAVE_AVX */
214
215
216#ifdef LV_HAVE_SSE3
217#include <pmmintrin.h>
219
220static inline void
222 lv_32fc_t* src0,
223 lv_32fc_t* points,
224 float scalar,
225 unsigned int num_points)
226{
227 __m128 xmm_points0, xmm_points1, xmm_result;
228
229 /*
230 * First do 4 values in every loop iteration.
231 * There may be up to 3 values left.
232 * leftovers0 indicates if at least 2 more are available for SSE execution.
233 * leftovers1 indicates if there is a single element left.
234 */
235 const int quarterPoints = num_points / 4;
236 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237 const int leftovers1 = num_points % 2;
238
239 // load complex value into both parts of the register.
240 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
241
242 // Load scalar into all 4 parts of the register
243 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
244
245 for (int i = 0; i < quarterPoints; ++i) {
246 xmm_points0 = _mm_load_ps((float*)points);
247 xmm_points1 = _mm_load_ps((float*)(points + 2));
248 points += 4;
249 __VOLK_PREFETCH(points);
250 // calculate distances
251 xmm_result = _mm_scaled_norm_dist_ps_sse3(
252 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
253
254 _mm_store_ps(target, xmm_result);
255 target += 4;
256 }
257
258 for (int i = 0; i < leftovers0; ++i) {
259 xmm_points0 = _mm_load_ps((float*)points);
260 points += 2;
261
262 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
263 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
264 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
265 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
266
267 _mm_storeh_pi((__m64*)target, xmm_result);
268 target += 2;
269 }
270
271 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
272}
273
274#endif /*LV_HAVE_SSE3*/
275
276#ifdef LV_HAVE_SSE
278#include <xmmintrin.h>
279static inline void
281 lv_32fc_t* src0,
282 lv_32fc_t* points,
283 float scalar,
284 unsigned int num_points)
285{
286 const __m128 xmm_scalar = _mm_set1_ps(scalar);
287 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
288
289 for (unsigned i = 0; i < num_points / 4; ++i) {
290 __m128 xmm_points0 = _mm_load_ps((float*)points);
291 __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
292 points += 4;
294 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
295 _mm_store_ps((float*)target, xmm_result);
296 target += 4;
297 }
298
299 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
300}
301#endif // LV_HAVE_SSE
302
303#ifdef LV_HAVE_GENERIC
304static inline void
306 lv_32fc_t* src0,
307 lv_32fc_t* points,
308 float scalar,
309 unsigned int num_points)
310{
311 const lv_32fc_t symbol = *src0;
312 calculate_scaled_distances(target, symbol, points, scalar, num_points);
313}
314
315#endif /*LV_HAVE_GENERIC*/
316
317
318#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
319
320#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
321#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
322
323#include <volk/volk_complex.h>
324
325
326#ifdef LV_HAVE_AVX2
327#include <immintrin.h>
329
330static inline void
331volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
332 lv_32fc_t* src0,
333 lv_32fc_t* points,
334 float scalar,
335 unsigned int num_points)
336{
337 const unsigned int num_bytes = num_points * 8;
338 __m128 xmm9, xmm10;
339 __m256 xmm4, xmm6;
340 __m256 xmm_points0, xmm_points1, xmm_result;
341
342 const unsigned int bound = num_bytes >> 6;
343
344 // load complex value into all parts of the register.
345 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
346 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
347
348 // Load scalar into all 8 parts of the register
349 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
350 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
351
352 // Set permutation constant
353 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
354
355 for (unsigned int i = 0; i < bound; ++i) {
356 xmm_points0 = _mm256_loadu_ps((float*)points);
357 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
358 points += 8;
359 __VOLK_PREFETCH(points);
360
362 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
363
364 _mm256_storeu_ps(target, xmm_result);
365 target += 8;
366 }
367
368 if (num_bytes >> 5 & 1) {
369 xmm_points0 = _mm256_loadu_ps((float*)points);
370
371 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
372
373 points += 4;
374
375 xmm6 = _mm256_mul_ps(xmm4, xmm4);
376
377 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
378 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
379
380 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
381
382 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
383 _mm_storeu_ps(target, xmm9);
384 target += 4;
385 }
386
387 if (num_bytes >> 4 & 1) {
388 xmm9 = _mm_loadu_ps((float*)points);
389
390 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
391
392 points += 2;
393
394 xmm9 = _mm_mul_ps(xmm10, xmm10);
395
396 xmm10 = _mm_hadd_ps(xmm9, xmm9);
397
398 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
399
400 _mm_storeh_pi((__m64*)target, xmm10);
401 target += 2;
402 }
403
404 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
405}
406
407#endif /*LV_HAVE_AVX2*/
408
409
410#ifdef LV_HAVE_AVX
411#include <immintrin.h>
413
414static inline void
416 lv_32fc_t* src0,
417 lv_32fc_t* points,
418 float scalar,
419 unsigned int num_points)
420{
421 const int eightsPoints = num_points / 8;
422 const int remainder = num_points - 8 * eightsPoints;
423
424 __m256 xmm_points0, xmm_points1, xmm_result;
425
426 // load complex value into all parts of the register.
427 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
428
429 // Load scalar into all 8 parts of the register
430 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
431
432 for (int i = 0; i < eightsPoints; ++i) {
433 xmm_points0 = _mm256_loadu_ps((float*)points);
434 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
435 points += 8;
436
437 xmm_result = _mm256_scaled_norm_dist_ps(
438 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
439
440 _mm256_storeu_ps(target, xmm_result);
441 target += 8;
442 }
443
444 const lv_32fc_t symbol = *src0;
445 calculate_scaled_distances(target, symbol, points, scalar, remainder);
446}
447
448#endif /* LV_HAVE_AVX */
449
450
451#ifdef LV_HAVE_SSE3
452#include <pmmintrin.h>
454
455static inline void
457 lv_32fc_t* src0,
458 lv_32fc_t* points,
459 float scalar,
460 unsigned int num_points)
461{
462 __m128 xmm_points0, xmm_points1, xmm_result;
463
464 /*
465 * First do 4 values in every loop iteration.
466 * There may be up to 3 values left.
467 * leftovers0 indicates if at least 2 more are available for SSE execution.
468 * leftovers1 indicates if there is a single element left.
469 */
470 const int quarterPoints = num_points / 4;
471 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
472 const int leftovers1 = num_points % 2;
473
474 // load complex value into both parts of the register.
475 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
476
477 // Load scalar into all 4 parts of the register
478 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
479
480 for (int i = 0; i < quarterPoints; ++i) {
481 xmm_points0 = _mm_loadu_ps((float*)points);
482 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
483 points += 4;
484 __VOLK_PREFETCH(points);
485 // calculate distances
486 xmm_result = _mm_scaled_norm_dist_ps_sse3(
487 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
488
489 _mm_storeu_ps(target, xmm_result);
490 target += 4;
491 }
492
493 for (int i = 0; i < leftovers0; ++i) {
494 xmm_points0 = _mm_loadu_ps((float*)points);
495 points += 2;
496
497 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
498 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
499 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
500 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
501
502 _mm_storeh_pi((__m64*)target, xmm_result);
503 target += 2;
504 }
505
506 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
507}
508
509#endif /*LV_HAVE_SSE3*/
510
511#ifdef LV_HAVE_SSE
513#include <xmmintrin.h>
514static inline void
516 lv_32fc_t* src0,
517 lv_32fc_t* points,
518 float scalar,
519 unsigned int num_points)
520{
521 const __m128 xmm_scalar = _mm_set1_ps(scalar);
522 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
523
524 for (unsigned i = 0; i < num_points / 4; ++i) {
525 __m128 xmm_points0 = _mm_loadu_ps((float*)points);
526 __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
527 points += 4;
529 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
530 _mm_storeu_ps((float*)target, xmm_result);
531 target += 4;
532 }
533
534 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
535}
536#endif // LV_HAVE_SSE
537
538#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/