Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
63#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64#define INCLUDED_volk_32fc_index_max_16u_a_H
65
66#include <inttypes.h>
67#include <limits.h>
68#include <stdio.h>
69#include <volk/volk_common.h>
70#include <volk/volk_complex.h>
71
72#ifdef LV_HAVE_AVX2
73#include <immintrin.h>
75
76static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
77 lv_32fc_t* src0,
78 uint32_t num_points)
79{
80 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81
82 const __m256i indices_increment = _mm256_set1_epi32(8);
83 /*
84 * At the start of each loop iteration current_indices holds the indices of
85 * the complex numbers loaded from memory. Explanation for odd order is given
86 * in implementation of vector_32fc_index_max_variant0().
87 */
88 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89
90 __m256 max_values = _mm256_setzero_ps();
91 __m256i max_indices = _mm256_setzero_si256();
92
93 for (unsigned i = 0; i < num_points / 8u; ++i) {
94 __m256 in0 = _mm256_load_ps((float*)src0);
95 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
97 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
98 src0 += 8;
99 }
100
101 // determine maximum value and index in the result of the vectorized loop
102 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
103 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
104 _mm256_store_ps(max_values_buffer, max_values);
105 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
106
107 float max = 0.f;
108 uint32_t index = 0;
109 for (unsigned i = 0; i < 8; i++) {
110 if (max_values_buffer[i] > max) {
111 max = max_values_buffer[i];
112 index = max_indices_buffer[i];
113 }
114 }
115
116 // handle tail not processed by the vectorized loop
117 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118 const float abs_squared =
119 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
120 if (abs_squared > max) {
121 max = abs_squared;
122 index = i;
123 }
124 ++src0;
125 }
126
127 *target = index;
128}
129
130#endif /*LV_HAVE_AVX2*/
131
132#ifdef LV_HAVE_AVX2
133#include <immintrin.h>
135
136static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
137 lv_32fc_t* src0,
138 uint32_t num_points)
139{
140 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141
142 const __m256i indices_increment = _mm256_set1_epi32(8);
143 /*
144 * At the start of each loop iteration current_indices holds the indices of
145 * the complex numbers loaded from memory. Explanation for odd order is given
146 * in implementation of vector_32fc_index_max_variant0().
147 */
148 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149
150 __m256 max_values = _mm256_setzero_ps();
151 __m256i max_indices = _mm256_setzero_si256();
152
153 for (unsigned i = 0; i < num_points / 8u; ++i) {
154 __m256 in0 = _mm256_load_ps((float*)src0);
155 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
157 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
158 src0 += 8;
159 }
160
161 // determine maximum value and index in the result of the vectorized loop
162 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
163 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
164 _mm256_store_ps(max_values_buffer, max_values);
165 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
166
167 float max = 0.f;
168 uint32_t index = 0;
169 for (unsigned i = 0; i < 8; i++) {
170 if (max_values_buffer[i] > max) {
171 max = max_values_buffer[i];
172 index = max_indices_buffer[i];
173 }
174 }
175
176 // handle tail not processed by the vectorized loop
177 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178 const float abs_squared =
179 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
180 if (abs_squared > max) {
181 max = abs_squared;
182 index = i;
183 }
184 ++src0;
185 }
186
187 *target = index;
188}
189
190#endif /*LV_HAVE_AVX2*/
191
192#ifdef LV_HAVE_SSE3
193#include <pmmintrin.h>
194#include <xmmintrin.h>
195
196static inline void
197volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
198{
199 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
200 const uint32_t num_bytes = num_points * 8;
201
202 union bit128 holderf;
203 union bit128 holderi;
204 float sq_dist = 0.0;
205
206 union bit128 xmm5, xmm4;
207 __m128 xmm1, xmm2, xmm3;
208 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209
210 xmm5.int_vec = _mm_setzero_si128();
211 xmm4.int_vec = _mm_setzero_si128();
212 holderf.int_vec = _mm_setzero_si128();
213 holderi.int_vec = _mm_setzero_si128();
214
215 int bound = num_bytes >> 5;
216 int i = 0;
217
218 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
219 xmm9 = _mm_setzero_si128();
220 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
221 xmm3 = _mm_setzero_ps();
222
223 for (; i < bound; ++i) {
224 xmm1 = _mm_load_ps((float*)src0);
225 xmm2 = _mm_load_ps((float*)&src0[2]);
226
227 src0 += 4;
228
229 xmm1 = _mm_mul_ps(xmm1, xmm1);
230 xmm2 = _mm_mul_ps(xmm2, xmm2);
231
232 xmm1 = _mm_hadd_ps(xmm1, xmm2);
233
234 xmm3 = _mm_max_ps(xmm1, xmm3);
235
236 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
237 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
238
239 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
240 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
241
242 xmm9 = _mm_add_epi32(xmm11, xmm12);
243
244 xmm8 = _mm_add_epi32(xmm8, xmm10);
245 }
246
247 if (num_bytes >> 4 & 1) {
248 xmm2 = _mm_load_ps((float*)src0);
249
250 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
251 xmm8 = bit128_p(&xmm1)->int_vec;
252
253 xmm2 = _mm_mul_ps(xmm2, xmm2);
254
255 src0 += 2;
256
257 xmm1 = _mm_hadd_ps(xmm2, xmm2);
258
259 xmm3 = _mm_max_ps(xmm1, xmm3);
260
261 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
262
263 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
264 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
265
266 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
267 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
268
269 xmm9 = _mm_add_epi32(xmm11, xmm12);
270
271 xmm8 = _mm_add_epi32(xmm8, xmm10);
272 }
273
274 if (num_bytes >> 3 & 1) {
275 sq_dist =
276 lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
277
278 xmm2 = _mm_load1_ps(&sq_dist);
279
280 xmm1 = xmm3;
281
282 xmm3 = _mm_max_ss(xmm3, xmm2);
283
284 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
285 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
286
287 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
288
289 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
290 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
291
292 xmm9 = _mm_add_epi32(xmm11, xmm12);
293 }
294
295 _mm_store_ps((float*)&(holderf.f), xmm3);
296 _mm_store_si128(&(holderi.int_vec), xmm9);
297
298 target[0] = holderi.i[0];
299 sq_dist = holderf.f[0];
300 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
301 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
302 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
303 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
304 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
305 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
306}
307
308#endif /*LV_HAVE_SSE3*/
309
310#ifdef LV_HAVE_GENERIC
311static inline void
312volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
313{
314 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
315
316 const uint32_t num_bytes = num_points * 8;
317
318 float sq_dist = 0.0;
319 float max = 0.0;
320 uint16_t index = 0;
321
322 uint32_t i = 0;
323
324 for (; i<num_bytes>> 3; ++i) {
325 sq_dist =
326 lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
327
328 if (sq_dist > max) {
329 index = i;
330 max = sq_dist;
331 }
332 }
333 target[0] = index;
334}
335
336#endif /*LV_HAVE_GENERIC*/
337
338#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
339
340#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
341#define INCLUDED_volk_32fc_index_max_16u_u_H
342
343#include <inttypes.h>
344#include <limits.h>
345#include <stdio.h>
346#include <volk/volk_common.h>
347#include <volk/volk_complex.h>
348
349#ifdef LV_HAVE_AVX2
350#include <immintrin.h>
352
353static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
354 lv_32fc_t* src0,
355 uint32_t num_points)
356{
357 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
358
359 const __m256i indices_increment = _mm256_set1_epi32(8);
360 /*
361 * At the start of each loop iteration current_indices holds the indices of
362 * the complex numbers loaded from memory. Explanation for odd order is given
363 * in implementation of vector_32fc_index_max_variant0().
364 */
365 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
366
367 __m256 max_values = _mm256_setzero_ps();
368 __m256i max_indices = _mm256_setzero_si256();
369
370 for (unsigned i = 0; i < num_points / 8u; ++i) {
371 __m256 in0 = _mm256_loadu_ps((float*)src0);
372 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
374 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
375 src0 += 8;
376 }
377
378 // determine maximum value and index in the result of the vectorized loop
379 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
380 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
381 _mm256_store_ps(max_values_buffer, max_values);
382 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
383
384 float max = 0.f;
385 uint32_t index = 0;
386 for (unsigned i = 0; i < 8; i++) {
387 if (max_values_buffer[i] > max) {
388 max = max_values_buffer[i];
389 index = max_indices_buffer[i];
390 }
391 }
392
393 // handle tail not processed by the vectorized loop
394 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
395 const float abs_squared =
396 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
397 if (abs_squared > max) {
398 max = abs_squared;
399 index = i;
400 }
401 ++src0;
402 }
403
404 *target = index;
405}
406
407#endif /*LV_HAVE_AVX2*/
408
409#ifdef LV_HAVE_AVX2
410#include <immintrin.h>
412
413static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
414 lv_32fc_t* src0,
415 uint32_t num_points)
416{
417 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
418
419 const __m256i indices_increment = _mm256_set1_epi32(8);
420 /*
421 * At the start of each loop iteration current_indices holds the indices of
422 * the complex numbers loaded from memory. Explanation for odd order is given
423 * in implementation of vector_32fc_index_max_variant0().
424 */
425 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
426
427 __m256 max_values = _mm256_setzero_ps();
428 __m256i max_indices = _mm256_setzero_si256();
429
430 for (unsigned i = 0; i < num_points / 8u; ++i) {
431 __m256 in0 = _mm256_loadu_ps((float*)src0);
432 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
434 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
435 src0 += 8;
436 }
437
438 // determine maximum value and index in the result of the vectorized loop
439 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
440 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
441 _mm256_store_ps(max_values_buffer, max_values);
442 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
443
444 float max = 0.f;
445 uint32_t index = 0;
446 for (unsigned i = 0; i < 8; i++) {
447 if (max_values_buffer[i] > max) {
448 max = max_values_buffer[i];
449 index = max_indices_buffer[i];
450 }
451 }
452
453 // handle tail not processed by the vectorized loop
454 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
455 const float abs_squared =
456 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
457 if (abs_squared > max) {
458 max = abs_squared;
459 index = i;
460 }
461 ++src0;
462 }
463
464 *target = index;
465}
466
467#endif /*LV_HAVE_AVX2*/
468
469#endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/