Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_index_max_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
52#ifndef INCLUDED_volk_32f_index_max_32u_a_H
53#define INCLUDED_volk_32f_index_max_32u_a_H
54
55#include <inttypes.h>
56#include <stdio.h>
57#include <volk/volk_common.h>
58
59#ifdef LV_HAVE_SSE4_1
60#include <smmintrin.h>
61
62static inline void
63volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
64{
65 if (num_points > 0) {
66 uint32_t number = 0;
67 const uint32_t quarterPoints = num_points / 4;
68
69 float* inputPtr = (float*)src0;
70
71 __m128 indexIncrementValues = _mm_set1_ps(4);
72 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
73
74 float max = src0[0];
75 float index = 0;
76 __m128 maxValues = _mm_set1_ps(max);
77 __m128 maxValuesIndex = _mm_setzero_ps();
78 __m128 compareResults;
79 __m128 currentValues;
80
81 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
82 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
83
84 for (; number < quarterPoints; number++) {
85
86 currentValues = _mm_load_ps(inputPtr);
87 inputPtr += 4;
88 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
89
90 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
91
92 maxValuesIndex =
93 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
94 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
95 }
96
97 // Calculate the largest value from the remaining 4 points
98 _mm_store_ps(maxValuesBuffer, maxValues);
99 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
100
101 for (number = 0; number < 4; number++) {
102 if (maxValuesBuffer[number] > max) {
103 index = maxIndexesBuffer[number];
104 max = maxValuesBuffer[number];
105 } else if (maxValuesBuffer[number] == max) {
106 if (index > maxIndexesBuffer[number])
107 index = maxIndexesBuffer[number];
108 }
109 }
110
111 number = quarterPoints * 4;
112 for (; number < num_points; number++) {
113 if (src0[number] > max) {
114 index = number;
115 max = src0[number];
116 }
117 }
118 target[0] = (uint32_t)index;
119 }
120}
121
122#endif /*LV_HAVE_SSE4_1*/
123
124
125#ifdef LV_HAVE_SSE
126
127#include <xmmintrin.h>
128
129static inline void
130volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
131{
132 if (num_points > 0) {
133 uint32_t number = 0;
134 const uint32_t quarterPoints = num_points / 4;
135
136 float* inputPtr = (float*)src0;
137
138 __m128 indexIncrementValues = _mm_set1_ps(4);
139 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
140
141 float max = src0[0];
142 float index = 0;
143 __m128 maxValues = _mm_set1_ps(max);
144 __m128 maxValuesIndex = _mm_setzero_ps();
145 __m128 compareResults;
146 __m128 currentValues;
147
148 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
149 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
150
151 for (; number < quarterPoints; number++) {
152
153 currentValues = _mm_load_ps(inputPtr);
154 inputPtr += 4;
155 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
156
157 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
158
159 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
160 _mm_andnot_ps(compareResults, maxValuesIndex));
161
162 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
163 _mm_andnot_ps(compareResults, maxValues));
164 }
165
166 // Calculate the largest value from the remaining 4 points
167 _mm_store_ps(maxValuesBuffer, maxValues);
168 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
169
170 for (number = 0; number < 4; number++) {
171 if (maxValuesBuffer[number] > max) {
172 index = maxIndexesBuffer[number];
173 max = maxValuesBuffer[number];
174 } else if (maxValuesBuffer[number] == max) {
175 if (index > maxIndexesBuffer[number])
176 index = maxIndexesBuffer[number];
177 }
178 }
179
180 number = quarterPoints * 4;
181 for (; number < num_points; number++) {
182 if (src0[number] > max) {
183 index = number;
184 max = src0[number];
185 }
186 }
187 target[0] = (uint32_t)index;
188 }
189}
190
191#endif /*LV_HAVE_SSE*/
192
193
194#ifdef LV_HAVE_AVX
195#include <immintrin.h>
196
197static inline void
198volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
199{
200 if (num_points > 0) {
201 uint32_t number = 0;
202 const uint32_t quarterPoints = num_points / 8;
203
204 float* inputPtr = (float*)src0;
205
206 __m256 indexIncrementValues = _mm256_set1_ps(8);
207 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
208
209 float max = src0[0];
210 float index = 0;
211 __m256 maxValues = _mm256_set1_ps(max);
212 __m256 maxValuesIndex = _mm256_setzero_ps();
213 __m256 compareResults;
214 __m256 currentValues;
215
216 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
217 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
218
219 for (; number < quarterPoints; number++) {
220 currentValues = _mm256_load_ps(inputPtr);
221 inputPtr += 8;
222 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
223 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
224 maxValuesIndex =
225 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
226 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
227 }
228
229 // Calculate the largest value from the remaining 8 points
230 _mm256_store_ps(maxValuesBuffer, maxValues);
231 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
232
233 for (number = 0; number < 8; number++) {
234 if (maxValuesBuffer[number] > max) {
235 index = maxIndexesBuffer[number];
236 max = maxValuesBuffer[number];
237 } else if (maxValuesBuffer[number] == max) {
238 if (index > maxIndexesBuffer[number])
239 index = maxIndexesBuffer[number];
240 }
241 }
242
243 number = quarterPoints * 8;
244 for (; number < num_points; number++) {
245 if (src0[number] > max) {
246 index = number;
247 max = src0[number];
248 }
249 }
250 target[0] = (uint32_t)index;
251 }
252}
253
254#endif /*LV_HAVE_AVX*/
255
256
257#ifdef LV_HAVE_NEON
258#include <arm_neon.h>
259
260static inline void
261volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
262{
263 if (num_points > 0) {
264 uint32_t number = 0;
265 const uint32_t quarterPoints = num_points / 4;
266
267 float* inputPtr = (float*)src0;
268 float32x4_t indexIncrementValues = vdupq_n_f32(4);
270 float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
271 float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
272
273 float max = src0[0];
274 float index = 0;
275 float32x4_t maxValues = vdupq_n_f32(max);
276 uint32x4_t maxValuesIndex = vmovq_n_u32(0);
277 uint32x4_t compareResults;
278 uint32x4_t currentIndexes_u;
279 float32x4_t currentValues;
280
281 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
282 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
283
284 for (; number < quarterPoints; number++) {
285 currentValues = vld1q_f32(inputPtr);
286 inputPtr += 4;
287 currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
288 currentIndexes_u = vcvtq_u32_f32(currentIndexes);
289 compareResults = vcleq_f32(currentValues, maxValues);
290 maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
291 vbicq_u32(currentIndexes_u, compareResults));
292 maxValues = vmaxq_f32(currentValues, maxValues);
293 }
294
295 // Calculate the largest value from the remaining 4 points
296 vst1q_f32(maxValuesBuffer, maxValues);
297 vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
298 for (number = 0; number < 4; number++) {
299 if (maxValuesBuffer[number] > max) {
300 index = maxIndexesBuffer[number];
301 max = maxValuesBuffer[number];
302 } else if (maxValues[number] == max) {
303 if (index > maxIndexesBuffer[number])
304 index = maxIndexesBuffer[number];
305 }
306 }
307
308 number = quarterPoints * 4;
309 for (; number < num_points; number++) {
310 if (src0[number] > max) {
311 index = number;
312 max = src0[number];
313 }
314 }
315 target[0] = (uint32_t)index;
316 }
317}
318
319#endif /*LV_HAVE_NEON*/
320
321
322#ifdef LV_HAVE_GENERIC
323
324static inline void
325volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
326{
327 if (num_points > 0) {
328 float max = src0[0];
329 uint32_t index = 0;
330
331 uint32_t i = 1;
332
333 for (; i < num_points; ++i) {
334 if (src0[i] > max) {
335 index = i;
336 max = src0[i];
337 }
338 }
339 target[0] = index;
340 }
341}
342
343#endif /*LV_HAVE_GENERIC*/
344
345
346#endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
347
348
349#ifndef INCLUDED_volk_32f_index_max_32u_u_H
350#define INCLUDED_volk_32f_index_max_32u_u_H
351
352#include <inttypes.h>
353#include <stdio.h>
354#include <volk/volk_common.h>
355
356
357#ifdef LV_HAVE_AVX
358#include <immintrin.h>
359
360static inline void
361volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
362{
363 if (num_points > 0) {
364 uint32_t number = 0;
365 const uint32_t quarterPoints = num_points / 8;
366
367 float* inputPtr = (float*)src0;
368
369 __m256 indexIncrementValues = _mm256_set1_ps(8);
370 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
371
372 float max = src0[0];
373 float index = 0;
374 __m256 maxValues = _mm256_set1_ps(max);
375 __m256 maxValuesIndex = _mm256_setzero_ps();
376 __m256 compareResults;
377 __m256 currentValues;
378
379 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
380 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
381
382 for (; number < quarterPoints; number++) {
383 currentValues = _mm256_loadu_ps(inputPtr);
384 inputPtr += 8;
385 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
386 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
387 maxValuesIndex =
388 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
389 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
390 }
391
392 // Calculate the largest value from the remaining 8 points
393 _mm256_store_ps(maxValuesBuffer, maxValues);
394 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
395
396 for (number = 0; number < 8; number++) {
397 if (maxValuesBuffer[number] > max) {
398 index = maxIndexesBuffer[number];
399 max = maxValuesBuffer[number];
400 } else if (maxValuesBuffer[number] == max) {
401 if (index > maxIndexesBuffer[number])
402 index = maxIndexesBuffer[number];
403 }
404 }
405
406 number = quarterPoints * 8;
407 for (; number < num_points; number++) {
408 if (src0[number] > max) {
409 index = number;
410 max = src0[number];
411 }
412 }
413 target[0] = (uint32_t)index;
414 }
415}
416
417#endif /*LV_HAVE_AVX*/
418
419
420#ifdef LV_HAVE_SSE4_1
421#include <smmintrin.h>
422
423static inline void
424volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
425{
426 if (num_points > 0) {
427 uint32_t number = 0;
428 const uint32_t quarterPoints = num_points / 4;
429
430 float* inputPtr = (float*)src0;
431
432 __m128 indexIncrementValues = _mm_set1_ps(4);
433 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
434
435 float max = src0[0];
436 float index = 0;
437 __m128 maxValues = _mm_set1_ps(max);
438 __m128 maxValuesIndex = _mm_setzero_ps();
439 __m128 compareResults;
440 __m128 currentValues;
441
442 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
443 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
444
445 for (; number < quarterPoints; number++) {
446 currentValues = _mm_loadu_ps(inputPtr);
447 inputPtr += 4;
448 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
449 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
450 maxValuesIndex =
451 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
452 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
453 }
454
455 // Calculate the largest value from the remaining 4 points
456 _mm_store_ps(maxValuesBuffer, maxValues);
457 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
458
459 for (number = 0; number < 4; number++) {
460 if (maxValuesBuffer[number] > max) {
461 index = maxIndexesBuffer[number];
462 max = maxValuesBuffer[number];
463 } else if (maxValuesBuffer[number] == max) {
464 if (index > maxIndexesBuffer[number])
465 index = maxIndexesBuffer[number];
466 }
467 }
468
469 number = quarterPoints * 4;
470 for (; number < num_points; number++) {
471 if (src0[number] > max) {
472 index = number;
473 max = src0[number];
474 }
475 }
476 target[0] = (uint32_t)index;
477 }
478}
479
480#endif /*LV_HAVE_SSE4_1*/
481
482#ifdef LV_HAVE_SSE
483#include <xmmintrin.h>
484
485static inline void
486volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
487{
488 if (num_points > 0) {
489 uint32_t number = 0;
490 const uint32_t quarterPoints = num_points / 4;
491
492 float* inputPtr = (float*)src0;
493
494 __m128 indexIncrementValues = _mm_set1_ps(4);
495 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
496
497 float max = src0[0];
498 float index = 0;
499 __m128 maxValues = _mm_set1_ps(max);
500 __m128 maxValuesIndex = _mm_setzero_ps();
501 __m128 compareResults;
502 __m128 currentValues;
503
504 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
505 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
506
507 for (; number < quarterPoints; number++) {
508 currentValues = _mm_loadu_ps(inputPtr);
509 inputPtr += 4;
510 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
511 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
512 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
513 _mm_andnot_ps(compareResults, maxValuesIndex));
514 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
515 _mm_andnot_ps(compareResults, maxValues));
516 }
517
518 // Calculate the largest value from the remaining 4 points
519 _mm_store_ps(maxValuesBuffer, maxValues);
520 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
521
522 for (number = 0; number < 4; number++) {
523 if (maxValuesBuffer[number] > max) {
524 index = maxIndexesBuffer[number];
525 max = maxValuesBuffer[number];
526 } else if (maxValuesBuffer[number] == max) {
527 if (index > maxIndexesBuffer[number])
528 index = maxIndexesBuffer[number];
529 }
530 }
531
532 number = quarterPoints * 4;
533 for (; number < num_points; number++) {
534 if (src0[number] > max) {
535 index = number;
536 max = src0[number];
537 }
538 }
539 target[0] = (uint32_t)index;
540 }
541}
542
543#endif /*LV_HAVE_SSE*/
544
545#endif /*INCLUDED_volk_32f_index_max_32u_u_H*/