Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_index_max_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_32f_index_max_16u_a_H
59#define INCLUDED_volk_32f_index_max_16u_a_H
60
61#include <inttypes.h>
62#include <limits.h>
63#include <stdio.h>
64#include <volk/volk_common.h>
65
66#ifdef LV_HAVE_AVX
67#include <immintrin.h>
68
69static inline void
70volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
71{
72 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
73
74 uint32_t number = 0;
75 const uint32_t eighthPoints = num_points / 8;
76
77 float* inputPtr = (float*)src0;
78
79 __m256 indexIncrementValues = _mm256_set1_ps(8);
80 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
81
82 float max = src0[0];
83 float index = 0;
84 __m256 maxValues = _mm256_set1_ps(max);
85 __m256 maxValuesIndex = _mm256_setzero_ps();
86 __m256 compareResults;
87 __m256 currentValues;
88
89 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
90 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
91
92 for (; number < eighthPoints; number++) {
93
94 currentValues = _mm256_load_ps(inputPtr);
95 inputPtr += 8;
96 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
97
98 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
99
100 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
101 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
102 }
103
104 // Calculate the largest value from the remaining 4 points
105 _mm256_store_ps(maxValuesBuffer, maxValues);
106 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
107
108 for (number = 0; number < 8; number++) {
109 if (maxValuesBuffer[number] > max) {
110 index = maxIndexesBuffer[number];
111 max = maxValuesBuffer[number];
112 } else if (maxValuesBuffer[number] == max) {
113 if (index > maxIndexesBuffer[number])
114 index = maxIndexesBuffer[number];
115 }
116 }
117
118 number = eighthPoints * 8;
119 for (; number < num_points; number++) {
120 if (src0[number] > max) {
121 index = number;
122 max = src0[number];
123 }
124 }
125 target[0] = (uint16_t)index;
126}
127
128#endif /*LV_HAVE_AVX*/
129
130#ifdef LV_HAVE_SSE4_1
131#include <smmintrin.h>
132
133static inline void
134volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
135{
136 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
137
138 uint32_t number = 0;
139 const uint32_t quarterPoints = num_points / 4;
140
141 float* inputPtr = (float*)src0;
142
143 __m128 indexIncrementValues = _mm_set1_ps(4);
144 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
145
146 float max = src0[0];
147 float index = 0;
148 __m128 maxValues = _mm_set1_ps(max);
149 __m128 maxValuesIndex = _mm_setzero_ps();
150 __m128 compareResults;
151 __m128 currentValues;
152
153 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
154 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
155
156 for (; number < quarterPoints; number++) {
157
158 currentValues = _mm_load_ps(inputPtr);
159 inputPtr += 4;
160 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
161
162 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
163
164 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
165 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
166 }
167
168 // Calculate the largest value from the remaining 4 points
169 _mm_store_ps(maxValuesBuffer, maxValues);
170 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
171
172 for (number = 0; number < 4; number++) {
173 if (maxValuesBuffer[number] > max) {
174 index = maxIndexesBuffer[number];
175 max = maxValuesBuffer[number];
176 } else if (maxValuesBuffer[number] == max) {
177 if (index > maxIndexesBuffer[number])
178 index = maxIndexesBuffer[number];
179 }
180 }
181
182 number = quarterPoints * 4;
183 for (; number < num_points; number++) {
184 if (src0[number] > max) {
185 index = number;
186 max = src0[number];
187 }
188 }
189 target[0] = (uint16_t)index;
190}
191
192#endif /*LV_HAVE_SSE4_1*/
193
194
195#ifdef LV_HAVE_SSE
196
197#include <xmmintrin.h>
198
199static inline void
200volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
201{
202 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
203
204 uint32_t number = 0;
205 const uint32_t quarterPoints = num_points / 4;
206
207 float* inputPtr = (float*)src0;
208
209 __m128 indexIncrementValues = _mm_set1_ps(4);
210 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
211
212 float max = src0[0];
213 float index = 0;
214 __m128 maxValues = _mm_set1_ps(max);
215 __m128 maxValuesIndex = _mm_setzero_ps();
216 __m128 compareResults;
217 __m128 currentValues;
218
219 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
220 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
221
222 for (; number < quarterPoints; number++) {
223
224 currentValues = _mm_load_ps(inputPtr);
225 inputPtr += 4;
226 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
227
228 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
229
230 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
231 _mm_andnot_ps(compareResults, maxValuesIndex));
232 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
233 _mm_andnot_ps(compareResults, maxValues));
234 }
235
236 // Calculate the largest value from the remaining 4 points
237 _mm_store_ps(maxValuesBuffer, maxValues);
238 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
239
240 for (number = 0; number < 4; number++) {
241 if (maxValuesBuffer[number] > max) {
242 index = maxIndexesBuffer[number];
243 max = maxValuesBuffer[number];
244 } else if (maxValuesBuffer[number] == max) {
245 if (index > maxIndexesBuffer[number])
246 index = maxIndexesBuffer[number];
247 }
248 }
249
250 number = quarterPoints * 4;
251 for (; number < num_points; number++) {
252 if (src0[number] > max) {
253 index = number;
254 max = src0[number];
255 }
256 }
257 target[0] = (uint16_t)index;
258}
259
260#endif /*LV_HAVE_SSE*/
261
262
263#ifdef LV_HAVE_GENERIC
264
265static inline void
266volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
267{
268 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
269
270 float max = src0[0];
271 uint16_t index = 0;
272
273 uint32_t i = 1;
274
275 for (; i < num_points; ++i) {
276 if (src0[i] > max) {
277 index = i;
278 max = src0[i];
279 }
280 }
281 target[0] = index;
282}
283
284#endif /*LV_HAVE_GENERIC*/
285
286
287#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
288
289
290#ifndef INCLUDED_volk_32f_index_max_16u_u_H
291#define INCLUDED_volk_32f_index_max_16u_u_H
292
293#include <inttypes.h>
294#include <limits.h>
295#include <stdio.h>
296#include <volk/volk_common.h>
297
298#ifdef LV_HAVE_AVX
299#include <immintrin.h>
300
301static inline void
302volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
303{
304 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
305
306 uint32_t number = 0;
307 const uint32_t eighthPoints = num_points / 8;
308
309 float* inputPtr = (float*)src0;
310
311 __m256 indexIncrementValues = _mm256_set1_ps(8);
312 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
313
314 float max = src0[0];
315 float index = 0;
316 __m256 maxValues = _mm256_set1_ps(max);
317 __m256 maxValuesIndex = _mm256_setzero_ps();
318 __m256 compareResults;
319 __m256 currentValues;
320
321 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
322 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
323
324 for (; number < eighthPoints; number++) {
325
326 currentValues = _mm256_loadu_ps(inputPtr);
327 inputPtr += 8;
328 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
329
330 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
331
332 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
333 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
334 }
335
336 // Calculate the largest value from the remaining 4 points
337 _mm256_storeu_ps(maxValuesBuffer, maxValues);
338 _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
339
340 for (number = 0; number < 8; number++) {
341 if (maxValuesBuffer[number] > max) {
342 index = maxIndexesBuffer[number];
343 max = maxValuesBuffer[number];
344 } else if (maxValuesBuffer[number] == max) {
345 if (index > maxIndexesBuffer[number])
346 index = maxIndexesBuffer[number];
347 }
348 }
349
350 number = eighthPoints * 8;
351 for (; number < num_points; number++) {
352 if (src0[number] > max) {
353 index = number;
354 max = src0[number];
355 }
356 }
357 target[0] = (uint16_t)index;
358}
359
360#endif /*LV_HAVE_AVX*/
361
362#endif /*INCLUDED_volk_32f_index_max_16u_u_H*/