Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_s32f_mod_range_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2017 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
32#ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
33#define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
34
35#ifdef LV_HAVE_GENERIC
36
37static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
38 const float* inputVector,
39 const float lower_bound,
40 const float upper_bound,
41 unsigned int num_points)
42{
43 float* outPtr = outputVector;
44 const float* inPtr;
45 const float distance = upper_bound - lower_bound;
46
47 for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
48 float val = *inPtr;
49 if (val < lower_bound) {
50 float excess = lower_bound - val;
51 signed int count = (int)(excess / distance);
52 *outPtr = val + (count + 1) * distance;
53 } else if (val > upper_bound) {
54 float excess = val - upper_bound;
55 signed int count = (int)(excess / distance);
56 *outPtr = val - (count + 1) * distance;
57 } else
58 *outPtr = val;
59 outPtr++;
60 }
61}
62#endif /* LV_HAVE_GENERIC */
63
64
65#ifdef LV_HAVE_AVX
66#include <xmmintrin.h>
67
68static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
69 const float* inputVector,
70 const float lower_bound,
71 const float upper_bound,
72 unsigned int num_points)
73{
74 const __m256 lower = _mm256_set1_ps(lower_bound);
75 const __m256 upper = _mm256_set1_ps(upper_bound);
76 const __m256 distance = _mm256_sub_ps(upper, lower);
77 __m256 input, output;
78 __m256 is_smaller, is_bigger;
79 __m256 excess, adj;
80
81 const float* inPtr = inputVector;
82 float* outPtr = outputVector;
83 const size_t eight_points = num_points / 8;
84 for (size_t counter = 0; counter < eight_points; counter++) {
85 input = _mm256_loadu_ps(inPtr);
86 // calculate mask: input < lower, input > upper
87 is_smaller = _mm256_cmp_ps(
88 input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
89 is_bigger = _mm256_cmp_ps(
90 input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
91 // find out how far we are out-of-bound – positive values!
92 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
93 excess =
94 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
95 // how many do we have to add? (int(excess/distance+1)*distance)
96 excess = _mm256_div_ps(excess, distance);
97 // round down
98 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
99 // plus 1
100 adj = _mm256_set1_ps(1.0f);
101 excess = _mm256_add_ps(excess, adj);
102 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
103 adj = _mm256_and_ps(adj, is_smaller);
104 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
105 // scale by distance, sign
106 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
107 output = _mm256_add_ps(input, excess);
108 _mm256_storeu_ps(outPtr, output);
109 inPtr += 8;
110 outPtr += 8;
111 }
112
114 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
115}
116static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
117 const float* inputVector,
118 const float lower_bound,
119 const float upper_bound,
120 unsigned int num_points)
121{
122 const __m256 lower = _mm256_set1_ps(lower_bound);
123 const __m256 upper = _mm256_set1_ps(upper_bound);
124 const __m256 distance = _mm256_sub_ps(upper, lower);
125 __m256 input, output;
126 __m256 is_smaller, is_bigger;
127 __m256 excess, adj;
128
129 const float* inPtr = inputVector;
130 float* outPtr = outputVector;
131 const size_t eight_points = num_points / 8;
132 for (size_t counter = 0; counter < eight_points; counter++) {
133 input = _mm256_load_ps(inPtr);
134 // calculate mask: input < lower, input > upper
135 is_smaller = _mm256_cmp_ps(
136 input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
137 is_bigger = _mm256_cmp_ps(
138 input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
139 // find out how far we are out-of-bound – positive values!
140 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
141 excess =
142 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
143 // how many do we have to add? (int(excess/distance+1)*distance)
144 excess = _mm256_div_ps(excess, distance);
145 // round down
146 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
147 // plus 1
148 adj = _mm256_set1_ps(1.0f);
149 excess = _mm256_add_ps(excess, adj);
150 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
151 adj = _mm256_and_ps(adj, is_smaller);
152 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
153 // scale by distance, sign
154 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
155 output = _mm256_add_ps(input, excess);
156 _mm256_store_ps(outPtr, output);
157 inPtr += 8;
158 outPtr += 8;
159 }
160
162 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
163}
164#endif /* LV_HAVE_AVX */
165
166
167#ifdef LV_HAVE_SSE2
168#include <xmmintrin.h>
169
170static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
171 const float* inputVector,
172 const float lower_bound,
173 const float upper_bound,
174 unsigned int num_points)
175{
176 const __m128 lower = _mm_set_ps1(lower_bound);
177 const __m128 upper = _mm_set_ps1(upper_bound);
178 const __m128 distance = _mm_sub_ps(upper, lower);
179 __m128 input, output;
180 __m128 is_smaller, is_bigger;
181 __m128 excess, adj;
182
183 const float* inPtr = inputVector;
184 float* outPtr = outputVector;
185 const size_t quarter_points = num_points / 4;
186 for (size_t counter = 0; counter < quarter_points; counter++) {
187 input = _mm_loadu_ps(inPtr);
188 // calculate mask: input < lower, input > upper
189 is_smaller = _mm_cmplt_ps(input, lower);
190 is_bigger = _mm_cmpgt_ps(input, upper);
191 // find out how far we are out-of-bound – positive values!
192 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
193 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
194 // how many do we have to add? (int(excess/distance+1)*distance)
195 excess = _mm_div_ps(excess, distance);
196 // round down
197 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
198 // plus 1
199 adj = _mm_set_ps1(1.0f);
200 excess = _mm_add_ps(excess, adj);
201 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
202 adj = _mm_and_ps(adj, is_smaller);
203 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
204 // scale by distance, sign
205 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
206 output = _mm_add_ps(input, excess);
207 _mm_storeu_ps(outPtr, output);
208 inPtr += 4;
209 outPtr += 4;
210 }
211
213 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
214}
215static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
216 const float* inputVector,
217 const float lower_bound,
218 const float upper_bound,
219 unsigned int num_points)
220{
221 const __m128 lower = _mm_set_ps1(lower_bound);
222 const __m128 upper = _mm_set_ps1(upper_bound);
223 const __m128 distance = _mm_sub_ps(upper, lower);
224 __m128 input, output;
225 __m128 is_smaller, is_bigger;
226 __m128 excess, adj;
227
228 const float* inPtr = inputVector;
229 float* outPtr = outputVector;
230 const size_t quarter_points = num_points / 4;
231 for (size_t counter = 0; counter < quarter_points; counter++) {
232 input = _mm_load_ps(inPtr);
233 // calculate mask: input < lower, input > upper
234 is_smaller = _mm_cmplt_ps(input, lower);
235 is_bigger = _mm_cmpgt_ps(input, upper);
236 // find out how far we are out-of-bound – positive values!
237 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
238 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
239 // how many do we have to add? (int(excess/distance+1)*distance)
240 excess = _mm_div_ps(excess, distance);
241 // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
242 // conversion.
243 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
244 // plus 1
245 adj = _mm_set_ps1(1.0f);
246 excess = _mm_add_ps(excess, adj);
247 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
248 adj = _mm_and_ps(adj, is_smaller);
249 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
250 // scale by distance, sign
251 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
252 output = _mm_add_ps(input, excess);
253 _mm_store_ps(outPtr, output);
254 inPtr += 4;
255 outPtr += 4;
256 }
257
259 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
260}
261#endif /* LV_HAVE_SSE2 */
262
263#ifdef LV_HAVE_SSE
264#include <xmmintrin.h>
265
266static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
267 const float* inputVector,
268 const float lower_bound,
269 const float upper_bound,
270 unsigned int num_points)
271{
272 const __m128 lower = _mm_set_ps1(lower_bound);
273 const __m128 upper = _mm_set_ps1(upper_bound);
274 const __m128 distance = _mm_sub_ps(upper, lower);
275 __m128 input, output;
276 __m128 is_smaller, is_bigger;
277 __m128 excess, adj;
278 __m128i rounddown;
279
280 const float* inPtr = inputVector;
281 float* outPtr = outputVector;
282 const size_t quarter_points = num_points / 4;
283 for (size_t counter = 0; counter < quarter_points; counter++) {
284 input = _mm_loadu_ps(inPtr);
285 // calculate mask: input < lower, input > upper
286 is_smaller = _mm_cmplt_ps(input, lower);
287 is_bigger = _mm_cmpgt_ps(input, upper);
288 // find out how far we are out-of-bound – positive values!
289 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
290 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
291 // how many do we have to add? (int(excess/distance+1)*distance)
292 excess = _mm_div_ps(excess, distance);
293 // round down – for some reason
294 rounddown = _mm_cvttps_epi32(excess);
295 excess = _mm_cvtepi32_ps(rounddown);
296 // plus 1
297 adj = _mm_set_ps1(1.0f);
298 excess = _mm_add_ps(excess, adj);
299 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
300 adj = _mm_and_ps(adj, is_smaller);
301 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
302 // scale by distance, sign
303 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
304 output = _mm_add_ps(input, excess);
305 _mm_storeu_ps(outPtr, output);
306 inPtr += 4;
307 outPtr += 4;
308 }
309
311 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
312}
313static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
314 const float* inputVector,
315 const float lower_bound,
316 const float upper_bound,
317 unsigned int num_points)
318{
319 const __m128 lower = _mm_set_ps1(lower_bound);
320 const __m128 upper = _mm_set_ps1(upper_bound);
321 const __m128 distance = _mm_sub_ps(upper, lower);
322 __m128 input, output;
323 __m128 is_smaller, is_bigger;
324 __m128 excess, adj;
325 __m128i rounddown;
326
327 const float* inPtr = inputVector;
328 float* outPtr = outputVector;
329 const size_t quarter_points = num_points / 4;
330 for (size_t counter = 0; counter < quarter_points; counter++) {
331 input = _mm_load_ps(inPtr);
332 // calculate mask: input < lower, input > upper
333 is_smaller = _mm_cmplt_ps(input, lower);
334 is_bigger = _mm_cmpgt_ps(input, upper);
335 // find out how far we are out-of-bound – positive values!
336 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
337 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
338 // how many do we have to add? (int(excess/distance+1)*distance)
339 excess = _mm_div_ps(excess, distance);
340 // round down
341 rounddown = _mm_cvttps_epi32(excess);
342 excess = _mm_cvtepi32_ps(rounddown);
343 // plus 1
344 adj = _mm_set_ps1(1.0f);
345 excess = _mm_add_ps(excess, adj);
346 // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
347 adj = _mm_and_ps(adj, is_smaller);
348 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
349 // scale by distance, sign
350 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
351 output = _mm_add_ps(input, excess);
352 _mm_store_ps(outPtr, output);
353 inPtr += 4;
354 outPtr += 4;
355 }
356
358 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
359}
360#endif /* LV_HAVE_SSE */
361
362
363#endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */