Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_x2_clamp_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
44#ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
45#define INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
46
47#ifdef LV_HAVE_GENERIC
48static inline void volk_32f_s32f_x2_clamp_32f_generic(float* out,
49 const float* in,
50 const float min,
51 const float max,
52 unsigned int num_points)
53{
54 unsigned int number = 0;
55 for (; number < num_points; number++) {
56 if (*in > max) {
57 *out = max;
58 } else if (*in < min) {
59 *out = min;
60 } else {
61 *out = *in;
62 }
63 in++;
64 out++;
65 }
66}
67#endif /* LV_HAVE_GENERIC */
68
69#if LV_HAVE_AVX2
70#include <immintrin.h>
71static inline void volk_32f_s32f_x2_clamp_32f_a_avx2(float* out,
72 const float* in,
73 const float min,
74 const float max,
75 unsigned int num_points)
76{
77 const __m256 vmin = _mm256_set1_ps(min);
78 const __m256 vmax = _mm256_set1_ps(max);
79
80 unsigned int number = 0;
81 unsigned int eighth_points = num_points / 8;
82 for (; number < eighth_points; number++) {
83 __m256 res = _mm256_load_ps(in);
84 __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
85 __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
86 res = _mm256_blendv_ps(res, vmax, max_mask);
87 res = _mm256_blendv_ps(res, vmin, min_mask);
88 _mm256_store_ps(out, res);
89 in += 8;
90 out += 8;
91 }
92
93 number = eighth_points * 8;
94 volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
95}
96#endif /* LV_HAVE_AVX2 */
97
98#if LV_HAVE_SSE4_1
99#include <immintrin.h>
100static inline void volk_32f_s32f_x2_clamp_32f_a_sse4_1(float* out,
101 const float* in,
102 const float min,
103 const float max,
104 unsigned int num_points)
105{
106 const __m128 vmin = _mm_set1_ps(min);
107 const __m128 vmax = _mm_set1_ps(max);
108
109 unsigned int number = 0;
110 unsigned int quarter_points = num_points / 4;
111 for (; number < quarter_points; number++) {
112 __m128 res = _mm_load_ps(in);
113 __m128 max_mask = _mm_cmplt_ps(vmax, res);
114 __m128 min_mask = _mm_cmplt_ps(res, vmin);
115 res = _mm_blendv_ps(res, vmax, max_mask);
116 res = _mm_blendv_ps(res, vmin, min_mask);
117 _mm_store_ps(out, res);
118 in += 4;
119 out += 4;
120 }
121
122 number = quarter_points * 4;
123 volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
124}
125#endif /* LV_HAVE_SSE4_1 */
126
127#endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H */
128
129#ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
130#define INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
131
132#if LV_HAVE_AVX2
133#include <immintrin.h>
134static inline void volk_32f_s32f_x2_clamp_32f_u_avx2(float* out,
135 const float* in,
136 const float min,
137 const float max,
138 unsigned int num_points)
139{
140 const __m256 vmin = _mm256_set1_ps(min);
141 const __m256 vmax = _mm256_set1_ps(max);
142
143 unsigned int number = 0;
144 unsigned int eighth_points = num_points / 8;
145 for (; number < eighth_points; number++) {
146 __m256 res = _mm256_loadu_ps(in);
147 __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
148 __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
149 res = _mm256_blendv_ps(res, vmax, max_mask);
150 res = _mm256_blendv_ps(res, vmin, min_mask);
151 _mm256_storeu_ps(out, res);
152 in += 8;
153 out += 8;
154 }
155
156 number = eighth_points * 8;
157 volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
158}
159#endif /* LV_HAVE_AVX2 */
160
161#if LV_HAVE_SSE4_1
162#include <immintrin.h>
163static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(float* out,
164 const float* in,
165 const float min,
166 const float max,
167 unsigned int num_points)
168{
169 const __m128 vmin = _mm_set1_ps(min);
170 const __m128 vmax = _mm_set1_ps(max);
171
172 unsigned int number = 0;
173 unsigned int quarter_points = num_points / 4;
174 for (; number < quarter_points; number++) {
175 __m128 res = _mm_loadu_ps(in);
176 __m128 max_mask = _mm_cmplt_ps(vmax, res);
177 __m128 min_mask = _mm_cmplt_ps(res, vmin);
178 res = _mm_blendv_ps(res, vmax, max_mask);
179 res = _mm_blendv_ps(res, vmin, min_mask);
180 _mm_storeu_ps(out, res);
181 in += 4;
182 out += 4;
183 }
184
185 number = quarter_points * 4;
186 volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
187}
188#endif /* LV_HAVE_SSE4_1 */
189
190#endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H */