Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_expfast_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
52#include <inttypes.h>
53#include <math.h>
54#include <stdio.h>
55
56#define Mln2 0.6931471805f
57#define A 8388608.0f
58#define B 1065353216.0f
59#define C 60801.0f
60
61
62#ifndef INCLUDED_volk_32f_expfast_32f_a_H
63#define INCLUDED_volk_32f_expfast_32f_a_H
64
65#if LV_HAVE_AVX && LV_HAVE_FMA
66
67#include <immintrin.h>
68
69static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
70 const float* aVector,
71 unsigned int num_points)
72{
73 float* bPtr = bVector;
74 const float* aPtr = aVector;
75
76 unsigned int number = 0;
77 const unsigned int eighthPoints = num_points / 8;
78
79 __m256 aVal, bVal, a, b;
80 __m256i exp;
81 a = _mm256_set1_ps(A / Mln2);
82 b = _mm256_set1_ps(B - C);
83
84 for (; number < eighthPoints; number++) {
85 aVal = _mm256_load_ps(aPtr);
86 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
87 bVal = _mm256_castsi256_ps(exp);
88
89 _mm256_store_ps(bPtr, bVal);
90 aPtr += 8;
91 bPtr += 8;
92 }
93
94 number = eighthPoints * 8;
95 for (; number < num_points; number++) {
96 *bPtr++ = expf(*aPtr++);
97 }
98}
99
100#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
101
102#ifdef LV_HAVE_AVX
103
104#include <immintrin.h>
105
106static inline void
107volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
108{
109 float* bPtr = bVector;
110 const float* aPtr = aVector;
111
112 unsigned int number = 0;
113 const unsigned int eighthPoints = num_points / 8;
114
115 __m256 aVal, bVal, a, b;
116 __m256i exp;
117 a = _mm256_set1_ps(A / Mln2);
118 b = _mm256_set1_ps(B - C);
119
120 for (; number < eighthPoints; number++) {
121 aVal = _mm256_load_ps(aPtr);
122 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
123 bVal = _mm256_castsi256_ps(exp);
124
125 _mm256_store_ps(bPtr, bVal);
126 aPtr += 8;
127 bPtr += 8;
128 }
129
130 number = eighthPoints * 8;
131 for (; number < num_points; number++) {
132 *bPtr++ = expf(*aPtr++);
133 }
134}
135
136#endif /* LV_HAVE_AVX for aligned */
137
138#ifdef LV_HAVE_SSE4_1
139#include <smmintrin.h>
140
141static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
142 const float* aVector,
143 unsigned int num_points)
144{
145 float* bPtr = bVector;
146 const float* aPtr = aVector;
147
148 unsigned int number = 0;
149 const unsigned int quarterPoints = num_points / 4;
150
151 __m128 aVal, bVal, a, b;
152 __m128i exp;
153 a = _mm_set1_ps(A / Mln2);
154 b = _mm_set1_ps(B - C);
155
156 for (; number < quarterPoints; number++) {
157 aVal = _mm_load_ps(aPtr);
158 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
159 bVal = _mm_castsi128_ps(exp);
160
161 _mm_store_ps(bPtr, bVal);
162 aPtr += 4;
163 bPtr += 4;
164 }
165
166 number = quarterPoints * 4;
167 for (; number < num_points; number++) {
168 *bPtr++ = expf(*aPtr++);
169 }
170}
171
172#endif /* LV_HAVE_SSE4_1 for aligned */
173
174#endif /* INCLUDED_volk_32f_expfast_32f_a_H */
175
176#ifndef INCLUDED_volk_32f_expfast_32f_u_H
177#define INCLUDED_volk_32f_expfast_32f_u_H
178
179#if LV_HAVE_AVX && LV_HAVE_FMA
180#include <immintrin.h>
181
182static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
183 const float* aVector,
184 unsigned int num_points)
185{
186 float* bPtr = bVector;
187 const float* aPtr = aVector;
188
189 unsigned int number = 0;
190 const unsigned int eighthPoints = num_points / 8;
191
192 __m256 aVal, bVal, a, b;
193 __m256i exp;
194 a = _mm256_set1_ps(A / Mln2);
195 b = _mm256_set1_ps(B - C);
196
197 for (; number < eighthPoints; number++) {
198 aVal = _mm256_loadu_ps(aPtr);
199 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
200 bVal = _mm256_castsi256_ps(exp);
201
202 _mm256_storeu_ps(bPtr, bVal);
203 aPtr += 8;
204 bPtr += 8;
205 }
206
207 number = eighthPoints * 8;
208 for (; number < num_points; number++) {
209 *bPtr++ = expf(*aPtr++);
210 }
211}
212
213#endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
214
215#ifdef LV_HAVE_AVX
216#include <immintrin.h>
217
218static inline void
219volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
220{
221 float* bPtr = bVector;
222 const float* aPtr = aVector;
223
224 unsigned int number = 0;
225 const unsigned int eighthPoints = num_points / 8;
226
227 __m256 aVal, bVal, a, b;
228 __m256i exp;
229 a = _mm256_set1_ps(A / Mln2);
230 b = _mm256_set1_ps(B - C);
231
232 for (; number < eighthPoints; number++) {
233 aVal = _mm256_loadu_ps(aPtr);
234 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
235 bVal = _mm256_castsi256_ps(exp);
236
237 _mm256_storeu_ps(bPtr, bVal);
238 aPtr += 8;
239 bPtr += 8;
240 }
241
242 number = eighthPoints * 8;
243 for (; number < num_points; number++) {
244 *bPtr++ = expf(*aPtr++);
245 }
246}
247
248#endif /* LV_HAVE_AVX for unaligned */
249
250
251#ifdef LV_HAVE_SSE4_1
252#include <smmintrin.h>
253
254static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
255 const float* aVector,
256 unsigned int num_points)
257{
258 float* bPtr = bVector;
259 const float* aPtr = aVector;
260
261 unsigned int number = 0;
262 const unsigned int quarterPoints = num_points / 4;
263
264 __m128 aVal, bVal, a, b;
265 __m128i exp;
266 a = _mm_set1_ps(A / Mln2);
267 b = _mm_set1_ps(B - C);
268
269 for (; number < quarterPoints; number++) {
270 aVal = _mm_loadu_ps(aPtr);
271 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
272 bVal = _mm_castsi128_ps(exp);
273
274 _mm_storeu_ps(bPtr, bVal);
275 aPtr += 4;
276 bPtr += 4;
277 }
278
279 number = quarterPoints * 4;
280 for (; number < num_points; number++) {
281 *bPtr++ = expf(*aPtr++);
282 }
283}
284
285#endif /* LV_HAVE_SSE4_1 for unaligned */
286
287
288#ifdef LV_HAVE_GENERIC
289
290static inline void volk_32f_expfast_32f_generic(float* bVector,
291 const float* aVector,
292 unsigned int num_points)
293{
294 float* bPtr = bVector;
295 const float* aPtr = aVector;
296 unsigned int number = 0;
297
298 for (number = 0; number < num_points; number++) {
299 *bPtr++ = expf(*aPtr++);
300 }
301}
302#endif /* LV_HAVE_GENERIC */
303
304#endif /* INCLUDED_volk_32f_expfast_32f_u_H */