Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_tanh_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
55#ifndef INCLUDED_volk_32f_tanh_32f_a_H
56#define INCLUDED_volk_32f_tanh_32f_a_H
57
58#include <inttypes.h>
59#include <math.h>
60#include <stdio.h>
61#include <string.h>
62
63
64#ifdef LV_HAVE_GENERIC
65
66static inline void
67volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
68{
69 unsigned int number = 0;
70 float* cPtr = cVector;
71 const float* aPtr = aVector;
72 for (; number < num_points; number++) {
73 *cPtr++ = tanhf(*aPtr++);
74 }
75}
76
77#endif /* LV_HAVE_GENERIC */
78
79
80#ifdef LV_HAVE_GENERIC
81
82static inline void
83volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
84{
85 float* cPtr = cVector;
86 const float* aPtr = aVector;
87 for (unsigned int number = 0; number < num_points; number++) {
88 if (*aPtr > 4.97)
89 *cPtr++ = 1;
90 else if (*aPtr <= -4.97)
91 *cPtr++ = -1;
92 else {
93 float x2 = (*aPtr) * (*aPtr);
94 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
95 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
96 *cPtr++ = a / b;
97 aPtr++;
98 }
99 }
100}
101
102#endif /* LV_HAVE_GENERIC */
103
104
105#ifdef LV_HAVE_SSE
106#include <xmmintrin.h>
107
108static inline void
109volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
110{
111 unsigned int number = 0;
112 const unsigned int quarterPoints = num_points / 4;
113
114 float* cPtr = cVector;
115 const float* aPtr = aVector;
116
117 __m128 aVal, cVal, x2, a, b;
118 __m128 const1, const2, const3, const4, const5, const6;
119 const1 = _mm_set_ps1(135135.0f);
120 const2 = _mm_set_ps1(17325.0f);
121 const3 = _mm_set_ps1(378.0f);
122 const4 = _mm_set_ps1(62370.0f);
123 const5 = _mm_set_ps1(3150.0f);
124 const6 = _mm_set_ps1(28.0f);
125 for (; number < quarterPoints; number++) {
126
127 aVal = _mm_load_ps(aPtr);
128 x2 = _mm_mul_ps(aVal, aVal);
129 a = _mm_mul_ps(
130 aVal,
132 const1,
133 _mm_mul_ps(x2,
134 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
135 b = _mm_add_ps(
136 const1,
138 x2,
139 _mm_add_ps(const4,
140 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
141
142 cVal = _mm_div_ps(a, b);
143
144 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
145
146 aPtr += 4;
147 cPtr += 4;
148 }
149
150 number = quarterPoints * 4;
151 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
152}
153#endif /* LV_HAVE_SSE */
154
155
156#ifdef LV_HAVE_AVX
157#include <immintrin.h>
158
159static inline void
160volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
161{
162 unsigned int number = 0;
163 const unsigned int eighthPoints = num_points / 8;
164
165 float* cPtr = cVector;
166 const float* aPtr = aVector;
167
168 __m256 aVal, cVal, x2, a, b;
169 __m256 const1, const2, const3, const4, const5, const6;
170 const1 = _mm256_set1_ps(135135.0f);
171 const2 = _mm256_set1_ps(17325.0f);
172 const3 = _mm256_set1_ps(378.0f);
173 const4 = _mm256_set1_ps(62370.0f);
174 const5 = _mm256_set1_ps(3150.0f);
175 const6 = _mm256_set1_ps(28.0f);
176 for (; number < eighthPoints; number++) {
177
178 aVal = _mm256_load_ps(aPtr);
179 x2 = _mm256_mul_ps(aVal, aVal);
180 a = _mm256_mul_ps(
181 aVal,
182 _mm256_add_ps(
183 const1,
184 _mm256_mul_ps(
185 x2,
186 _mm256_add_ps(const2,
187 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
188 b = _mm256_add_ps(
189 const1,
190 _mm256_mul_ps(
191 x2,
192 _mm256_add_ps(
193 const4,
194 _mm256_mul_ps(x2,
195 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
196
197 cVal = _mm256_div_ps(a, b);
198
199 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
200
201 aPtr += 8;
202 cPtr += 8;
203 }
204
205 number = eighthPoints * 8;
206 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
207}
208#endif /* LV_HAVE_AVX */
209
210#if LV_HAVE_AVX && LV_HAVE_FMA
211#include <immintrin.h>
212
213static inline void
214volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
215{
216 unsigned int number = 0;
217 const unsigned int eighthPoints = num_points / 8;
218
219 float* cPtr = cVector;
220 const float* aPtr = aVector;
221
222 __m256 aVal, cVal, x2, a, b;
223 __m256 const1, const2, const3, const4, const5, const6;
224 const1 = _mm256_set1_ps(135135.0f);
225 const2 = _mm256_set1_ps(17325.0f);
226 const3 = _mm256_set1_ps(378.0f);
227 const4 = _mm256_set1_ps(62370.0f);
228 const5 = _mm256_set1_ps(3150.0f);
229 const6 = _mm256_set1_ps(28.0f);
230 for (; number < eighthPoints; number++) {
231
232 aVal = _mm256_load_ps(aPtr);
233 x2 = _mm256_mul_ps(aVal, aVal);
234 a = _mm256_mul_ps(
235 aVal,
236 _mm256_fmadd_ps(
237 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
238 b = _mm256_fmadd_ps(
239 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
240
241 cVal = _mm256_div_ps(a, b);
242
243 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
244
245 aPtr += 8;
246 cPtr += 8;
247 }
248
249 number = eighthPoints * 8;
250 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
251}
252#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
253
254#endif /* INCLUDED_volk_32f_tanh_32f_a_H */
255
256
257#ifndef INCLUDED_volk_32f_tanh_32f_u_H
258#define INCLUDED_volk_32f_tanh_32f_u_H
259
260#include <inttypes.h>
261#include <math.h>
262#include <stdio.h>
263#include <string.h>
264
265
266#ifdef LV_HAVE_SSE
267#include <xmmintrin.h>
268
269static inline void
270volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
271{
272 unsigned int number = 0;
273 const unsigned int quarterPoints = num_points / 4;
274
275 float* cPtr = cVector;
276 const float* aPtr = aVector;
277
278 __m128 aVal, cVal, x2, a, b;
279 __m128 const1, const2, const3, const4, const5, const6;
280 const1 = _mm_set_ps1(135135.0f);
281 const2 = _mm_set_ps1(17325.0f);
282 const3 = _mm_set_ps1(378.0f);
283 const4 = _mm_set_ps1(62370.0f);
284 const5 = _mm_set_ps1(3150.0f);
285 const6 = _mm_set_ps1(28.0f);
286 for (; number < quarterPoints; number++) {
287
288 aVal = _mm_loadu_ps(aPtr);
289 x2 = _mm_mul_ps(aVal, aVal);
290 a = _mm_mul_ps(
291 aVal,
293 const1,
294 _mm_mul_ps(x2,
295 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
296 b = _mm_add_ps(
297 const1,
299 x2,
300 _mm_add_ps(const4,
301 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
302
303 cVal = _mm_div_ps(a, b);
304
305 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
306
307 aPtr += 4;
308 cPtr += 4;
309 }
310
311 number = quarterPoints * 4;
312 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
313}
314#endif /* LV_HAVE_SSE */
315
316
317#ifdef LV_HAVE_AVX
318#include <immintrin.h>
319
320static inline void
321volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
322{
323 unsigned int number = 0;
324 const unsigned int eighthPoints = num_points / 8;
325
326 float* cPtr = cVector;
327 const float* aPtr = aVector;
328
329 __m256 aVal, cVal, x2, a, b;
330 __m256 const1, const2, const3, const4, const5, const6;
331 const1 = _mm256_set1_ps(135135.0f);
332 const2 = _mm256_set1_ps(17325.0f);
333 const3 = _mm256_set1_ps(378.0f);
334 const4 = _mm256_set1_ps(62370.0f);
335 const5 = _mm256_set1_ps(3150.0f);
336 const6 = _mm256_set1_ps(28.0f);
337 for (; number < eighthPoints; number++) {
338
339 aVal = _mm256_loadu_ps(aPtr);
340 x2 = _mm256_mul_ps(aVal, aVal);
341 a = _mm256_mul_ps(
342 aVal,
343 _mm256_add_ps(
344 const1,
345 _mm256_mul_ps(
346 x2,
347 _mm256_add_ps(const2,
348 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
349 b = _mm256_add_ps(
350 const1,
351 _mm256_mul_ps(
352 x2,
353 _mm256_add_ps(
354 const4,
355 _mm256_mul_ps(x2,
356 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
357
358 cVal = _mm256_div_ps(a, b);
359
360 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
361
362 aPtr += 8;
363 cPtr += 8;
364 }
365
366 number = eighthPoints * 8;
367 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
368}
369#endif /* LV_HAVE_AVX */
370
371#if LV_HAVE_AVX && LV_HAVE_FMA
372#include <immintrin.h>
373
374static inline void
375volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
376{
377 unsigned int number = 0;
378 const unsigned int eighthPoints = num_points / 8;
379
380 float* cPtr = cVector;
381 const float* aPtr = aVector;
382
383 __m256 aVal, cVal, x2, a, b;
384 __m256 const1, const2, const3, const4, const5, const6;
385 const1 = _mm256_set1_ps(135135.0f);
386 const2 = _mm256_set1_ps(17325.0f);
387 const3 = _mm256_set1_ps(378.0f);
388 const4 = _mm256_set1_ps(62370.0f);
389 const5 = _mm256_set1_ps(3150.0f);
390 const6 = _mm256_set1_ps(28.0f);
391 for (; number < eighthPoints; number++) {
392
393 aVal = _mm256_loadu_ps(aPtr);
394 x2 = _mm256_mul_ps(aVal, aVal);
395 a = _mm256_mul_ps(
396 aVal,
397 _mm256_fmadd_ps(
398 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
399 b = _mm256_fmadd_ps(
400 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
401
402 cVal = _mm256_div_ps(a, b);
403
404 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
405
406 aPtr += 8;
407 cPtr += 8;
408 }
409
410 number = eighthPoints * 8;
411 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
412}
413#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
414
415#endif /* INCLUDED_volk_32f_tanh_32f_u_H */