Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_invsqrt_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
52#ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
53#define INCLUDED_volk_32f_invsqrt_32f_a_H
54
55#include <inttypes.h>
56#include <math.h>
57#include <stdio.h>
58#include <string.h>
59
60static inline float Q_rsqrt(float number)
61{
62 float x2;
63 const float threehalfs = 1.5F;
64 union f32_to_i32 {
65 int32_t i;
66 float f;
67 } u;
68
69 x2 = number * 0.5F;
70 u.f = number;
71 u.i = 0x5f3759df - (u.i >> 1); // what the fuck?
72 u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration
73 // u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be
74 // removed
75
76 return u.f;
77}
78
79#ifdef LV_HAVE_AVX
80#include <immintrin.h>
81
82static inline void
83volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
84{
85 unsigned int number = 0;
86 const unsigned int eighthPoints = num_points / 8;
87
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
90 __m256 aVal, cVal;
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
93 cVal = _mm256_rsqrt_ps(aVal);
94 _mm256_store_ps(cPtr, cVal);
95 aPtr += 8;
96 cPtr += 8;
97 }
98
99 number = eighthPoints * 8;
100 for (; number < num_points; number++)
101 *cPtr++ = Q_rsqrt(*aPtr++);
102}
103#endif /* LV_HAVE_AVX */
104
105
106#ifdef LV_HAVE_SSE
107#include <xmmintrin.h>
108
109static inline void
110volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
111{
112 unsigned int number = 0;
113 const unsigned int quarterPoints = num_points / 4;
114
115 float* cPtr = cVector;
116 const float* aPtr = aVector;
117
118 __m128 aVal, cVal;
119 for (; number < quarterPoints; number++) {
120
121 aVal = _mm_load_ps(aPtr);
122
123 cVal = _mm_rsqrt_ps(aVal);
124
125 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
126
127 aPtr += 4;
128 cPtr += 4;
129 }
130
131 number = quarterPoints * 4;
132 for (; number < num_points; number++) {
133 *cPtr++ = Q_rsqrt(*aPtr++);
134 }
135}
136#endif /* LV_HAVE_SSE */
137
138
139#ifdef LV_HAVE_NEON
140#include <arm_neon.h>
141
142static inline void
143volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
144{
145 unsigned int number;
146 const unsigned int quarter_points = num_points / 4;
147
148 float* cPtr = cVector;
149 const float* aPtr = aVector;
150 float32x4_t a_val, c_val;
151 for (number = 0; number < quarter_points; ++number) {
152 a_val = vld1q_f32(aPtr);
153 c_val = vrsqrteq_f32(a_val);
154 vst1q_f32(cPtr, c_val);
155 aPtr += 4;
156 cPtr += 4;
157 }
158
159 for (number = quarter_points * 4; number < num_points; number++)
160 *cPtr++ = Q_rsqrt(*aPtr++);
161}
162#endif /* LV_HAVE_NEON */
163
164
165#ifdef LV_HAVE_GENERIC
166
167static inline void volk_32f_invsqrt_32f_generic(float* cVector,
168 const float* aVector,
169 unsigned int num_points)
170{
171 float* cPtr = cVector;
172 const float* aPtr = aVector;
173 unsigned int number = 0;
174 for (number = 0; number < num_points; number++) {
175 *cPtr++ = Q_rsqrt(*aPtr++);
176 }
177}
178#endif /* LV_HAVE_GENERIC */
179
180#ifdef LV_HAVE_AVX
181#include <immintrin.h>
182
183static inline void
184volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
185{
186 unsigned int number = 0;
187 const unsigned int eighthPoints = num_points / 8;
188
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
191 __m256 aVal, cVal;
192 for (; number < eighthPoints; number++) {
193 aVal = _mm256_loadu_ps(aPtr);
194 cVal = _mm256_rsqrt_ps(aVal);
195 _mm256_storeu_ps(cPtr, cVal);
196 aPtr += 8;
197 cPtr += 8;
198 }
199
200 number = eighthPoints * 8;
201 for (; number < num_points; number++)
202 *cPtr++ = Q_rsqrt(*aPtr++);
203}
204#endif /* LV_HAVE_AVX */
205
206#endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */