Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_sqrt_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
52#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53#define INCLUDED_volk_32f_sqrt_32f_a_H
54
55#include <inttypes.h>
56#include <math.h>
57#include <stdio.h>
58
59#ifdef LV_HAVE_SSE
60#include <xmmintrin.h>
61
62static inline void
63volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
64{
65 unsigned int number = 0;
66 const unsigned int quarterPoints = num_points / 4;
67
68 float* cPtr = cVector;
69 const float* aPtr = aVector;
70
71 __m128 aVal, cVal;
72 for (; number < quarterPoints; number++) {
73 aVal = _mm_load_ps(aPtr);
74
75 cVal = _mm_sqrt_ps(aVal);
76
77 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
78
79 aPtr += 4;
80 cPtr += 4;
81 }
82
83 number = quarterPoints * 4;
84 for (; number < num_points; number++) {
85 *cPtr++ = sqrtf(*aPtr++);
86 }
87}
88
89#endif /* LV_HAVE_SSE */
90
91#ifdef LV_HAVE_AVX
92#include <immintrin.h>
93
94static inline void
95volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
96{
97 unsigned int number = 0;
98 const unsigned int eighthPoints = num_points / 8;
99
100 float* cPtr = cVector;
101 const float* aPtr = aVector;
102
103 __m256 aVal, cVal;
104 for (; number < eighthPoints; number++) {
105 aVal = _mm256_load_ps(aPtr);
106
107 cVal = _mm256_sqrt_ps(aVal);
108
109 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
110
111 aPtr += 8;
112 cPtr += 8;
113 }
114
115 number = eighthPoints * 8;
116 for (; number < num_points; number++) {
117 *cPtr++ = sqrtf(*aPtr++);
118 }
119}
120
121#endif /* LV_HAVE_AVX */
122
123
124#ifdef LV_HAVE_NEON
125#include <arm_neon.h>
126
127static inline void
128volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
129{
130 float* cPtr = cVector;
131 const float* aPtr = aVector;
132 unsigned int number = 0;
133 unsigned int quarter_points = num_points / 4;
134 float32x4_t in_vec, out_vec;
135
136 for (number = 0; number < quarter_points; number++) {
137 in_vec = vld1q_f32(aPtr);
138 // note that armv8 has vsqrt_f32 which will be much better
139 out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
140 vst1q_f32(cPtr, out_vec);
141 aPtr += 4;
142 cPtr += 4;
143 }
144
145 for (number = quarter_points * 4; number < num_points; number++) {
146 *cPtr++ = sqrtf(*aPtr++);
147 }
148}
149
150#endif /* LV_HAVE_NEON */
151
152
153#ifdef LV_HAVE_GENERIC
154
155static inline void
156volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
157{
158 float* cPtr = cVector;
159 const float* aPtr = aVector;
160 unsigned int number = 0;
161
162 for (number = 0; number < num_points; number++) {
163 *cPtr++ = sqrtf(*aPtr++);
164 }
165}
166
167#endif /* LV_HAVE_GENERIC */
168
169#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
170
171#ifndef INCLUDED_volk_32f_sqrt_32f_u_H
172#define INCLUDED_volk_32f_sqrt_32f_u_H
173
174#include <inttypes.h>
175#include <math.h>
176#include <stdio.h>
177#ifdef LV_HAVE_AVX
178#include <immintrin.h>
179
180static inline void
181volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
182{
183 unsigned int number = 0;
184 const unsigned int eighthPoints = num_points / 8;
185
186 float* cPtr = cVector;
187 const float* aPtr = aVector;
188
189 __m256 aVal, cVal;
190 for (; number < eighthPoints; number++) {
191 aVal = _mm256_loadu_ps(aPtr);
192
193 cVal = _mm256_sqrt_ps(aVal);
194
195 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
196
197 aPtr += 8;
198 cPtr += 8;
199 }
200
201 number = eighthPoints * 8;
202 for (; number < num_points; number++) {
203 *cPtr++ = sqrtf(*aPtr++);
204 }
205}
206
207#endif /* LV_HAVE_AVX */
208#endif /* INCLUDED_volk_32f_sqrt_32f_u_H */