Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_64f_convert_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
51#ifndef INCLUDED_volk_64f_convert_32f_u_H
52#define INCLUDED_volk_64f_convert_32f_u_H
53
54#include <inttypes.h>
55#include <stdio.h>
56
57#ifdef LV_HAVE_AVX512F
58#include <immintrin.h>
59
60static inline void volk_64f_convert_32f_u_avx512f(float* outputVector,
61 const double* inputVector,
62 unsigned int num_points)
63{
64 unsigned int number = 0;
65
66 const unsigned int oneSixteenthPoints = num_points / 16;
67
68 const double* inputVectorPtr = (const double*)inputVector;
69 float* outputVectorPtr = outputVector;
70 __m256 ret1, ret2;
71 __m512d inputVal1, inputVal2;
72
73 for (; number < oneSixteenthPoints; number++) {
74 inputVal1 = _mm512_loadu_pd(inputVectorPtr);
75 inputVectorPtr += 8;
76 inputVal2 = _mm512_loadu_pd(inputVectorPtr);
77 inputVectorPtr += 8;
78
79 ret1 = _mm512_cvtpd_ps(inputVal1);
80 ret2 = _mm512_cvtpd_ps(inputVal2);
81
82 _mm256_storeu_ps(outputVectorPtr, ret1);
83 outputVectorPtr += 8;
84
85 _mm256_storeu_ps(outputVectorPtr, ret2);
86 outputVectorPtr += 8;
87 }
88
89 number = oneSixteenthPoints * 16;
90 for (; number < num_points; number++) {
91 outputVector[number] = (float)(inputVector[number]);
92 }
93}
94#endif /* LV_HAVE_AVX512F */
95
96
97#ifdef LV_HAVE_AVX
98#include <immintrin.h>
99
100static inline void volk_64f_convert_32f_u_avx(float* outputVector,
101 const double* inputVector,
102 unsigned int num_points)
103{
104 unsigned int number = 0;
105
106 const unsigned int oneEightPoints = num_points / 8;
107
108 const double* inputVectorPtr = (const double*)inputVector;
109 float* outputVectorPtr = outputVector;
110 __m128 ret1, ret2;
111 __m256d inputVal1, inputVal2;
112
113 for (; number < oneEightPoints; number++) {
114 inputVal1 = _mm256_loadu_pd(inputVectorPtr);
115 inputVectorPtr += 4;
116 inputVal2 = _mm256_loadu_pd(inputVectorPtr);
117 inputVectorPtr += 4;
118
119 ret1 = _mm256_cvtpd_ps(inputVal1);
120 ret2 = _mm256_cvtpd_ps(inputVal2);
121
122 _mm_storeu_ps(outputVectorPtr, ret1);
123 outputVectorPtr += 4;
124
125 _mm_storeu_ps(outputVectorPtr, ret2);
126 outputVectorPtr += 4;
127 }
128
129 number = oneEightPoints * 8;
130 for (; number < num_points; number++) {
131 outputVector[number] = (float)(inputVector[number]);
132 }
133}
134#endif /* LV_HAVE_AVX */
135
136
137#ifdef LV_HAVE_SSE2
138#include <emmintrin.h>
139
140static inline void volk_64f_convert_32f_u_sse2(float* outputVector,
141 const double* inputVector,
142 unsigned int num_points)
143{
144 unsigned int number = 0;
145
146 const unsigned int quarterPoints = num_points / 4;
147
148 const double* inputVectorPtr = (const double*)inputVector;
149 float* outputVectorPtr = outputVector;
150 __m128 ret, ret2;
151 __m128d inputVal1, inputVal2;
152
153 for (; number < quarterPoints; number++) {
154 inputVal1 = _mm_loadu_pd(inputVectorPtr);
155 inputVectorPtr += 2;
156 inputVal2 = _mm_loadu_pd(inputVectorPtr);
157 inputVectorPtr += 2;
158
159 ret = _mm_cvtpd_ps(inputVal1);
160 ret2 = _mm_cvtpd_ps(inputVal2);
161
162 ret = _mm_movelh_ps(ret, ret2);
163
164 _mm_storeu_ps(outputVectorPtr, ret);
165 outputVectorPtr += 4;
166 }
167
168 number = quarterPoints * 4;
169 for (; number < num_points; number++) {
170 outputVector[number] = (float)(inputVector[number]);
171 }
172}
173#endif /* LV_HAVE_SSE2 */
174
175
176#ifdef LV_HAVE_GENERIC
177
178static inline void volk_64f_convert_32f_generic(float* outputVector,
179 const double* inputVector,
180 unsigned int num_points)
181{
182 float* outputVectorPtr = outputVector;
183 const double* inputVectorPtr = inputVector;
184 unsigned int number = 0;
185
186 for (number = 0; number < num_points; number++) {
187 *outputVectorPtr++ = ((float)(*inputVectorPtr++));
188 }
189}
190#endif /* LV_HAVE_GENERIC */
191
192
193#endif /* INCLUDED_volk_64f_convert_32f_u_H */
194#ifndef INCLUDED_volk_64f_convert_32f_a_H
195#define INCLUDED_volk_64f_convert_32f_a_H
196
197#include <inttypes.h>
198#include <stdio.h>
199
200#ifdef LV_HAVE_AVX512F
201#include <immintrin.h>
202
203static inline void volk_64f_convert_32f_a_avx512f(float* outputVector,
204 const double* inputVector,
205 unsigned int num_points)
206{
207 unsigned int number = 0;
208
209 const unsigned int oneSixteenthPoints = num_points / 16;
210
211 const double* inputVectorPtr = (const double*)inputVector;
212 float* outputVectorPtr = outputVector;
213 __m256 ret1, ret2;
214 __m512d inputVal1, inputVal2;
215
216 for (; number < oneSixteenthPoints; number++) {
217 inputVal1 = _mm512_load_pd(inputVectorPtr);
218 inputVectorPtr += 8;
219 inputVal2 = _mm512_load_pd(inputVectorPtr);
220 inputVectorPtr += 8;
221
222 ret1 = _mm512_cvtpd_ps(inputVal1);
223 ret2 = _mm512_cvtpd_ps(inputVal2);
224
225 _mm256_store_ps(outputVectorPtr, ret1);
226 outputVectorPtr += 8;
227
228 _mm256_store_ps(outputVectorPtr, ret2);
229 outputVectorPtr += 8;
230 }
231
232 number = oneSixteenthPoints * 16;
233 for (; number < num_points; number++) {
234 outputVector[number] = (float)(inputVector[number]);
235 }
236}
237#endif /* LV_HAVE_AVX512F */
238
239
240#ifdef LV_HAVE_AVX
241#include <immintrin.h>
242
243static inline void volk_64f_convert_32f_a_avx(float* outputVector,
244 const double* inputVector,
245 unsigned int num_points)
246{
247 unsigned int number = 0;
248
249 const unsigned int oneEightPoints = num_points / 8;
250
251 const double* inputVectorPtr = (const double*)inputVector;
252 float* outputVectorPtr = outputVector;
253 __m128 ret1, ret2;
254 __m256d inputVal1, inputVal2;
255
256 for (; number < oneEightPoints; number++) {
257 inputVal1 = _mm256_load_pd(inputVectorPtr);
258 inputVectorPtr += 4;
259 inputVal2 = _mm256_load_pd(inputVectorPtr);
260 inputVectorPtr += 4;
261
262 ret1 = _mm256_cvtpd_ps(inputVal1);
263 ret2 = _mm256_cvtpd_ps(inputVal2);
264
265 _mm_store_ps(outputVectorPtr, ret1);
266 outputVectorPtr += 4;
267
268 _mm_store_ps(outputVectorPtr, ret2);
269 outputVectorPtr += 4;
270 }
271
272 number = oneEightPoints * 8;
273 for (; number < num_points; number++) {
274 outputVector[number] = (float)(inputVector[number]);
275 }
276}
277#endif /* LV_HAVE_AVX */
278
279
280#ifdef LV_HAVE_SSE2
281#include <emmintrin.h>
282
283static inline void volk_64f_convert_32f_a_sse2(float* outputVector,
284 const double* inputVector,
285 unsigned int num_points)
286{
287 unsigned int number = 0;
288
289 const unsigned int quarterPoints = num_points / 4;
290
291 const double* inputVectorPtr = (const double*)inputVector;
292 float* outputVectorPtr = outputVector;
293 __m128 ret, ret2;
294 __m128d inputVal1, inputVal2;
295
296 for (; number < quarterPoints; number++) {
297 inputVal1 = _mm_load_pd(inputVectorPtr);
298 inputVectorPtr += 2;
299 inputVal2 = _mm_load_pd(inputVectorPtr);
300 inputVectorPtr += 2;
301
302 ret = _mm_cvtpd_ps(inputVal1);
303 ret2 = _mm_cvtpd_ps(inputVal2);
304
305 ret = _mm_movelh_ps(ret, ret2);
306
307 _mm_store_ps(outputVectorPtr, ret);
308 outputVectorPtr += 4;
309 }
310
311 number = quarterPoints * 4;
312 for (; number < num_points; number++) {
313 outputVector[number] = (float)(inputVector[number]);
314 }
315}
316#endif /* LV_HAVE_SSE2 */
317
318
319#endif /* INCLUDED_volk_64f_convert_32f_a_H */