Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_deinterleave_real_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a_H
59#define INCLUDED_volk_32fc_deinterleave_real_64f_a_H
60
61#include <inttypes.h>
62#include <stdio.h>
63
64#ifdef LV_HAVE_AVX2
65#include <immintrin.h>
66
67static inline void volk_32fc_deinterleave_real_64f_a_avx2(double* iBuffer,
68 const lv_32fc_t* complexVector,
69 unsigned int num_points)
70{
71 unsigned int number = 0;
72
73 const float* complexVectorPtr = (float*)complexVector;
74 double* iBufferPtr = iBuffer;
75
76 const unsigned int quarterPoints = num_points / 4;
77 __m256 cplxValue;
78 __m128 fVal;
79 __m256d dVal;
80 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
81 for (; number < quarterPoints; number++) {
82
83 cplxValue = _mm256_load_ps(complexVectorPtr);
84 complexVectorPtr += 8;
85
86 // Arrange in i1i2i1i2 format
87 cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
88 fVal = _mm256_extractf128_ps(cplxValue, 0);
89 dVal = _mm256_cvtps_pd(fVal);
90 _mm256_store_pd(iBufferPtr, dVal);
91
92 iBufferPtr += 4;
93 }
94
95 number = quarterPoints * 4;
96 for (; number < num_points; number++) {
97 *iBufferPtr++ = (double)*complexVectorPtr++;
98 complexVectorPtr++;
99 }
100}
101#endif /* LV_HAVE_AVX2 */
102
103#ifdef LV_HAVE_SSE2
104#include <emmintrin.h>
105
106static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer,
107 const lv_32fc_t* complexVector,
108 unsigned int num_points)
109{
110 unsigned int number = 0;
111
112 const float* complexVectorPtr = (float*)complexVector;
113 double* iBufferPtr = iBuffer;
114
115 const unsigned int halfPoints = num_points / 2;
116 __m128 cplxValue, fVal;
117 __m128d dVal;
118 for (; number < halfPoints; number++) {
119
120 cplxValue = _mm_load_ps(complexVectorPtr);
121 complexVectorPtr += 4;
122
123 // Arrange in i1i2i1i2 format
124 fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
125 dVal = _mm_cvtps_pd(fVal);
126 _mm_store_pd(iBufferPtr, dVal);
127
128 iBufferPtr += 2;
129 }
130
131 number = halfPoints * 2;
132 for (; number < num_points; number++) {
133 *iBufferPtr++ = (double)*complexVectorPtr++;
134 complexVectorPtr++;
135 }
136}
137#endif /* LV_HAVE_SSE */
138
139#ifdef LV_HAVE_GENERIC
140
141static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer,
142 const lv_32fc_t* complexVector,
143 unsigned int num_points)
144{
145 unsigned int number = 0;
146 const float* complexVectorPtr = (float*)complexVector;
147 double* iBufferPtr = iBuffer;
148 for (number = 0; number < num_points; number++) {
149 *iBufferPtr++ = (double)*complexVectorPtr++;
150 complexVectorPtr++;
151 }
152}
153#endif /* LV_HAVE_GENERIC */
154
155#ifdef LV_HAVE_NEONV8
156#include <arm_neon.h>
157
158static inline void volk_32fc_deinterleave_real_64f_neon(double* iBuffer,
159 const lv_32fc_t* complexVector,
160 unsigned int num_points)
161{
162 unsigned int number = 0;
163 unsigned int quarter_points = num_points / 4;
164 const float* complexVectorPtr = (float*)complexVector;
165 double* iBufferPtr = iBuffer;
166 float32x2x4_t complexInput;
167 float64x2_t iVal1;
168 float64x2_t iVal2;
169 float64x2x2_t iVal;
170
171 for (number = 0; number < quarter_points; number++) {
172 // Load data into register
173 complexInput = vld4_f32(complexVectorPtr);
174
175 // Perform single to double precision conversion
176 iVal1 = vcvt_f64_f32(complexInput.val[0]);
177 iVal2 = vcvt_f64_f32(complexInput.val[2]);
178 iVal.val[0] = iVal1;
179 iVal.val[1] = iVal2;
180
181 // Store results into memory buffer
182 vst2q_f64(iBufferPtr, iVal);
183
184 // Update pointers
185 iBufferPtr += 4;
186 complexVectorPtr += 8;
187 }
188
189 for (number = quarter_points * 4; number < num_points; number++) {
190 *iBufferPtr++ = (double)*complexVectorPtr++;
191 complexVectorPtr++;
192 }
193}
194#endif /* LV_HAVE_NEON */
195
196#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a_H */
197
198#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_u_H
199#define INCLUDED_volk_32fc_deinterleave_real_64f_u_H
200
201#include <inttypes.h>
202#include <stdio.h>
203
204#ifdef LV_HAVE_AVX2
205#include <immintrin.h>
206
207static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer,
208 const lv_32fc_t* complexVector,
209 unsigned int num_points)
210{
211 unsigned int number = 0;
212
213 const float* complexVectorPtr = (float*)complexVector;
214 double* iBufferPtr = iBuffer;
215
216 const unsigned int quarterPoints = num_points / 4;
217 __m256 cplxValue;
218 __m128 fVal;
219 __m256d dVal;
220 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
221 for (; number < quarterPoints; number++) {
222
223 cplxValue = _mm256_loadu_ps(complexVectorPtr);
224 complexVectorPtr += 8;
225
226 // Arrange in i1i2i1i2 format
227 cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
228 fVal = _mm256_extractf128_ps(cplxValue, 0);
229 dVal = _mm256_cvtps_pd(fVal);
230 _mm256_storeu_pd(iBufferPtr, dVal);
231
232 iBufferPtr += 4;
233 }
234
235 number = quarterPoints * 4;
236 for (; number < num_points; number++) {
237 *iBufferPtr++ = (double)*complexVectorPtr++;
238 complexVectorPtr++;
239 }
240}
241#endif /* LV_HAVE_AVX2 */
242
243#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_u_H */