Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_conjugate_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
55#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
56#define INCLUDED_volk_32fc_conjugate_32fc_u_H
57
58#include <float.h>
59#include <inttypes.h>
60#include <stdio.h>
61#include <volk/volk_complex.h>
62
63#ifdef LV_HAVE_AVX
64#include <immintrin.h>
65
66static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector,
67 const lv_32fc_t* aVector,
68 unsigned int num_points)
69{
70 unsigned int number = 0;
71 const unsigned int quarterPoints = num_points / 4;
72
73 __m256 x;
74 lv_32fc_t* c = cVector;
75 const lv_32fc_t* a = aVector;
76
77 __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
78
79 for (; number < quarterPoints; number++) {
80
81 x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
82
83 x = _mm256_xor_ps(x, conjugator); // conjugate register
84
85 _mm256_storeu_ps((float*)c, x); // Store the results back into the C container
86
87 a += 4;
88 c += 4;
89 }
90
91 number = quarterPoints * 4;
92
93 for (; number < num_points; number++) {
94 *c++ = lv_conj(*a++);
95 }
96}
97#endif /* LV_HAVE_AVX */
98
99#ifdef LV_HAVE_SSE3
100#include <pmmintrin.h>
101
102static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
103 const lv_32fc_t* aVector,
104 unsigned int num_points)
105{
106 unsigned int number = 0;
107 const unsigned int halfPoints = num_points / 2;
108
109 __m128 x;
110 lv_32fc_t* c = cVector;
111 const lv_32fc_t* a = aVector;
112
113 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
114
115 for (; number < halfPoints; number++) {
116
117 x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
118
119 x = _mm_xor_ps(x, conjugator); // conjugate register
120
121 _mm_storeu_ps((float*)c, x); // Store the results back into the C container
122
123 a += 2;
124 c += 2;
125 }
126
127 if ((num_points % 2) != 0) {
128 *c = lv_conj(*a);
129 }
130}
131#endif /* LV_HAVE_SSE3 */
132
133#ifdef LV_HAVE_GENERIC
134
135static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector,
136 const lv_32fc_t* aVector,
137 unsigned int num_points)
138{
139 lv_32fc_t* cPtr = cVector;
140 const lv_32fc_t* aPtr = aVector;
141 unsigned int number = 0;
142
143 for (number = 0; number < num_points; number++) {
144 *cPtr++ = lv_conj(*aPtr++);
145 }
146}
147#endif /* LV_HAVE_GENERIC */
148
149
150#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
151#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
152#define INCLUDED_volk_32fc_conjugate_32fc_a_H
153
154#include <float.h>
155#include <inttypes.h>
156#include <stdio.h>
157#include <volk/volk_complex.h>
158
159#ifdef LV_HAVE_AVX
160#include <immintrin.h>
161
162static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector,
163 const lv_32fc_t* aVector,
164 unsigned int num_points)
165{
166 unsigned int number = 0;
167 const unsigned int quarterPoints = num_points / 4;
168
169 __m256 x;
170 lv_32fc_t* c = cVector;
171 const lv_32fc_t* a = aVector;
172
173 __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
174
175 for (; number < quarterPoints; number++) {
176
177 x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
178
179 x = _mm256_xor_ps(x, conjugator); // conjugate register
180
181 _mm256_store_ps((float*)c, x); // Store the results back into the C container
182
183 a += 4;
184 c += 4;
185 }
186
187 number = quarterPoints * 4;
188
189 for (; number < num_points; number++) {
190 *c++ = lv_conj(*a++);
191 }
192}
193#endif /* LV_HAVE_AVX */
194
195#ifdef LV_HAVE_SSE3
196#include <pmmintrin.h>
197
198static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
199 const lv_32fc_t* aVector,
200 unsigned int num_points)
201{
202 unsigned int number = 0;
203 const unsigned int halfPoints = num_points / 2;
204
205 __m128 x;
206 lv_32fc_t* c = cVector;
207 const lv_32fc_t* a = aVector;
208
209 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
210
211 for (; number < halfPoints; number++) {
212
213 x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
214
215 x = _mm_xor_ps(x, conjugator); // conjugate register
216
217 _mm_store_ps((float*)c, x); // Store the results back into the C container
218
219 a += 2;
220 c += 2;
221 }
222
223 if ((num_points % 2) != 0) {
224 *c = lv_conj(*a);
225 }
226}
227#endif /* LV_HAVE_SSE3 */
228
229#ifdef LV_HAVE_NEON
230#include <arm_neon.h>
231
232static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector,
233 const lv_32fc_t* aVector,
234 unsigned int num_points)
235{
236 unsigned int number;
237 const unsigned int quarterPoints = num_points / 4;
238
239 float32x4x2_t x;
240 lv_32fc_t* c = cVector;
241 const lv_32fc_t* a = aVector;
242
243 for (number = 0; number < quarterPoints; number++) {
244 __VOLK_PREFETCH(a + 4);
245 x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
246
247 // xor the imaginary lane
248 x.val[1] = vnegq_f32(x.val[1]);
249
250 vst2q_f32((float*)c, x); // Store the results back into the C container
251
252 a += 4;
253 c += 4;
254 }
255
256 for (number = quarterPoints * 4; number < num_points; number++) {
257 *c++ = lv_conj(*a++);
258 }
259}
260#endif /* LV_HAVE_NEON */
261
262
263#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */