Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_add_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
61#ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H
62#define INCLUDED_volk_32fc_x2_add_32fc_u_H
63
64#ifdef LV_HAVE_AVX
65#include <immintrin.h>
66
67static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector,
68 const lv_32fc_t* aVector,
69 const lv_32fc_t* bVector,
70 unsigned int num_points)
71{
72 unsigned int number = 0;
73 const unsigned int quarterPoints = num_points / 4;
74
75 lv_32fc_t* cPtr = cVector;
76 const lv_32fc_t* aPtr = aVector;
77 const lv_32fc_t* bPtr = bVector;
78
79 __m256 aVal, bVal, cVal;
80 for (; number < quarterPoints; number++) {
81
82 aVal = _mm256_loadu_ps((float*)aPtr);
83 bVal = _mm256_loadu_ps((float*)bPtr);
84
85 cVal = _mm256_add_ps(aVal, bVal);
86
87 _mm256_storeu_ps((float*)cPtr,
88 cVal); // Store the results back into the C container
89
90 aPtr += 4;
91 bPtr += 4;
92 cPtr += 4;
93 }
94
95 number = quarterPoints * 4;
96 for (; number < num_points; number++) {
97 *cPtr++ = (*aPtr++) + (*bPtr++);
98 }
99}
100#endif /* LV_HAVE_AVX */
101
102
103#ifdef LV_HAVE_AVX
104#include <immintrin.h>
105
106static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector,
107 const lv_32fc_t* aVector,
108 const lv_32fc_t* bVector,
109 unsigned int num_points)
110{
111 unsigned int number = 0;
112 const unsigned int quarterPoints = num_points / 4;
113
114 lv_32fc_t* cPtr = cVector;
115 const lv_32fc_t* aPtr = aVector;
116 const lv_32fc_t* bPtr = bVector;
117
118 __m256 aVal, bVal, cVal;
119 for (; number < quarterPoints; number++) {
120
121 aVal = _mm256_load_ps((float*)aPtr);
122 bVal = _mm256_load_ps((float*)bPtr);
123
124 cVal = _mm256_add_ps(aVal, bVal);
125
126 _mm256_store_ps((float*)cPtr,
127 cVal); // Store the results back into the C container
128
129 aPtr += 4;
130 bPtr += 4;
131 cPtr += 4;
132 }
133
134 number = quarterPoints * 4;
135 for (; number < num_points; number++) {
136 *cPtr++ = (*aPtr++) + (*bPtr++);
137 }
138}
139#endif /* LV_HAVE_AVX */
140
141
142#ifdef LV_HAVE_SSE
143#include <xmmintrin.h>
144
145static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector,
146 const lv_32fc_t* aVector,
147 const lv_32fc_t* bVector,
148 unsigned int num_points)
149{
150 unsigned int number = 0;
151 const unsigned int halfPoints = num_points / 2;
152
153 lv_32fc_t* cPtr = cVector;
154 const lv_32fc_t* aPtr = aVector;
155 const lv_32fc_t* bPtr = bVector;
156
157 __m128 aVal, bVal, cVal;
158 for (; number < halfPoints; number++) {
159
160 aVal = _mm_loadu_ps((float*)aPtr);
161 bVal = _mm_loadu_ps((float*)bPtr);
162
163 cVal = _mm_add_ps(aVal, bVal);
164
165 _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container
166
167 aPtr += 2;
168 bPtr += 2;
169 cPtr += 2;
170 }
171
172 number = halfPoints * 2;
173 for (; number < num_points; number++) {
174 *cPtr++ = (*aPtr++) + (*bPtr++);
175 }
176}
177#endif /* LV_HAVE_SSE */
178
179
180#ifdef LV_HAVE_GENERIC
181
182static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector,
183 const lv_32fc_t* aVector,
184 const lv_32fc_t* bVector,
185 unsigned int num_points)
186{
187 lv_32fc_t* cPtr = cVector;
188 const lv_32fc_t* aPtr = aVector;
189 const lv_32fc_t* bPtr = bVector;
190 unsigned int number = 0;
191
192 for (number = 0; number < num_points; number++) {
193 *cPtr++ = (*aPtr++) + (*bPtr++);
194 }
195}
196#endif /* LV_HAVE_GENERIC */
197
198
199#ifdef LV_HAVE_SSE
200#include <xmmintrin.h>
201
202static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector,
203 const lv_32fc_t* aVector,
204 const lv_32fc_t* bVector,
205 unsigned int num_points)
206{
207 unsigned int number = 0;
208 const unsigned int halfPoints = num_points / 2;
209
210 lv_32fc_t* cPtr = cVector;
211 const lv_32fc_t* aPtr = aVector;
212 const lv_32fc_t* bPtr = bVector;
213
214 __m128 aVal, bVal, cVal;
215 for (; number < halfPoints; number++) {
216 aVal = _mm_load_ps((float*)aPtr);
217 bVal = _mm_load_ps((float*)bPtr);
218
219 cVal = _mm_add_ps(aVal, bVal);
220
221 _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
222
223 aPtr += 2;
224 bPtr += 2;
225 cPtr += 2;
226 }
227
228 number = halfPoints * 2;
229 for (; number < num_points; number++) {
230 *cPtr++ = (*aPtr++) + (*bPtr++);
231 }
232}
233#endif /* LV_HAVE_SSE */
234
235
236#ifdef LV_HAVE_NEON
237#include <arm_neon.h>
238
239static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector,
240 const lv_32fc_t* aVector,
241 const lv_32fc_t* bVector,
242 unsigned int num_points)
243{
244 unsigned int number = 0;
245 const unsigned int halfPoints = num_points / 2;
246
247 lv_32fc_t* cPtr = cVector;
248 const lv_32fc_t* aPtr = aVector;
249 const lv_32fc_t* bPtr = bVector;
250 float32x4_t aVal, bVal, cVal;
251 for (number = 0; number < halfPoints; number++) {
252 // Load in to NEON registers
253 aVal = vld1q_f32((const float32_t*)(aPtr));
254 bVal = vld1q_f32((const float32_t*)(bPtr));
255 __VOLK_PREFETCH(aPtr + 2);
256 __VOLK_PREFETCH(bPtr + 2);
257
258 // vector add
259 cVal = vaddq_f32(aVal, bVal);
260 // Store the results back into the C container
261 vst1q_f32((float*)(cPtr), cVal);
262
263 aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
264 bPtr += 2;
265 cPtr += 2;
266 }
267
268 number = halfPoints * 2; // should be = num_points
269 for (; number < num_points; number++) {
270 *cPtr++ = (*aPtr++) + (*bPtr++);
271 }
272}
273
274#endif /* LV_HAVE_NEON */
275
276
277#endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */