Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32fc_accumulator_s32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
51#ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
52#define INCLUDED_volk_32fc_accumulator_s32fc_a_H
53
54#include <inttypes.h>
55#include <volk/volk_common.h>
56
57#ifdef LV_HAVE_GENERIC
59 const lv_32fc_t* inputBuffer,
60 unsigned int num_points)
61{
62 const lv_32fc_t* aPtr = inputBuffer;
63 unsigned int number = 0;
64 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
65
66 for (; number < num_points; number++) {
67 returnValue += (*aPtr++);
68 }
69 *result = returnValue;
70}
71#endif /* LV_HAVE_GENERIC */
72
73#ifdef LV_HAVE_AVX
74#include <immintrin.h>
75
77 const lv_32fc_t* inputBuffer,
78 unsigned int num_points)
79{
80 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
81 unsigned int number = 0;
82 const unsigned int quarterPoints = num_points / 4;
83
84 const lv_32fc_t* aPtr = inputBuffer;
85 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
86
87 __m256 accumulator = _mm256_setzero_ps();
88 __m256 aVal = _mm256_setzero_ps();
89
90 for (; number < quarterPoints; number++) {
91 aVal = _mm256_loadu_ps((float*)aPtr);
92 accumulator = _mm256_add_ps(accumulator, aVal);
93 aPtr += 4;
94 }
95
96 _mm256_store_ps(tempBuffer, accumulator);
97
98 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
99 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
100 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
101 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
102
103 number = quarterPoints * 4;
104 for (; number < num_points; number++) {
105 returnValue += (*aPtr++);
106 }
107 *result = returnValue;
108}
109#endif /* LV_HAVE_AVX */
110
111#ifdef LV_HAVE_SSE
112#include <xmmintrin.h>
113
115 const lv_32fc_t* inputBuffer,
116 unsigned int num_points)
117{
118 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
119 unsigned int number = 0;
120 const unsigned int halfPoints = num_points / 2;
121
122 const lv_32fc_t* aPtr = inputBuffer;
123 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
124
125 __m128 accumulator = _mm_setzero_ps();
126 __m128 aVal = _mm_setzero_ps();
127
128 for (; number < halfPoints; number++) {
129 aVal = _mm_loadu_ps((float*)aPtr);
130 accumulator = _mm_add_ps(accumulator, aVal);
131 aPtr += 2;
132 }
133
134 _mm_store_ps(tempBuffer, accumulator);
135
136 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
137 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
138
139 number = halfPoints * 2;
140 for (; number < num_points; number++) {
141 returnValue += (*aPtr++);
142 }
143 *result = returnValue;
144}
145#endif /* LV_HAVE_SSE */
146
147#ifdef LV_HAVE_AVX
148#include <immintrin.h>
149
151 const lv_32fc_t* inputBuffer,
152 unsigned int num_points)
153{
154 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
155 unsigned int number = 0;
156 const unsigned int quarterPoints = num_points / 4;
157
158 const lv_32fc_t* aPtr = inputBuffer;
159 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
160
161 __m256 accumulator = _mm256_setzero_ps();
162 __m256 aVal = _mm256_setzero_ps();
163
164 for (; number < quarterPoints; number++) {
165 aVal = _mm256_load_ps((float*)aPtr);
166 accumulator = _mm256_add_ps(accumulator, aVal);
167 aPtr += 4;
168 }
169
170 _mm256_store_ps(tempBuffer, accumulator);
171
172 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
173 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
174 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
175 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
176
177 number = quarterPoints * 4;
178 for (; number < num_points; number++) {
179 returnValue += (*aPtr++);
180 }
181 *result = returnValue;
182}
183#endif /* LV_HAVE_AVX */
184
185#ifdef LV_HAVE_SSE
186#include <xmmintrin.h>
187
189 const lv_32fc_t* inputBuffer,
190 unsigned int num_points)
191{
192 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
193 unsigned int number = 0;
194 const unsigned int halfPoints = num_points / 2;
195
196 const lv_32fc_t* aPtr = inputBuffer;
197 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
198
199 __m128 accumulator = _mm_setzero_ps();
200 __m128 aVal = _mm_setzero_ps();
201
202 for (; number < halfPoints; number++) {
203 aVal = _mm_load_ps((float*)aPtr);
204 accumulator = _mm_add_ps(accumulator, aVal);
205 aPtr += 2;
206 }
207
208 _mm_store_ps(tempBuffer, accumulator);
209
210 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
211 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
212
213 number = halfPoints * 2;
214 for (; number < num_points; number++) {
215 returnValue += (*aPtr++);
216 }
217 *result = returnValue;
218}
219#endif /* LV_HAVE_SSE */
220
221#ifdef LV_HAVE_NEON
222#include <arm_neon.h>
224 const lv_32fc_t* inputBuffer,
225 unsigned int num_points)
226{
227 const lv_32fc_t* aPtr = inputBuffer;
228 unsigned int number = 0;
229 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
230 unsigned int eighthPoints = num_points / 8;
231 float32x4_t in_vec;
232 float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
233 float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
234 float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
235 float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
236 __VOLK_ATTR_ALIGNED(32) float tempBuffer[4];
237
238 for (; number < eighthPoints; number++) {
239 in_vec = vld1q_f32((float*)aPtr);
240 out_vec0 = vaddq_f32(in_vec, out_vec0);
241 aPtr += 2;
242
243 in_vec = vld1q_f32((float*)aPtr);
244 out_vec1 = vaddq_f32(in_vec, out_vec1);
245 aPtr += 2;
246
247 in_vec = vld1q_f32((float*)aPtr);
248 out_vec2 = vaddq_f32(in_vec, out_vec2);
249 aPtr += 2;
250
251 in_vec = vld1q_f32((float*)aPtr);
252 out_vec3 = vaddq_f32(in_vec, out_vec3);
253 aPtr += 2;
254 }
255 vst1q_f32(tempBuffer, out_vec0);
256 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
257 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
258
259 vst1q_f32(tempBuffer, out_vec1);
260 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
261 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
262
263 vst1q_f32(tempBuffer, out_vec2);
264 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
265 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
266
267 vst1q_f32(tempBuffer, out_vec3);
268 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
269 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
270
271 number = eighthPoints * 8;
272 for (; number < num_points; number++) {
273 returnValue += (*aPtr++);
274 }
275 *result = returnValue;
276}
277#endif /* LV_HAVE_NEON */
278
279#endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */