58#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
59#define INCLUDED_volk_32f_x2_multiply_32f_u_H
70 unsigned int num_points)
72 unsigned int number = 0;
73 const unsigned int quarterPoints = num_points / 4;
75 float* cPtr = cVector;
76 const float* aPtr = aVector;
77 const float* bPtr = bVector;
80 for (; number < quarterPoints; number++) {
94 number = quarterPoints * 4;
95 for (; number < num_points; number++) {
96 *cPtr++ = (*aPtr++) * (*bPtr++);
101#ifdef LV_HAVE_AVX512F
102#include <immintrin.h>
104static inline void volk_32f_x2_multiply_32f_u_avx512f(
float* cVector,
105 const float* aVector,
106 const float* bVector,
107 unsigned int num_points)
109 unsigned int number = 0;
110 const unsigned int sixteenthPoints = num_points / 16;
112 float* cPtr = cVector;
113 const float* aPtr = aVector;
114 const float* bPtr = bVector;
116 __m512 aVal, bVal, cVal;
117 for (; number < sixteenthPoints; number++) {
119 aVal = _mm512_loadu_ps(aPtr);
120 bVal = _mm512_loadu_ps(bPtr);
122 cVal = _mm512_mul_ps(aVal, bVal);
124 _mm512_storeu_ps(cPtr, cVal);
131 number = sixteenthPoints * 16;
132 for (; number < num_points; number++) {
133 *cPtr++ = (*aPtr++) * (*bPtr++);
139#include <immintrin.h>
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
146 unsigned int number = 0;
147 const unsigned int eighthPoints = num_points / 8;
149 float* cPtr = cVector;
150 const float* aPtr = aVector;
151 const float* bPtr = bVector;
153 __m256 aVal, bVal, cVal;
154 for (; number < eighthPoints; number++) {
156 aVal = _mm256_loadu_ps(aPtr);
157 bVal = _mm256_loadu_ps(bPtr);
159 cVal = _mm256_mul_ps(aVal, bVal);
161 _mm256_storeu_ps(cPtr, cVal);
168 number = eighthPoints * 8;
169 for (; number < num_points; number++) {
170 *cPtr++ = (*aPtr++) * (*bPtr++);
176#ifdef LV_HAVE_GENERIC
179 const float* aVector,
180 const float* bVector,
181 unsigned int num_points)
183 float* cPtr = cVector;
184 const float* aPtr = aVector;
185 const float* bPtr = bVector;
186 unsigned int number = 0;
188 for (number = 0; number < num_points; number++) {
189 *cPtr++ = (*aPtr++) * (*bPtr++);
198#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
199#define INCLUDED_volk_32f_x2_multiply_32f_a_H
205#include <xmmintrin.h>
208 const float* aVector,
209 const float* bVector,
210 unsigned int num_points)
212 unsigned int number = 0;
213 const unsigned int quarterPoints = num_points / 4;
215 float* cPtr = cVector;
216 const float* aPtr = aVector;
217 const float* bPtr = bVector;
220 for (; number < quarterPoints; number++) {
234 number = quarterPoints * 4;
235 for (; number < num_points; number++) {
236 *cPtr++ = (*aPtr++) * (*bPtr++);
241#ifdef LV_HAVE_AVX512F
242#include <immintrin.h>
244static inline void volk_32f_x2_multiply_32f_a_avx512f(
float* cVector,
245 const float* aVector,
246 const float* bVector,
247 unsigned int num_points)
249 unsigned int number = 0;
250 const unsigned int sixteenthPoints = num_points / 16;
252 float* cPtr = cVector;
253 const float* aPtr = aVector;
254 const float* bPtr = bVector;
256 __m512 aVal, bVal, cVal;
257 for (; number < sixteenthPoints; number++) {
259 aVal = _mm512_load_ps(aPtr);
260 bVal = _mm512_load_ps(bPtr);
262 cVal = _mm512_mul_ps(aVal, bVal);
264 _mm512_store_ps(cPtr, cVal);
271 number = sixteenthPoints * 16;
272 for (; number < num_points; number++) {
273 *cPtr++ = (*aPtr++) * (*bPtr++);
280#include <immintrin.h>
283 const float* aVector,
284 const float* bVector,
285 unsigned int num_points)
287 unsigned int number = 0;
288 const unsigned int eighthPoints = num_points / 8;
290 float* cPtr = cVector;
291 const float* aPtr = aVector;
292 const float* bPtr = bVector;
294 __m256 aVal, bVal, cVal;
295 for (; number < eighthPoints; number++) {
297 aVal = _mm256_load_ps(aPtr);
298 bVal = _mm256_load_ps(bPtr);
300 cVal = _mm256_mul_ps(aVal, bVal);
302 _mm256_store_ps(cPtr, cVal);
309 number = eighthPoints * 8;
310 for (; number < num_points; number++) {
311 *cPtr++ = (*aPtr++) * (*bPtr++);
321 const float* aVector,
322 const float* bVector,
323 unsigned int num_points)
325 const unsigned int quarter_points = num_points / 4;
327 float32x4_t avec, bvec, cvec;
328 for (number = 0; number < quarter_points; ++number) {
329 avec = vld1q_f32(aVector);
330 bvec = vld1q_f32(bVector);
331 cvec = vmulq_f32(avec, bvec);
332 vst1q_f32(cVector, cvec);
337 for (number = quarter_points * 4; number < num_points; ++number) {
338 *cVector++ = *aVector++ * *bVector++;
345extern void volk_32f_x2_multiply_32f_a_orc_impl(
float* cVector,
346 const float* aVector,
347 const float* bVector,
348 unsigned int num_points);
350static inline void volk_32f_x2_multiply_32f_u_orc(
float* cVector,
351 const float* aVector,
352 const float* bVector,
353 unsigned int num_points)
355 volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);