Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_multiply_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
56#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
57#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
58
59#include <inttypes.h>
60#include <stdio.h>
61
62#ifdef LV_HAVE_GENERIC
63static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
64 const float* aVector,
65 const float scalar,
66 unsigned int num_points)
67{
68 for (unsigned int number = 0; number < num_points; number++) {
69 *cVector++ = (*aVector++) * scalar;
70 }
71}
72#endif /* LV_HAVE_GENERIC */
73
74#ifdef LV_HAVE_SSE
75#include <xmmintrin.h>
76
77static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
78 const float* aVector,
79 const float scalar,
80 unsigned int num_points)
81{
82 const unsigned int quarterPoints = num_points / 4;
83
84 float* cPtr = cVector;
85 const float* aPtr = aVector;
86
87 const __m128 bVal = _mm_set_ps1(scalar);
88 for (unsigned int number = 0; number < quarterPoints; number++) {
89 __m128 aVal = _mm_loadu_ps(aPtr);
90
91 __m128 cVal = _mm_mul_ps(aVal, bVal);
92
93 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
94
95 aPtr += 4;
96 cPtr += 4;
97 }
98
99 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
100 *cPtr++ = (*aPtr++) * scalar;
101 }
102}
103#endif /* LV_HAVE_SSE */
104
105#ifdef LV_HAVE_AVX
106#include <immintrin.h>
107
108static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
109 const float* aVector,
110 const float scalar,
111 unsigned int num_points)
112{
113 const unsigned int eighthPoints = num_points / 8;
114
115 float* cPtr = cVector;
116 const float* aPtr = aVector;
117
118 const __m256 bVal = _mm256_set1_ps(scalar);
119 for (unsigned int number = 0; number < eighthPoints; number++) {
120 __m256 aVal = _mm256_loadu_ps(aPtr);
121
122 __m256 cVal = _mm256_mul_ps(aVal, bVal);
123
124 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
125
126 aPtr += 8;
127 cPtr += 8;
128 }
129
130 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
131 *cPtr++ = (*aPtr++) * scalar;
132 }
133}
134#endif /* LV_HAVE_AVX */
135
136#ifdef LV_HAVE_RISCV64
137extern void volk_32f_s32f_multiply_32f_sifive_u74(float* cVector,
138 const float* aVector,
139 const float scalar,
140 unsigned int num_points);
141#endif /* LV_HAVE_RISCV64 */
142
143
144#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
145
146
147#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
148#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
149
150#include <inttypes.h>
151#include <stdio.h>
152
153#ifdef LV_HAVE_SSE
154#include <xmmintrin.h>
155
156static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
157 const float* aVector,
158 const float scalar,
159 unsigned int num_points)
160{
161 const unsigned int quarterPoints = num_points / 4;
162
163 float* cPtr = cVector;
164 const float* aPtr = aVector;
165
166 const __m128 bVal = _mm_set_ps1(scalar);
167 for (unsigned int number = 0; number < quarterPoints; number++) {
168 __m128 aVal = _mm_load_ps(aPtr);
169
170 __m128 cVal = _mm_mul_ps(aVal, bVal);
171
172 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
173
174 aPtr += 4;
175 cPtr += 4;
176 }
177
178 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
179 *cPtr++ = (*aPtr++) * scalar;
180 }
181}
182#endif /* LV_HAVE_SSE */
183
184#ifdef LV_HAVE_AVX
185#include <immintrin.h>
186
187static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
188 const float* aVector,
189 const float scalar,
190 unsigned int num_points)
191{
192 const unsigned int eighthPoints = num_points / 8;
193
194 float* cPtr = cVector;
195 const float* aPtr = aVector;
196
197 const __m256 bVal = _mm256_set1_ps(scalar);
198 for (unsigned int number = 0; number < eighthPoints; number++) {
199 __m256 aVal = _mm256_load_ps(aPtr);
200
201 __m256 cVal = _mm256_mul_ps(aVal, bVal);
202
203 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
204
205 aPtr += 8;
206 cPtr += 8;
207 }
208
209 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
210 *cPtr++ = (*aPtr++) * scalar;
211 }
212}
213#endif /* LV_HAVE_AVX */
214
215#ifdef LV_HAVE_NEON
216#include <arm_neon.h>
217
218static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
219 const float* aVector,
220 const float scalar,
221 unsigned int num_points)
222{
223 const unsigned int quarterPoints = num_points / 4;
224
225 const float* inputPtr = aVector;
226 float* outputPtr = cVector;
227
228 for (unsigned int number = 0; number < quarterPoints; number++) {
229 float32x4_t aVal = vld1q_f32(inputPtr); // Load into NEON regs
230 float32x4_t cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
231 vst1q_f32(outputPtr, cVal); // Store results back to output
232 inputPtr += 4;
233 outputPtr += 4;
234 }
235
236 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
237 *outputPtr++ = (*inputPtr++) * scalar;
238 }
239}
240#endif /* LV_HAVE_NEON */
241
242
243#ifdef LV_HAVE_ORC
244
245extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
246 const float* src,
247 const float scalar,
248 unsigned int num_points);
249
250static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
251 const float* aVector,
252 const float scalar,
253 unsigned int num_points)
254{
255 volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
256}
257
258#endif /* LV_HAVE_ORC */
259
260#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */