Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_x2_subtract_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
59#define INCLUDED_volk_32f_x2_subtract_32f_a_H
60
61#include <inttypes.h>
62#include <stdio.h>
63
64
65#ifdef LV_HAVE_GENERIC
66
67static inline void volk_32f_x2_subtract_32f_generic(float* cVector,
68 const float* aVector,
69 const float* bVector,
70 unsigned int num_points)
71{
72 for (unsigned int number = 0; number < num_points; number++) {
73 *cVector++ = (*aVector++) - (*bVector++);
74 }
75}
76#endif /* LV_HAVE_GENERIC */
77
78
79#ifdef LV_HAVE_AVX512F
80#include <immintrin.h>
81
82static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector,
83 const float* aVector,
84 const float* bVector,
85 unsigned int num_points)
86{
87 const unsigned int sixteenthPoints = num_points / 16;
88
89 for (unsigned int number = 0; number < sixteenthPoints; number++) {
90 __m512 aVal = _mm512_load_ps(aVector);
91 __m512 bVal = _mm512_load_ps(bVector);
92
93 __m512 cVal = _mm512_sub_ps(aVal, bVal);
94
95 _mm512_store_ps(cVector, cVal); // Store the results back into the C container
96
97 aVector += 16;
98 bVector += 16;
99 cVector += 16;
100 }
101
103 cVector, aVector, bVector, num_points - sixteenthPoints * 16);
104}
105#endif /* LV_HAVE_AVX512F */
106
107#ifdef LV_HAVE_AVX
108#include <immintrin.h>
109
110static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector,
111 const float* aVector,
112 const float* bVector,
113 unsigned int num_points)
114{
115 const unsigned int eighthPoints = num_points / 8;
116
117 for (unsigned int number = 0; number < eighthPoints; number++) {
118 __m256 aVal = _mm256_load_ps(aVector);
119 __m256 bVal = _mm256_load_ps(bVector);
120
121 __m256 cVal = _mm256_sub_ps(aVal, bVal);
122
123 _mm256_store_ps(cVector, cVal); // Store the results back into the C container
124
125 aVector += 8;
126 bVector += 8;
127 cVector += 8;
128 }
129
131 cVector, aVector, bVector, num_points - eighthPoints * 8);
132}
133#endif /* LV_HAVE_AVX */
134
135#ifdef LV_HAVE_SSE
136#include <xmmintrin.h>
137
138static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector,
139 const float* aVector,
140 const float* bVector,
141 unsigned int num_points)
142{
143 const unsigned int quarterPoints = num_points / 4;
144
145 for (unsigned int number = 0; number < quarterPoints; number++) {
146 __m128 aVal = _mm_load_ps(aVector);
147 __m128 bVal = _mm_load_ps(bVector);
148
149 __m128 cVal = _mm_sub_ps(aVal, bVal);
150
151 _mm_store_ps(cVector, cVal); // Store the results back into the C container
152
153 aVector += 4;
154 bVector += 4;
155 cVector += 4;
156 }
157
159 cVector, aVector, bVector, num_points - quarterPoints * 4);
160}
161#endif /* LV_HAVE_SSE */
162
163
164#ifdef LV_HAVE_NEON
165#include <arm_neon.h>
166
167static inline void volk_32f_x2_subtract_32f_neon(float* cVector,
168 const float* aVector,
169 const float* bVector,
170 unsigned int num_points)
171{
172 const unsigned int quarterPoints = num_points / 4;
173
174 for (unsigned int number = 0; number < quarterPoints; number++) {
175 float32x4_t a_vec = vld1q_f32(aVector);
176 float32x4_t b_vec = vld1q_f32(bVector);
177
178 float32x4_t c_vec = vsubq_f32(a_vec, b_vec);
179
180 vst1q_f32(cVector, c_vec);
181
182 aVector += 4;
183 bVector += 4;
184 cVector += 4;
185 }
186
188 cVector, aVector, bVector, num_points - quarterPoints * 4);
189}
190#endif /* LV_HAVE_NEON */
191
192
193#ifdef LV_HAVE_ORC
194extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector,
195 const float* aVector,
196 const float* bVector,
197 unsigned int num_points);
198
199static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector,
200 const float* aVector,
201 const float* bVector,
202 unsigned int num_points)
203{
204 volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
205}
206#endif /* LV_HAVE_ORC */
207
208
209#endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
210
211
212#ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
213#define INCLUDED_volk_32f_x2_subtract_32f_u_H
214
215#include <inttypes.h>
216#include <stdio.h>
217
218#ifdef LV_HAVE_AVX512F
219#include <immintrin.h>
220
221static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector,
222 const float* aVector,
223 const float* bVector,
224 unsigned int num_points)
225{
226 const unsigned int sixteenthPoints = num_points / 16;
227
228 for (unsigned int number = 0; number < sixteenthPoints; number++) {
229 __m512 aVal = _mm512_loadu_ps(aVector);
230 __m512 bVal = _mm512_loadu_ps(bVector);
231
232 __m512 cVal = _mm512_sub_ps(aVal, bVal);
233
234 _mm512_storeu_ps(cVector, cVal); // Store the results back into the C container
235
236 aVector += 16;
237 bVector += 16;
238 cVector += 16;
239 }
240
242 cVector, aVector, bVector, num_points - sixteenthPoints * 16);
243}
244#endif /* LV_HAVE_AVX512F */
245
246
247#ifdef LV_HAVE_AVX
248#include <immintrin.h>
249
250static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
251 const float* aVector,
252 const float* bVector,
253 unsigned int num_points)
254{
255 const unsigned int eighthPoints = num_points / 8;
256
257 for (unsigned int number = 0; number < eighthPoints; number++) {
258 __m256 aVal = _mm256_loadu_ps(aVector);
259 __m256 bVal = _mm256_loadu_ps(bVector);
260
261 __m256 cVal = _mm256_sub_ps(aVal, bVal);
262
263 _mm256_storeu_ps(cVector, cVal); // Store the results back into the C container
264
265 aVector += 8;
266 bVector += 8;
267 cVector += 8;
268 }
269
271 cVector, aVector, bVector, num_points - eighthPoints * 8);
272}
273#endif /* LV_HAVE_AVX */
274
275#endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */