Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32i_x2_or_32i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
68#ifndef INCLUDED_volk_32i_x2_or_32i_a_H
69#define INCLUDED_volk_32i_x2_or_32i_a_H
70
71#include <inttypes.h>
72#include <stdio.h>
73
74#ifdef LV_HAVE_AVX512F
75#include <immintrin.h>
76
77static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
78 const int32_t* aVector,
79 const int32_t* bVector,
80 unsigned int num_points)
81{
82 unsigned int number = 0;
83 const unsigned int sixteenthPoints = num_points / 16;
84
85 int32_t* cPtr = (int32_t*)cVector;
86 const int32_t* aPtr = (int32_t*)aVector;
87 const int32_t* bPtr = (int32_t*)bVector;
88
89 __m512i aVal, bVal, cVal;
90 for (; number < sixteenthPoints; number++) {
91
92 aVal = _mm512_load_si512(aPtr);
93 bVal = _mm512_load_si512(bPtr);
94
95 cVal = _mm512_or_si512(aVal, bVal);
96
97 _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
98
99 aPtr += 16;
100 bPtr += 16;
101 cPtr += 16;
102 }
103
104 number = sixteenthPoints * 16;
105 for (; number < num_points; number++) {
106 cVector[number] = aVector[number] | bVector[number];
107 }
108}
109#endif /* LV_HAVE_AVX512F */
110
111#ifdef LV_HAVE_AVX2
112#include <immintrin.h>
113
114static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
115 const int32_t* aVector,
116 const int32_t* bVector,
117 unsigned int num_points)
118{
119 unsigned int number = 0;
120 const unsigned int oneEightPoints = num_points / 8;
121
122 int32_t* cPtr = cVector;
123 const int32_t* aPtr = aVector;
124 const int32_t* bPtr = bVector;
125
126 __m256i aVal, bVal, cVal;
127 for (; number < oneEightPoints; number++) {
128
129 aVal = _mm256_load_si256((__m256i*)aPtr);
130 bVal = _mm256_load_si256((__m256i*)bPtr);
131
132 cVal = _mm256_or_si256(aVal, bVal);
133
134 _mm256_store_si256((__m256i*)cPtr,
135 cVal); // Store the results back into the C container
136
137 aPtr += 8;
138 bPtr += 8;
139 cPtr += 8;
140 }
141
142 number = oneEightPoints * 8;
143 for (; number < num_points; number++) {
144 cVector[number] = aVector[number] | bVector[number];
145 }
146}
147#endif /* LV_HAVE_AVX2 */
148
149
150#ifdef LV_HAVE_SSE
151#include <xmmintrin.h>
152
153static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector,
154 const int32_t* aVector,
155 const int32_t* bVector,
156 unsigned int num_points)
157{
158 unsigned int number = 0;
159 const unsigned int quarterPoints = num_points / 4;
160
161 float* cPtr = (float*)cVector;
162 const float* aPtr = (float*)aVector;
163 const float* bPtr = (float*)bVector;
164
165 __m128 aVal, bVal, cVal;
166 for (; number < quarterPoints; number++) {
167 aVal = _mm_load_ps(aPtr);
168 bVal = _mm_load_ps(bPtr);
169
170 cVal = _mm_or_ps(aVal, bVal);
171
172 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
173
174 aPtr += 4;
175 bPtr += 4;
176 cPtr += 4;
177 }
178
179 number = quarterPoints * 4;
180 for (; number < num_points; number++) {
181 cVector[number] = aVector[number] | bVector[number];
182 }
183}
184#endif /* LV_HAVE_SSE */
185
186
187#ifdef LV_HAVE_NEON
188#include <arm_neon.h>
189
190static inline void volk_32i_x2_or_32i_neon(int32_t* cVector,
191 const int32_t* aVector,
192 const int32_t* bVector,
193 unsigned int num_points)
194{
195 int32_t* cPtr = cVector;
196 const int32_t* aPtr = aVector;
197 const int32_t* bPtr = bVector;
198 unsigned int number = 0;
199 unsigned int quarter_points = num_points / 4;
200
201 int32x4_t a_val, b_val, c_val;
202
203 for (number = 0; number < quarter_points; number++) {
204 a_val = vld1q_s32(aPtr);
205 b_val = vld1q_s32(bPtr);
206 c_val = vorrq_s32(a_val, b_val);
207 vst1q_s32(cPtr, c_val);
208 aPtr += 4;
209 bPtr += 4;
210 cPtr += 4;
211 }
212
213 for (number = quarter_points * 4; number < num_points; number++) {
214 *cPtr++ = (*aPtr++) | (*bPtr++);
215 }
216}
217#endif /* LV_HAVE_NEON */
218
219
220#ifdef LV_HAVE_GENERIC
221
222static inline void volk_32i_x2_or_32i_generic(int32_t* cVector,
223 const int32_t* aVector,
224 const int32_t* bVector,
225 unsigned int num_points)
226{
227 int32_t* cPtr = cVector;
228 const int32_t* aPtr = aVector;
229 const int32_t* bPtr = bVector;
230 unsigned int number = 0;
231
232 for (number = 0; number < num_points; number++) {
233 *cPtr++ = (*aPtr++) | (*bPtr++);
234 }
235}
236#endif /* LV_HAVE_GENERIC */
237
238
239#ifdef LV_HAVE_ORC
240extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
241 const int32_t* aVector,
242 const int32_t* bVector,
243 unsigned int num_points);
244
245static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
246 const int32_t* aVector,
247 const int32_t* bVector,
248 unsigned int num_points)
249{
250 volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
251}
252#endif /* LV_HAVE_ORC */
253
254
255#endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
256
257
258#ifndef INCLUDED_volk_32i_x2_or_32i_u_H
259#define INCLUDED_volk_32i_x2_or_32i_u_H
260
261#include <inttypes.h>
262#include <stdio.h>
263
264#ifdef LV_HAVE_AVX512F
265#include <immintrin.h>
266
267static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
268 const int32_t* aVector,
269 const int32_t* bVector,
270 unsigned int num_points)
271{
272 unsigned int number = 0;
273 const unsigned int sixteenthPoints = num_points / 16;
274
275 int32_t* cPtr = (int32_t*)cVector;
276 const int32_t* aPtr = (int32_t*)aVector;
277 const int32_t* bPtr = (int32_t*)bVector;
278
279 __m512i aVal, bVal, cVal;
280 for (; number < sixteenthPoints; number++) {
281
282 aVal = _mm512_loadu_si512(aPtr);
283 bVal = _mm512_loadu_si512(bPtr);
284
285 cVal = _mm512_or_si512(aVal, bVal);
286
287 _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
288
289 aPtr += 16;
290 bPtr += 16;
291 cPtr += 16;
292 }
293
294 number = sixteenthPoints * 16;
295 for (; number < num_points; number++) {
296 cVector[number] = aVector[number] | bVector[number];
297 }
298}
299#endif /* LV_HAVE_AVX512F */
300
301#ifdef LV_HAVE_AVX2
302#include <immintrin.h>
303
304static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
305 const int32_t* aVector,
306 const int32_t* bVector,
307 unsigned int num_points)
308{
309 unsigned int number = 0;
310 const unsigned int oneEightPoints = num_points / 8;
311
312 int32_t* cPtr = cVector;
313 const int32_t* aPtr = aVector;
314 const int32_t* bPtr = bVector;
315
316 __m256i aVal, bVal, cVal;
317 for (; number < oneEightPoints; number++) {
318
319 aVal = _mm256_loadu_si256((__m256i*)aPtr);
320 bVal = _mm256_loadu_si256((__m256i*)bPtr);
321
322 cVal = _mm256_or_si256(aVal, bVal);
323
324 _mm256_storeu_si256((__m256i*)cPtr,
325 cVal); // Store the results back into the C container
326
327 aPtr += 8;
328 bPtr += 8;
329 cPtr += 8;
330 }
331
332 number = oneEightPoints * 8;
333 for (; number < num_points; number++) {
334 cVector[number] = aVector[number] | bVector[number];
335 }
336}
337#endif /* LV_HAVE_AVX2 */
338
339
340#endif /* INCLUDED_volk_32i_x2_or_32i_u_H */