Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_64f_x2_max_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_64f_x2_max_64f_a_H
59#define INCLUDED_volk_64f_x2_max_64f_a_H
60
61#include <inttypes.h>
62#include <stdio.h>
63
64#ifdef LV_HAVE_AVX512F
65#include <immintrin.h>
66
67static inline void volk_64f_x2_max_64f_a_avx512f(double* cVector,
68 const double* aVector,
69 const double* bVector,
70 unsigned int num_points)
71{
72 unsigned int number = 0;
73 const unsigned int eigthPoints = num_points / 8;
74
75 double* cPtr = cVector;
76 const double* aPtr = aVector;
77 const double* bPtr = bVector;
78
79 __m512d aVal, bVal, cVal;
80 for (; number < eigthPoints; number++) {
81
82 aVal = _mm512_load_pd(aPtr);
83 bVal = _mm512_load_pd(bPtr);
84
85 cVal = _mm512_max_pd(aVal, bVal);
86
87 _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
88
89 aPtr += 8;
90 bPtr += 8;
91 cPtr += 8;
92 }
93
94 number = eigthPoints * 8;
95 for (; number < num_points; number++) {
96 const double a = *aPtr++;
97 const double b = *bPtr++;
98 *cPtr++ = (a > b ? a : b);
99 }
100}
101#endif /* LV_HAVE_AVX512F */
102
103
104#ifdef LV_HAVE_AVX
105#include <immintrin.h>
106
107static inline void volk_64f_x2_max_64f_a_avx(double* cVector,
108 const double* aVector,
109 const double* bVector,
110 unsigned int num_points)
111{
112 unsigned int number = 0;
113 const unsigned int quarterPoints = num_points / 4;
114
115 double* cPtr = cVector;
116 const double* aPtr = aVector;
117 const double* bPtr = bVector;
118
119 __m256d aVal, bVal, cVal;
120 for (; number < quarterPoints; number++) {
121
122 aVal = _mm256_load_pd(aPtr);
123 bVal = _mm256_load_pd(bPtr);
124
125 cVal = _mm256_max_pd(aVal, bVal);
126
127 _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
128
129 aPtr += 4;
130 bPtr += 4;
131 cPtr += 4;
132 }
133
134 number = quarterPoints * 4;
135 for (; number < num_points; number++) {
136 const double a = *aPtr++;
137 const double b = *bPtr++;
138 *cPtr++ = (a > b ? a : b);
139 }
140}
141#endif /* LV_HAVE_AVX */
142
143
144#ifdef LV_HAVE_SSE2
145#include <emmintrin.h>
146
147static inline void volk_64f_x2_max_64f_a_sse2(double* cVector,
148 const double* aVector,
149 const double* bVector,
150 unsigned int num_points)
151{
152 unsigned int number = 0;
153 const unsigned int halfPoints = num_points / 2;
154
155 double* cPtr = cVector;
156 const double* aPtr = aVector;
157 const double* bPtr = bVector;
158
159 __m128d aVal, bVal, cVal;
160 for (; number < halfPoints; number++) {
161
162 aVal = _mm_load_pd(aPtr);
163 bVal = _mm_load_pd(bPtr);
164
165 cVal = _mm_max_pd(aVal, bVal);
166
167 _mm_store_pd(cPtr, cVal); // Store the results back into the C container
168
169 aPtr += 2;
170 bPtr += 2;
171 cPtr += 2;
172 }
173
174 number = halfPoints * 2;
175 for (; number < num_points; number++) {
176 const double a = *aPtr++;
177 const double b = *bPtr++;
178 *cPtr++ = (a > b ? a : b);
179 }
180}
181#endif /* LV_HAVE_SSE2 */
182
183
184#ifdef LV_HAVE_GENERIC
185
186static inline void volk_64f_x2_max_64f_generic(double* cVector,
187 const double* aVector,
188 const double* bVector,
189 unsigned int num_points)
190{
191 double* cPtr = cVector;
192 const double* aPtr = aVector;
193 const double* bPtr = bVector;
194 unsigned int number = 0;
195
196 for (number = 0; number < num_points; number++) {
197 const double a = *aPtr++;
198 const double b = *bPtr++;
199 *cPtr++ = (a > b ? a : b);
200 }
201}
202#endif /* LV_HAVE_GENERIC */
203
204
205#endif /* INCLUDED_volk_64f_x2_max_64f_a_H */
206
207
208#ifndef INCLUDED_volk_64f_x2_max_64f_u_H
209#define INCLUDED_volk_64f_x2_max_64f_u_H
210
211#include <inttypes.h>
212#include <stdio.h>
213
214#ifdef LV_HAVE_AVX512F
215#include <immintrin.h>
216
217static inline void volk_64f_x2_max_64f_u_avx512f(double* cVector,
218 const double* aVector,
219 const double* bVector,
220 unsigned int num_points)
221{
222 unsigned int number = 0;
223 const unsigned int eigthPoints = num_points / 8;
224
225 double* cPtr = cVector;
226 const double* aPtr = aVector;
227 const double* bPtr = bVector;
228
229 __m512d aVal, bVal, cVal;
230 for (; number < eigthPoints; number++) {
231
232 aVal = _mm512_loadu_pd(aPtr);
233 bVal = _mm512_loadu_pd(bPtr);
234
235 cVal = _mm512_max_pd(aVal, bVal);
236
237 _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
238
239 aPtr += 8;
240 bPtr += 8;
241 cPtr += 8;
242 }
243
244 number = eigthPoints * 8;
245 for (; number < num_points; number++) {
246 const double a = *aPtr++;
247 const double b = *bPtr++;
248 *cPtr++ = (a > b ? a : b);
249 }
250}
251#endif /* LV_HAVE_AVX512F */
252
253
254#ifdef LV_HAVE_AVX
255#include <immintrin.h>
256
257static inline void volk_64f_x2_max_64f_u_avx(double* cVector,
258 const double* aVector,
259 const double* bVector,
260 unsigned int num_points)
261{
262 unsigned int number = 0;
263 const unsigned int quarterPoints = num_points / 4;
264
265 double* cPtr = cVector;
266 const double* aPtr = aVector;
267 const double* bPtr = bVector;
268
269 __m256d aVal, bVal, cVal;
270 for (; number < quarterPoints; number++) {
271
272 aVal = _mm256_loadu_pd(aPtr);
273 bVal = _mm256_loadu_pd(bPtr);
274
275 cVal = _mm256_max_pd(aVal, bVal);
276
277 _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
278
279 aPtr += 4;
280 bPtr += 4;
281 cPtr += 4;
282 }
283
284 number = quarterPoints * 4;
285 for (; number < num_points; number++) {
286 const double a = *aPtr++;
287 const double b = *bPtr++;
288 *cPtr++ = (a > b ? a : b);
289 }
290}
291#endif /* LV_HAVE_AVX */
292
293
294#endif /* INCLUDED_volk_64f_x2_max_64f_u_H */