Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_64f_multiply_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_32f_64f_multiply_64f_H
59#define INCLUDED_volk_32f_64f_multiply_64f_H
60
61#include <inttypes.h>
62
63
64#ifdef LV_HAVE_GENERIC
65
66static inline void volk_32f_64f_multiply_64f_generic(double* cVector,
67 const float* aVector,
68 const double* bVector,
69 unsigned int num_points)
70{
71 double* cPtr = cVector;
72 const float* aPtr = aVector;
73 const double* bPtr = bVector;
74 unsigned int number = 0;
75
76 for (number = 0; number < num_points; number++) {
77 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
78 }
79}
80
81#endif /* LV_HAVE_GENERIC */
82
83/*
84 * Unaligned versions
85 */
86
87
88#ifdef LV_HAVE_AVX
89
90#include <immintrin.h>
91#include <xmmintrin.h>
92
93static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector,
94 const float* aVector,
95 const double* bVector,
96 unsigned int num_points)
97{
98 unsigned int number = 0;
99 const unsigned int eighth_points = num_points / 8;
100
101 double* cPtr = cVector;
102 const float* aPtr = aVector;
103 const double* bPtr = bVector;
104
105 __m256 aVal;
106 __m128 aVal1, aVal2;
107 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
108 for (; number < eighth_points; number++) {
109
110 aVal = _mm256_loadu_ps(aPtr);
111 bVal1 = _mm256_loadu_pd(bPtr);
112 bVal2 = _mm256_loadu_pd(bPtr + 4);
113
114 aVal1 = _mm256_extractf128_ps(aVal, 0);
115 aVal2 = _mm256_extractf128_ps(aVal, 1);
116
117 aDbl1 = _mm256_cvtps_pd(aVal1);
118 aDbl2 = _mm256_cvtps_pd(aVal2);
119
120 cVal1 = _mm256_mul_pd(aDbl1, bVal1);
121 cVal2 = _mm256_mul_pd(aDbl2, bVal2);
122
123 _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container
124 _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container
125
126 aPtr += 8;
127 bPtr += 8;
128 cPtr += 8;
129 }
130
131 number = eighth_points * 8;
132 for (; number < num_points; number++) {
133 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
134 }
135}
136
137#endif /* LV_HAVE_AVX */
138
139
140#ifdef LV_HAVE_AVX
141
142#include <immintrin.h>
143#include <xmmintrin.h>
144
145static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector,
146 const float* aVector,
147 const double* bVector,
148 unsigned int num_points)
149{
150 unsigned int number = 0;
151 const unsigned int eighth_points = num_points / 8;
152
153 double* cPtr = cVector;
154 const float* aPtr = aVector;
155 const double* bPtr = bVector;
156
157 __m256 aVal;
158 __m128 aVal1, aVal2;
159 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
160 for (; number < eighth_points; number++) {
161
162 aVal = _mm256_load_ps(aPtr);
163 bVal1 = _mm256_load_pd(bPtr);
164 bVal2 = _mm256_load_pd(bPtr + 4);
165
166 aVal1 = _mm256_extractf128_ps(aVal, 0);
167 aVal2 = _mm256_extractf128_ps(aVal, 1);
168
169 aDbl1 = _mm256_cvtps_pd(aVal1);
170 aDbl2 = _mm256_cvtps_pd(aVal2);
171
172 cVal1 = _mm256_mul_pd(aDbl1, bVal1);
173 cVal2 = _mm256_mul_pd(aDbl2, bVal2);
174
175 _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
176 _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container
177
178 aPtr += 8;
179 bPtr += 8;
180 cPtr += 8;
181 }
182
183 number = eighth_points * 8;
184 for (; number < num_points; number++) {
185 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
186 }
187}
188
189#endif /* LV_HAVE_AVX */
190
191
192#endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */