Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_power_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
59#define INCLUDED_volk_32f_s32f_power_32f_a_H
60
61#include <inttypes.h>
62#include <math.h>
63#include <stdio.h>
64
65#ifdef LV_HAVE_SSE4_1
66#include <tmmintrin.h>
67
68#ifdef LV_HAVE_LIB_SIMDMATH
69#include <simdmath.h>
70#endif /* LV_HAVE_LIB_SIMDMATH */
71
72static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector,
73 const float* aVector,
74 const float power,
75 unsigned int num_points)
76{
77 unsigned int number = 0;
78
79 float* cPtr = cVector;
80 const float* aPtr = aVector;
81
82#ifdef LV_HAVE_LIB_SIMDMATH
83 const unsigned int quarterPoints = num_points / 4;
84 __m128 vPower = _mm_set_ps1(power);
85 __m128 zeroValue = _mm_setzero_ps();
86 __m128 signMask;
87 __m128 negatedValues;
88 __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
89 __m128 onesMask = _mm_set_ps1(1);
90
91 __m128 aVal, cVal;
92 for (; number < quarterPoints; number++) {
93
94 aVal = _mm_load_ps(aPtr);
95 signMask = _mm_cmplt_ps(aVal, zeroValue);
96 negatedValues = _mm_sub_ps(zeroValue, aVal);
97 aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
98
99 // powf4 doesn't support negative values in the base, so we mask them off and then
100 // apply the negative after
101 cVal = powf4(aVal, vPower); // Takes each input value to the specified power
102
103 cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
104
105 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
106
107 aPtr += 4;
108 cPtr += 4;
109 }
110
111 number = quarterPoints * 4;
112#endif /* LV_HAVE_LIB_SIMDMATH */
113
114 for (; number < num_points; number++) {
115 *cPtr++ = powf((*aPtr++), power);
116 }
117}
118
119#endif /* LV_HAVE_SSE4_1 */
120
121
122#ifdef LV_HAVE_SSE
123#include <xmmintrin.h>
124
125#ifdef LV_HAVE_LIB_SIMDMATH
126#include <simdmath.h>
127#endif /* LV_HAVE_LIB_SIMDMATH */
128
129static inline void volk_32f_s32f_power_32f_a_sse(float* cVector,
130 const float* aVector,
131 const float power,
132 unsigned int num_points)
133{
134 unsigned int number = 0;
135
136 float* cPtr = cVector;
137 const float* aPtr = aVector;
138
139#ifdef LV_HAVE_LIB_SIMDMATH
140 const unsigned int quarterPoints = num_points / 4;
141 __m128 vPower = _mm_set_ps1(power);
142 __m128 zeroValue = _mm_setzero_ps();
143 __m128 signMask;
144 __m128 negatedValues;
145 __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
146 __m128 onesMask = _mm_set_ps1(1);
147
148 __m128 aVal, cVal;
149 for (; number < quarterPoints; number++) {
150
151 aVal = _mm_load_ps(aPtr);
152 signMask = _mm_cmplt_ps(aVal, zeroValue);
153 negatedValues = _mm_sub_ps(zeroValue, aVal);
154 aVal =
155 _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues));
156
157 // powf4 doesn't support negative values in the base, so we mask them off and then
158 // apply the negative after
159 cVal = powf4(aVal, vPower); // Takes each input value to the specified power
160
161 cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask),
162 _mm_and_ps(signMask, negativeOneToPower)),
163 cVal);
164
165 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
166
167 aPtr += 4;
168 cPtr += 4;
169 }
170
171 number = quarterPoints * 4;
172#endif /* LV_HAVE_LIB_SIMDMATH */
173
174 for (; number < num_points; number++) {
175 *cPtr++ = powf((*aPtr++), power);
176 }
177}
178
179#endif /* LV_HAVE_SSE */
180
181
182#ifdef LV_HAVE_GENERIC
183
184static inline void volk_32f_s32f_power_32f_generic(float* cVector,
185 const float* aVector,
186 const float power,
187 unsigned int num_points)
188{
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
191 unsigned int number = 0;
192
193 for (number = 0; number < num_points; number++) {
194 *cPtr++ = powf((*aPtr++), power);
195 }
196}
197#endif /* LV_HAVE_GENERIC */
198
199
200#endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */