Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_common.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11#ifndef INCLUDED_LIBVOLK_COMMON_H
12#define INCLUDED_LIBVOLK_COMMON_H
13
15// Cross-platform attribute macros
17#if _MSC_VER
18#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
19#define __VOLK_ATTR_UNUSED
20#define __VOLK_ATTR_INLINE __forceinline
21#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
22#define __VOLK_ATTR_EXPORT __declspec(dllexport)
23#define __VOLK_ATTR_IMPORT __declspec(dllimport)
24#define __VOLK_PREFETCH(addr)
25#define __VOLK_ASM __asm
26#define __VOLK_VOLATILE
27#elif defined(__clang__)
28// AppleClang also defines __GNUC__, so do this check first. These
29// will probably be the same as for __GNUC__, but let's keep them
30// separate just to be safe.
31#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
32#define __VOLK_ATTR_UNUSED __attribute__((unused))
33#define __VOLK_ATTR_INLINE __attribute__((always_inline))
34#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
35#define __VOLK_ASM __asm__
36#define __VOLK_VOLATILE __volatile__
37#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
38#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
39#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
40#elif defined __GNUC__
41#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
42#define __VOLK_ATTR_UNUSED __attribute__((unused))
43#define __VOLK_ATTR_INLINE __attribute__((always_inline))
44#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
45#define __VOLK_ASM __asm__
46#define __VOLK_VOLATILE __volatile__
47#if __GNUC__ >= 4
48#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
49#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
50#else
51#define __VOLK_ATTR_EXPORT
52#define __VOLK_ATTR_IMPORT
53#endif
54#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
55#elif _MSC_VER
56#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
57#define __VOLK_ATTR_UNUSED
58#define __VOLK_ATTR_INLINE __forceinline
59#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
60#define __VOLK_ATTR_EXPORT __declspec(dllexport)
61#define __VOLK_ATTR_IMPORT __declspec(dllimport)
62#define __VOLK_PREFETCH(addr)
63#define __VOLK_ASM __asm
64#define __VOLK_VOLATILE
65#else
66#define __VOLK_ATTR_ALIGNED(x)
67#define __VOLK_ATTR_UNUSED
68#define __VOLK_ATTR_INLINE
69#define __VOLK_ATTR_DEPRECATED
70#define __VOLK_ATTR_EXPORT
71#define __VOLK_ATTR_IMPORT
72#define __VOLK_PREFETCH(addr)
73#define __VOLK_ASM __asm__
74#define __VOLK_VOLATILE __volatile__
75#endif
76
78// Ignore annoying warnings in MSVC
80#if defined(_MSC_VER)
81#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2',
82 // possible loss of data
83#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
84#endif
85
87// C-linkage declaration macros
88// FIXME: due to the usage of complex.h, require gcc for c-linkage
90#if defined(__cplusplus) && (__GNUC__)
91#define __VOLK_DECL_BEGIN extern "C" {
92#define __VOLK_DECL_END }
93#else
94#define __VOLK_DECL_BEGIN
95#define __VOLK_DECL_END
96#endif
97
99// Define VOLK_API for library symbols
100// http://gcc.gnu.org/wiki/Visibility
102#ifdef volk_EXPORTS
103#define VOLK_API __VOLK_ATTR_EXPORT
104#else
105#define VOLK_API __VOLK_ATTR_IMPORT
106#endif
107
109// The bit128 union used by some
111#include <stdint.h>
112
113#ifdef LV_HAVE_SSE
114#ifdef _WIN32
115#include <intrin.h>
116#else
117#include <x86intrin.h>
118#endif
119#endif
120
121union bit128 {
122 uint8_t i8[16];
123 uint16_t i16[8];
124 uint32_t i[4];
125 float f[4];
126 double d[2];
127
128#ifdef LV_HAVE_SSE
130#endif
131
132#ifdef LV_HAVE_SSE2
135#endif
136};
137
138union bit256 {
139 uint8_t i8[32];
140 uint16_t i16[16];
141 uint32_t i[8];
142 float f[8];
143 double d[4];
144
145#ifdef LV_HAVE_AVX
146 __m256 float_vec;
147 __m256i int_vec;
148 __m256d double_vec;
149#endif
150};
151
152#define bit128_p(x) ((union bit128*)(x))
153#define bit256_p(x) ((union bit256*)(x))
154
156// log2f
158#include <math.h>
159// +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels
160static inline float log2f_non_ieee(float f)
161{
162 float const result = log2f(f);
163 return isinf(result) ? copysignf(127.0f, result) : result;
164}
165
167// Constant used to do log10 calculations as faster log2
169// precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
170#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120
171
173// arctan(x) polynomial expansion
175static inline float volk_arctan_poly(const float x)
176{
177 /*
178 * arctan(x) polynomial expansion on the interval [-1, 1]
179 * Maximum relative error < 6.6e-7
180 */
181 const float a1 = +0x1.ffffeap-1f;
182 const float a3 = -0x1.55437p-2f;
183 const float a5 = +0x1.972be6p-3f;
184 const float a7 = -0x1.1436ap-3f;
185 const float a9 = +0x1.5785aap-4f;
186 const float a11 = -0x1.2f3004p-5f;
187 const float a13 = +0x1.01a37cp-7f;
188
189 const float x_times_x = x * x;
190 float arctan = a13;
191 arctan = fmaf(x_times_x, arctan, a11);
192 arctan = fmaf(x_times_x, arctan, a9);
193 arctan = fmaf(x_times_x, arctan, a7);
194 arctan = fmaf(x_times_x, arctan, a5);
195 arctan = fmaf(x_times_x, arctan, a3);
196 arctan = fmaf(x_times_x, arctan, a1);
197 arctan *= x;
198
199 return arctan;
200}
202// arctan(x)
204static inline float volk_arctan(const float x)
205{
206 /*
207 * arctan(x) + arctan(1 / x) == sign(x) * pi / 2
208 */
209 const float pi_2 = 0x1.921fb6p0f;
210
211 if (fabs(x) < 1.f) {
212 return volk_arctan_poly(x);
213 } else {
214 return copysignf(pi_2, x) - volk_arctan_poly(1.f / x);
215 }
216}
218// arctan2(y, x)
220static inline float volk_atan2(const float y, const float x)
221{
222 /*
223 * / arctan(y / x) if x > 0
224 * | arctan(y / x) + PI if x < 0 and y >= 0
225 * atan2(y, x) = | arctan(y / x) - PI if x < 0 and y < 0
226 * | sign(y) * PI / 2 if x = 0
227 * \ undefined if x = 0 and y = 0
228 * atan2f(0.f, 0.f) shall return 0.f
229 * atan2f(0.f, -0.f) shall return -0.f
230 */
231 const float pi = 0x1.921fb6p1f;
232 const float pi_2 = 0x1.921fb6p0f;
233
234 if (fabs(x) == 0.f) {
235 return (fabs(y) == 0.f) ? copysignf(0.f, y) : copysignf(pi_2, y);
236 }
237 const int swap = fabs(x) < fabs(y);
238 const float input = swap ? (x / y) : (y / x);
239 float result = volk_arctan_poly(input);
240 result = swap ? (input >= 0.f ? pi_2 : -pi_2) - result : result;
241 if (x < 0.f) {
242 result += copysignf(pi, y);
243 }
244 return result;
245}
246
247#endif /*INCLUDED_LIBVOLK_COMMON_H*/