Vector Optimized Library of Kernels
3.1.0
Architecture-tuned implementations of math kernels
volk_sse_intrinsics.h
Go to the documentation of this file.
1
/* -*- c++ -*- */
2
/*
3
* Copyright 2015 Free Software Foundation, Inc.
4
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5
*
6
* This file is part of VOLK
7
*
8
* SPDX-License-Identifier: LGPL-3.0-or-later
9
*/
10
11
/*
12
* This file is intended to hold SSE intrinsics of intrinsics.
13
* They should be used in VOLK kernels to avoid copy-pasta.
14
*/
15
16
#ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
17
#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
18
#include <xmmintrin.h>
19
20
/*
21
* Approximate arctan(x) via polynomial expansion
22
* on the interval [-1, 1]
23
*
24
* Maximum relative error ~6.5e-7
25
* Polynomial evaluated via Horner's method
26
*/
27
static
inline
__m128
_mm_arctan_poly_sse
(
const
__m128
x)
28
{
29
const
__m128
a1 =
_mm_set1_ps
(+0x1.ffffeap-1f);
30
const
__m128
a3 =
_mm_set1_ps
(-0x1.55437p-2f);
31
const
__m128
a5 =
_mm_set1_ps
(+0x1.972be6p-3f);
32
const
__m128
a7 =
_mm_set1_ps
(-0x1.1436ap-3f);
33
const
__m128
a9 =
_mm_set1_ps
(+0x1.5785aap-4f);
34
const
__m128
a11 =
_mm_set1_ps
(-0x1.2f3004p-5f);
35
const
__m128
a13 =
_mm_set1_ps
(+0x1.01a37cp-7f);
36
37
const
__m128
x_times_x =
_mm_mul_ps
(x, x);
38
__m128
arctan;
39
arctan = a13;
40
arctan =
_mm_mul_ps
(x_times_x, arctan);
41
arctan =
_mm_add_ps
(arctan, a11);
42
arctan =
_mm_mul_ps
(x_times_x, arctan);
43
arctan =
_mm_add_ps
(arctan, a9);
44
arctan =
_mm_mul_ps
(x_times_x, arctan);
45
arctan =
_mm_add_ps
(arctan, a7);
46
arctan =
_mm_mul_ps
(x_times_x, arctan);
47
arctan =
_mm_add_ps
(arctan, a5);
48
arctan =
_mm_mul_ps
(x_times_x, arctan);
49
arctan =
_mm_add_ps
(arctan, a3);
50
arctan =
_mm_mul_ps
(x_times_x, arctan);
51
arctan =
_mm_add_ps
(arctan, a1);
52
arctan =
_mm_mul_ps
(x, arctan);
53
54
return
arctan;
55
}
56
57
static
inline
__m128
_mm_magnitudesquared_ps
(
__m128
cplxValue1,
__m128
cplxValue2)
58
{
59
__m128
iValue, qValue;
60
// Arrange in i1i2i3i4 format
61
iValue =
_mm_shuffle_ps
(cplxValue1, cplxValue2,
_MM_SHUFFLE
(2, 0, 2, 0));
62
// Arrange in q1q2q3q4 format
63
qValue =
_mm_shuffle_ps
(cplxValue1, cplxValue2,
_MM_SHUFFLE
(3, 1, 3, 1));
64
iValue =
_mm_mul_ps
(iValue, iValue);
// Square the I values
65
qValue =
_mm_mul_ps
(qValue, qValue);
// Square the Q Values
66
return
_mm_add_ps
(iValue, qValue);
// Add the I2 and Q2 values
67
}
68
69
static
inline
__m128
_mm_magnitude_ps
(
__m128
cplxValue1,
__m128
cplxValue2)
70
{
71
return
_mm_sqrt_ps
(
_mm_magnitudesquared_ps
(cplxValue1, cplxValue2));
72
}
73
74
static
inline
__m128
_mm_scaled_norm_dist_ps_sse
(
const
__m128
symbols0,
75
const
__m128
symbols1,
76
const
__m128
points0,
77
const
__m128
points1,
78
const
__m128
scalar)
79
{
80
// calculate scalar * |x - y|^2
81
const
__m128
diff0 =
_mm_sub_ps
(symbols0, points0);
82
const
__m128
diff1 =
_mm_sub_ps
(symbols1, points1);
83
const
__m128
norms =
_mm_magnitudesquared_ps
(diff0, diff1);
84
return
_mm_mul_ps
(norms, scalar);
85
}
86
87
static
inline
__m128
_mm_accumulate_square_sum_ps
(
88
__m128
sq_acc,
__m128
acc,
__m128
val
,
__m128
rec,
__m128
aux)
89
{
90
aux =
_mm_mul_ps
(aux,
val
);
91
aux =
_mm_sub_ps
(aux, acc);
92
aux =
_mm_mul_ps
(aux, aux);
93
aux =
_mm_mul_ps
(aux, rec);
94
return
_mm_add_ps
(sq_acc, aux);
95
}
96
97
#endif
/* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
include
volk
volk_sse_intrinsics.h
Generated by
1.9.4