Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_16i_convert_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
40#ifndef INCLUDED_volk_16i_convert_8i_u_H
41#define INCLUDED_volk_16i_convert_8i_u_H
42
43#include <inttypes.h>
44#include <stdio.h>
45
46#ifdef LV_HAVE_AVX2
47#include <immintrin.h>
48
49static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
50 const int16_t* inputVector,
51 unsigned int num_points)
52{
53 unsigned int number = 0;
54 const unsigned int thirtysecondPoints = num_points / 32;
55
56 int8_t* outputVectorPtr = outputVector;
57 int16_t* inputPtr = (int16_t*)inputVector;
58 __m256i inputVal1;
59 __m256i inputVal2;
60 __m256i ret;
61
62 for (; number < thirtysecondPoints; number++) {
63
64 // Load the 16 values
65 inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
66 inputPtr += 16;
67 inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
68 inputPtr += 16;
69
70 inputVal1 = _mm256_srai_epi16(inputVal1, 8);
71 inputVal2 = _mm256_srai_epi16(inputVal2, 8);
72
73 ret = _mm256_packs_epi16(inputVal1, inputVal2);
74 ret = _mm256_permute4x64_epi64(ret, 0b11011000);
75
76 _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
77
78 outputVectorPtr += 32;
79 }
80
81 number = thirtysecondPoints * 32;
82 for (; number < num_points; number++) {
83 outputVector[number] = (int8_t)(inputVector[number] >> 8);
84 }
85}
86#endif /* LV_HAVE_AVX2 */
87
88
89#ifdef LV_HAVE_SSE2
90#include <emmintrin.h>
91
92static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector,
93 const int16_t* inputVector,
94 unsigned int num_points)
95{
96 unsigned int number = 0;
97 const unsigned int sixteenthPoints = num_points / 16;
98
99 int8_t* outputVectorPtr = outputVector;
100 int16_t* inputPtr = (int16_t*)inputVector;
101 __m128i inputVal1;
102 __m128i inputVal2;
103 __m128i ret;
104
105 for (; number < sixteenthPoints; number++) {
106
107 // Load the 16 values
108 inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
109 inputPtr += 8;
110 inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
111 inputPtr += 8;
112
113 inputVal1 = _mm_srai_epi16(inputVal1, 8);
114 inputVal2 = _mm_srai_epi16(inputVal2, 8);
115
116 ret = _mm_packs_epi16(inputVal1, inputVal2);
117
118 _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
119
120 outputVectorPtr += 16;
121 }
122
123 number = sixteenthPoints * 16;
124 for (; number < num_points; number++) {
125 outputVector[number] = (int8_t)(inputVector[number] >> 8);
126 }
127}
128#endif /* LV_HAVE_SSE2 */
129
130
131#ifdef LV_HAVE_GENERIC
132
133static inline void volk_16i_convert_8i_generic(int8_t* outputVector,
134 const int16_t* inputVector,
135 unsigned int num_points)
136{
137 int8_t* outputVectorPtr = outputVector;
138 const int16_t* inputVectorPtr = inputVector;
139 unsigned int number = 0;
140
141 for (number = 0; number < num_points; number++) {
142 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
143 }
144}
145#endif /* LV_HAVE_GENERIC */
146
147
148#endif /* INCLUDED_volk_16i_convert_8i_u_H */
149#ifndef INCLUDED_volk_16i_convert_8i_a_H
150#define INCLUDED_volk_16i_convert_8i_a_H
151
152#include <inttypes.h>
153#include <stdio.h>
154
155#ifdef LV_HAVE_AVX2
156#include <immintrin.h>
157
158static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
159 const int16_t* inputVector,
160 unsigned int num_points)
161{
162 unsigned int number = 0;
163 const unsigned int thirtysecondPoints = num_points / 32;
164
165 int8_t* outputVectorPtr = outputVector;
166 int16_t* inputPtr = (int16_t*)inputVector;
167 __m256i inputVal1;
168 __m256i inputVal2;
169 __m256i ret;
170
171 for (; number < thirtysecondPoints; number++) {
172
173 // Load the 16 values
174 inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
175 inputPtr += 16;
176 inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
177 inputPtr += 16;
178
179 inputVal1 = _mm256_srai_epi16(inputVal1, 8);
180 inputVal2 = _mm256_srai_epi16(inputVal2, 8);
181
182 ret = _mm256_packs_epi16(inputVal1, inputVal2);
183 ret = _mm256_permute4x64_epi64(ret, 0b11011000);
184
185 _mm256_store_si256((__m256i*)outputVectorPtr, ret);
186
187 outputVectorPtr += 32;
188 }
189
190 number = thirtysecondPoints * 32;
191 for (; number < num_points; number++) {
192 outputVector[number] = (int8_t)(inputVector[number] >> 8);
193 }
194}
195#endif /* LV_HAVE_AVX2 */
196
197
198#ifdef LV_HAVE_SSE2
199#include <emmintrin.h>
200
201static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector,
202 const int16_t* inputVector,
203 unsigned int num_points)
204{
205 unsigned int number = 0;
206 const unsigned int sixteenthPoints = num_points / 16;
207
208 int8_t* outputVectorPtr = outputVector;
209 int16_t* inputPtr = (int16_t*)inputVector;
210 __m128i inputVal1;
211 __m128i inputVal2;
212 __m128i ret;
213
214 for (; number < sixteenthPoints; number++) {
215
216 // Load the 16 values
217 inputVal1 = _mm_load_si128((__m128i*)inputPtr);
218 inputPtr += 8;
219 inputVal2 = _mm_load_si128((__m128i*)inputPtr);
220 inputPtr += 8;
221
222 inputVal1 = _mm_srai_epi16(inputVal1, 8);
223 inputVal2 = _mm_srai_epi16(inputVal2, 8);
224
225 ret = _mm_packs_epi16(inputVal1, inputVal2);
226
227 _mm_store_si128((__m128i*)outputVectorPtr, ret);
228
229 outputVectorPtr += 16;
230 }
231
232 number = sixteenthPoints * 16;
233 for (; number < num_points; number++) {
234 outputVector[number] = (int8_t)(inputVector[number] >> 8);
235 }
236}
237#endif /* LV_HAVE_SSE2 */
238
239
240#ifdef LV_HAVE_NEON
241#include <arm_neon.h>
242
243static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
244 const int16_t* inputVector,
245 unsigned int num_points)
246{
247 int8_t* outputVectorPtr = outputVector;
248 const int16_t* inputVectorPtr = inputVector;
249 unsigned int number = 0;
250 unsigned int sixteenth_points = num_points / 16;
251
252 int16x8_t inputVal0;
253 int16x8_t inputVal1;
254 int8x8_t outputVal0;
255 int8x8_t outputVal1;
256 int8x16_t outputVal;
257
258 for (number = 0; number < sixteenth_points; number++) {
259 // load two input vectors
260 inputVal0 = vld1q_s16(inputVectorPtr);
261 inputVal1 = vld1q_s16(inputVectorPtr + 8);
262 // shift right
263 outputVal0 = vshrn_n_s16(inputVal0, 8);
264 outputVal1 = vshrn_n_s16(inputVal1, 8);
265 // squash two vectors and write output
266 outputVal = vcombine_s8(outputVal0, outputVal1);
267 vst1q_s8(outputVectorPtr, outputVal);
268 inputVectorPtr += 16;
269 outputVectorPtr += 16;
270 }
271
272 for (number = sixteenth_points * 16; number < num_points; number++) {
273 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
274 }
275}
276#endif /* LV_HAVE_NEON */
277
278
279#endif /* INCLUDED_volk_16i_convert_8i_a_H */