Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_8i_convert_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
40#ifndef INCLUDED_volk_8i_convert_16i_u_H
41#define INCLUDED_volk_8i_convert_16i_u_H
42
43#include <inttypes.h>
44#include <stdio.h>
45
46#ifdef LV_HAVE_AVX2
47#include <immintrin.h>
48
49static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
50 const int8_t* inputVector,
51 unsigned int num_points)
52{
53 unsigned int number = 0;
54 const unsigned int sixteenthPoints = num_points / 16;
55
56 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
57 __m256i* outputVectorPtr = (__m256i*)outputVector;
58 __m128i inputVal;
59 __m256i ret;
60
61 for (; number < sixteenthPoints; number++) {
62 inputVal = _mm_loadu_si128(inputVectorPtr);
63 ret = _mm256_cvtepi8_epi16(inputVal);
64 ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
65 _mm256_storeu_si256(outputVectorPtr, ret);
66
67 outputVectorPtr++;
68 inputVectorPtr++;
69 }
70
71 number = sixteenthPoints * 16;
72 for (; number < num_points; number++) {
73 outputVector[number] = (int16_t)(inputVector[number]) * 256;
74 }
75}
76#endif /* LV_HAVE_AVX2 */
77
78
79#ifdef LV_HAVE_SSE4_1
80#include <smmintrin.h>
81
82static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
83 const int8_t* inputVector,
84 unsigned int num_points)
85{
86 unsigned int number = 0;
87 const unsigned int sixteenthPoints = num_points / 16;
88
89 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
90 __m128i* outputVectorPtr = (__m128i*)outputVector;
91 __m128i inputVal;
92 __m128i ret;
93
94 for (; number < sixteenthPoints; number++) {
95 inputVal = _mm_loadu_si128(inputVectorPtr);
96 ret = _mm_cvtepi8_epi16(inputVal);
97 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
98 _mm_storeu_si128(outputVectorPtr, ret);
99
100 outputVectorPtr++;
101
102 inputVal = _mm_srli_si128(inputVal, 8);
103 ret = _mm_cvtepi8_epi16(inputVal);
104 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
105 _mm_storeu_si128(outputVectorPtr, ret);
106
107 outputVectorPtr++;
108
109 inputVectorPtr++;
110 }
111
112 number = sixteenthPoints * 16;
113 for (; number < num_points; number++) {
114 outputVector[number] = (int16_t)(inputVector[number]) * 256;
115 }
116}
117#endif /* LV_HAVE_SSE4_1 */
118
119
120#ifdef LV_HAVE_GENERIC
121
122static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
123 const int8_t* inputVector,
124 unsigned int num_points)
125{
126 int16_t* outputVectorPtr = outputVector;
127 const int8_t* inputVectorPtr = inputVector;
128 unsigned int number = 0;
129
130 for (number = 0; number < num_points; number++) {
131 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
132 }
133}
134#endif /* LV_HAVE_GENERIC */
135
136
137#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
138
139
140#ifndef INCLUDED_volk_8i_convert_16i_a_H
141#define INCLUDED_volk_8i_convert_16i_a_H
142
143#include <inttypes.h>
144#include <stdio.h>
145
146#ifdef LV_HAVE_AVX2
147#include <immintrin.h>
148
149static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
150 const int8_t* inputVector,
151 unsigned int num_points)
152{
153 unsigned int number = 0;
154 const unsigned int sixteenthPoints = num_points / 16;
155
156 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
157 __m256i* outputVectorPtr = (__m256i*)outputVector;
158 __m128i inputVal;
159 __m256i ret;
160
161 for (; number < sixteenthPoints; number++) {
162 inputVal = _mm_load_si128(inputVectorPtr);
163 ret = _mm256_cvtepi8_epi16(inputVal);
164 ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
165 _mm256_store_si256(outputVectorPtr, ret);
166
167 outputVectorPtr++;
168 inputVectorPtr++;
169 }
170
171 number = sixteenthPoints * 16;
172 for (; number < num_points; number++) {
173 outputVector[number] = (int16_t)(inputVector[number]) * 256;
174 }
175}
176#endif /* LV_HAVE_AVX2 */
177
178
179#ifdef LV_HAVE_SSE4_1
180#include <smmintrin.h>
181
182static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
183 const int8_t* inputVector,
184 unsigned int num_points)
185{
186 unsigned int number = 0;
187 const unsigned int sixteenthPoints = num_points / 16;
188
189 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
190 __m128i* outputVectorPtr = (__m128i*)outputVector;
191 __m128i inputVal;
192 __m128i ret;
193
194 for (; number < sixteenthPoints; number++) {
195 inputVal = _mm_load_si128(inputVectorPtr);
196 ret = _mm_cvtepi8_epi16(inputVal);
197 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
198 _mm_store_si128(outputVectorPtr, ret);
199
200 outputVectorPtr++;
201
202 inputVal = _mm_srli_si128(inputVal, 8);
203 ret = _mm_cvtepi8_epi16(inputVal);
204 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
205 _mm_store_si128(outputVectorPtr, ret);
206
207 outputVectorPtr++;
208
209 inputVectorPtr++;
210 }
211
212 number = sixteenthPoints * 16;
213 for (; number < num_points; number++) {
214 outputVector[number] = (int16_t)(inputVector[number]) * 256;
215 }
216}
217#endif /* LV_HAVE_SSE4_1 */
218
219
220#ifdef LV_HAVE_NEON
221#include <arm_neon.h>
222
223static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
224 const int8_t* inputVector,
225 unsigned int num_points)
226{
227 int16_t* outputVectorPtr = outputVector;
228 const int8_t* inputVectorPtr = inputVector;
229 unsigned int number;
230 const unsigned int eighth_points = num_points / 8;
231
232 int8x8_t input_vec;
233 int16x8_t converted_vec;
234
235 // NEON doesn't have a concept of 8 bit registers, so we are really
236 // dealing with the low half of 16-bit registers. Since this requires
237 // a move instruction we likely do better with ASM here.
238 for (number = 0; number < eighth_points; ++number) {
239 input_vec = vld1_s8(inputVectorPtr);
240 converted_vec = vmovl_s8(input_vec);
241 // converted_vec = vmulq_s16(converted_vec, scale_factor);
242 converted_vec = vshlq_n_s16(converted_vec, 8);
243 vst1q_s16(outputVectorPtr, converted_vec);
244
245 inputVectorPtr += 8;
246 outputVectorPtr += 8;
247 }
248
249 for (number = eighth_points * 8; number < num_points; number++) {
250 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
251 }
252}
253#endif /* LV_HAVE_NEON */
254
255
256#ifdef LV_HAVE_ORC
257extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
258 const int8_t* inputVector,
259 unsigned int num_points);
260
261static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
262 const int8_t* inputVector,
263 unsigned int num_points)
264{
265 volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
266}
267#endif /* LV_HAVE_ORC */
268
269
270#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */