Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_32u_reverse_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
30#ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
32 int b00 : 1;
33 int b01 : 1;
34 int b02 : 1;
35 int b03 : 1;
36 int b04 : 1;
37 int b05 : 1;
38 int b06 : 1;
39 int b07 : 1;
40 int b08 : 1;
41 int b09 : 1;
42 int b10 : 1;
43 int b11 : 1;
44 int b12 : 1;
45 int b13 : 1;
46 int b14 : 1;
47 int b15 : 1;
48 int b16 : 1;
49 int b17 : 1;
50 int b18 : 1;
51 int b19 : 1;
52 int b20 : 1;
53 int b21 : 1;
54 int b22 : 1;
55 int b23 : 1;
56 int b24 : 1;
57 int b25 : 1;
58 int b26 : 1;
59 int b27 : 1;
60 int b28 : 1;
61 int b29 : 1;
62 int b30 : 1;
63 int b31 : 1;
64};
65struct char_split {
66 uint8_t b00 : 1;
67 uint8_t b01 : 1;
68 uint8_t b02 : 1;
69 uint8_t b03 : 1;
70 uint8_t b04 : 1;
71 uint8_t b05 : 1;
72 uint8_t b06 : 1;
73 uint8_t b07 : 1;
74};
75
76// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
77// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
78static const unsigned char BitReverseTable256[] = {
79 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
80 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
81 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
82 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
83 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
84 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
85 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
86 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
87 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
88 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
89 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
90 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
91 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
92 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
93 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
94 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
95 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
96 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
97 0x3F, 0xBF, 0x7F, 0xFF
98};
99#ifdef LV_HAVE_GENERIC
100static inline void
101volk_32u_reverse_32u_generic(uint32_t* out, const uint32_t* in, unsigned int num_points)
102{
103 const struct dword_split* in_ptr = (const struct dword_split*)in;
104 struct dword_split* out_ptr = (struct dword_split*)out;
105 unsigned int number = 0;
106 for (; number < num_points; ++number) {
107 out_ptr->b00 = in_ptr->b31;
108 out_ptr->b01 = in_ptr->b30;
109 out_ptr->b02 = in_ptr->b29;
110 out_ptr->b03 = in_ptr->b28;
111 out_ptr->b04 = in_ptr->b27;
112 out_ptr->b05 = in_ptr->b26;
113 out_ptr->b06 = in_ptr->b25;
114 out_ptr->b07 = in_ptr->b24;
115 out_ptr->b08 = in_ptr->b23;
116 out_ptr->b09 = in_ptr->b22;
117 out_ptr->b10 = in_ptr->b21;
118 out_ptr->b11 = in_ptr->b20;
119 out_ptr->b12 = in_ptr->b19;
120 out_ptr->b13 = in_ptr->b18;
121 out_ptr->b14 = in_ptr->b17;
122 out_ptr->b15 = in_ptr->b16;
123 out_ptr->b16 = in_ptr->b15;
124 out_ptr->b17 = in_ptr->b14;
125 out_ptr->b18 = in_ptr->b13;
126 out_ptr->b19 = in_ptr->b12;
127 out_ptr->b20 = in_ptr->b11;
128 out_ptr->b21 = in_ptr->b10;
129 out_ptr->b22 = in_ptr->b09;
130 out_ptr->b23 = in_ptr->b08;
131 out_ptr->b24 = in_ptr->b07;
132 out_ptr->b25 = in_ptr->b06;
133 out_ptr->b26 = in_ptr->b05;
134 out_ptr->b27 = in_ptr->b04;
135 out_ptr->b28 = in_ptr->b03;
136 out_ptr->b29 = in_ptr->b02;
137 out_ptr->b30 = in_ptr->b01;
138 out_ptr->b31 = in_ptr->b00;
139 ++in_ptr;
140 ++out_ptr;
141 }
142}
143#endif /* LV_HAVE_GENERIC */
144
145#ifdef LV_HAVE_GENERIC
146static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
147 const uint32_t* in,
148 unsigned int num_points)
149{
150 const uint32_t* in_ptr = in;
151 uint32_t* out_ptr = out;
152 unsigned int number = 0;
153 for (; number < num_points; ++number) {
154 const struct char_split* in8 = (const struct char_split*)in_ptr;
155 struct char_split* out8 = (struct char_split*)out_ptr;
156
157 out8[3].b00 = in8[0].b07;
158 out8[3].b01 = in8[0].b06;
159 out8[3].b02 = in8[0].b05;
160 out8[3].b03 = in8[0].b04;
161 out8[3].b04 = in8[0].b03;
162 out8[3].b05 = in8[0].b02;
163 out8[3].b06 = in8[0].b01;
164 out8[3].b07 = in8[0].b00;
165
166 out8[2].b00 = in8[1].b07;
167 out8[2].b01 = in8[1].b06;
168 out8[2].b02 = in8[1].b05;
169 out8[2].b03 = in8[1].b04;
170 out8[2].b04 = in8[1].b03;
171 out8[2].b05 = in8[1].b02;
172 out8[2].b06 = in8[1].b01;
173 out8[2].b07 = in8[1].b00;
174
175 out8[1].b00 = in8[2].b07;
176 out8[1].b01 = in8[2].b06;
177 out8[1].b02 = in8[2].b05;
178 out8[1].b03 = in8[2].b04;
179 out8[1].b04 = in8[2].b03;
180 out8[1].b05 = in8[2].b02;
181 out8[1].b06 = in8[2].b01;
182 out8[1].b07 = in8[2].b00;
183
184 out8[0].b00 = in8[3].b07;
185 out8[0].b01 = in8[3].b06;
186 out8[0].b02 = in8[3].b05;
187 out8[0].b03 = in8[3].b04;
188 out8[0].b04 = in8[3].b03;
189 out8[0].b05 = in8[3].b02;
190 out8[0].b06 = in8[3].b01;
191 out8[0].b07 = in8[3].b00;
192 ++in_ptr;
193 ++out_ptr;
194 }
195}
196#endif /* LV_HAVE_GENERIC */
197
198// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
199// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
200#ifdef LV_HAVE_GENERIC
201static inline void
202volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
203{
204 const uint32_t* in_ptr = in;
205 uint32_t* out_ptr = out;
206 unsigned int number = 0;
207 for (; number < num_points; ++number) {
208 *out_ptr = ((uint32_t)BitReverseTable256[*in_ptr & 0xff] << 24) |
209 (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
210 (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
211 (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
212 ++in_ptr;
213 ++out_ptr;
214 }
215}
216#endif /* LV_HAVE_GENERIC */
217
218// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
219// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
220#ifdef LV_HAVE_GENERIC
221static inline void
222volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
223{
224 const uint32_t* in_ptr = in;
225 uint32_t* out_ptr = out;
226 const uint8_t* in8;
227 uint8_t* out8;
228 unsigned int number = 0;
229 for (; number < num_points; ++number) {
230 in8 = (const uint8_t*)in_ptr;
231 out8 = (uint8_t*)out_ptr;
232 out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
233 out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
234 out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
235 out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
236 ++in_ptr;
237 ++out_ptr;
238 }
239}
240#endif /* LV_HAVE_GENERIC */
241
242#ifdef LV_HAVE_GENERIC
243// Current gr-pager implementation
244static inline void
245volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
246{
247 const uint32_t* in_ptr = in;
248 uint32_t* out_ptr = out;
249 const uint8_t* in8;
250 uint8_t* out8;
251 unsigned int number = 0;
252 for (; number < num_points; ++number) {
253 in8 = (const uint8_t*)in_ptr;
254 out8 = (uint8_t*)out_ptr;
255 out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
256 out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
257 out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
258 out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
259 ++in_ptr;
260 ++out_ptr;
261 }
262}
263#endif /* LV_HAVE_GENERIC */
264
265// After lengthy thought and quite a bit of whiteboarding:
266#ifdef LV_HAVE_GENERIC
267static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
268 const uint32_t* in,
269 unsigned int num_points)
270{
271 const uint32_t* in_ptr = in;
272 uint32_t* out_ptr = out;
273 unsigned int number = 0;
274 for (; number < num_points; ++number) {
275 uint32_t tmp = *in_ptr;
276 /* permute uint16:
277 The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
278 */
279 tmp = (tmp << 16) | (tmp >> 16);
280 /* permute bytes:
281 shift up by 1 B first, then only consider even bytes, and OR with the unshifted
282 even bytes
283 */
284 tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
285 /* permute 4bit tuples:
286 Same idea, but the "consideration" mask expression becomes unwieldy
287 */
288 tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
289 ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
290 /* permute 2bit tuples:
291 Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
292 3; we need those every 4b, which coincides with a hex digit!
293 */
294 tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
295 /* permute odd/even:
296 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) =
297 0x05!
298 */
299 tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
300
301 *out_ptr = tmp;
302 ++in_ptr;
303 ++out_ptr;
304 }
305}
306#endif /* LV_HAVE_GENERIC */
307#ifdef LV_HAVE_GENERIC
308static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
309 const uint32_t* in,
310 unsigned int num_points)
311{
312 // same stuff as top_down, inverted order (permutation matrices don't care, you know!)
313 const uint32_t* in_ptr = in;
314 uint32_t* out_ptr = out;
315 unsigned int number = 0;
316 for (; number < num_points; ++number) {
317 uint32_t tmp = *in_ptr;
318 tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
319 tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
320 tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
321 ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
322 tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
323 tmp = (tmp << 16) | (tmp >> 16);
324
325 *out_ptr = tmp;
326 ++in_ptr;
327 ++out_ptr;
328 }
329}
330#endif /* LV_HAVE_GENERIC */
331
332#ifdef LV_HAVE_NEONV8
333#include <arm_neon.h>
334
335static inline void
336volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
337{
338 const uint32_t* in_ptr = in;
339 uint32_t* out_ptr = out;
340
341 const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
342
343 const unsigned int quarterPoints = num_points / 4;
344 unsigned int number = 0;
345 for (; number < quarterPoints; ++number) {
346 __VOLK_PREFETCH(in_ptr + 4);
347 uint32x4_t x = vld1q_u32(in_ptr);
348 uint32x4_t z =
349 vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
350 vst1q_u32(out_ptr, z);
351 in_ptr += 4;
352 out_ptr += 4;
353 }
354 number = quarterPoints * 4;
355 for (; number < num_points; ++number) {
356 *out_ptr = ((uint32_t)BitReverseTable256[*in_ptr & 0xff] << 24) |
357 (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
358 (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
359 (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
360 ++in_ptr;
361 ++out_ptr;
362 }
363}
364
365#endif /* LV_HAVE_NEONV8 */
366
367#ifdef LV_HAVE_NEON
368#include <arm_neon.h>
369
370#if defined(__aarch64__)
371#define DO_RBIT \
372 __VOLK_ASM("rbit %w[result], %w[value]" \
373 : [result] "=r"(*out_ptr) \
374 : [value] "r"(*in_ptr) \
375 :); \
376 in_ptr++; \
377 out_ptr++;
378#else
379#define DO_RBIT \
380 __VOLK_ASM("rbit %[result], %[value]" \
381 : [result] "=r"(*out_ptr) \
382 : [value] "r"(*in_ptr) \
383 :); \
384 in_ptr++; \
385 out_ptr++;
386#endif
387
388static inline void
389volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
390{
391
392 const uint32_t* in_ptr = in;
393 uint32_t* out_ptr = out;
394 const unsigned int eighthPoints = num_points / 8;
395 unsigned int number = 0;
396 for (; number < eighthPoints; ++number) {
397 __VOLK_PREFETCH(in_ptr + 8);
398 DO_RBIT;
399 DO_RBIT;
400 DO_RBIT;
401 DO_RBIT;
402 DO_RBIT;
403 DO_RBIT;
404 DO_RBIT;
405 DO_RBIT;
406 }
407 number = eighthPoints * 8;
408 for (; number < num_points; ++number) {
409 DO_RBIT;
410 }
411}
412#undef DO_RBIT
413#endif /* LV_HAVE_NEON */
414
415
416#endif /* INCLUDED_volk_32u_reverse_32u_u_H */