14#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
25 unsigned int res = (
val & b[0]) != 0;
26 res |= ((
val & b[4]) != 0) << 4;
27 res |= ((
val & b[3]) != 0) << 3;
28 res |= ((
val & b[2]) != 0) << 2;
29 res |= ((
val & b[1]) != 0) << 1;
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
38 unsigned int branch, bit;
39 for (branch = 0; branch < num_branches; ++branch) {
40 for (bit = 0; bit < frame_half; ++bit) {
41 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 *(frame_ptr + frame_half) = *(temp_ptr + 1);
46 frame_ptr += frame_half;
54 unsigned int frame_size)
57 unsigned int frame_half = frame_size >> 1;
58 unsigned int num_branches = 1;
63 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
66 num_branches = num_branches << 1;
67 frame_half = frame_half >> 1;
78 unsigned int frame_size)
80 if (frame_size < 16) {
87 unsigned int stage = po2;
88 unsigned char* frame_ptr = frame;
89 unsigned char* temp_ptr = temp;
91 unsigned int frame_half = frame_size >> 1;
92 unsigned int num_branches = 1;
115 __m128i r_frame0, r_temp0, shifted;
119 const __m128i shuffle_separate =
120 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
127 for (branch = 0; branch < num_branches; ++branch) {
128 for (bit = 0; bit < frame_half; bit += 16) {
152 frame_ptr += frame_half;
154 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
156 num_branches = num_branches << 1;
157 frame_half = frame_half >> 1;
173 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
223 for (branch = 0; branch < num_branches; ++branch) {
258#include <immintrin.h>
260static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
262 unsigned int frame_size)
264 if (frame_size < 32) {
271 unsigned int stage = po2;
272 unsigned char* frame_ptr = frame;
273 unsigned char* temp_ptr = temp;
275 unsigned int frame_half = frame_size >> 1;
276 unsigned int num_branches = 1;
281 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
331 __m256i r_frame0, r_temp0, shifted;
332 __m128i r_temp2, r_frame2, shifted2;
334 __m256i r_frame1, r_temp1;
336 const __m256i shuffle_separate = _mm256_setr_epi8(0,
368 const __m128i shuffle_separate128 =
369 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
376 for (branch = 0; branch < num_branches; ++branch) {
377 for (bit = 0; bit < frame_half; bit += 32) {
378 if ((frame_half - bit) <
404 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
406 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
409 shifted = _mm256_srli_si256(r_temp0, 1);
410 shifted = _mm256_and_si256(shifted, mask_stage1);
411 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
412 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
414 shifted = _mm256_srli_si256(r_temp1, 1);
415 shifted = _mm256_and_si256(shifted, mask_stage1);
416 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
417 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
419 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
420 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
421 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
422 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
424 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
426 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
430 frame_ptr += frame_half;
432 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
434 num_branches = num_branches << 1;
435 frame_half = frame_half >> 1;
450 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
482 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
514 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
546 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
579 for (branch = 0; branch < num_branches / 2; ++branch) {
580 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
587 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
589 shifted = _mm256_srli_si256(r_temp0, 8);
590 shifted = _mm256_and_si256(shifted, mask_stage4);
591 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
594 shifted = _mm256_srli_si256(r_frame0, 4);
595 shifted = _mm256_and_si256(shifted, mask_stage3);
596 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
598 shifted = _mm256_srli_si256(r_frame0, 2);
599 shifted = _mm256_and_si256(shifted, mask_stage2);
600 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
602 shifted = _mm256_srli_si256(r_frame0, 1);
603 shifted = _mm256_and_si256(shifted, mask_stage1);
604 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
607 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
615#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
616#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
619#include <tmmintrin.h>
623 unsigned int frame_size)
625 if (frame_size < 16) {
632 unsigned int stage = po2;
633 unsigned char* frame_ptr = frame;
634 unsigned char* temp_ptr = temp;
636 unsigned int frame_half = frame_size >> 1;
637 unsigned int num_branches = 1;
660 __m128i r_frame0, r_temp0, shifted;
664 const __m128i shuffle_separate =
665 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
672 for (branch = 0; branch < num_branches; ++branch) {
673 for (bit = 0; bit < frame_half; bit += 16) {
697 frame_ptr += frame_half;
699 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
701 num_branches = num_branches << 1;
702 frame_half = frame_half >> 1;
718 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
768 for (branch = 0; branch < num_branches; ++branch) {
802#include <immintrin.h>
804static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
806 unsigned int frame_size)
808 if (frame_size < 32) {
815 unsigned int stage = po2;
816 unsigned char* frame_ptr = frame;
817 unsigned char* temp_ptr = temp;
819 unsigned int frame_half = frame_size >> 1;
820 unsigned int num_branches = 1;
825 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
875 __m256i r_frame0, r_temp0, shifted;
876 __m128i r_temp2, r_frame2, shifted2;
878 __m256i r_frame1, r_temp1;
880 const __m256i shuffle_separate = _mm256_setr_epi8(0,
912 const __m128i shuffle_separate128 =
913 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
920 for (branch = 0; branch < num_branches; ++branch) {
921 for (bit = 0; bit < frame_half; bit += 32) {
922 if ((frame_half - bit) <
948 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
950 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
953 shifted = _mm256_srli_si256(r_temp0, 1);
954 shifted = _mm256_and_si256(shifted, mask_stage1);
955 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
956 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
958 shifted = _mm256_srli_si256(r_temp1, 1);
959 shifted = _mm256_and_si256(shifted, mask_stage1);
960 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
961 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
963 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
964 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
965 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
966 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
968 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
970 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
974 frame_ptr += frame_half;
976 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
978 num_branches = num_branches << 1;
979 frame_half = frame_half >> 1;
994 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
1026 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1058 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1090 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1123 for (branch = 0; branch < num_branches / 2; ++branch) {
1124 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1131 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1133 shifted = _mm256_srli_si256(r_temp0, 8);
1134 shifted = _mm256_and_si256(shifted, mask_stage4);
1135 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1137 shifted = _mm256_srli_si256(r_frame0, 4);
1138 shifted = _mm256_and_si256(shifted, mask_stage3);
1139 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1141 shifted = _mm256_srli_si256(r_frame0, 2);
1142 shifted = _mm256_and_si256(shifted, mask_stage2);
1143 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1145 shifted = _mm256_srli_si256(r_frame0, 1);
1146 shifted = _mm256_and_si256(shifted, mask_stage1);
1147 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1150 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);