61#ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
62#define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
66static inline float llr_odd(
const float la,
const float lb)
68 const float ala = fabsf(la);
69 const float alb = fabsf(lb);
70 return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
74 float* llrs,
int min_stage,
const int depth,
const int frame_size,
const int row)
76 int loop_stage = depth - 1;
79 int stage_size = 0x01 << loop_stage;
82 while (min_stage <= loop_stage) {
83 dst_llr_ptr = llrs + loop_stage * frame_size + row;
84 src_llr_ptr = dst_llr_ptr + frame_size;
85 for (el = 0; el < stage_size; el++) {
86 *dst_llr_ptr++ =
llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
95static inline float llr_even(
const float la,
const float lb,
const unsigned char f)
106even_u_values(
unsigned char* u_even,
const unsigned char* u,
const int u_num)
110 for (i = 1; i < u_num; i += 2) {
120 for (i = 1; i < u_num; i += 2) {
121 *u_xor++ = *u ^ *(u + 1);
128 int max_stage_depth = 0;
129 int half_stage_size = 0x01;
130 int stage_size = half_stage_size << 1;
131 while (max_stage_depth < (frame_exp - 1)) {
132 if (!(row % stage_size < half_stage_size)) {
135 half_stage_size <<= 1;
139 return max_stage_depth;
142#ifdef LV_HAVE_GENERIC
151 const int frame_size = 0x01 << frame_exp;
152 const int next_stage = stage + 1;
154 const int half_stage_size = 0x01 << stage;
155 const int stage_size = half_stage_size << 1;
157 const bool is_upper_stage_half = row % stage_size < half_stage_size;
160 float* next_llrs = llrs + frame_size;
161 float* call_row_llr = llrs + row;
163 const int section = row - (row % stage_size);
164 const int jump_size = ((row % half_stage_size) << 1) % stage_size;
166 const int next_upper_row = section + jump_size;
167 const int next_lower_row = next_upper_row + 1;
169 const float* upper_right_llr_ptr = next_llrs + next_upper_row;
170 const float* lower_right_llr_ptr = next_llrs + next_lower_row;
172 if (!is_upper_stage_half) {
173 const int u_pos = u_num >> stage;
174 const unsigned char f = u[u_pos - 1];
175 *call_row_llr =
llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
179 if (frame_exp > next_stage) {
180 unsigned char* u_half = u + frame_size;
183 next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
187 next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
190 *call_row_llr =
llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
197#include <immintrin.h>
207 const int frame_size = 0x01 << frame_exp;
209 const float* next_llrs = llrs + frame_size + row;
210 *(llrs + row) =
llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
215 if (max_stage_depth < 3) {
220 int loop_stage = max_stage_depth;
221 int stage_size = 0x01 << loop_stage;
226 __m256 src0, src1, dst;
231 unsigned char* u_target = u + frame_size;
232 unsigned char* u_temp = u + 2 * frame_size;
233 memcpy(u_temp, u + u_num - stage_size,
sizeof(
unsigned char) * stage_size);
237 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
238 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
243 for (p = 0; p < stage_size; p += 8) {
247 src0 = _mm256_loadu_ps(src_llr_ptr);
248 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
253 _mm256_storeu_ps(dst_llr_ptr, dst);
261 const int min_stage = stage > 2 ? stage : 2;
266 while (min_stage < loop_stage) {
267 dst_llr_ptr = llrs + loop_stage * frame_size + row;
268 src_llr_ptr = dst_llr_ptr + frame_size;
269 for (el = 0; el < stage_size; el += 8) {
270 src0 = _mm256_loadu_ps(src_llr_ptr);
272 src1 = _mm256_loadu_ps(src_llr_ptr);
277 _mm256_storeu_ps(dst_llr_ptr, dst);
292#include <immintrin.h>
295static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(
float* llrs,
302 const int frame_size = 0x01 << frame_exp;
304 const float* next_llrs = llrs + frame_size + row;
305 *(llrs + row) =
llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
310 if (max_stage_depth < 3) {
315 int loop_stage = max_stage_depth;
316 int stage_size = 0x01 << loop_stage;
321 __m256 src0, src1, dst;
326 unsigned char* u_target = u + frame_size;
327 unsigned char* u_temp = u + 2 * frame_size;
328 memcpy(u_temp, u + u_num - stage_size,
sizeof(
unsigned char) * stage_size);
330 volk_8u_x2_encodeframepolar_8u_u_avx2(u_target, u_temp, stage_size);
332 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
333 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
338 for (p = 0; p < stage_size; p += 8) {
342 src0 = _mm256_loadu_ps(src_llr_ptr);
343 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
348 _mm256_storeu_ps(dst_llr_ptr, dst);
356 const int min_stage = stage > 2 ? stage : 2;
361 while (min_stage < loop_stage) {
362 dst_llr_ptr = llrs + loop_stage * frame_size + row;
363 src_llr_ptr = dst_llr_ptr + frame_size;
364 for (el = 0; el < stage_size; el += 8) {
365 src0 = _mm256_loadu_ps(src_llr_ptr);
367 src1 = _mm256_loadu_ps(src_llr_ptr);
372 _mm256_storeu_ps(dst_llr_ptr, dst);