68#ifndef INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
69#define INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
76#define ROTATOR_RELOAD 512
77#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
87 unsigned int num_points)
91 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); ++i) {
93 *outVector++ = *inVector++ * (*phase);
94 (*phase) *= *phase_inc;
100 *outVector++ = *inVector++ * (*phase);
101 (*phase) *= *phase_inc;
120 unsigned int num_points)
124 const lv_32fc_t* inputVectorPtr = inVector;
126 lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
127 float32x4x2_t input_vec;
128 float32x4x2_t output_vec;
130 unsigned int i = 0, j = 0;
133 for (i = 0; i < 4; ++i) {
135 incr *= (*phase_inc);
139 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
140 const float32x4x2_t incr_vec = vld2q_f32((
float*)incrPtr);
141 float32x4x2_t phase_vec = vld2q_f32((
float*)phasePtr);
143 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
145 input_vec = vld2q_f32((
float*)inputVectorPtr);
153 vst2q_f32((
float*)outputVectorPtr, output_vec);
155 outputVectorPtr += 4;
163 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
164 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
168 input_vec = vld2q_f32((
float*)inputVectorPtr);
176 vst2q_f32((
float*)outputVectorPtr, output_vec);
178 outputVectorPtr += 4;
188 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
189 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
192 vst2q_f32((
float*)phasePtr, phase_vec);
195 for (i = 0; i < num_points % 4; i++) {
196 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
197 phasePtr[0] *= (*phase_inc);
201 (*phase) = phasePtr[0];
208#include <smmintrin.h>
210static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(
lv_32fc_t* outVector,
214 unsigned int num_points)
219 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
221 unsigned int i, j = 0;
223 for (i = 0; i < 2; ++i) {
224 phase_Ptr[i] *= incr;
225 incr *= (*phase_inc);
228 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
233 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
299 if (num_points & 1) {
300 *cPtr++ = *aPtr++ * phase_Ptr[0];
301 phase_Ptr[0] *= (*phase_inc);
304 (*phase) = phase_Ptr[0];
311#include <smmintrin.h>
313static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(
lv_32fc_t* outVector,
317 unsigned int num_points)
322 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
324 unsigned int i, j = 0;
326 for (i = 0; i < 2; ++i) {
327 phase_Ptr[i] *= incr;
328 incr *= (*phase_inc);
334 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
339 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
405 if (num_points & 1) {
406 *cPtr++ = *aPtr++ * phase_Ptr[0];
407 phase_Ptr[0] *= (*phase_inc);
410 (*phase) = phase_Ptr[0];
417#include <immintrin.h>
424 unsigned int num_points)
429 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
431 unsigned int i, j = 0;
433 for (i = 0; i < 4; ++i) {
434 phase_Ptr[i] *= incr;
435 incr *= (*phase_inc);
438 __m256 aVal, phase_Val, z;
440 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
442 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
451 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
454 aVal = _mm256_load_ps((
float*)aPtr);
459 _mm256_store_ps((
float*)cPtr, z);
468 aVal = _mm256_load_ps((
float*)aPtr);
473 _mm256_store_ps((
float*)cPtr, z);
482 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
483 (*phase) = phase_Ptr[0];
485 cPtr, aPtr, phase_inc, phase, num_points % 4);
492#include <immintrin.h>
499 unsigned int num_points)
504 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
506 unsigned int i, j = 0;
508 for (i = 0; i < 4; ++i) {
509 phase_Ptr[i] *= incr;
510 incr *= (*phase_inc);
513 __m256 aVal, phase_Val, z;
515 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
517 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
526 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); ++i) {
529 aVal = _mm256_loadu_ps((
float*)aPtr);
534 _mm256_storeu_ps((
float*)cPtr, z);
543 aVal = _mm256_loadu_ps((
float*)aPtr);
548 _mm256_storeu_ps((
float*)cPtr, z);
557 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
558 (*phase) = phase_Ptr[0];
560 cPtr, aPtr, phase_inc, phase, num_points % 4);
565#if LV_HAVE_AVX && LV_HAVE_FMA
566#include <immintrin.h>
568static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(
lv_32fc_t* outVector,
572 unsigned int num_points)
578 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
580 unsigned int i, j = 0;
582 for (i = 0; i < 4; ++i) {
583 phase_Ptr[i] *= incr;
584 incr *= (*phase_inc);
587 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
589 phase_Val = _mm256_load_ps((
float*)phase_Ptr);
590 inc_Val = _mm256_set_ps(
lv_cimag(incr),
599 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
602 aVal = _mm256_load_ps((
float*)aPtr);
604 yl = _mm256_moveldup_ps(phase_Val);
605 yh = _mm256_movehdup_ps(phase_Val);
606 ylp = _mm256_moveldup_ps(inc_Val);
607 yhp = _mm256_movehdup_ps(inc_Val);
612 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
613 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
614 tmp2 = _mm256_mul_ps(aVal, yh);
615 tmp2p = _mm256_mul_ps(phase_Val, yhp);
617 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
618 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
620 _mm256_store_ps((
float*)cPtr, z);
625 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
626 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
627 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
628 tmp2 = _mm256_sqrt_ps(tmp1);
629 phase_Val = _mm256_div_ps(phase_Val, tmp2);
632 aVal = _mm256_load_ps((
float*)aPtr);
634 yl = _mm256_moveldup_ps(phase_Val);
635 yh = _mm256_movehdup_ps(phase_Val);
636 ylp = _mm256_moveldup_ps(inc_Val);
637 yhp = _mm256_movehdup_ps(inc_Val);
642 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
643 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
644 tmp2 = _mm256_mul_ps(aVal, yh);
645 tmp2p = _mm256_mul_ps(phase_Val, yhp);
647 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
648 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
650 _mm256_store_ps((
float*)cPtr, z);
656 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
657 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
658 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
659 tmp2 = _mm256_sqrt_ps(tmp1);
660 phase_Val = _mm256_div_ps(phase_Val, tmp2);
663 _mm256_store_ps((
float*)phase_Ptr, phase_Val);
664 for (i = 0; i < num_points % 4; ++i) {
665 *cPtr++ = *aPtr++ * phase_Ptr[0];
666 phase_Ptr[0] *= (*phase_inc);
669 (*phase) = phase_Ptr[0];
674#if LV_HAVE_AVX && LV_HAVE_FMA
675#include <immintrin.h>
677static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(
lv_32fc_t* outVector,
681 unsigned int num_points)
686 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
688 unsigned int i, j = 0;
690 for (i = 0; i < 4; ++i) {
691 phase_Ptr[i] *= incr;
692 incr *= (*phase_inc);
695 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
697 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
698 inc_Val = _mm256_set_ps(
lv_cimag(incr),
707 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
710 aVal = _mm256_loadu_ps((
float*)aPtr);
712 yl = _mm256_moveldup_ps(phase_Val);
713 yh = _mm256_movehdup_ps(phase_Val);
714 ylp = _mm256_moveldup_ps(inc_Val);
715 yhp = _mm256_movehdup_ps(inc_Val);
720 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
721 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
722 tmp2 = _mm256_mul_ps(aVal, yh);
723 tmp2p = _mm256_mul_ps(phase_Val, yhp);
725 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
726 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
728 _mm256_storeu_ps((
float*)cPtr, z);
733 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
734 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
735 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
736 tmp2 = _mm256_sqrt_ps(tmp1);
737 phase_Val = _mm256_div_ps(phase_Val, tmp2);
740 aVal = _mm256_loadu_ps((
float*)aPtr);
742 yl = _mm256_moveldup_ps(phase_Val);
743 yh = _mm256_movehdup_ps(phase_Val);
744 ylp = _mm256_moveldup_ps(inc_Val);
745 yhp = _mm256_movehdup_ps(inc_Val);
750 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
751 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
752 tmp2 = _mm256_mul_ps(aVal, yh);
753 tmp2p = _mm256_mul_ps(phase_Val, yhp);
755 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
756 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
758 _mm256_storeu_ps((
float*)cPtr, z);
764 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
765 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
766 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
767 tmp2 = _mm256_sqrt_ps(tmp1);
768 phase_Val = _mm256_div_ps(phase_Val, tmp2);
771 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
772 for (i = 0; i < num_points % 4; ++i) {
773 *cPtr++ = *aPtr++ * phase_Ptr[0];
774 phase_Ptr[0] *= (*phase_inc);
777 (*phase) = phase_Ptr[0];