66#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
76 const unsigned int num_points)
79 for (
unsigned int i = 0; i < num_points; ++i) {
84 diff = symbol - *points++;
96volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(
float* target,
100 unsigned int num_points)
102 const unsigned int num_bytes = num_points * 8;
105 __m256 xmm_points0, xmm_points1, xmm_result;
107 const unsigned int bound = num_bytes >> 6;
110 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
111 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
114 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
118 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
120 for (
unsigned int i = 0; i < bound; ++i) {
121 xmm_points0 = _mm256_load_ps((
float*)points);
122 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
127 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
129 _mm256_store_ps(target, xmm_result);
133 if (num_bytes >> 5 & 1) {
134 xmm_points0 = _mm256_load_ps((
float*)points);
136 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
140 xmm6 = _mm256_mul_ps(xmm4, xmm4);
142 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
145 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
147 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
152 if (num_bytes >> 4 & 1) {
176#include <immintrin.h>
184 unsigned int num_points)
186 const int eightsPoints = num_points / 8;
187 const int remainder = num_points - 8 * eightsPoints;
189 __m256 xmm_points0, xmm_points1, xmm_result;
192 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
195 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
197 for (
int i = 0; i < eightsPoints; ++i) {
198 xmm_points0 = _mm256_load_ps((
float*)points);
199 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
203 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
205 _mm256_store_ps(target, xmm_result);
217#include <pmmintrin.h>
225 unsigned int num_points)
227 __m128 xmm_points0, xmm_points1, xmm_result;
235 const int quarterPoints = num_points / 4;
236 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237 const int leftovers1 = num_points % 2;
245 for (
int i = 0; i < quarterPoints; ++i) {
252 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
258 for (
int i = 0; i < leftovers0; ++i) {
262 xmm_points0 =
_mm_sub_ps(xmm_symbol, xmm_points0);
263 xmm_points0 =
_mm_mul_ps(xmm_points0, xmm_points0);
264 xmm_points0 =
_mm_hadd_ps(xmm_points0, xmm_points0);
265 xmm_result =
_mm_mul_ps(xmm_points0, xmm_scalar);
278#include <xmmintrin.h>
284 unsigned int num_points)
289 for (
unsigned i = 0; i < num_points / 4; ++i) {
294 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
303#ifdef LV_HAVE_GENERIC
309 unsigned int num_points)
320#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
321#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
327#include <immintrin.h>
331volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(
float* target,
335 unsigned int num_points)
337 const unsigned int num_bytes = num_points * 8;
340 __m256 xmm_points0, xmm_points1, xmm_result;
342 const unsigned int bound = num_bytes >> 6;
345 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
346 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
349 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
350 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
353 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
355 for (
unsigned int i = 0; i < bound; ++i) {
356 xmm_points0 = _mm256_loadu_ps((
float*)points);
357 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
362 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
364 _mm256_storeu_ps(target, xmm_result);
368 if (num_bytes >> 5 & 1) {
369 xmm_points0 = _mm256_loadu_ps((
float*)points);
371 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
375 xmm6 = _mm256_mul_ps(xmm4, xmm4);
377 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
378 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
380 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
382 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
387 if (num_bytes >> 4 & 1) {
411#include <immintrin.h>
419 unsigned int num_points)
421 const int eightsPoints = num_points / 8;
422 const int remainder = num_points - 8 * eightsPoints;
424 __m256 xmm_points0, xmm_points1, xmm_result;
427 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
430 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
432 for (
int i = 0; i < eightsPoints; ++i) {
433 xmm_points0 = _mm256_loadu_ps((
float*)points);
434 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
438 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
440 _mm256_storeu_ps(target, xmm_result);
452#include <pmmintrin.h>
460 unsigned int num_points)
462 __m128 xmm_points0, xmm_points1, xmm_result;
470 const int quarterPoints = num_points / 4;
471 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
472 const int leftovers1 = num_points % 2;
480 for (
int i = 0; i < quarterPoints; ++i) {
487 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
493 for (
int i = 0; i < leftovers0; ++i) {
497 xmm_points0 =
_mm_sub_ps(xmm_symbol, xmm_points0);
498 xmm_points0 =
_mm_mul_ps(xmm_points0, xmm_points0);
499 xmm_points0 =
_mm_hadd_ps(xmm_points0, xmm_points0);
500 xmm_result =
_mm_mul_ps(xmm_points0, xmm_scalar);
513#include <xmmintrin.h>
519 unsigned int num_points)
524 for (
unsigned i = 0; i < num_points / 4; ++i) {
529 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);