Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
45#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
47
48typedef union {
49 unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
50 unsigned int w[64 /*NUMSTATES*/ / 32];
51 unsigned short s[64 /*NUMSTATES*/ / 16];
52 unsigned char c[64 /*NUMSTATES*/ / 8];
53#ifdef _MSC_VER
55#else
56} decision_t __attribute__((aligned(16)));
57#endif
58
59
60static inline void renormalize(unsigned char* X, unsigned char threshold)
61{
62 int NUMSTATES = 64;
63 int i;
64
65 unsigned char min = X[0];
66 // if(min > threshold) {
67 for (i = 0; i < NUMSTATES; i++)
68 if (min > X[i])
69 min = X[i];
70 for (i = 0; i < NUMSTATES; i++)
71 X[i] -= min;
72 //}
73}
74
75
76// helper BFLY for GENERIC version
77static inline void BFLY(int i,
78 int s,
79 unsigned char* syms,
80 unsigned char* Y,
81 unsigned char* X,
82 decision_t* d,
83 unsigned char* Branchtab)
84{
85 int j;
86 unsigned int decision0, decision1;
87 unsigned char metric, m0, m1, m2, m3;
88
89 int NUMSTATES = 64;
90 int RATE = 2;
91 int METRICSHIFT = 2;
92 int PRECISIONSHIFT = 2;
93
94 metric = 0;
95 for (j = 0; j < RATE; j++)
96 metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
97 metric = metric >> PRECISIONSHIFT;
98
99 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
100
101 m0 = X[i] + metric;
102 m1 = X[i + NUMSTATES / 2] + (max - metric);
103 m2 = X[i] + (max - metric);
104 m3 = X[i + NUMSTATES / 2] + metric;
105
106 decision0 = (signed int)(m0 - m1) > 0;
107 decision1 = (signed int)(m2 - m3) > 0;
108
109 Y[2 * i] = decision0 ? m1 : m0;
110 Y[2 * i + 1] = decision1 ? m3 : m2;
111
112 d->w[i / (sizeof(unsigned int) * 8 / 2) +
113 s * (sizeof(decision_t) / sizeof(unsigned int))] |=
114 (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
115}
116
117
118//#if LV_HAVE_AVX2
119//
120//#include <immintrin.h>
121//#include <stdio.h>
122//
123// static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
124// unsigned char* X,
125// unsigned char* syms,
126// unsigned char* dec,
127// unsigned int framebits,
128// unsigned int excess,
129// unsigned char* Branchtab)
130//{
131// unsigned int i9;
132// for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
133// unsigned char a75, a81;
134// int a73, a92;
135// int s20, s21;
136// unsigned char *a80, *b6;
137// int *a110, *a91, *a93;
138// __m256i *a112, *a71, *a72, *a77, *a83, *a95;
139// __m256i a86, a87;
140// __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25,
141// m26,
142// s18, s19, s22, s23, s24, s25, t13, t14, t15;
143// a71 = ((__m256i*)X);
144// s18 = *(a71);
145// a72 = (a71 + 1);
146// s19 = *(a72);
147// s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
148// s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
149// s18 = s22;
150// a73 = (4 * i9);
151// b6 = (syms + a73);
152// a75 = *(b6);
153// a76 = _mm256_set1_epi8(a75);
154// a77 = ((__m256i*)Branchtab);
155// a78 = *(a77);
156// a79 = _mm256_xor_si256(a76, a78);
157// a80 = (b6 + 1);
158// a81 = *(a80);
159// a82 = _mm256_set1_epi8(a81);
160// a83 = (a77 + 1);
161// a84 = *(a83);
162// a85 = _mm256_xor_si256(a82, a84);
163// t13 = _mm256_avg_epu8(a79, a85);
164// a86 = ((__m256i)t13);
165// a87 = _mm256_srli_epi16(a86, 2);
166// a88 = ((__m256i)a87);
167// t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
168// t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
169// m23 = _mm256_adds_epu8(s18, t14);
170// m24 = _mm256_adds_epu8(s19, t15);
171// m25 = _mm256_adds_epu8(s18, t15);
172// m26 = _mm256_adds_epu8(s19, t14);
173// a89 = _mm256_min_epu8(m24, m23);
174// d9 = _mm256_cmpeq_epi8(a89, m24);
175// a90 = _mm256_min_epu8(m26, m25);
176// d10 = _mm256_cmpeq_epi8(a90, m26);
177// s22 = _mm256_unpacklo_epi8(d9, d10);
178// s23 = _mm256_unpackhi_epi8(d9, d10);
179// s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
180// a91 = ((int*)dec);
181// a92 = (4 * i9);
182// a93 = (a91 + a92);
183// *(a93) = s20;
184// s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
185// a110 = (a93 + 1);
186// *(a110) = s21;
187// s22 = _mm256_unpacklo_epi8(a89, a90);
188// s23 = _mm256_unpackhi_epi8(a89, a90);
189// a95 = ((__m256i*)Y);
190// s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
191// *(a95) = s24;
192// s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
193// a112 = (a95 + 1);
194// *(a112) = s23;
195// if ((((unsigned char*)Y)[0] > 210)) {
196// __m256i m5, m6;
197// m5 = ((__m256i*)Y)[0];
198// m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
199// __m256i m7;
200// m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
201// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
202// ((__m256i)m7)));
203// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
204// ((__m256i)m7)));
205// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
206// ((__m256i)m7)));
207// m7 = _mm256_unpacklo_epi8(m7, m7);
208// m7 = _mm256_shufflelo_epi16(m7, 0);
209// m6 = _mm256_unpacklo_epi64(m7, m7);
210// m6 = _mm256_permute2x128_si256(
211// m6, m6, 0); // copy lower half of m6 to upper half, since above ops
212// // operate on 128 bit lanes
213// ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
214// ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
215// }
216// unsigned char a188, a194;
217// int a205;
218// int s48, s54;
219// unsigned char *a187, *a193;
220// int *a204, *a206, *a223, *b16;
221// __m256i *a184, *a185, *a190, *a196, *a208, *a225;
222// __m256i a199, a200;
223// __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39,
224// m40,
225// m41, m42, s46, s47, s50, s51, t25, t26, t27;
226// a184 = ((__m256i*)Y);
227// s46 = *(a184);
228// a185 = (a184 + 1);
229// s47 = *(a185);
230// s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
231// s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
232// s46 = s50;
233// a187 = (b6 + 2);
234// a188 = *(a187);
235// a189 = _mm256_set1_epi8(a188);
236// a190 = ((__m256i*)Branchtab);
237// a191 = *(a190);
238// a192 = _mm256_xor_si256(a189, a191);
239// a193 = (b6 + 3);
240// a194 = *(a193);
241// a195 = _mm256_set1_epi8(a194);
242// a196 = (a190 + 1);
243// a197 = *(a196);
244// a198 = _mm256_xor_si256(a195, a197);
245// t25 = _mm256_avg_epu8(a192, a198);
246// a199 = ((__m256i)t25);
247// a200 = _mm256_srli_epi16(a199, 2);
248// a201 = ((__m256i)a200);
249// t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
250// t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
251// m39 = _mm256_adds_epu8(s46, t26);
252// m40 = _mm256_adds_epu8(s47, t27);
253// m41 = _mm256_adds_epu8(s46, t27);
254// m42 = _mm256_adds_epu8(s47, t26);
255// a202 = _mm256_min_epu8(m40, m39);
256// d17 = _mm256_cmpeq_epi8(a202, m40);
257// a203 = _mm256_min_epu8(m42, m41);
258// d18 = _mm256_cmpeq_epi8(a203, m42);
259// s24 = _mm256_unpacklo_epi8(d17, d18);
260// s25 = _mm256_unpackhi_epi8(d17, d18);
261// s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
262// a204 = ((int*)dec);
263// a205 = (4 * i9);
264// b16 = (a204 + a205);
265// a206 = (b16 + 2);
266// *(a206) = s48;
267// s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
268// a223 = (b16 + 3);
269// *(a223) = s54;
270// s50 = _mm256_unpacklo_epi8(a202, a203);
271// s51 = _mm256_unpackhi_epi8(a202, a203);
272// s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
273// s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
274// a208 = ((__m256i*)X);
275// *(a208) = s25;
276// a225 = (a208 + 1);
277// *(a225) = s51;
278//
279// if ((((unsigned char*)X)[0] > 210)) {
280// __m256i m12, m13;
281// m12 = ((__m256i*)X)[0];
282// m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
283// __m256i m14;
284// m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
285// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
286// ((__m256i)m14)));
287// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
288// ((__m256i)m14)));
289// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
290// ((__m256i)m14)));
291// m14 = _mm256_unpacklo_epi8(m14, m14);
292// m14 = _mm256_shufflelo_epi16(m14, 0);
293// m13 = _mm256_unpacklo_epi64(m14, m14);
294// m13 = _mm256_permute2x128_si256(m13, m13, 0);
295// ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
296// ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
297// }
298// }
299//
300// renormalize(X, 210);
301//
302// unsigned int j;
303// for (j = 0; j < (framebits + excess) % 2; ++j) {
304// int i;
305// for (i = 0; i < 64 / 2; i++) {
306// BFLY(i,
307// (((framebits + excess) >> 1) << 1) + j,
308// syms,
309// Y,
310// X,
311// (decision_t*)dec,
312// Branchtab);
313// }
314//
315// renormalize(Y, 210);
316// }
317// /*skip*/
318//}
319//
320//#endif /*LV_HAVE_AVX2*/
321
322
323#if LV_HAVE_SSE3
324
325#include <emmintrin.h>
326#include <mmintrin.h>
327#include <pmmintrin.h>
328#include <stdio.h>
329#include <xmmintrin.h>
330
331static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
332 unsigned char* X,
333 unsigned char* syms,
334 unsigned char* dec,
335 unsigned int framebits,
336 unsigned int excess,
337 unsigned char* Branchtab)
338{
339 unsigned int i9;
340 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
341 unsigned char a75, a81;
342 int a73, a92;
343 short int s20, s21, s26, s27;
344 unsigned char *a74, *a80, *b6;
345 short int *a110, *a111, *a91, *a93, *a94;
346 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
347 __m128i a105, a106, a86, a87;
348 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
349 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
350 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
351 a71 = ((__m128i*)X);
352 s18 = *(a71);
353 a72 = (a71 + 2);
354 s19 = *(a72);
355 a73 = (4 * i9);
356 a74 = (syms + a73);
357 a75 = *(a74);
358 a76 = _mm_set1_epi8(a75);
359 a77 = ((__m128i*)Branchtab);
360 a78 = *(a77);
361 a79 = _mm_xor_si128(a76, a78);
362 b6 = (a73 + syms);
363 a80 = (b6 + 1);
364 a81 = *(a80);
365 a82 = _mm_set1_epi8(a81);
366 a83 = (a77 + 2);
367 a84 = *(a83);
368 a85 = _mm_xor_si128(a82, a84);
369 t13 = _mm_avg_epu8(a79, a85);
370 a86 = ((__m128i)t13);
371 a87 = _mm_srli_epi16(a86, 2);
372 a88 = ((__m128i)a87);
373 t14 = _mm_and_si128(
374 a88,
375 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
376 t15 = _mm_subs_epu8(
377 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
378 t14);
379 m23 = _mm_adds_epu8(s18, t14);
380 m24 = _mm_adds_epu8(s19, t15);
381 m25 = _mm_adds_epu8(s18, t15);
382 m26 = _mm_adds_epu8(s19, t14);
383 a89 = _mm_min_epu8(m24, m23);
384 d9 = _mm_cmpeq_epi8(a89, m24);
385 a90 = _mm_min_epu8(m26, m25);
386 d10 = _mm_cmpeq_epi8(a90, m26);
387 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
388 a91 = ((short int*)dec);
389 a92 = (8 * i9);
390 a93 = (a91 + a92);
391 *(a93) = s20;
392 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
393 a94 = (a93 + 1);
394 *(a94) = s21;
395 s22 = _mm_unpacklo_epi8(a89, a90);
396 s23 = _mm_unpackhi_epi8(a89, a90);
397 a95 = ((__m128i*)Y);
398 *(a95) = s22;
399 a96 = (a95 + 1);
400 *(a96) = s23;
401 a97 = (a71 + 1);
402 s24 = *(a97);
403 a98 = (a71 + 3);
404 s25 = *(a98);
405 a99 = (a77 + 1);
406 a100 = *(a99);
407 a101 = _mm_xor_si128(a76, a100);
408 a102 = (a77 + 3);
409 a103 = *(a102);
410 a104 = _mm_xor_si128(a82, a103);
411 t16 = _mm_avg_epu8(a101, a104);
412 a105 = ((__m128i)t16);
413 a106 = _mm_srli_epi16(a105, 2);
414 a107 = ((__m128i)a106);
415 t17 = _mm_and_si128(
416 a107,
417 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
418 t18 = _mm_subs_epu8(
419 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
420 t17);
421 m27 = _mm_adds_epu8(s24, t17);
422 m28 = _mm_adds_epu8(s25, t18);
423 m29 = _mm_adds_epu8(s24, t18);
424 m30 = _mm_adds_epu8(s25, t17);
425 a108 = _mm_min_epu8(m28, m27);
426 d11 = _mm_cmpeq_epi8(a108, m28);
427 a109 = _mm_min_epu8(m30, m29);
428 d12 = _mm_cmpeq_epi8(a109, m30);
429 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
430 a110 = (a93 + 2);
431 *(a110) = s26;
432 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
433 a111 = (a93 + 3);
434 *(a111) = s27;
435 s28 = _mm_unpacklo_epi8(a108, a109);
436 s29 = _mm_unpackhi_epi8(a108, a109);
437 a112 = (a95 + 2);
438 *(a112) = s28;
439 a113 = (a95 + 3);
440 *(a113) = s29;
441 if ((((unsigned char*)Y)[0] > 210)) {
442 __m128i m5, m6;
443 m5 = ((__m128i*)Y)[0];
444 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
445 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
446 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
447 __m128i m7;
448 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
449 m7 =
450 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
451 m7 =
452 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
453 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
454 m7 = _mm_unpacklo_epi8(m7, m7);
455 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
456 m6 = _mm_unpacklo_epi64(m7, m7);
457 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
458 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
459 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
460 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
461 }
462 unsigned char a188, a194;
463 int a186, a205;
464 short int s48, s49, s54, s55;
465 unsigned char *a187, *a193, *b15;
466 short int *a204, *a206, *a207, *a223, *a224, *b16;
467 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
468 *a225, *a226;
469 __m128i a199, a200, a218, a219;
470 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
471 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
472 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
473 a184 = ((__m128i*)Y);
474 s46 = *(a184);
475 a185 = (a184 + 2);
476 s47 = *(a185);
477 a186 = (4 * i9);
478 b15 = (a186 + syms);
479 a187 = (b15 + 2);
480 a188 = *(a187);
481 a189 = _mm_set1_epi8(a188);
482 a190 = ((__m128i*)Branchtab);
483 a191 = *(a190);
484 a192 = _mm_xor_si128(a189, a191);
485 a193 = (b15 + 3);
486 a194 = *(a193);
487 a195 = _mm_set1_epi8(a194);
488 a196 = (a190 + 2);
489 a197 = *(a196);
490 a198 = _mm_xor_si128(a195, a197);
491 t25 = _mm_avg_epu8(a192, a198);
492 a199 = ((__m128i)t25);
493 a200 = _mm_srli_epi16(a199, 2);
494 a201 = ((__m128i)a200);
495 t26 = _mm_and_si128(
496 a201,
497 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
498 t27 = _mm_subs_epu8(
499 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
500 t26);
501 m39 = _mm_adds_epu8(s46, t26);
502 m40 = _mm_adds_epu8(s47, t27);
503 m41 = _mm_adds_epu8(s46, t27);
504 m42 = _mm_adds_epu8(s47, t26);
505 a202 = _mm_min_epu8(m40, m39);
506 d17 = _mm_cmpeq_epi8(a202, m40);
507 a203 = _mm_min_epu8(m42, m41);
508 d18 = _mm_cmpeq_epi8(a203, m42);
509 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
510 a204 = ((short int*)dec);
511 a205 = (8 * i9);
512 b16 = (a204 + a205);
513 a206 = (b16 + 4);
514 *(a206) = s48;
515 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
516 a207 = (b16 + 5);
517 *(a207) = s49;
518 s50 = _mm_unpacklo_epi8(a202, a203);
519 s51 = _mm_unpackhi_epi8(a202, a203);
520 a208 = ((__m128i*)X);
521 *(a208) = s50;
522 a209 = (a208 + 1);
523 *(a209) = s51;
524 a210 = (a184 + 1);
525 s52 = *(a210);
526 a211 = (a184 + 3);
527 s53 = *(a211);
528 a212 = (a190 + 1);
529 a213 = *(a212);
530 a214 = _mm_xor_si128(a189, a213);
531 a215 = (a190 + 3);
532 a216 = *(a215);
533 a217 = _mm_xor_si128(a195, a216);
534 t28 = _mm_avg_epu8(a214, a217);
535 a218 = ((__m128i)t28);
536 a219 = _mm_srli_epi16(a218, 2);
537 a220 = ((__m128i)a219);
538 t29 = _mm_and_si128(
539 a220,
540 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
541 t30 = _mm_subs_epu8(
542 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
543 t29);
544 m43 = _mm_adds_epu8(s52, t29);
545 m44 = _mm_adds_epu8(s53, t30);
546 m45 = _mm_adds_epu8(s52, t30);
547 m46 = _mm_adds_epu8(s53, t29);
548 a221 = _mm_min_epu8(m44, m43);
549 d19 = _mm_cmpeq_epi8(a221, m44);
550 a222 = _mm_min_epu8(m46, m45);
551 d20 = _mm_cmpeq_epi8(a222, m46);
552 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
553 a223 = (b16 + 6);
554 *(a223) = s54;
555 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
556 a224 = (b16 + 7);
557 *(a224) = s55;
558 s56 = _mm_unpacklo_epi8(a221, a222);
559 s57 = _mm_unpackhi_epi8(a221, a222);
560 a225 = (a208 + 2);
561 *(a225) = s56;
562 a226 = (a208 + 3);
563 *(a226) = s57;
564 if ((((unsigned char*)X)[0] > 210)) {
565 __m128i m12, m13;
566 m12 = ((__m128i*)X)[0];
567 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
568 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
569 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
570 __m128i m14;
571 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
572 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
573 ((__m128i)m14)));
574 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
575 ((__m128i)m14)));
576 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
577 ((__m128i)m14)));
578 m14 = _mm_unpacklo_epi8(m14, m14);
579 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
580 m13 = _mm_unpacklo_epi64(m14, m14);
581 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
582 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
583 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
584 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
585 }
586 }
587
588 renormalize(X, 210);
589
590 /*int ch;
591 for(ch = 0; ch < 64; ch++) {
592 printf("%d,", X[ch]);
593 }
594 printf("\n");*/
595
596 unsigned int j;
597 for (j = 0; j < (framebits + excess) % 2; ++j) {
598 int i;
599 for (i = 0; i < 64 / 2; i++) {
600 BFLY(i,
601 (((framebits + excess) >> 1) << 1) + j,
602 syms,
603 Y,
604 X,
605 (decision_t*)dec,
606 Branchtab);
607 }
608
609
610 renormalize(Y, 210);
611
612 /*printf("\n");
613 for(ch = 0; ch < 64; ch++) {
614 printf("%d,", Y[ch]);
615 }
616 printf("\n");*/
617 }
618 /*skip*/
619}
620
621#endif /*LV_HAVE_SSE3*/
622
623#if LV_HAVE_NEON
624
625#include "volk/sse2neon.h"
626
627static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
628 unsigned char* X,
629 unsigned char* syms,
630 unsigned char* dec,
631 unsigned int framebits,
632 unsigned int excess,
633 unsigned char* Branchtab)
634{
635 unsigned int i9;
636 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
637 unsigned char a75, a81;
638 int a73, a92;
639 short int s20, s21, s26, s27;
640 unsigned char *a74, *a80, *b6;
641 short int *a110, *a111, *a91, *a93, *a94;
642 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
643 __m128i a105, a106, a86, a87;
644 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
645 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
646 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
647 a71 = ((__m128i*)X);
648 s18 = *(a71);
649 a72 = (a71 + 2);
650 s19 = *(a72);
651 a73 = (4 * i9);
652 a74 = (syms + a73);
653 a75 = *(a74);
654 a76 = _mm_set1_epi8(a75);
655 a77 = ((__m128i*)Branchtab);
656 a78 = *(a77);
657 a79 = _mm_xor_si128(a76, a78);
658 b6 = (a73 + syms);
659 a80 = (b6 + 1);
660 a81 = *(a80);
661 a82 = _mm_set1_epi8(a81);
662 a83 = (a77 + 2);
663 a84 = *(a83);
664 a85 = _mm_xor_si128(a82, a84);
665 t13 = _mm_avg_epu8(a79, a85);
666 a86 = ((__m128i)t13);
667 a87 = _mm_srli_epi16(a86, 2);
668 a88 = ((__m128i)a87);
669 t14 = _mm_and_si128(
670 a88,
671 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
672 t15 = _mm_subs_epu8(
673 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
674 t14);
675 m23 = _mm_adds_epu8(s18, t14);
676 m24 = _mm_adds_epu8(s19, t15);
677 m25 = _mm_adds_epu8(s18, t15);
678 m26 = _mm_adds_epu8(s19, t14);
679 a89 = _mm_min_epu8(m24, m23);
680 d9 = _mm_cmpeq_epi8(a89, m24);
681 a90 = _mm_min_epu8(m26, m25);
682 d10 = _mm_cmpeq_epi8(a90, m26);
683 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
684 a91 = ((short int*)dec);
685 a92 = (8 * i9);
686 a93 = (a91 + a92);
687 *(a93) = s20;
688 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
689 a94 = (a93 + 1);
690 *(a94) = s21;
691 s22 = _mm_unpacklo_epi8(a89, a90);
692 s23 = _mm_unpackhi_epi8(a89, a90);
693 a95 = ((__m128i*)Y);
694 *(a95) = s22;
695 a96 = (a95 + 1);
696 *(a96) = s23;
697 a97 = (a71 + 1);
698 s24 = *(a97);
699 a98 = (a71 + 3);
700 s25 = *(a98);
701 a99 = (a77 + 1);
702 a100 = *(a99);
703 a101 = _mm_xor_si128(a76, a100);
704 a102 = (a77 + 3);
705 a103 = *(a102);
706 a104 = _mm_xor_si128(a82, a103);
707 t16 = _mm_avg_epu8(a101, a104);
708 a105 = ((__m128i)t16);
709 a106 = _mm_srli_epi16(a105, 2);
710 a107 = ((__m128i)a106);
711 t17 = _mm_and_si128(
712 a107,
713 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
714 t18 = _mm_subs_epu8(
715 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
716 t17);
717 m27 = _mm_adds_epu8(s24, t17);
718 m28 = _mm_adds_epu8(s25, t18);
719 m29 = _mm_adds_epu8(s24, t18);
720 m30 = _mm_adds_epu8(s25, t17);
721 a108 = _mm_min_epu8(m28, m27);
722 d11 = _mm_cmpeq_epi8(a108, m28);
723 a109 = _mm_min_epu8(m30, m29);
724 d12 = _mm_cmpeq_epi8(a109, m30);
725 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
726 a110 = (a93 + 2);
727 *(a110) = s26;
728 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
729 a111 = (a93 + 3);
730 *(a111) = s27;
731 s28 = _mm_unpacklo_epi8(a108, a109);
732 s29 = _mm_unpackhi_epi8(a108, a109);
733 a112 = (a95 + 2);
734 *(a112) = s28;
735 a113 = (a95 + 3);
736 *(a113) = s29;
737 if ((((unsigned char*)Y)[0] > 210)) {
738 __m128i m5, m6;
739 m5 = ((__m128i*)Y)[0];
740 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
741 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
742 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
743 __m128i m7;
744 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
745 m7 =
746 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
747 m7 =
748 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
749 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
750 m7 = _mm_unpacklo_epi8(m7, m7);
751 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
752 m6 = _mm_unpacklo_epi64(m7, m7);
753 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
754 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
755 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
756 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
757 }
758 unsigned char a188, a194;
759 int a186, a205;
760 short int s48, s49, s54, s55;
761 unsigned char *a187, *a193, *b15;
762 short int *a204, *a206, *a207, *a223, *a224, *b16;
763 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
764 *a225, *a226;
765 __m128i a199, a200, a218, a219;
766 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
767 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
768 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
769 a184 = ((__m128i*)Y);
770 s46 = *(a184);
771 a185 = (a184 + 2);
772 s47 = *(a185);
773 a186 = (4 * i9);
774 b15 = (a186 + syms);
775 a187 = (b15 + 2);
776 a188 = *(a187);
777 a189 = _mm_set1_epi8(a188);
778 a190 = ((__m128i*)Branchtab);
779 a191 = *(a190);
780 a192 = _mm_xor_si128(a189, a191);
781 a193 = (b15 + 3);
782 a194 = *(a193);
783 a195 = _mm_set1_epi8(a194);
784 a196 = (a190 + 2);
785 a197 = *(a196);
786 a198 = _mm_xor_si128(a195, a197);
787 t25 = _mm_avg_epu8(a192, a198);
788 a199 = ((__m128i)t25);
789 a200 = _mm_srli_epi16(a199, 2);
790 a201 = ((__m128i)a200);
791 t26 = _mm_and_si128(
792 a201,
793 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
794 t27 = _mm_subs_epu8(
795 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
796 t26);
797 m39 = _mm_adds_epu8(s46, t26);
798 m40 = _mm_adds_epu8(s47, t27);
799 m41 = _mm_adds_epu8(s46, t27);
800 m42 = _mm_adds_epu8(s47, t26);
801 a202 = _mm_min_epu8(m40, m39);
802 d17 = _mm_cmpeq_epi8(a202, m40);
803 a203 = _mm_min_epu8(m42, m41);
804 d18 = _mm_cmpeq_epi8(a203, m42);
805 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
806 a204 = ((short int*)dec);
807 a205 = (8 * i9);
808 b16 = (a204 + a205);
809 a206 = (b16 + 4);
810 *(a206) = s48;
811 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
812 a207 = (b16 + 5);
813 *(a207) = s49;
814 s50 = _mm_unpacklo_epi8(a202, a203);
815 s51 = _mm_unpackhi_epi8(a202, a203);
816 a208 = ((__m128i*)X);
817 *(a208) = s50;
818 a209 = (a208 + 1);
819 *(a209) = s51;
820 a210 = (a184 + 1);
821 s52 = *(a210);
822 a211 = (a184 + 3);
823 s53 = *(a211);
824 a212 = (a190 + 1);
825 a213 = *(a212);
826 a214 = _mm_xor_si128(a189, a213);
827 a215 = (a190 + 3);
828 a216 = *(a215);
829 a217 = _mm_xor_si128(a195, a216);
830 t28 = _mm_avg_epu8(a214, a217);
831 a218 = ((__m128i)t28);
832 a219 = _mm_srli_epi16(a218, 2);
833 a220 = ((__m128i)a219);
834 t29 = _mm_and_si128(
835 a220,
836 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
837 t30 = _mm_subs_epu8(
838 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
839 t29);
840 m43 = _mm_adds_epu8(s52, t29);
841 m44 = _mm_adds_epu8(s53, t30);
842 m45 = _mm_adds_epu8(s52, t30);
843 m46 = _mm_adds_epu8(s53, t29);
844 a221 = _mm_min_epu8(m44, m43);
845 d19 = _mm_cmpeq_epi8(a221, m44);
846 a222 = _mm_min_epu8(m46, m45);
847 d20 = _mm_cmpeq_epi8(a222, m46);
848 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
849 a223 = (b16 + 6);
850 *(a223) = s54;
851 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
852 a224 = (b16 + 7);
853 *(a224) = s55;
854 s56 = _mm_unpacklo_epi8(a221, a222);
855 s57 = _mm_unpackhi_epi8(a221, a222);
856 a225 = (a208 + 2);
857 *(a225) = s56;
858 a226 = (a208 + 3);
859 *(a226) = s57;
860 if ((((unsigned char*)X)[0] > 210)) {
861 __m128i m12, m13;
862 m12 = ((__m128i*)X)[0];
863 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
864 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
865 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
866 __m128i m14;
867 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
868 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
869 ((__m128i)m14)));
870 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
871 ((__m128i)m14)));
872 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
873 ((__m128i)m14)));
874 m14 = _mm_unpacklo_epi8(m14, m14);
875 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
876 m13 = _mm_unpacklo_epi64(m14, m14);
877 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
878 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
879 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
880 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
881 }
882 }
883
884 renormalize(X, 210);
885
886 /*int ch;
887 for(ch = 0; ch < 64; ch++) {
888 printf("%d,", X[ch]);
889 }
890 printf("\n");*/
891
892 unsigned int j;
893 for (j = 0; j < (framebits + excess) % 2; ++j) {
894 int i;
895 for (i = 0; i < 64 / 2; i++) {
896 BFLY(i,
897 (((framebits + excess) >> 1) << 1) + j,
898 syms,
899 Y,
900 X,
901 (decision_t*)dec,
902 Branchtab);
903 }
904
905
906 renormalize(Y, 210);
907
908 /*printf("\n");
909 for(ch = 0; ch < 64; ch++) {
910 printf("%d,", Y[ch]);
911 }
912 printf("\n");*/
913 }
914 /*skip*/
915}
916
917#endif /*LV_HAVE_NEON*/
918
919#if LV_HAVE_GENERIC
920
921static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
922 unsigned char* X,
923 unsigned char* syms,
924 unsigned char* dec,
925 unsigned int framebits,
926 unsigned int excess,
927 unsigned char* Branchtab)
928{
929 int nbits = framebits + excess;
930 int NUMSTATES = 64;
931 int RENORMALIZE_THRESHOLD = 210;
932
933 int s, i;
934 for (s = 0; s < nbits; s++) {
935 void* tmp;
936 for (i = 0; i < NUMSTATES / 2; i++) {
937 BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
938 }
939
940 renormalize(Y, RENORMALIZE_THRESHOLD);
941
943 tmp = (void*)X;
944 X = Y;
945 Y = (unsigned char*)tmp;
946 }
947}
948
949#endif /* LV_HAVE_GENERIC */
950
951#endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/