73#ifndef SSE2NEON_PRECISE_MINMAX
74#define SSE2NEON_PRECISE_MINMAX (0)
77#ifndef SSE2NEON_PRECISE_DIV
78#define SSE2NEON_PRECISE_DIV (0)
81#ifndef SSE2NEON_PRECISE_SQRT
82#define SSE2NEON_PRECISE_SQRT (0)
85#ifndef SSE2NEON_PRECISE_DP
86#define SSE2NEON_PRECISE_DP (0)
90#if defined(__GNUC__) || defined(__clang__)
91#pragma push_macro("FORCE_INLINE")
92#pragma push_macro("ALIGN_STRUCT")
93#define FORCE_INLINE static inline __attribute__((always_inline))
94#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
95#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
96#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
98#warning "Macro name collisions may happen with unsupported compiler."
100#define FORCE_INLINE static inline
103#define ALIGN_STRUCT(x) __declspec(align(x))
105#define _sse2neon_likely(x) (x)
106#define _sse2neon_unlikely(x) (x)
111#define _sse2neon_const static const
113#define _sse2neon_const const
122#if defined(__arm__) && __ARM_ARCH == 7
127#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
128#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
130#if !defined(__clang__)
131#pragma GCC push_options
132#pragma GCC target("fpu=neon")
134#elif defined(__aarch64__)
135#if !defined(__clang__)
136#pragma GCC push_options
137#pragma GCC target("+simd")
140#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
142 "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
144#if !defined(__clang__)
145#pragma GCC push_options
148#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
153#if !defined(__aarch64__) && (__ARM_ARCH == 8)
154#if defined __has_include && __has_include(<arm_acle.h>)
160#if !defined(__aarch64__)
169#if !defined(__aarch64__)
178#if defined(__GNUC__) && (__GNUC__ <= 9)
179#define __has_builtin(x) HAS##x
180#define HAS__builtin_popcount 1
181#define HAS__builtin_popcountll 1
183#define __has_builtin(x) 0
195#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
196 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
199#define _MM_FROUND_TO_NEAREST_INT 0x00
200#define _MM_FROUND_TO_NEG_INF 0x01
201#define _MM_FROUND_TO_POS_INF 0x02
202#define _MM_FROUND_TO_ZERO 0x03
203#define _MM_FROUND_CUR_DIRECTION 0x04
204#define _MM_FROUND_NO_EXC 0x08
205#define _MM_FROUND_RAISE_EXC 0x00
206#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
207#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
208#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
209#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
210#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
211#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
212#define _MM_ROUND_NEAREST 0x0000
213#define _MM_ROUND_DOWN 0x2000
214#define _MM_ROUND_UP 0x4000
215#define _MM_ROUND_TOWARD_ZERO 0x6000
217#define _MM_FLUSH_ZERO_MASK 0x8000
218#define _MM_FLUSH_ZERO_ON 0x8000
219#define _MM_FLUSH_ZERO_OFF 0x0000
221#define _MM_DENORMALS_ZERO_MASK 0x0040
222#define _MM_DENORMALS_ZERO_ON 0x0040
223#define _MM_DENORMALS_ZERO_OFF 0x0000
226#define __constrange(a, b) const
239#if defined(__aarch64__)
248#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
249#if (defined(__x86_64__) || defined(__i386__))
250#define __int64 long long
252#define __int64 int64_t
258#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
259#define vreinterpretq_m128_f32(x) (x)
260#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
262#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
263#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
264#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
265#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
267#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
268#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
269#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
270#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
272#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
273#define vreinterpretq_f32_m128(x) (x)
274#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
276#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
277#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
278#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
279#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
281#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
282#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
283#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
284#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
286#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
287#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
288#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
289#define vreinterpretq_m128i_s64(x) (x)
291#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
292#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
293#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
294#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
296#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
297#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
299#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
300#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
301#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
302#define vreinterpretq_s64_m128i(x) (x)
304#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
305#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
306#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
307#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
309#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
310#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
311#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
312#define vreinterpret_m64_s64(x) (x)
314#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
315#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
316#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
317#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
319#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
320#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
321#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
323#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
324#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
325#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
326#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
328#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
329#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
330#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
331#define vreinterpret_s64_m64(x) (x)
333#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
335#if defined(__aarch64__)
336#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
337#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
339#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
341#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
342#define vreinterpretq_m128d_f64(x) (x)
344#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
346#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
347#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
349#define vreinterpretq_f64_m128d(x) (x)
350#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
352#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
353#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
355#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
356#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
358#define vreinterpretq_m128d_f32(x) (x)
360#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
362#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
363#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
365#define vreinterpretq_f32_m128d(x) (x)
398 uint16_t m128_u16[8];
399 uint32_t m128_u32[4];
400 uint64_t m128_u64[2];
404#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
405#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
406#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
409#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
410#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
411#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
412#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
446#if defined(__GNUC__) && !defined(__clang__) && \
447 ((__GNUC__ <= 12 && defined(__arm__)) || \
448 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
449 (__GNUC__ <= 9 && defined(__aarch64__)))
453 ret.val[0] = vld1q_u8(p + 0);
454 ret.val[1] = vld1q_u8(p + 16);
455 ret.val[2] = vld1q_u8(p + 32);
456 ret.val[3] = vld1q_u8(p + 48);
463 return vld1q_u8_x4(p);
564#if defined(__aarch64__)
590 float32x2_t a21 = vget_high_f32(
592 float32x2_t b03 = vget_low_f32(
599 float32x2_t a03 = vget_low_f32(
601 float32x2_t b21 = vget_high_f32(
664 float32x2_t a02 = vset_lane_f32(a0, a22, 1);
682 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
689 float32_t b2 = vgetq_lane_f32(b, 2);
691 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
698 float32_t b2 = vgetq_lane_f32(b, 2);
700 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
714#if defined(__ARM_FEATURE_CRYPTO) && \
715 (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
719 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
720 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
721 return vreinterpretq_u64_p128(vmull_p64(a, b));
739 poly8x8_t a = vreinterpret_p8_u64(_a);
740 poly8x8_t b = vreinterpret_p8_u64(_b);
743 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
744 vcreate_u8(0x00000000ffffffff));
745 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
746 vcreate_u8(0x0000000000000000));
749 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));
751 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));
753 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));
755 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));
757 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));
759 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));
761 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));
763 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));
766 uint8x16_t l = veorq_u8(e, f);
767 uint8x16_t m = veorq_u8(g, h);
768 uint8x16_t n = veorq_u8(i, j);
772#if defined(__aarch64__)
773 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
774 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
775 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
776 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
777 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
778 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
779 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
780 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
782 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
783 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
784 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
785 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
789 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
790 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
791 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
795 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
796 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
797 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
800#if defined(__aarch64__)
801 uint8x16_t t0 = vreinterpretq_u8_u64(
802 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
803 uint8x16_t t1 = vreinterpretq_u8_u64(
804 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
805 uint8x16_t t2 = vreinterpretq_u8_u64(
806 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
807 uint8x16_t t3 = vreinterpretq_u8_u64(
808 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
810 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
811 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
812 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
813 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
816 uint8x16_t t0_shift = vextq_u8(t0, t0, 15);
817 uint8x16_t t1_shift = vextq_u8(t1, t1, 14);
818 uint8x16_t t2_shift = vextq_u8(t2, t2, 13);
819 uint8x16_t t3_shift = vextq_u8(t3, t3, 12);
822 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
823 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
824 uint8x16_t mix = veorq_u8(d, cross1);
825 uint8x16_t r = veorq_u8(mix, cross2);
826 return vreinterpretq_u64_u8(r);
838#define _mm_shuffle_epi32_default(a, imm) \
842 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
843 ret = vsetq_lane_s32( \
844 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
846 ret = vsetq_lane_s32( \
847 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
849 ret = vsetq_lane_s32( \
850 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
852 vreinterpretq_m128i_s32(ret); \
939#if defined(__aarch64__)
940#define _mm_shuffle_epi32_splat(a, imm) \
942 vreinterpretq_m128i_s32( \
943 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
946#define _mm_shuffle_epi32_splat(a, imm) \
948 vreinterpretq_m128i_s32( \
949 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
967#define _mm_shuffle_ps_default(a, b, imm) \
971 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
972 ret = vsetq_lane_f32( \
973 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
975 ret = vsetq_lane_f32( \
976 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
978 ret = vsetq_lane_f32( \
979 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
981 vreinterpretq_m128_f32(ret); \
990#define _mm_shufflelo_epi16_function(a, imm) \
992 int16x8_t ret = vreinterpretq_s16_m128i(a); \
993 int16x4_t lowBits = vget_low_s16(ret); \
994 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
995 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
997 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
999 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1001 vreinterpretq_m128i_s16(ret); \
1010#define _mm_shufflehi_epi16_function(a, imm) \
1012 int16x8_t ret = vreinterpretq_s16_m128i(a); \
1013 int16x4_t highBits = vget_high_s16(ret); \
1014 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1015 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1017 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1019 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1021 vreinterpretq_m128i_s16(ret); \
1050 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1329 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1339 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1349 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1359 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1371 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1411#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1439#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1443 float32_t data = vgetq_lane_f32(
1445 return (int32_t) data;
1559#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1593 int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1594 static const uint32_t bitMask[2] = {0xFFFFFFFF, 0};
1595 int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
1641#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1673#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1683#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1686 float32_t data = vgetq_lane_f32(
1688 return (int64_t) data;
1727#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1735#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1758#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1764#if SSE2NEON_PRECISE_DIV
1785#define _mm_extract_pi16(a, imm) \
1786 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1803#if defined(__aarch64__)
1810#if defined(__aarch64__)
1811 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
1813 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
1827#if defined(__aarch64__)
1834#if defined(__aarch64__)
1835 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
1837 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
1840 if (r.field.bit22) {
1850#define _mm_insert_pi16(a, b, imm) \
1852 vreinterpret_m64_s16( \
1853 vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1872#define _mm_load_ps1 _mm_load1_ps
1903 vcombine_f32(vget_low_f32(a), vld1_f32((
const float32_t *) p)));
1920 vcombine_f32(vld1_f32((
const float32_t *) p), vget_high_f32(a)));
1935 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1957 vsetq_lane_s16(*(
const int16_t *) p, vdupq_n_s16(0), 0));
1969 vcombine_s64(vld1_s64((
const int64_t *) p), vdup_n_s64(0)));
1979 return malloc(size);
1980 if (align == 2 || (
sizeof(
void *) == 8 && align == 4))
1981 align =
sizeof(
void *);
1982 if (!posix_memalign(&ptr, align, size))
1998 vst1_s8((int8_t *) mem_addr, masked);
2005#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
2027#if SSE2NEON_PRECISE_MINMAX
2057 float32_t value = vgetq_lane_f32(
_mm_max_ps(a, b), 0);
2082#if SSE2NEON_PRECISE_MINMAX
2112 float32_t value = vgetq_lane_f32(
_mm_min_ps(a, b), 0);
2158#if defined(__aarch64__)
2159 static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2160 uint8x8_t tmp = vshr_n_u8(input, 7);
2161 return vaddv_u8(vshl_u8(tmp, shift));
2164 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2165 uint32x2_t paired16 =
2166 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2167 uint8x8_t paired32 =
2168 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2169 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2180#if defined(__aarch64__)
2181 static const int32x4_t shift = {0, 1, 2, 3};
2182 uint32x4_t tmp = vshrq_n_u32(input, 31);
2183 return vaddvq_u32(vshlq_u32(tmp, shift));
2188 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2191 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2193 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2252#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2263#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2268#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2273#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2278#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2283#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2288#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2293#define _m_pminub(a, b) _mm_min_pu8(a, b)
2298#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2304#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2311 __builtin_prefetch(p);
2319#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2324#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2334#if SSE2NEON_PRECISE_DIV
2362#if SSE2NEON_PRECISE_SQRT
2379 return vsetq_lane_f32(vgetq_lane_f32(
_mm_rsqrt_ps(in), 0), in, 0);
2389 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2392 vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2405#if defined(__aarch64__)
2412#if defined(__aarch64__)
2413 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
2415 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
2420#if defined(__aarch64__)
2421 __asm__ __volatile__(
"msr FPCR, %0" ::
"r"(r));
2423 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
2451#if defined(__aarch64__)
2458#if defined(__aarch64__)
2459 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
2461 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
2482#if defined(__aarch64__)
2483 __asm__ __volatile__(
"msr FPCR, %0" ::
"r"(r));
2485 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
2539#if __has_builtin(__builtin_shufflevector)
2540#define _mm_shuffle_pi16(a, imm) \
2542 vreinterpret_m64_s16(__builtin_shufflevector( \
2543 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2544 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2547#define _mm_shuffle_pi16(a, imm) \
2551 vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2552 ret = vset_lane_s16( \
2553 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2555 ret = vset_lane_s16( \
2556 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2558 ret = vset_lane_s16( \
2559 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2561 vreinterpret_m64_s16(ret); \
2570 __sync_synchronize();
2575#if __has_builtin(__builtin_shufflevector)
2576#define _mm_shuffle_ps(a, b, imm) \
2578 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2579 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2580 float32x4_t _shuf = __builtin_shufflevector( \
2581 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2582 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2583 vreinterpretq_m128_f32(_shuf); \
2586#define _mm_shuffle_ps(a, b, imm) \
2590 case _MM_SHUFFLE(1, 0, 3, 2): \
2591 ret = _mm_shuffle_ps_1032((a), (b)); \
2593 case _MM_SHUFFLE(2, 3, 0, 1): \
2594 ret = _mm_shuffle_ps_2301((a), (b)); \
2596 case _MM_SHUFFLE(0, 3, 2, 1): \
2597 ret = _mm_shuffle_ps_0321((a), (b)); \
2599 case _MM_SHUFFLE(2, 1, 0, 3): \
2600 ret = _mm_shuffle_ps_2103((a), (b)); \
2602 case _MM_SHUFFLE(1, 0, 1, 0): \
2603 ret = _mm_movelh_ps((a), (b)); \
2605 case _MM_SHUFFLE(1, 0, 0, 1): \
2606 ret = _mm_shuffle_ps_1001((a), (b)); \
2608 case _MM_SHUFFLE(0, 1, 0, 1): \
2609 ret = _mm_shuffle_ps_0101((a), (b)); \
2611 case _MM_SHUFFLE(3, 2, 1, 0): \
2612 ret = _mm_shuffle_ps_3210((a), (b)); \
2614 case _MM_SHUFFLE(0, 0, 1, 1): \
2615 ret = _mm_shuffle_ps_0011((a), (b)); \
2617 case _MM_SHUFFLE(0, 0, 2, 2): \
2618 ret = _mm_shuffle_ps_0022((a), (b)); \
2620 case _MM_SHUFFLE(2, 2, 0, 0): \
2621 ret = _mm_shuffle_ps_2200((a), (b)); \
2623 case _MM_SHUFFLE(3, 2, 0, 2): \
2624 ret = _mm_shuffle_ps_3202((a), (b)); \
2626 case _MM_SHUFFLE(3, 2, 3, 2): \
2627 ret = _mm_movehl_ps((b), (a)); \
2629 case _MM_SHUFFLE(1, 1, 3, 3): \
2630 ret = _mm_shuffle_ps_1133((a), (b)); \
2632 case _MM_SHUFFLE(2, 0, 1, 0): \
2633 ret = _mm_shuffle_ps_2010((a), (b)); \
2635 case _MM_SHUFFLE(2, 0, 0, 1): \
2636 ret = _mm_shuffle_ps_2001((a), (b)); \
2638 case _MM_SHUFFLE(2, 0, 3, 2): \
2639 ret = _mm_shuffle_ps_2032((a), (b)); \
2642 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2661#if SSE2NEON_PRECISE_SQRT
2666 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2667 const uint32x4_t div_by_zero =
2668 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2669 recip = vreinterpretq_f32_u32(
2670 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2682#elif defined(__aarch64__)
2686 float32x4_t sq = vrecpeq_f32(recipsq);
2722 vst1q_f32(p, vdupq_n_f32(a0));
2742#define _mm_store1_ps _mm_store_ps1
2781 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2819#if __has_builtin(__builtin_nontemporal_store)
2820 __builtin_nontemporal_store(a, (float32x4_t *) p);
2858#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2860 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2861 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2862 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2863 vget_low_f32(ROW23.val[0])); \
2864 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2865 vget_low_f32(ROW23.val[1])); \
2866 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2867 vget_high_f32(ROW23.val[0])); \
2868 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2869 vget_high_f32(ROW23.val[1])); \
2874#define _mm_ucomieq_ss _mm_comieq_ss
2875#define _mm_ucomige_ss _mm_comige_ss
2876#define _mm_ucomigt_ss _mm_comigt_ss
2877#define _mm_ucomile_ss _mm_comile_ss
2878#define _mm_ucomilt_ss _mm_comilt_ss
2879#define _mm_ucomineq_ss _mm_comineq_ss
2885#if defined(__GNUC__) || defined(__clang__)
2886#pragma GCC diagnostic push
2887#pragma GCC diagnostic ignored "-Wuninitialized"
2891#if defined(__GNUC__) || defined(__clang__)
2892#pragma GCC diagnostic pop
2900#if defined(__GNUC__) || defined(__clang__)
2901#pragma GCC diagnostic push
2902#pragma GCC diagnostic ignored "-Wuninitialized"
2906#if defined(__GNUC__) || defined(__clang__)
2907#pragma GCC diagnostic pop
2922#if defined(__aarch64__)
2928 float32x2x2_t result = vzip_f32(a1, b1);
2944#if defined(__aarch64__)
2950 float32x2x2_t result = vzip_f32(a1, b1);
3013#if defined(__aarch64__)
3014 return vreinterpretq_m128d_f64(
3015 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3017 double *da = (
double *) &a;
3018 double *db = (
double *) &b;
3020 c[0] = da[0] + db[0];
3021 c[1] = da[1] + db[1];
3022 return vld1q_f32((float32_t *) c);
3036#if defined(__aarch64__)
3039 double *da = (
double *) &a;
3040 double *db = (
double *) &b;
3042 c[0] = da[0] + db[0];
3044 return vld1q_f32((float32_t *) c);
3196#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3201#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3240#if defined(__aarch64__)
3295#if defined(__aarch64__)
3297 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3302 uint32x4_t swapped = vrev64q_u32(cmp);
3321#if defined(__aarch64__)
3323 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3330 d[0] = (*(
double *) &a0) >= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3331 d[1] = (*(
double *) &a1) >= (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3343#if defined(__aarch64__)
3351 d[0] = (*(
double *) &a0) >= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3402#if defined(__aarch64__)
3404 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3411 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3412 d[1] = (*(
double *) &a1) > (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3424#if defined(__aarch64__)
3432 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3444#if defined(__aarch64__)
3446 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3453 d[0] = (*(
double *) &a0) <= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3454 d[1] = (*(
double *) &a1) <= (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3466#if defined(__aarch64__)
3474 d[0] = (*(
double *) &a0) <= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3520#if defined(__aarch64__)
3522 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3529 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3530 d[1] = (*(
double *) &a1) < (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3542#if defined(__aarch64__)
3549 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3561#if defined(__aarch64__)
3563 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3568 uint32x4_t swapped = vrev64q_u32(cmp);
3587#if defined(__aarch64__)
3589 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3590 vdupq_n_u64(UINT64_MAX)));
3598 !((*(
double *) &a0) >= (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3600 !((*(
double *) &a1) >= (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3620#if defined(__aarch64__)
3622 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3623 vdupq_n_u64(UINT64_MAX)));
3631 !((*(
double *) &a0) > (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3633 !((*(
double *) &a1) > (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3653#if defined(__aarch64__)
3655 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3656 vdupq_n_u64(UINT64_MAX)));
3664 !((*(
double *) &a0) <= (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3666 !((*(
double *) &a1) <= (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3686#if defined(__aarch64__)
3688 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3689 vdupq_n_u64(UINT64_MAX)));
3697 !((*(
double *) &a0) < (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3699 !((*(
double *) &a1) < (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3719#if defined(__aarch64__)
3721 uint64x2_t not_nan_a =
3722 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3723 uint64x2_t not_nan_b =
3724 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3732 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3733 (*(
double *) &b0) == (*(
double *) &b0))
3736 d[1] = ((*(
double *) &a1) == (*(
double *) &a1) &&
3737 (*(
double *) &b1) == (*(
double *) &b1))
3751#if defined(__aarch64__)
3758 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3759 (*(
double *) &b0) == (*(
double *) &b0))
3773#if defined(__aarch64__)
3775 uint64x2_t not_nan_a =
3776 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3777 uint64x2_t not_nan_b =
3778 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3780 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3787 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3788 (*(
double *) &b0) == (*(
double *) &b0))
3791 d[1] = ((*(
double *) &a1) == (*(
double *) &a1) &&
3792 (*(
double *) &b1) == (*(
double *) &b1))
3806#if defined(__aarch64__)
3813 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3814 (*(
double *) &b0) == (*(
double *) &b0))
3828#if defined(__aarch64__)
3829 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3834 return (*(
double *) &a0 >= *(
double *) &b0);
3843#if defined(__aarch64__)
3844 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3849 return (*(
double *) &a0 > *(
double *) &b0);
3858#if defined(__aarch64__)
3859 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3864 return (*(
double *) &a0 <= *(
double *) &b0);
3873#if defined(__aarch64__)
3874 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3879 return (*(
double *) &a0 < *(
double *) &b0);
3888#if defined(__aarch64__)
3889 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3891 uint32x4_t a_not_nan =
3893 uint32x4_t b_not_nan =
3895 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3898 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3899 vreinterpretq_u64_u32(a_eq_b));
3900 return vgetq_lane_u64(and_results, 0) & 0x1;
3924#if defined(__aarch64__)
3925 return vreinterpretq_m128d_f64(
3955 double d0 = ((
double *) &rnd)[0];
3956 double d1 = ((
double *) &rnd)[1];
3973 double d0 = ((
double *) &rnd)[0];
3974 double d1 = ((
double *) &rnd)[1];
3975 int32_t
ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3993#if defined(__aarch64__)
3994 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3997 float a0 = (float) ((
double *) &a)[0];
3998 float a1 = (float) ((
double *) &a)[1];
4015#if defined(__aarch64__)
4016 return vreinterpretq_m128d_f64(
4038#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
4050 float *f = (
float *) &a;
4053 uint32x4_t signmask = vdupq_n_u32(0x80000000);
4056 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4058 int32x4_t r_trunc = vcvtq_s32_f32(
4060 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4061 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
4062 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4064 float32x4_t delta = vsubq_f32(
4066 vcvtq_f32_s32(r_trunc));
4067 uint32x4_t is_delta_half =
4068 vceqq_f32(delta, half);
4070 vbslq_s32(is_delta_half, r_even, r_normal));
4073 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4079 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4098#if defined(__aarch64__)
4099 return vreinterpretq_m128d_f64(
4115#if defined(__aarch64__)
4116 return (
double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4118 return ((
double *) &a)[0];
4130#if defined(__aarch64__)
4131 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4134 double ret = ((
double *) &rnd)[0];
4135 return (int32_t) ret;
4147#if defined(__aarch64__)
4148 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4151 double ret = ((
double *) &rnd)[0];
4152 return (int64_t) ret;
4162#define _mm_cvtsd_si64x _mm_cvtsd_si64
4171#if defined(__aarch64__)
4173 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4203#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4211#if defined(__aarch64__)
4212 return vreinterpretq_m128d_f64(
4213 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4215 double bf = (double) b;
4226#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4248#if defined(__aarch64__)
4249 return vreinterpretq_m128d_f64(
4250 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4252 double bf = (double) b;
4271#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4277#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4291#if defined(__aarch64__)
4292 return vreinterpretq_m128d_f64(
4293 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4305 double a0 = ((
double *) &a)[0];
4306 double a1 = ((
double *) &a)[1];
4315 double a0 = ((
double *) &a)[0];
4316 double a1 = ((
double *) &a)[1];
4317 int32_t
ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4337 double ret = *((
double *) &a);
4338 return (int32_t) ret;
4349#if defined(__aarch64__)
4350 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4352 double ret = *((
double *) &a);
4353 return (int64_t) ret;
4363#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4376#if defined(__aarch64__)
4377 return vreinterpretq_m128d_f64(
4378 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4380 double *da = (
double *) &a;
4381 double *db = (
double *) &b;
4383 c[0] = da[0] / db[0];
4384 c[1] = da[1] / db[1];
4385 return vld1q_f32((float32_t *) c);
4396#if defined(__aarch64__)
4398 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4399 return vreinterpretq_m128d_f64(
4400 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4410#define _mm_extract_epi16(a, imm) \
4411 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4418#define _mm_insert_epi16(a, b, imm) \
4420 vreinterpretq_m128i_s16( \
4421 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4432#if defined(__aarch64__)
4433 return vreinterpretq_m128d_f64(vld1q_f64(p));
4435 const float *fp = (
const float *) p;
4436 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4448#define _mm_load_pd1 _mm_load1_pd
4460#if defined(__aarch64__)
4461 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4463 const float *fp = (
const float *) p;
4485#if defined(__aarch64__)
4486 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4502#if defined(__aarch64__)
4503 return vreinterpretq_m128d_f64(
4504 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4519 vcombine_s32(vld1_s32((int32_t
const *) p), vcreate_s32(0)));
4532#if defined(__aarch64__)
4533 return vreinterpretq_m128d_f64(
4534 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4537 vcombine_f32(vld1_f32((
const float *) p),
4552#if defined(__aarch64__)
4553 float64x2_t v = vld1q_f64(p);
4554 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4556 int64x2_t v = vld1q_s64((
const int64_t *) p);
4584 vsetq_lane_s32(*(
const int32_t *) p, vdupq_n_s32(0), 0));
4602 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4603 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4620 vst1q_s8((int8_t *) mem_addr, masked);
4646#if defined(__aarch64__)
4647#if SSE2NEON_PRECISE_MINMAX
4648 float64x2_t _a = vreinterpretq_f64_m128d(a);
4649 float64x2_t _b = vreinterpretq_f64_m128d(b);
4650 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4652 return vreinterpretq_m128d_f64(
4653 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4661 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? a0 : b0;
4662 d[1] = (*(
double *) &a1) > (*(
double *) &b1) ? a1 : b1;
4674#if defined(__aarch64__)
4677 double *da = (
double *) &a;
4678 double *db = (
double *) &b;
4679 double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4707#if defined(__aarch64__)
4708#if SSE2NEON_PRECISE_MINMAX
4709 float64x2_t _a = vreinterpretq_f64_m128d(a);
4710 float64x2_t _b = vreinterpretq_f64_m128d(b);
4711 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4713 return vreinterpretq_m128d_f64(
4714 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4722 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? a0 : b0;
4723 d[1] = (*(
double *) &a1) < (*(
double *) &b1) ? a1 : b1;
4734#if defined(__aarch64__)
4737 double *da = (
double *) &a;
4738 double *db = (
double *) &b;
4739 double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4802 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4817 uint32x4_t paired16 =
4818 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4831 uint64x2_t paired32 =
4832 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4845 uint8x16_t paired64 =
4846 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4853 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4862 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4863 return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4907#if defined(__aarch64__)
4908 return vreinterpretq_m128d_f64(
4909 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4911 double *da = (
double *) &a;
4912 double *db = (
double *) &b;
4914 c[0] = da[0] * db[0];
4915 c[1] = da[1] * db[1];
4916 return vld1q_f32((float32_t *) c);
4958 int32x4_t ab3210 = vmull_s16(a3210, b3210);
4961 int32x4_t ab7654 = vmull_s16(a7654, b7654);
4963 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4975 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4976#if defined(__aarch64__)
4979 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4980 vreinterpretq_u16_u32(ab7654));
4985 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4987 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
5084 __asm__ __volatile__(
"isb\n");
5094 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
5109 int16_t
ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
5135 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5158 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5159 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5160 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5161 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5162 return (
__m128i) vld1q_s8(data);
5171#if defined(__aarch64__)
5172 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
5181#define _mm_set_pd1 _mm_set1_pd
5249#if defined(__aarch64__)
5250 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5272 int16_t
ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5311 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5312 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5313 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5314 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5315 return (
__m128i) vld1q_s8(data);
5330#if defined(__aarch64__)
5331 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5348#if __has_builtin(__builtin_shufflevector)
5349#define _mm_shuffle_epi32(a, imm) \
5351 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5352 int32x4_t _shuf = __builtin_shufflevector( \
5353 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5354 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5355 vreinterpretq_m128i_s32(_shuf); \
5358#define _mm_shuffle_epi32(a, imm) \
5362 case _MM_SHUFFLE(1, 0, 3, 2): \
5363 ret = _mm_shuffle_epi_1032((a)); \
5365 case _MM_SHUFFLE(2, 3, 0, 1): \
5366 ret = _mm_shuffle_epi_2301((a)); \
5368 case _MM_SHUFFLE(0, 3, 2, 1): \
5369 ret = _mm_shuffle_epi_0321((a)); \
5371 case _MM_SHUFFLE(2, 1, 0, 3): \
5372 ret = _mm_shuffle_epi_2103((a)); \
5374 case _MM_SHUFFLE(1, 0, 1, 0): \
5375 ret = _mm_shuffle_epi_1010((a)); \
5377 case _MM_SHUFFLE(1, 0, 0, 1): \
5378 ret = _mm_shuffle_epi_1001((a)); \
5380 case _MM_SHUFFLE(0, 1, 0, 1): \
5381 ret = _mm_shuffle_epi_0101((a)); \
5383 case _MM_SHUFFLE(2, 2, 1, 1): \
5384 ret = _mm_shuffle_epi_2211((a)); \
5386 case _MM_SHUFFLE(0, 1, 2, 2): \
5387 ret = _mm_shuffle_epi_0122((a)); \
5389 case _MM_SHUFFLE(3, 3, 3, 2): \
5390 ret = _mm_shuffle_epi_3332((a)); \
5392 case _MM_SHUFFLE(0, 0, 0, 0): \
5393 ret = _mm_shuffle_epi32_splat((a), 0); \
5395 case _MM_SHUFFLE(1, 1, 1, 1): \
5396 ret = _mm_shuffle_epi32_splat((a), 1); \
5398 case _MM_SHUFFLE(2, 2, 2, 2): \
5399 ret = _mm_shuffle_epi32_splat((a), 2); \
5401 case _MM_SHUFFLE(3, 3, 3, 3): \
5402 ret = _mm_shuffle_epi32_splat((a), 3); \
5405 ret = _mm_shuffle_epi32_default((a), (imm)); \
5419#if __has_builtin(__builtin_shufflevector)
5420#define _mm_shuffle_pd(a, b, imm8) \
5421 vreinterpretq_m128d_s64(__builtin_shufflevector( \
5422 vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5423 ((imm8 & 0x2) >> 1) + 2))
5425#define _mm_shuffle_pd(a, b, imm8) \
5426 _mm_castsi128_pd(_mm_set_epi64x( \
5427 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5428 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5433#if __has_builtin(__builtin_shufflevector)
5434#define _mm_shufflehi_epi16(a, imm) \
5436 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5437 int16x8_t _shuf = __builtin_shufflevector( \
5438 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5439 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5440 (((imm) >> 6) & 0x3) + 4); \
5441 vreinterpretq_m128i_s16(_shuf); \
5444#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5449#if __has_builtin(__builtin_shufflevector)
5450#define _mm_shufflelo_epi16(a, imm) \
5452 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5453 int16x8_t _shuf = __builtin_shufflevector( \
5454 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5455 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5456 vreinterpretq_m128i_s16(_shuf); \
5459#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5481 int16x8_t vc = vdupq_n_s16((int16_t) c);
5504 int32x4_t vc = vdupq_n_s32((int32_t) c);
5527 int64x2_t vc = vdupq_n_s64((int64_t) c);
5610 vld1q_u8(((uint8_t
const *) tmp) + (16 - imm)));
5618#if defined(__aarch64__)
5619 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5621 double a0 = sqrt(((
double *) &a)[0]);
5622 double a1 = sqrt(((
double *) &a)[1]);
5633#if defined(__aarch64__)
5636 return _mm_set_pd(((
double *) &a)[1], sqrt(((
double *) &b)[0]));
5655 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5676 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5697 const int count = (imm & ~15) ? 15 : imm;
5698 return (
__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5715#define _mm_srai_epi32(a, imm) \
5718 if (_sse2neon_unlikely((imm) == 0)) { \
5720 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5721 ret = vreinterpretq_m128i_s32( \
5722 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
5724 ret = vreinterpretq_m128i_s32( \
5725 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
5749 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5772 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5795 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5812#define _mm_srli_epi16(a, imm) \
5815 if (_sse2neon_unlikely((imm) & ~15)) { \
5816 ret = _mm_setzero_si128(); \
5818 ret = vreinterpretq_m128i_u16( \
5819 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
5838#define _mm_srli_epi32(a, imm) \
5841 if (_sse2neon_unlikely((imm) & ~31)) { \
5842 ret = _mm_setzero_si128(); \
5844 ret = vreinterpretq_m128i_u32( \
5845 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
5863#define _mm_srli_epi64(a, imm) \
5866 if (_sse2neon_unlikely((imm) & ~63)) { \
5867 ret = _mm_setzero_si128(); \
5869 ret = vreinterpretq_m128i_u64( \
5870 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
5899#if defined(__aarch64__)
5900 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5912#if defined(__aarch64__)
5913 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5914 vst1q_f64((float64_t *) mem_addr,
5915 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5918 vst1q_f32((float32_t *) mem_addr,
5928#if defined(__aarch64__)
5929 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5946#define _mm_store1_pd _mm_store_pd1
5956#if defined(__aarch64__)
5957 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5978#if defined(__aarch64__)
5979 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
6029#if __has_builtin(__builtin_nontemporal_store)
6030 __builtin_nontemporal_store(a, (float32x4_t *) p);
6031#elif defined(__aarch64__)
6032 vst1q_f64(p, vreinterpretq_f64_m128d(a));
6044#if __has_builtin(__builtin_nontemporal_store)
6045 __builtin_nontemporal_store(a, p);
6057 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
6066 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
6124#if defined(__aarch64__)
6125 return vreinterpretq_m128d_f64(
6126 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6128 double *da = (
double *) &a;
6129 double *db = (
double *) &b;
6131 c[0] = da[0] - db[0];
6132 c[1] = da[1] - db[1];
6133 return vld1q_f32((float32_t *) c);
6212#define _mm_ucomieq_sd _mm_comieq_sd
6213#define _mm_ucomige_sd _mm_comige_sd
6214#define _mm_ucomigt_sd _mm_comigt_sd
6215#define _mm_ucomile_sd _mm_comile_sd
6216#define _mm_ucomilt_sd _mm_comilt_sd
6217#define _mm_ucomineq_sd _mm_comineq_sd
6223#if defined(__GNUC__) || defined(__clang__)
6224#pragma GCC diagnostic push
6225#pragma GCC diagnostic ignored "-Wuninitialized"
6229#if defined(__GNUC__) || defined(__clang__)
6230#pragma GCC diagnostic pop
6249#if defined(__aarch64__)
6255 int16x4x2_t result = vzip_s16(a1, b1);
6265#if defined(__aarch64__)
6271 int32x2x2_t result = vzip_s32(a1, b1);
6302#if defined(__aarch64__)
6310 int8x8x2_t result = vzip_s8(a1, b1);
6328#if defined(__aarch64__)
6329 return vreinterpretq_m128d_f64(
6330 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6353#if defined(__aarch64__)
6359 int16x4x2_t result = vzip_s16(a1, b1);
6375#if defined(__aarch64__)
6381 int32x2x2_t result = vzip_s32(a1, b1);
6407#if defined(__aarch64__)
6413 int8x8x2_t result = vzip_s8(a1, b1);
6431#if defined(__aarch64__)
6432 return vreinterpretq_m128d_f64(
6433 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6483#if defined(__aarch64__)
6484 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6485 vreinterpretq_f64_m128d(b),
6486 vreinterpretq_f64_m128d(mask)));
6499#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA)
6513#if defined(__aarch64__)
6514 return vreinterpretq_m128d_f64(
6515 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6517 double *da = (
double *) &a;
6518 double *db = (
double *) &b;
6519 double c[] = {da[0] + da[1], db[0] + db[1]};
6529#if defined(__aarch64__)
6538 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6547#if defined(__aarch64__)
6548 float64x2_t a = vreinterpretq_f64_m128d(_a);
6549 float64x2_t b = vreinterpretq_f64_m128d(_b);
6550 return vreinterpretq_m128d_f64(
6551 vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
6553 double *da = (
double *) &_a;
6554 double *db = (
double *) &_b;
6555 double c[] = {da[0] - da[1], db[0] - db[1]};
6567#if defined(__aarch64__)
6569 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6571 float32x4x2_t c = vuzpq_f32(a, b);
6583#define _mm_lddqu_si128 _mm_loadu_si128
6592#define _mm_loaddup_pd _mm_load1_pd
6599#if defined(__aarch64__)
6600 return vreinterpretq_m128d_f64(
6601 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6613#if __has_builtin(__builtin_shufflevector)
6629#if __has_builtin(__builtin_shufflevector)
6742 tmp[1] = vdupq_n_u8(0);
6758#define _mm_alignr_pi8(a, b, imm) \
6761 if (_sse2neon_unlikely((imm) >= 16)) { \
6762 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6764 uint8x8_t tmp_low, tmp_high; \
6766 const int idx = (imm) -8; \
6767 tmp_low = vreinterpret_u8_m64(a); \
6768 tmp_high = vdup_n_u8(0); \
6769 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6771 const int idx = (imm); \
6772 tmp_low = vreinterpret_u8_m64(b); \
6773 tmp_high = vreinterpret_u8_m64(a); \
6774 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6786#if defined(__aarch64__)
6790 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6791 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6802 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6803 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6828#if defined(__aarch64__)
6831 return vreinterpretq_s64_s16(
6832 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6839 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6840 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6853#if defined(__aarch64__)
6854 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6856 int16x4x2_t res = vuzp_s16(a, b);
6857 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6868#if defined(__aarch64__)
6870 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6872 int16x8x2_t c = vuzpq_s16(a, b);
6884#if defined(__aarch64__)
6886 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6888 int32x4x2_t c = vuzpq_s32(a, b);
6900#if defined(__aarch64__)
6903 int16x4x2_t c = vuzp_s16(a, b);
6915#if defined(__aarch64__)
6918 int32x2x2_t c = vuzp_s32(a, b);
6930#if defined(__aarch64__)
6932 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6934 int16x8x2_t c = vuzpq_s16(a, b);
6946#if defined(__aarch64__)
6949 int16x4x2_t c = vuzp_s16(a, b);
6966#if defined(__aarch64__)
6969 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6970 vmovl_s8(vget_low_s8(b)));
6971 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6972 vmovl_s8(vget_high_s8(b)));
6974 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6982 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6983 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6986 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6987 int16x8_t b_odd = vshrq_n_s16(b, 8);
6990 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6991 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
7009 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
7010 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
7013 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
7014 int16x4_t b_odd = vshr_n_s16(b, 8);
7017 int16x4_t prod1 = vmul_s16(a_even, b_even);
7018 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
7046 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
7047 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
7059 int32x4_t mul_extend =
7073 uint8x16_t idx_masked =
7074 vandq_u8(idx, vdupq_n_u8(0x8F));
7075#if defined(__aarch64__)
7077#elif defined(__GNUC__)
7081 __asm__ __volatile__(
7082 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
7083 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
7085 : [tbl]
"w"(tbl), [idx]
"w"(idx_masked));
7089 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
7091 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
7092 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
7112 const int8x8_t controlMask =
7139 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
7141#if defined(__aarch64__)
7142 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
7144 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
7149 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
7151 int16x8_t res = vbicq_s16(masked, zeroMask);
7176 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
7179#if defined(__aarch64__)
7180 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
7182 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
7187 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
7189 int32x4_t res = vbicq_s32(masked, zeroMask);
7214 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
7217#if defined(__aarch64__)
7218 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
7220 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
7225 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
7227 int8x16_t res = vbicq_s8(masked, zeroMask);
7255 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7258#if defined(__aarch64__)
7259 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7261 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7266 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7268 int16x4_t res = vbic_s16(masked, zeroMask);
7296 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7299#if defined(__aarch64__)
7300 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7302 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7307 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7309 int32x2_t res = vbic_s32(masked, zeroMask);
7337 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7340#if defined(__aarch64__)
7341 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7343 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7348 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7350 int8x8_t res = vbic_s8(masked, zeroMask);
7370#define _mm_blend_epi16(a, b, imm) \
7372 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
7373 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
7374 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
7375 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
7376 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
7377 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
7378 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
7379 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7380 uint16x8_t _mask_vec = vld1q_u16(_mask); \
7381 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
7382 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
7383 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
7389#define _mm_blend_pd(a, b, imm) \
7391 const uint64_t _mask[2] = { \
7392 ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
7393 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \
7394 uint64x2_t _mask_vec = vld1q_u64(_mask); \
7395 uint64x2_t _a = vreinterpretq_u64_m128d(a); \
7396 uint64x2_t _b = vreinterpretq_u64_m128d(b); \
7397 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7406 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7407 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7408 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7409 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7410 uint32x4_t mask = vld1q_u32(data);
7444#if defined(__aarch64__)
7445 float64x2_t a = vreinterpretq_f64_m128d(_a);
7446 float64x2_t b = vreinterpretq_f64_m128d(_b);
7447 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7474#if defined(__aarch64__)
7475 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7477 double *f = (
double *) &a;
7488#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7491 float *f = (
float *) &a;
7492 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7524#if defined(__aarch64__)
7532 uint32x4_t swapped = vrev64q_u32(cmp);
7550 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7551 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7568 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7577 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7578 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7587 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7588 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7589 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7606 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7607 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7625 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7635 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7636 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7645 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7646 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7647 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7658 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7659 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7660#if !SSE2NEON_PRECISE_DP
7661 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7662 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7665#if !SSE2NEON_PRECISE_DP
7671#if defined(__aarch64__)
7672 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7673 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7675 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7676 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7679 double d0 = (imm & 0x10) ? ((
double *) &a)[0] * ((
double *) &b)[0] : 0;
7680 double d1 = (imm & 0x20) ? ((
double *) &a)[1] * ((
double *) &b)[1] : 0;
7685#if defined(__aarch64__)
7686 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7688 double sum = *((
double *) &tmp) + *(((
double *) &tmp) + 1);
7703#if defined(__aarch64__)
7733 (imm & 0x1) ? s : 0,
7734 (imm & 0x2) ? s : 0,
7735 (imm & 0x4) ? s : 0,
7736 (imm & 0x8) ? s : 0,
7744#define _mm_extract_epi32(a, imm) \
7745 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7750#define _mm_extract_epi64(a, imm) \
7751 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7757#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7761#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7769#if defined(__aarch64__)
7770 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7772 double *f = (
double *) &a;
7783#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7786 float *f = (
float *) &a;
7787 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7819#define _mm_insert_epi32(a, b, imm) \
7821 vreinterpretq_m128i_s32( \
7822 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7829#define _mm_insert_epi64(a, b, imm) \
7831 vreinterpretq_m128i_s64( \
7832 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7839#define _mm_insert_epi8(a, b, imm) \
7841 vreinterpretq_m128i_s8( \
7842 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7849#define _mm_insert_ps(a, b, imm8) \
7851 float32x4_t tmp1 = \
7852 vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \
7853 vreinterpretq_f32_m128(a), 0); \
7854 float32x4_t tmp2 = \
7855 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7856 ((imm8 >> 4) & 0x3)); \
7857 const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7858 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7859 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7860 ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \
7861 uint32x4_t mask = vld1q_u32(data); \
7862 float32x4_t all_zeros = vdupq_n_f32(0); \
7864 vreinterpretq_m128_f32( \
7865 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
7975 uint16_t min, idx = 0;
7977#if defined(__aarch64__)
7992 for (i = 0; i < 8; i++) {
8019 switch (imm & 0x4) {
8029#if defined(__GNUC__) || defined(__clang__)
8030 __builtin_unreachable();
8035 switch (imm & 0x3) {
8037 _b = vreinterpretq_u8_u32(
8041 _b = vreinterpretq_u8_u32(
8045 _b = vreinterpretq_u8_u32(
8049 _b = vreinterpretq_u8_u32(
8053#if defined(__GNUC__) || defined(__clang__)
8054 __builtin_unreachable();
8059 int16x8_t c04, c15, c26, c37;
8060 uint8x8_t low_b = vget_low_u8(_b);
8061 c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8062 _a = vextq_u8(_a, _a, 1);
8063 c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8064 _a = vextq_u8(_a, _a, 1);
8065 c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8066 _a = vextq_u8(_a, _a, 1);
8067 c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8068#if defined(__aarch64__)
8070 c04 = vpaddq_s16(c04, c26);
8072 c15 = vpaddq_s16(c15, c37);
8075 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8077 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8079 vreinterpretq_s16_s32(trn2_c)));
8081 int16x4_t c01, c23, c45, c67;
8082 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8083 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8084 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8085 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8088 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8138#if defined(__aarch64__)
8141 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
8147 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
8149 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
8152 double *v_double = (
double *) &a;
8158 for (
int i = 0; i < 2; i++) {
8159 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
8160 double roundDown = floor(tmp);
8161 double roundUp = ceil(tmp);
8162 double diffDown = tmp - roundDown;
8163 double diffUp = roundUp - tmp;
8164 if (diffDown < diffUp) {
8167 }
else if (diffDown > diffUp) {
8173 double half = roundDown / 2;
8174 if (half != floor(half)) {
8184 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
8196 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
8197 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
8207#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
8221 float *v_float = (
float *) &a;
8226 uint32x4_t signmask = vdupq_n_u32(0x80000000);
8229 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
8231 int32x4_t r_trunc = vcvtq_s32_f32(
8233 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
8234 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
8235 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
8237 float32x4_t delta = vsubq_f32(
8239 vcvtq_f32_s32(r_trunc));
8240 uint32x4_t is_delta_half =
8241 vceqq_f32(delta, half);
8243 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
8253 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
8254 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
8255 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
8256 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
8299#if __has_builtin(__builtin_nontemporal_store)
8300 return __builtin_nontemporal_load(p);
8311 return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
8320 int64x2_t a_and_mask =
8322 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
8337 uint64x2_t result = vandq_u64(zf, cf);
8338 return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
8351 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8360#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
8371 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8380#if defined(__aarch64__)
8395#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8396 __asm__ __volatile__(
"crc32ch %w[c], %w[c], %w[v]\n\t"
8399#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8400 crc = __crc32ch(crc, v);
8413#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8414 __asm__ __volatile__(
"crc32cw %w[c], %w[c], %w[v]\n\t"
8417#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8418 crc = __crc32cw(crc, v);
8431#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8432 __asm__ __volatile__(
"crc32cx %w[c], %w[c], %x[v]\n\t"
8437 crc =
_mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8447#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8448 __asm__ __volatile__(
"crc32cb %w[c], %w[c], %w[v]\n\t"
8451#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8452 crc = __crc32cb(crc, v);
8455 for (
int bit = 0; bit < 8; bit++) {
8457 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8467#if !defined(__ARM_FEATURE_CRYPTO)
8469#define SSE2NEON_AES_DATA(w) \
8471 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8472 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8473 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8474 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8475 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8476 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8477 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8478 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8479 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8480 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8481 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8482 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8483 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8484 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8485 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8486 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8487 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8488 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8489 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8490 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8491 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8492 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8493 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8494 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8495 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8496 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8497 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8498 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8499 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8500 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8501 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8502 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8503 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8504 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8505 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8506 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8507 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8512#define SSE2NEON_AES_H0(x) (x)
8514#undef SSE2NEON_AES_H0
8524#if defined(__aarch64__)
8525 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8526 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8527 0xc, 0x1, 0x6, 0xb};
8528 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8529 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8535 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8544 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8545 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8546 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8552#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8553 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8554 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8555#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b ))
8556#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8557#define SSE2NEON_AES_U0(p) \
8558 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8559#define SSE2NEON_AES_U1(p) \
8560 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8561#define SSE2NEON_AES_U2(p) \
8562 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8563#define SSE2NEON_AES_U3(p) \
8564 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8565 static const uint32_t
ALIGN_STRUCT(16) aes_table[4][256] = {
8571#undef SSE2NEON_AES_B2W
8572#undef SSE2NEON_AES_F2
8573#undef SSE2NEON_AES_F3
8574#undef SSE2NEON_AES_U0
8575#undef SSE2NEON_AES_U1
8576#undef SSE2NEON_AES_U2
8577#undef SSE2NEON_AES_U3
8585 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8586 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8587 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8588 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8589 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8590 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8591 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8592 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8622 for (
int i = 0; i < 16; i++)
8638 for (
int i = 0; i < 4; ++i) {
8643 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8645#undef SSE2NEON_AES_DATA
8676 u8[0x4], u8[0x1], u8[0xE], u8[0xB],
8677 u8[0x1], u8[0xE], u8[0xB], u8[0x4],
8678 u8[0xC], u8[0x9], u8[0x6], u8[0x3],
8679 u8[0x9], u8[0x6], u8[0x3], u8[0xC],
8681 uint32x4_t r = {0, (unsigned) rcon, 0, (
unsigned) rcon};
8695 switch (imm & 0x11) {
8717#if defined(__aarch64__)
8724#if defined(__aarch64__)
8725 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
8727 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
8738#if defined(__aarch64__)
8739#if __has_builtin(__builtin_popcount)
8740 return __builtin_popcount(a);
8742 return (
int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8746 uint8x8_t input_val, count8x8_val;
8747 uint16x4_t count16x4_val;
8748 uint32x2_t count32x2_val;
8750 input_val = vld1_u8((uint8_t *) &a);
8751 count8x8_val = vcnt_u8(input_val);
8752 count16x4_val = vpaddl_u8(count8x8_val);
8753 count32x2_val = vpaddl_u16(count16x4_val);
8755 vst1_u32(&count, count32x2_val);
8765#if defined(__aarch64__)
8766#if __has_builtin(__builtin_popcountll)
8767 return __builtin_popcountll(a);
8769 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8773 uint8x8_t input_val, count8x8_val;
8774 uint16x4_t count16x4_val;
8775 uint32x2_t count32x2_val;
8776 uint64x1_t count64x1_val;
8778 input_val = vld1_u8((uint8_t *) &a);
8779 count8x8_val = vcnt_u8(input_val);
8780 count16x4_val = vpaddl_u8(count8x8_val);
8781 count32x2_val = vpaddl_u16(count16x4_val);
8782 count64x1_val = vpaddl_u32(count32x2_val);
8783 vst1_u64(&count, count64x1_val);
8794#if defined(__aarch64__)
8801#if defined(__aarch64__)
8802 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
8804 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
8809#if defined(__aarch64__)
8810 __asm__ __volatile__(
"msr FPCR, %0" ::
"r"(r));
8812 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
8821#if defined(__aarch64__)
8830 asm volatile(
"mrs %0, cntvct_el0" :
"=r"(
val));
8834 uint32_t pmccntr, pmuseren, pmcntenset;
8837 asm volatile(
"mrc p15, 0, %0, c9, c14, 0" :
"=r"(pmuseren));
8839 asm volatile(
"mrc p15, 0, %0, c9, c12, 1" :
"=r"(pmcntenset));
8840 if (pmcntenset & 0x80000000UL) {
8841 asm volatile(
"mrc p15, 0, %0, c9, c13, 0" :
"=r"(pmccntr));
8843 return (uint64_t) (pmccntr) << 6;
8849 gettimeofday(&tv, NULL);
8850 return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
8854#if defined(__GNUC__) || defined(__clang__)
8855#pragma pop_macro("ALIGN_STRUCT")
8856#pragma pop_macro("FORCE_INLINE")
8859#if defined(__GNUC__) && !defined(__clang__)
8860#pragma GCC pop_options