Vector Optimized Library of Kernels 3.1.0
Architecture-tuned implementations of math kernels
sse2neon.h
Go to the documentation of this file.
1/*
2 * Copyright 2022 Free Software Foundation, Inc.
3 *
4 * This file is part of VOLK
5 *
6 * SPDX-License-Identifier: MIT
7 *
8 * This file is from :
9 * https://github.com/DLTcollab/sse2neon
10 */
11
12// Turn off Clang formatting, as
13// this would make diffs a lot more
14// tricky.
15// clang-format off
16#ifndef SSE2NEON_H
17#define SSE2NEON_H
18
19// This header file provides a simple API translation layer
20// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
21//
22// This header file does not yet translate all of the SSE intrinsics.
23//
24// Contributors to this work are:
25// John W. Ratcliff <jratcliffscarab@gmail.com>
26// Brandon Rowlett <browlett@nvidia.com>
27// Ken Fast <kfast@gdeb.com>
28// Eric van Beurden <evanbeurden@nvidia.com>
29// Alexander Potylitsin <apotylitsin@nvidia.com>
30// Hasindu Gamaarachchi <hasindu2008@gmail.com>
31// Jim Huang <jserv@biilabs.io>
32// Mark Cheng <marktwtn@biilabs.io>
33// Malcolm James MacLeod <malcolm@gulden.com>
34// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
35// Sebastian Pop <spop@amazon.com>
36// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
37// Danila Kutenin <danilak@google.com>
38// François Turban (JishinMaster) <francois.turban@gmail.com>
39// Pei-Hsuan Hung <afcidk@gmail.com>
40// Yang-Hao Yuan <yanghau@biilabs.io>
41// Syoyo Fujita <syoyo@lighttransport.com>
42// Brecht Van Lommel <brecht@blender.org>
43
44/*
45 * sse2neon is freely redistributable under the MIT License.
46 *
47 * Permission is hereby granted, free of charge, to any person obtaining a copy
48 * of this software and associated documentation files (the "Software"), to deal
49 * in the Software without restriction, including without limitation the rights
50 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
51 * copies of the Software, and to permit persons to whom the Software is
52 * furnished to do so, subject to the following conditions:
53 *
54 * The above copyright notice and this permission notice shall be included in
55 * all copies or substantial portions of the Software.
56 *
57 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
59 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
62 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
63 * SOFTWARE.
64 */
65
66/* Tunable configurations */
67
68/* Enable precise implementation of math operations
69 * This would slow down the computation a bit, but gives consistent result with
70 * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
71 */
72/* _mm_min|max_ps|ss|pd|sd */
73#ifndef SSE2NEON_PRECISE_MINMAX
74#define SSE2NEON_PRECISE_MINMAX (0)
75#endif
76/* _mm_rcp_ps and _mm_div_ps */
77#ifndef SSE2NEON_PRECISE_DIV
78#define SSE2NEON_PRECISE_DIV (0)
79#endif
80/* _mm_sqrt_ps and _mm_rsqrt_ps */
81#ifndef SSE2NEON_PRECISE_SQRT
82#define SSE2NEON_PRECISE_SQRT (0)
83#endif
84/* _mm_dp_pd */
85#ifndef SSE2NEON_PRECISE_DP
86#define SSE2NEON_PRECISE_DP (0)
87#endif
88
89/* compiler specific definitions */
90#if defined(__GNUC__) || defined(__clang__)
91#pragma push_macro("FORCE_INLINE")
92#pragma push_macro("ALIGN_STRUCT")
93#define FORCE_INLINE static inline __attribute__((always_inline))
94#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
95#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
96#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
97#else /* non-GNU / non-clang compilers */
98#warning "Macro name collisions may happen with unsupported compiler."
99#ifndef FORCE_INLINE
100#define FORCE_INLINE static inline
101#endif
102#ifndef ALIGN_STRUCT
103#define ALIGN_STRUCT(x) __declspec(align(x))
104#endif
105#define _sse2neon_likely(x) (x)
106#define _sse2neon_unlikely(x) (x)
107#endif
108
109/* C language does not allow initializing a variable with a function call. */
110#ifdef __cplusplus
111#define _sse2neon_const static const
112#else
113#define _sse2neon_const const
114#endif
115
116#include <stdint.h>
117#include <stdlib.h>
118
119/* Architecture-specific build options */
120/* FIXME: #pragma GCC push_options is only available on GCC */
121#if defined(__GNUC__)
122#if defined(__arm__) && __ARM_ARCH == 7
123/* According to ARM C Language Extensions Architecture specification,
124 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
125 * architecture supported.
126 */
127#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
128#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
129#endif
130#if !defined(__clang__)
131#pragma GCC push_options
132#pragma GCC target("fpu=neon")
133#endif
134#elif defined(__aarch64__)
135#if !defined(__clang__)
136#pragma GCC push_options
137#pragma GCC target("+simd")
138#endif
139#elif __ARM_ARCH == 8
140#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
141#error \
142 "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
143#endif
144#if !defined(__clang__)
145#pragma GCC push_options
146#endif
147#else
148#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
149#endif
150#endif
151
152#include <arm_neon.h>
153#if !defined(__aarch64__) && (__ARM_ARCH == 8)
154#if defined __has_include && __has_include(<arm_acle.h>)
155#include <arm_acle.h>
156#endif
157#endif
158
159/* Rounding functions require either Aarch64 instructions or libm failback */
160#if !defined(__aarch64__)
161#include <math.h>
162#endif
163
164/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
165 * or even not accessible in user mode.
166 * To write or access to these registers in user mode,
167 * we have to perform syscall instead.
168 */
169#if !defined(__aarch64__)
170#include <sys/time.h>
171#endif
172
173/* "__has_builtin" can be used to query support for built-in functions
174 * provided by gcc/clang and other compilers that support it.
175 */
176#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
177/* Compatibility with gcc <= 9 */
178#if defined(__GNUC__) && (__GNUC__ <= 9)
179#define __has_builtin(x) HAS##x
180#define HAS__builtin_popcount 1
181#define HAS__builtin_popcountll 1
182#else
183#define __has_builtin(x) 0
184#endif
185#endif
186
195#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
196 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
197
198/* Rounding mode macros. */
199#define _MM_FROUND_TO_NEAREST_INT 0x00
200#define _MM_FROUND_TO_NEG_INF 0x01
201#define _MM_FROUND_TO_POS_INF 0x02
202#define _MM_FROUND_TO_ZERO 0x03
203#define _MM_FROUND_CUR_DIRECTION 0x04
204#define _MM_FROUND_NO_EXC 0x08
205#define _MM_FROUND_RAISE_EXC 0x00
206#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
207#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
208#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
209#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
210#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
211#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
212#define _MM_ROUND_NEAREST 0x0000
213#define _MM_ROUND_DOWN 0x2000
214#define _MM_ROUND_UP 0x4000
215#define _MM_ROUND_TOWARD_ZERO 0x6000
216/* Flush zero mode macros. */
217#define _MM_FLUSH_ZERO_MASK 0x8000
218#define _MM_FLUSH_ZERO_ON 0x8000
219#define _MM_FLUSH_ZERO_OFF 0x0000
220/* Denormals are zeros mode macros. */
221#define _MM_DENORMALS_ZERO_MASK 0x0040
222#define _MM_DENORMALS_ZERO_ON 0x0040
223#define _MM_DENORMALS_ZERO_OFF 0x0000
224
225/* indicate immediate constant argument in a given range */
226#define __constrange(a, b) const
227
228/* A few intrinsics accept traditional data types like ints or floats, but
229 * most operate on data types that are specific to SSE.
230 * If a vector type ends in d, it contains doubles, and if it does not have
231 * a suffix, it contains floats. An integer vector type can contain any type
232 * of integer, from chars to shorts to unsigned long longs.
233 */
234typedef int64x1_t __m64;
235typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
236// On ARM 32-bit architecture, the float64x2_t is not supported.
237// The data type __m128d should be represented in a different way for related
238// intrinsic conversion.
239#if defined(__aarch64__)
240typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
241#else
242typedef float32x4_t __m128d;
243#endif
244typedef int64x2_t __m128i; /* 128-bit vector containing integers */
245
246// __int64 is defined in the Intrinsics Guide which maps to different datatype
247// in different data model
248#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
249#if (defined(__x86_64__) || defined(__i386__))
250#define __int64 long long
251#else
252#define __int64 int64_t
253#endif
254#endif
255
256/* type-safe casting between types */
257
258#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
259#define vreinterpretq_m128_f32(x) (x)
260#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
261
262#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
263#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
264#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
265#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
266
267#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
268#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
269#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
270#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
271
272#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
273#define vreinterpretq_f32_m128(x) (x)
274#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
275
276#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
277#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
278#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
279#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
280
281#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
282#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
283#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
284#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
285
286#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
287#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
288#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
289#define vreinterpretq_m128i_s64(x) (x)
290
291#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
292#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
293#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
294#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
295
296#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
297#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
298
299#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
300#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
301#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
302#define vreinterpretq_s64_m128i(x) (x)
303
304#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
305#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
306#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
307#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
308
309#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
310#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
311#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
312#define vreinterpret_m64_s64(x) (x)
313
314#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
315#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
316#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
317#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
318
319#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
320#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
321#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
322
323#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
324#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
325#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
326#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
327
328#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
329#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
330#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
331#define vreinterpret_s64_m64(x) (x)
332
333#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
334
335#if defined(__aarch64__)
336#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
337#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
338
339#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
340
341#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
342#define vreinterpretq_m128d_f64(x) (x)
343
344#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
345
346#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
347#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
348
349#define vreinterpretq_f64_m128d(x) (x)
350#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
351#else
352#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
353#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
354
355#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
356#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
357
358#define vreinterpretq_m128d_f32(x) (x)
359
360#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
361
362#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
363#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
364
365#define vreinterpretq_f32_m128d(x) (x)
366#endif
367
368// A struct is defined in this header file called 'SIMDVec' which can be used
369// by applications which attempt to access the contents of an __m128 struct
370// directly. It is important to note that accessing the __m128 struct directly
371// is bad coding practice by Microsoft: @see:
372// https://docs.microsoft.com/en-us/cpp/cpp/m128
373//
374// However, some legacy source code may try to access the contents of an __m128
375// struct directly so the developer can use the SIMDVec as an alias for it. Any
376// casting must be done manually by the developer, as you cannot cast or
377// otherwise alias the base NEON data type for intrinsic operations.
378//
379// union intended to allow direct access to an __m128 variable using the names
380// that the MSVC compiler provides. This union should really only be used when
381// trying to access the members of the vector as integer values. GCC/clang
382// allow native access to the float members through a simple array access
383// operator (in C since 4.6, in C++ since 4.8).
384//
385// Ideally direct accesses to SIMD vectors should not be used since it can cause
386// a performance hit. If it really is needed however, the original __m128
387// variable can be aliased with a pointer to this union and used to access
388// individual components. The use of this union should be hidden behind a macro
389// that is used throughout the codebase to access the members instead of always
390// declaring this type of variable.
391typedef union ALIGN_STRUCT(16) SIMDVec {
392 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
393 int8_t m128_i8[16]; // as signed 8-bit integers.
394 int16_t m128_i16[8]; // as signed 16-bit integers.
395 int32_t m128_i32[4]; // as signed 32-bit integers.
396 int64_t m128_i64[2]; // as signed 64-bit integers.
397 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
398 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
399 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
400 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
402
403// casting using SIMDVec
404#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
405#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
406#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
407
408/* SSE macros */
409#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
410#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
411#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
412#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
413
414// Function declaration
415// SSE
421// SSE2
428FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
429FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
430FORCE_INLINE __m128d _mm_set_pd(double, double);
433// SSE4.1
440// SSE4.2
441FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
442
443/* Backwards compatibility for compilers with lack of specific type support */
444
445// Older gcc does not define vld1q_u8_x4 type
446#if defined(__GNUC__) && !defined(__clang__) && \
447 ((__GNUC__ <= 12 && defined(__arm__)) || \
448 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
449 (__GNUC__ <= 9 && defined(__aarch64__)))
450FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
451{
452 uint8x16x4_t ret;
453 ret.val[0] = vld1q_u8(p + 0);
454 ret.val[1] = vld1q_u8(p + 16);
455 ret.val[2] = vld1q_u8(p + 32);
456 ret.val[3] = vld1q_u8(p + 48);
457 return ret;
458}
459#else
460// Wraps vld1q_u8_x4
461FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
462{
463 return vld1q_u8_x4(p);
464}
465#endif
466
467/* Function Naming Conventions
468 * The naming convention of SSE intrinsics is straightforward. A generic SSE
469 * intrinsic function is given as follows:
470 * _mm_<name>_<data_type>
471 *
472 * The parts of this format are given as follows:
473 * 1. <name> describes the operation performed by the intrinsic
474 * 2. <data_type> identifies the data type of the function's primary arguments
475 *
476 * This last part, <data_type>, is a little complicated. It identifies the
477 * content of the input values, and can be set to any of the following values:
478 * + ps - vectors contain floats (ps stands for packed single-precision)
479 * + pd - vectors cantain doubles (pd stands for packed double-precision)
480 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
481 * signed integers
482 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
483 * unsigned integers
484 * + si128 - unspecified 128-bit vector or 256-bit vector
485 * + m128/m128i/m128d - identifies input vector types when they are different
486 * than the type of the returned vector
487 *
488 * For example, _mm_setzero_ps. The _mm implies that the function returns
489 * a 128-bit vector. The _ps at the end implies that the argument vectors
490 * contain floats.
491 *
492 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
493 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
494 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
495 * // Set packed 8-bit integers
496 * // 128 bits, 16 chars, per 8 bits
497 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
498 * 4, 5, 12, 13, 6, 7, 14, 15);
499 * // Shuffle packed 8-bit integers
500 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
501 *
502 * Data (Number, Binary, Byte Index):
503 +------+------+-------------+------+------+-------------+
504 | 1 | 2 | 3 | 4 | Number
505 +------+------+------+------+------+------+------+------+
506 | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
507 +------+------+------+------+------+------+------+------+
508 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
509 +------+------+------+------+------+------+------+------+
510
511 +------+------+------+------+------+------+------+------+
512 | 5 | 6 | 7 | 8 | Number
513 +------+------+------+------+------+------+------+------+
514 | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
515 +------+------+------+------+------+------+------+------+
516 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
517 +------+------+------+------+------+------+------+------+
518 * Index (Byte Index):
519 +------+------+------+------+------+------+------+------+
520 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
521 +------+------+------+------+------+------+------+------+
522
523 +------+------+------+------+------+------+------+------+
524 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
525 +------+------+------+------+------+------+------+------+
526 * Result:
527 +------+------+------+------+------+------+------+------+
528 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
529 +------+------+------+------+------+------+------+------+
530 | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
531 +------+------+------+------+------+------+------+------+
532 | 256 | 2 | 5 | 6 | Number
533 +------+------+------+------+------+------+------+------+
534
535 +------+------+------+------+------+------+------+------+
536 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
537 +------+------+------+------+------+------+------+------+
538 | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
539 +------+------+------+------+------+------+------+------+
540 | 3 | 7 | 4 | 8 | Number
541 +------+------+------+------+------+------+-------------+
542 */
543
544/* Constants for use with _mm_prefetch. */
546 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
547 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
548 _MM_HINT_T1 = 2, /* load data to L2 cache only */
549 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
550 _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
551 _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
552 _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
553 _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
555
556// The bit field mapping to the FPCR(floating-point control register)
557typedef struct {
558 uint16_t res0;
559 uint8_t res1 : 6;
560 uint8_t bit22 : 1;
561 uint8_t bit23 : 1;
562 uint8_t bit24 : 1;
563 uint8_t res2 : 7;
564#if defined(__aarch64__)
565 uint32_t res3;
566#endif
568
569// Takes the upper 64 bits of a and places it in the low end of the result
570// Takes the lower 64 bits of b and places it into the high end of the result.
572{
573 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
574 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
575 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
576}
577
578// takes the lower two 32-bit values from a and swaps them and places in high
579// end of result takes the higher two 32 bit values from b and swaps them and
580// places in low end of result.
582{
583 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
584 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
585 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
586}
587
589{
590 float32x2_t a21 = vget_high_f32(
592 float32x2_t b03 = vget_low_f32(
594 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
595}
596
598{
599 float32x2_t a03 = vget_low_f32(
601 float32x2_t b21 = vget_high_f32(
603 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
604}
605
607{
608 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
609 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
610 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
611}
612
614{
615 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
616 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
617 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
618}
619
621{
622 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
623 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
624 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
625}
626
627// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
628// high
630{
631 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
632 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
633 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
634}
635
637{
638 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
639 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
640 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
641}
642
644{
645 float32x2_t a22 =
646 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
647 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
648 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
649}
650
652{
653 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
654 float32x2_t b22 =
655 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
656 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
657}
658
660{
661 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
662 float32x2_t a22 =
663 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
664 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
665 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
666 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
667}
668
670{
671 float32x2_t a33 =
672 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
673 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
674 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
675}
676
678{
679 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
680 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
681 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
682 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
683 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
684}
685
687{
688 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
689 float32_t b2 = vgetq_lane_f32(b, 2);
690 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
691 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
692 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
693}
694
696{
697 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
698 float32_t b2 = vgetq_lane_f32(b, 2);
699 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
700 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
701 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
702}
703
704// Kahan summation for accurate summation of floating-point numbers.
705// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
706FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
707{
708 y -= *c;
709 float t = *sum + y;
710 *c = (t - *sum) - y;
711 *sum = t;
712}
713
714#if defined(__ARM_FEATURE_CRYPTO) && \
715 (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
716// Wraps vmull_p64
717FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
718{
719 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
720 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
721 return vreinterpretq_u64_p128(vmull_p64(a, b));
722}
723#else // ARMv7 polyfill
724// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
725//
726// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
727// 64-bit->128-bit polynomial multiply.
728//
729// It needs some work and is somewhat slow, but it is still faster than all
730// known scalar methods.
731//
732// Algorithm adapted to C from
733// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
734// from "Fast Software Polynomial Multiplication on ARM Processors Using the
735// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
736// (https://hal.inria.fr/hal-01506572)
737static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
738{
739 poly8x8_t a = vreinterpret_p8_u64(_a);
740 poly8x8_t b = vreinterpret_p8_u64(_b);
741
742 // Masks
743 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
744 vcreate_u8(0x00000000ffffffff));
745 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
746 vcreate_u8(0x0000000000000000));
747
748 // Do the multiplies, rotating with vext to get all combinations
749 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
750 uint8x16_t e =
751 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
752 uint8x16_t f =
753 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
754 uint8x16_t g =
755 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
756 uint8x16_t h =
757 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
758 uint8x16_t i =
759 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
760 uint8x16_t j =
761 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
762 uint8x16_t k =
763 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
764
765 // Add cross products
766 uint8x16_t l = veorq_u8(e, f); // L = E + F
767 uint8x16_t m = veorq_u8(g, h); // M = G + H
768 uint8x16_t n = veorq_u8(i, j); // N = I + J
769
770 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
771 // instructions.
772#if defined(__aarch64__)
773 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
774 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
775 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
776 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
777 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
778 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
779 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
780 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
781#else
782 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
783 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
784 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
785 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
786#endif
787 // t0 = (L) (P0 + P1) << 8
788 // t1 = (M) (P2 + P3) << 16
789 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
790 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
791 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
792
793 // t2 = (N) (P4 + P5) << 24
794 // t3 = (K) (P6 + P7) << 32
795 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
796 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
797 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
798
799 // De-interleave
800#if defined(__aarch64__)
801 uint8x16_t t0 = vreinterpretq_u8_u64(
802 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
803 uint8x16_t t1 = vreinterpretq_u8_u64(
804 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
805 uint8x16_t t2 = vreinterpretq_u8_u64(
806 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
807 uint8x16_t t3 = vreinterpretq_u8_u64(
808 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
809#else
810 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
811 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
812 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
813 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
814#endif
815 // Shift the cross products
816 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
817 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
818 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
819 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
820
821 // Accumulate the products
822 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
823 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
824 uint8x16_t mix = veorq_u8(d, cross1);
825 uint8x16_t r = veorq_u8(mix, cross2);
826 return vreinterpretq_u64_u8(r);
827}
828#endif // ARMv7 polyfill
829
830// C equivalent:
831// __m128i _mm_shuffle_epi32_default(__m128i a,
832// __constrange(0, 255) int imm) {
833// __m128i ret;
834// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
835// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
836// return ret;
837// }
838#define _mm_shuffle_epi32_default(a, imm) \
839 __extension__({ \
840 int32x4_t ret; \
841 ret = vmovq_n_s32( \
842 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
843 ret = vsetq_lane_s32( \
844 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
845 ret, 1); \
846 ret = vsetq_lane_s32( \
847 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
848 ret, 2); \
849 ret = vsetq_lane_s32( \
850 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
851 ret, 3); \
852 vreinterpretq_m128i_s32(ret); \
853 })
854
855// Takes the upper 64 bits of a and places it in the low end of the result
856// Takes the lower 64 bits of a and places it into the high end of the result.
858{
859 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
860 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
861 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
862}
863
864// takes the lower two 32-bit values from a and swaps them and places in low end
865// of result takes the higher two 32 bit values from a and swaps them and places
866// in high end of result.
868{
869 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
870 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
871 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
872}
873
874// rotates the least significant 32 bits into the most significant 32 bits, and
875// shifts the rest down
877{
880}
881
882// rotates the most significant 32 bits into the least significant 32 bits, and
883// shifts the rest up
885{
888}
889
890// gets the lower 64 bits of a, and places it in the upper 64 bits
891// gets the lower 64 bits of a and places it in the lower 64 bits
893{
894 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
895 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
896}
897
898// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
899// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
901{
902 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
903 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
904 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
905}
906
907// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
908// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
909// places it in the lower 64 bits
911{
912 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
913 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
914}
915
917{
918 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
919 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
920 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
921}
922
924{
925 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
926 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
927 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
928}
929
931{
932 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
933 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
934 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
935}
936
937// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
938// int imm)
939#if defined(__aarch64__)
940#define _mm_shuffle_epi32_splat(a, imm) \
941 __extension__({ \
942 vreinterpretq_m128i_s32( \
943 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
944 })
945#else
946#define _mm_shuffle_epi32_splat(a, imm) \
947 __extension__({ \
948 vreinterpretq_m128i_s32( \
949 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
950 })
951#endif
952
953// NEON does not support a general purpose permute intrinsic
954// Selects four specific single-precision, floating-point values from a and b,
955// based on the mask i.
956//
957// C equivalent:
958// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
959// __constrange(0, 255) int imm) {
960// __m128 ret;
961// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
962// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
963// return ret;
964// }
965//
966// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
967#define _mm_shuffle_ps_default(a, b, imm) \
968 __extension__({ \
969 float32x4_t ret; \
970 ret = vmovq_n_f32( \
971 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
972 ret = vsetq_lane_f32( \
973 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
974 ret, 1); \
975 ret = vsetq_lane_f32( \
976 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
977 ret, 2); \
978 ret = vsetq_lane_f32( \
979 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
980 ret, 3); \
981 vreinterpretq_m128_f32(ret); \
982 })
983
984// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
985// by imm.
986// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
987// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
988// __constrange(0,255) int
989// imm)
990#define _mm_shufflelo_epi16_function(a, imm) \
991 __extension__({ \
992 int16x8_t ret = vreinterpretq_s16_m128i(a); \
993 int16x4_t lowBits = vget_low_s16(ret); \
994 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
995 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
996 1); \
997 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
998 2); \
999 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1000 3); \
1001 vreinterpretq_m128i_s16(ret); \
1002 })
1003
1004// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1005// by imm.
1006// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1007// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1008// __constrange(0,255) int
1009// imm)
1010#define _mm_shufflehi_epi16_function(a, imm) \
1011 __extension__({ \
1012 int16x8_t ret = vreinterpretq_s16_m128i(a); \
1013 int16x4_t highBits = vget_high_s16(ret); \
1014 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1015 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1016 5); \
1017 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1018 6); \
1019 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1020 7); \
1021 vreinterpretq_m128i_s16(ret); \
1022 })
1023
1024/* MMX */
1025
1026//_mm_empty is a no-op on arm
1028
1029/* SSE */
1030
1031// Adds the four single-precision, floating-point values of a and b.
1032//
1033// r0 := a0 + b0
1034// r1 := a1 + b1
1035// r2 := a2 + b2
1036// r3 := a3 + b3
1037//
1038// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
1040{
1043}
1044
1045// adds the scalar single-precision floating point values of a and b.
1046// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1048{
1049 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1050 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1051 // the upper values in the result must be the remnants of <a>.
1052 return vreinterpretq_m128_f32(vaddq_f32(a, value));
1053}
1054
1055// Computes the bitwise AND of the four single-precision, floating-point values
1056// of a and b.
1057//
1058// r0 := a0 & b0
1059// r1 := a1 & b1
1060// r2 := a2 & b2
1061// r3 := a3 & b3
1062//
1063// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
1065{
1068}
1069
1070// Computes the bitwise AND-NOT of the four single-precision, floating-point
1071// values of a and b.
1072//
1073// r0 := ~a0 & b0
1074// r1 := ~a1 & b1
1075// r2 := ~a2 & b2
1076// r3 := ~a3 & b3
1077//
1078// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
1080{
1082 vbicq_s32(vreinterpretq_s32_m128(b),
1083 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1084}
1085
1086// Average packed unsigned 16-bit integers in a and b, and store the results in
1087// dst.
1088//
1089// FOR j := 0 to 3
1090// i := j*16
1091// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1092// ENDFOR
1093//
1094// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
1096{
1097 return vreinterpret_m64_u16(
1098 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1099}
1100
1101// Average packed unsigned 8-bit integers in a and b, and store the results in
1102// dst.
1103//
1104// FOR j := 0 to 7
1105// i := j*8
1106// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1107// ENDFOR
1108//
1109// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
1111{
1112 return vreinterpret_m64_u8(
1113 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1114}
1115
1116// Compares for equality.
1117// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
1119{
1122}
1123
1124// Compares for equality.
1125// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
1127{
1128 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1129}
1130
1131// Compares for greater than or equal.
1132// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
1134{
1137}
1138
1139// Compares for greater than or equal.
1140// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
1142{
1143 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1144}
1145
1146// Compares for greater than.
1147//
1148// r0 := (a0 > b0) ? 0xffffffff : 0x0
1149// r1 := (a1 > b1) ? 0xffffffff : 0x0
1150// r2 := (a2 > b2) ? 0xffffffff : 0x0
1151// r3 := (a3 > b3) ? 0xffffffff : 0x0
1152//
1153// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
1155{
1158}
1159
1160// Compares for greater than.
1161// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
1163{
1164 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1165}
1166
1167// Compares for less than or equal.
1168//
1169// r0 := (a0 <= b0) ? 0xffffffff : 0x0
1170// r1 := (a1 <= b1) ? 0xffffffff : 0x0
1171// r2 := (a2 <= b2) ? 0xffffffff : 0x0
1172// r3 := (a3 <= b3) ? 0xffffffff : 0x0
1173//
1174// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
1176{
1179}
1180
1181// Compares for less than or equal.
1182// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
1184{
1185 return _mm_move_ss(a, _mm_cmple_ps(a, b));
1186}
1187
1188// Compares for less than
1189// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
1191{
1194}
1195
1196// Compares for less than
1197// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
1199{
1200 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1201}
1202
1203// Compares for inequality.
1204// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
1206{
1207 return vreinterpretq_m128_u32(vmvnq_u32(
1209}
1210
1211// Compares for inequality.
1212// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
1214{
1215 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1216}
1217
1218// Compares for not greater than or equal.
1219// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
1221{
1222 return vreinterpretq_m128_u32(vmvnq_u32(
1224}
1225
1226// Compares for not greater than or equal.
1227// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
1229{
1230 return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1231}
1232
1233// Compares for not greater than.
1234// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
1236{
1237 return vreinterpretq_m128_u32(vmvnq_u32(
1239}
1240
1241// Compares for not greater than.
1242// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1244{
1245 return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1246}
1247
1248// Compares for not less than or equal.
1249// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
1251{
1252 return vreinterpretq_m128_u32(vmvnq_u32(
1254}
1255
1256// Compares for not less than or equal.
1257// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1259{
1260 return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1261}
1262
1263// Compares for not less than.
1264// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
1266{
1267 return vreinterpretq_m128_u32(vmvnq_u32(
1269}
1270
1271// Compares for not less than.
1272// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
1274{
1275 return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1276}
1277
1278// Compares the four 32-bit floats in a and b to check if any values are NaN.
1279// Ordered compare between each value returns true for "orderable" and false for
1280// "not orderable" (NaN).
1281// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1282// also:
1283// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1284// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1286{
1287 // Note: NEON does not have ordered compare builtin
1288 // Need to compare a eq a and b eq b to check for NaN
1289 // Do AND of results to get final
1290 uint32x4_t ceqaa =
1292 uint32x4_t ceqbb =
1294 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1295}
1296
1297// Compares for ordered.
1298// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
1300{
1301 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1302}
1303
1304// Compares for unordered.
1305// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
1307{
1308 uint32x4_t f32a =
1310 uint32x4_t f32b =
1312 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1313}
1314
1315// Compares for unordered.
1316// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
1318{
1319 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1320}
1321
1322// Compares the lower single-precision floating point scalar values of a and b
1323// using an equality operation. :
1324// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
1326{
1327 uint32x4_t a_eq_b =
1329 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1330}
1331
1332// Compares the lower single-precision floating point scalar values of a and b
1333// using a greater than or equal operation. :
1334// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
1336{
1337 uint32x4_t a_ge_b =
1339 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1340}
1341
1342// Compares the lower single-precision floating point scalar values of a and b
1343// using a greater than operation. :
1344// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
1346{
1347 uint32x4_t a_gt_b =
1349 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1350}
1351
1352// Compares the lower single-precision floating point scalar values of a and b
1353// using a less than or equal operation. :
1354// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
1356{
1357 uint32x4_t a_le_b =
1359 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1360}
1361
1362// Compares the lower single-precision floating point scalar values of a and b
1363// using a less than operation. :
1364// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1365// note!! The documentation on MSDN is incorrect! If either of the values is a
1366// NAN the docs say you will get a one, but in fact, it will return a zero!!
1368{
1369 uint32x4_t a_lt_b =
1371 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1372}
1373
1374// Compares the lower single-precision floating point scalar values of a and b
1375// using an inequality operation. :
1376// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
1378{
1379 return !_mm_comieq_ss(a, b);
1380}
1381
1382// Convert packed signed 32-bit integers in b to packed single-precision
1383// (32-bit) floating-point elements, store the results in the lower 2 elements
1384// of dst, and copy the upper 2 packed elements from a to the upper elements of
1385// dst.
1386//
1387// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1388// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1389// dst[95:64] := a[95:64]
1390// dst[127:96] := a[127:96]
1391//
1392// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
1394{
1396 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1397 vget_high_f32(vreinterpretq_f32_m128(a))));
1398}
1399
1400// Convert packed single-precision (32-bit) floating-point elements in a to
1401// packed 32-bit integers, and store the results in dst.
1402//
1403// FOR j := 0 to 1
1404// i := 32*j
1405// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1406// ENDFOR
1407//
1408// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
1410{
1411#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1412 return vreinterpret_m64_s32(
1413 vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1414#else
1415 return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1417#endif
1418}
1419
1420// Convert the signed 32-bit integer b to a single-precision (32-bit)
1421// floating-point element, store the result in the lower element of dst, and
1422// copy the upper 3 packed elements from a to the upper elements of dst.
1423//
1424// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1425// dst[127:32] := a[127:32]
1426//
1427// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
1429{
1431 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1432}
1433
1434// Convert the lower single-precision (32-bit) floating-point element in a to a
1435// 32-bit integer, and store the result in dst.
1436// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
1438{
1439#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1440 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1441 0);
1442#else
1443 float32_t data = vgetq_lane_f32(
1445 return (int32_t) data;
1446#endif
1447}
1448
1449// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1450// floating-point elements, and store the results in dst.
1451//
1452// FOR j := 0 to 3
1453// i := j*16
1454// m := j*32
1455// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1456// ENDFOR
1457//
1458// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
1460{
1462 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1463}
1464
1465// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1466// floating-point elements, store the results in the lower 2 elements of dst,
1467// and copy the upper 2 packed elements from a to the upper elements of dst.
1468//
1469// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1470// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1471// dst[95:64] := a[95:64]
1472// dst[127:96] := a[127:96]
1473//
1474// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
1476{
1478 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1479 vget_high_f32(vreinterpretq_f32_m128(a))));
1480}
1481
1482// Convert packed signed 32-bit integers in a to packed single-precision
1483// (32-bit) floating-point elements, store the results in the lower 2 elements
1484// of dst, then convert the packed signed 32-bit integers in b to
1485// single-precision (32-bit) floating-point element, and store the results in
1486// the upper 2 elements of dst.
1487//
1488// dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1489// dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1490// dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1491// dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1492//
1493// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
1495{
1496 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1497 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1498}
1499
1500// Convert the lower packed 8-bit integers in a to packed single-precision
1501// (32-bit) floating-point elements, and store the results in dst.
1502//
1503// FOR j := 0 to 3
1504// i := j*8
1505// m := j*32
1506// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1507// ENDFOR
1508//
1509// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
1511{
1512 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1513 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1514}
1515
1516// Convert packed single-precision (32-bit) floating-point elements in a to
1517// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1518// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1519// 0x7FFFFFFF.
1520//
1521// FOR j := 0 to 3
1522// i := 16*j
1523// k := 32*j
1524// IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
1525// dst[i+15:i] := 0x7FFF
1526// ELSE
1527// dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
1528// FI
1529// ENDFOR
1530//
1531// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
1533{
1534 const __m128 i16Min = _mm_set_ps1((float) INT16_MIN);
1535 const __m128 i16Max = _mm_set_ps1((float) INT16_MAX);
1536 const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1537 const __m128i maxMask = _mm_castps_si128(
1538 _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
1539 const __m128i betweenMask = _mm_castps_si128(
1540 _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
1541 const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1543 __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
1544 __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
1545 __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1546 __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1547 return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
1548}
1549
1550// Convert packed single-precision (32-bit) floating-point elements in a to
1551// packed 32-bit integers, and store the results in dst.
1552//
1553// FOR j := 0 to 1
1554// i := 32*j
1555// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1556// ENDFOR
1557//
1558// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
1559#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1560
1561// Convert packed single-precision (32-bit) floating-point elements in a to
1562// packed 8-bit integers, and store the results in lower 4 elements of dst.
1563// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1564// between 0x7F and 0x7FFFFFFF.
1565//
1566// FOR j := 0 to 3
1567// i := 8*j
1568// k := 32*j
1569// IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
1570// dst[i+7:i] := 0x7F
1571// ELSE
1572// dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
1573// FI
1574// ENDFOR
1575//
1576// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
1578{
1579 const __m128 i8Min = _mm_set_ps1((float) INT8_MIN);
1580 const __m128 i8Max = _mm_set_ps1((float) INT8_MAX);
1581 const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1582 const __m128i maxMask = _mm_castps_si128(
1583 _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
1584 const __m128i betweenMask = _mm_castps_si128(
1585 _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
1586 const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1588 __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
1589 __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
1590 __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1591 __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1592 int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
1593 int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1594 static const uint32_t bitMask[2] = {0xFFFFFFFF, 0};
1595 int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
1596
1597 return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
1598}
1599
1600// Convert packed unsigned 16-bit integers in a to packed single-precision
1601// (32-bit) floating-point elements, and store the results in dst.
1602//
1603// FOR j := 0 to 3
1604// i := j*16
1605// m := j*32
1606// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1607// ENDFOR
1608//
1609// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
1611{
1613 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1614}
1615
1616// Convert the lower packed unsigned 8-bit integers in a to packed
1617// single-precision (32-bit) floating-point elements, and store the results in
1618// dst.
1619//
1620// FOR j := 0 to 3
1621// i := j*8
1622// m := j*32
1623// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1624// ENDFOR
1625//
1626// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
1628{
1629 return vreinterpretq_m128_f32(vcvtq_f32_u32(
1630 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1631}
1632
1633// Convert the signed 32-bit integer b to a single-precision (32-bit)
1634// floating-point element, store the result in the lower element of dst, and
1635// copy the upper 3 packed elements from a to the upper elements of dst.
1636//
1637// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1638// dst[127:32] := a[127:32]
1639//
1640// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
1641#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1642
1643// Convert the signed 64-bit integer b to a single-precision (32-bit)
1644// floating-point element, store the result in the lower element of dst, and
1645// copy the upper 3 packed elements from a to the upper elements of dst.
1646//
1647// dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1648// dst[127:32] := a[127:32]
1649//
1650// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
1652{
1654 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1655}
1656
1657// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1658//
1659// dst[31:0] := a[31:0]
1660//
1661// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
1663{
1664 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1665}
1666
1667// Convert the lower single-precision (32-bit) floating-point element in a to a
1668// 32-bit integer, and store the result in dst.
1669//
1670// dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1671//
1672// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
1673#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1674
1675// Convert the lower single-precision (32-bit) floating-point element in a to a
1676// 64-bit integer, and store the result in dst.
1677//
1678// dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1679//
1680// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
1682{
1683#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1684 return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1685#else
1686 float32_t data = vgetq_lane_f32(
1688 return (int64_t) data;
1689#endif
1690}
1691
1692// Convert packed single-precision (32-bit) floating-point elements in a to
1693// packed 32-bit integers with truncation, and store the results in dst.
1694//
1695// FOR j := 0 to 1
1696// i := 32*j
1697// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1698// ENDFOR
1699//
1700// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
1702{
1703 return vreinterpret_m64_s32(
1704 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1705}
1706
1707// Convert the lower single-precision (32-bit) floating-point element in a to a
1708// 32-bit integer with truncation, and store the result in dst.
1709//
1710// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1711//
1712// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
1714{
1715 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1716}
1717
1718// Convert packed single-precision (32-bit) floating-point elements in a to
1719// packed 32-bit integers with truncation, and store the results in dst.
1720//
1721// FOR j := 0 to 1
1722// i := 32*j
1723// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1724// ENDFOR
1725//
1726// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
1727#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1728
1729// Convert the lower single-precision (32-bit) floating-point element in a to a
1730// 32-bit integer with truncation, and store the result in dst.
1731//
1732// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1733//
1734// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
1735#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1736
1737// Convert the lower single-precision (32-bit) floating-point element in a to a
1738// 64-bit integer with truncation, and store the result in dst.
1739//
1740// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1741//
1742// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
1744{
1745 return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1746}
1747
1748// Divides the four single-precision, floating-point values of a and b.
1749//
1750// r0 := a0 / b0
1751// r1 := a1 / b1
1752// r2 := a2 / b2
1753// r3 := a3 / b3
1754//
1755// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
1757{
1758#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1761#else
1762 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1763 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1764#if SSE2NEON_PRECISE_DIV
1765 // Additional Netwon-Raphson iteration for accuracy
1766 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1767#endif
1768 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1769#endif
1770}
1771
1772// Divides the scalar single-precision floating point value of a by b.
1773// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
1775{
1776 float32_t value =
1777 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1779 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1780}
1781
1782// Extract a 16-bit integer from a, selected with imm8, and store the result in
1783// the lower element of dst.
1784// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
1785#define _mm_extract_pi16(a, imm) \
1786 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1787
1788// Free aligned memory that was allocated with _mm_malloc.
1789// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
1790FORCE_INLINE void _mm_free(void *addr)
1791{
1792 free(addr);
1793}
1794
1795// Macro: Get the flush zero bits from the MXCSR control and status register.
1796// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1797// _MM_FLUSH_ZERO_OFF
1798// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
1800{
1801 union {
1802 fpcr_bitfield field;
1803#if defined(__aarch64__)
1804 uint64_t value;
1805#else
1806 uint32_t value;
1807#endif
1808 } r;
1809
1810#if defined(__aarch64__)
1811 __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
1812#else
1813 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1814#endif
1815
1816 return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1817}
1818
1819// Macro: Get the rounding mode bits from the MXCSR control and status register.
1820// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1821// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1822// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
1824{
1825 union {
1826 fpcr_bitfield field;
1827#if defined(__aarch64__)
1828 uint64_t value;
1829#else
1830 uint32_t value;
1831#endif
1832 } r;
1833
1834#if defined(__aarch64__)
1835 __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
1836#else
1837 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1838#endif
1839
1840 if (r.field.bit22) {
1841 return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1842 } else {
1843 return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1844 }
1845}
1846
1847// Copy a to dst, and insert the 16-bit integer i into dst at the location
1848// specified by imm8.
1849// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
1850#define _mm_insert_pi16(a, b, imm) \
1851 __extension__({ \
1852 vreinterpret_m64_s16( \
1853 vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1854 })
1855
1856// Loads four single-precision, floating-point values.
1857// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
1859{
1860 return vreinterpretq_m128_f32(vld1q_f32(p));
1861}
1862
1863// Load a single-precision (32-bit) floating-point element from memory into all
1864// elements of dst.
1865//
1866// dst[31:0] := MEM[mem_addr+31:mem_addr]
1867// dst[63:32] := MEM[mem_addr+31:mem_addr]
1868// dst[95:64] := MEM[mem_addr+31:mem_addr]
1869// dst[127:96] := MEM[mem_addr+31:mem_addr]
1870//
1871// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1872#define _mm_load_ps1 _mm_load1_ps
1873
1874// Loads an single - precision, floating - point value into the low word and
1875// clears the upper three words.
1876// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
1878{
1879 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1880}
1881
1882// Loads a single single-precision, floating-point value, copying it into all
1883// four words
1884// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
1886{
1887 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1888}
1889
1890// Sets the upper two single-precision, floating-point values with 64
1891// bits of data loaded from the address p; the lower two values are passed
1892// through from a.
1893//
1894// r0 := a0
1895// r1 := a1
1896// r2 := *p0
1897// r3 := *p1
1898//
1899// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
1901{
1903 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1904}
1905
1906// Sets the lower two single-precision, floating-point values with 64
1907// bits of data loaded from the address p; the upper two values are passed
1908// through from a.
1909//
1910// Return Value
1911// r0 := *p0
1912// r1 := *p1
1913// r2 := a2
1914// r3 := a3
1915//
1916// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
1918{
1920 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1921}
1922
1923// Load 4 single-precision (32-bit) floating-point elements from memory into dst
1924// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1925// general-protection exception may be generated.
1926//
1927// dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1928// dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1929// dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1930// dst[127:96] := MEM[mem_addr+31:mem_addr]
1931//
1932// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
1934{
1935 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1936 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1937}
1938
1939// Loads four single-precision, floating-point values.
1940// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
1942{
1943 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1944 // equivalent for neon
1945 return vreinterpretq_m128_f32(vld1q_f32(p));
1946}
1947
1948// Load unaligned 16-bit integer from memory into the first element of dst.
1949//
1950// dst[15:0] := MEM[mem_addr+15:mem_addr]
1951// dst[MAX:16] := 0
1952//
1953// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
1955{
1957 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1958}
1959
1960// Load unaligned 64-bit integer from memory into the first element of dst.
1961//
1962// dst[63:0] := MEM[mem_addr+63:mem_addr]
1963// dst[MAX:64] := 0
1964//
1965// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
1967{
1969 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1970}
1971
1972// Allocate aligned blocks of memory.
1973// https://software.intel.com/en-us/
1974// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
1975FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1976{
1977 void *ptr;
1978 if (align == 1)
1979 return malloc(size);
1980 if (align == 2 || (sizeof(void *) == 8 && align == 4))
1981 align = sizeof(void *);
1982 if (!posix_memalign(&ptr, align, size))
1983 return ptr;
1984 return NULL;
1985}
1986
1987// Conditionally store 8-bit integer elements from a into memory using mask
1988// (elements are not stored when the highest bit is not set in the corresponding
1989// element) and a non-temporal memory hint.
1990// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
1991FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1992{
1993 int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1994 __m128 b = _mm_load_ps((const float *) mem_addr);
1995 int8x8_t masked =
1996 vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1997 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1998 vst1_s8((int8_t *) mem_addr, masked);
1999}
2000
2001// Conditionally store 8-bit integer elements from a into memory using mask
2002// (elements are not stored when the highest bit is not set in the corresponding
2003// element) and a non-temporal memory hint.
2004// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
2005#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
2006
2007// Compare packed signed 16-bit integers in a and b, and store packed maximum
2008// values in dst.
2009//
2010// FOR j := 0 to 3
2011// i := j*16
2012// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
2013// ENDFOR
2014//
2015// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
2017{
2018 return vreinterpret_m64_s16(
2020}
2021
2022// Computes the maximums of the four single-precision, floating-point values of
2023// a and b.
2024// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
2026{
2027#if SSE2NEON_PRECISE_MINMAX
2028 float32x4_t _a = vreinterpretq_f32_m128(a);
2029 float32x4_t _b = vreinterpretq_f32_m128(b);
2030 return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
2031#else
2034#endif
2035}
2036
2037// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2038// values in dst.
2039//
2040// FOR j := 0 to 7
2041// i := j*8
2042// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
2043// ENDFOR
2044//
2045// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
2047{
2048 return vreinterpret_m64_u8(
2050}
2051
2052// Computes the maximum of the two lower scalar single-precision floating point
2053// values of a and b.
2054// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
2056{
2057 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2059 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2060}
2061
2062// Compare packed signed 16-bit integers in a and b, and store packed minimum
2063// values in dst.
2064//
2065// FOR j := 0 to 3
2066// i := j*16
2067// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
2068// ENDFOR
2069//
2070// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
2072{
2073 return vreinterpret_m64_s16(
2075}
2076
2077// Computes the minima of the four single-precision, floating-point values of a
2078// and b.
2079// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2081{
2082#if SSE2NEON_PRECISE_MINMAX
2083 float32x4_t _a = vreinterpretq_f32_m128(a);
2084 float32x4_t _b = vreinterpretq_f32_m128(b);
2085 return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2086#else
2089#endif
2090}
2091
2092// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2093// values in dst.
2094//
2095// FOR j := 0 to 7
2096// i := j*8
2097// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
2098// ENDFOR
2099//
2100// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
2102{
2103 return vreinterpret_m64_u8(
2105}
2106
2107// Computes the minimum of the two lower scalar single-precision floating point
2108// values of a and b.
2109// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2111{
2112 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2114 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2115}
2116
2117// Sets the low word to the single-precision, floating-point value of b
2118// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
2120{
2122 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2123 vreinterpretq_f32_m128(a), 0));
2124}
2125
2126// Moves the upper two values of B into the lower two values of A.
2127//
2128// r3 := a3
2129// r2 := a2
2130// r1 := b3
2131// r0 := b2
2133{
2134 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2135 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2136 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2137}
2138
2139// Moves the lower two values of B into the upper two values of A.
2140//
2141// r3 := b1
2142// r2 := b0
2143// r1 := a1
2144// r0 := a0
2146{
2147 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2148 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2149 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2150}
2151
2152// Create mask from the most significant bit of each 8-bit element in a, and
2153// store the result in dst.
2154// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
2156{
2157 uint8x8_t input = vreinterpret_u8_m64(a);
2158#if defined(__aarch64__)
2159 static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2160 uint8x8_t tmp = vshr_n_u8(input, 7);
2161 return vaddv_u8(vshl_u8(tmp, shift));
2162#else
2163 // Refer the implementation of `_mm_movemask_epi8`
2164 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2165 uint32x2_t paired16 =
2166 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2167 uint8x8_t paired32 =
2168 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2169 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2170#endif
2171}
2172
2173// NEON does not provide this method
2174// Creates a 4-bit mask from the most significant bits of the four
2175// single-precision, floating-point values.
2176// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
2178{
2179 uint32x4_t input = vreinterpretq_u32_m128(a);
2180#if defined(__aarch64__)
2181 static const int32x4_t shift = {0, 1, 2, 3};
2182 uint32x4_t tmp = vshrq_n_u32(input, 31);
2183 return vaddvq_u32(vshlq_u32(tmp, shift));
2184#else
2185 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2186 // Shift out everything but the sign bits with a 32-bit unsigned shift
2187 // right.
2188 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2189 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2190 uint8x16_t paired =
2191 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2192 // Extract the result.
2193 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2194#endif
2195}
2196
2197// Multiplies the four single-precision, floating-point values of a and b.
2198//
2199// r0 := a0 * b0
2200// r1 := a1 * b1
2201// r2 := a2 * b2
2202// r3 := a3 * b3
2203//
2204// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2206{
2209}
2210
2211// Multiply the lower single-precision (32-bit) floating-point element in a and
2212// b, store the result in the lower element of dst, and copy the upper 3 packed
2213// elements from a to the upper elements of dst.
2214//
2215// dst[31:0] := a[31:0] * b[31:0]
2216// dst[127:32] := a[127:32]
2217//
2218// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
2220{
2221 return _mm_move_ss(a, _mm_mul_ps(a, b));
2222}
2223
2224// Multiply the packed unsigned 16-bit integers in a and b, producing
2225// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2226// integers in dst.
2227// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
2229{
2230 return vreinterpret_m64_u16(vshrn_n_u32(
2231 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2232}
2233
2234// Computes the bitwise OR of the four single-precision, floating-point values
2235// of a and b.
2236// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
2238{
2241}
2242
2243// Average packed unsigned 8-bit integers in a and b, and store the results in
2244// dst.
2245//
2246// FOR j := 0 to 7
2247// i := j*8
2248// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2249// ENDFOR
2250//
2251// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2252#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2253
2254// Average packed unsigned 16-bit integers in a and b, and store the results in
2255// dst.
2256//
2257// FOR j := 0 to 3
2258// i := j*16
2259// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2260// ENDFOR
2261//
2262// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2263#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2264
2265// Extract a 16-bit integer from a, selected with imm8, and store the result in
2266// the lower element of dst.
2267// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
2268#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2269
2270// Copy a to dst, and insert the 16-bit integer i into dst at the location
2271// specified by imm8.
2272// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
2273#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2274
2275// Compare packed signed 16-bit integers in a and b, and store packed maximum
2276// values in dst.
2277// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
2278#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2279
2280// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2281// values in dst.
2282// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
2283#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2284
2285// Compare packed signed 16-bit integers in a and b, and store packed minimum
2286// values in dst.
2287// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
2288#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2289
2290// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2291// values in dst.
2292// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
2293#define _m_pminub(a, b) _mm_min_pu8(a, b)
2294
2295// Create mask from the most significant bit of each 8-bit element in a, and
2296// store the result in dst.
2297// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
2298#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2299
2300// Multiply the packed unsigned 16-bit integers in a and b, producing
2301// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2302// integers in dst.
2303// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2304#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2305
2306// Loads one cache line of data from address p to a location closer to the
2307// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
2308FORCE_INLINE void _mm_prefetch(const void *p, int i)
2309{
2310 (void) i;
2311 __builtin_prefetch(p);
2312}
2313
2314// Compute the absolute differences of packed unsigned 8-bit integers in a and
2315// b, then horizontally sum each consecutive 8 differences to produce four
2316// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2317// 16 bits of dst.
2318// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
2319#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2320
2321// Shuffle 16-bit integers in a using the control in imm8, and store the results
2322// in dst.
2323// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
2324#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2325
2326// Compute the approximate reciprocal of packed single-precision (32-bit)
2327// floating-point elements in a, and store the results in dst. The maximum
2328// relative error for this approximation is less than 1.5*2^-12.
2329// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
2331{
2332 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2333 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2334#if SSE2NEON_PRECISE_DIV
2335 // Additional Netwon-Raphson iteration for accuracy
2336 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2337#endif
2338 return vreinterpretq_m128_f32(recip);
2339}
2340
2341// Compute the approximate reciprocal of the lower single-precision (32-bit)
2342// floating-point element in a, store the result in the lower element of dst,
2343// and copy the upper 3 packed elements from a to the upper elements of dst. The
2344// maximum relative error for this approximation is less than 1.5*2^-12.
2345//
2346// dst[31:0] := (1.0 / a[31:0])
2347// dst[127:32] := a[127:32]
2348//
2349// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
2351{
2352 return _mm_move_ss(a, _mm_rcp_ps(a));
2353}
2354
2355// Computes the approximations of the reciprocal square roots of the four
2356// single-precision floating point values of in.
2357// The current precision is 1% error.
2358// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2360{
2361 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2362#if SSE2NEON_PRECISE_SQRT
2363 // Additional Netwon-Raphson iteration for accuracy
2364 out = vmulq_f32(
2365 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2366 out = vmulq_f32(
2367 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2368#endif
2369 return vreinterpretq_m128_f32(out);
2370}
2371
2372// Compute the approximate reciprocal square root of the lower single-precision
2373// (32-bit) floating-point element in a, store the result in the lower element
2374// of dst, and copy the upper 3 packed elements from a to the upper elements of
2375// dst.
2376// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
2378{
2379 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2380}
2381
2382// Compute the absolute differences of packed unsigned 8-bit integers in a and
2383// b, then horizontally sum each consecutive 8 differences to produce four
2384// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2385// 16 bits of dst.
2386// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
2388{
2389 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2390 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2391 return vreinterpret_m64_u16(
2392 vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2393}
2394
2395// Macro: Set the flush zero bits of the MXCSR control and status register to
2396// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2397// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2398// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
2400{
2401 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2402 // regardless of the value of the FZ bit.
2403 union {
2404 fpcr_bitfield field;
2405#if defined(__aarch64__)
2406 uint64_t value;
2407#else
2408 uint32_t value;
2409#endif
2410 } r;
2411
2412#if defined(__aarch64__)
2413 __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
2414#else
2415 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2416#endif
2417
2418 r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2419
2420#if defined(__aarch64__)
2421 __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
2422#else
2423 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2424#endif
2425}
2426
2427// Sets the four single-precision, floating-point values to the four inputs.
2428// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
2429FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2430{
2431 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2432 return vreinterpretq_m128_f32(vld1q_f32(data));
2433}
2434
2435// Sets the four single-precision, floating-point values to w.
2436// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2438{
2439 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2440}
2441
2442// Macro: Set the rounding mode bits of the MXCSR control and status register to
2443// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2444// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2445// _MM_ROUND_TOWARD_ZERO
2446// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
2448{
2449 union {
2450 fpcr_bitfield field;
2451#if defined(__aarch64__)
2452 uint64_t value;
2453#else
2454 uint32_t value;
2455#endif
2456 } r;
2457
2458#if defined(__aarch64__)
2459 __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
2460#else
2461 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2462#endif
2463
2464 switch (rounding) {
2466 r.field.bit22 = 1;
2467 r.field.bit23 = 1;
2468 break;
2469 case _MM_ROUND_DOWN:
2470 r.field.bit22 = 0;
2471 r.field.bit23 = 1;
2472 break;
2473 case _MM_ROUND_UP:
2474 r.field.bit22 = 1;
2475 r.field.bit23 = 0;
2476 break;
2477 default: //_MM_ROUND_NEAREST
2478 r.field.bit22 = 0;
2479 r.field.bit23 = 0;
2480 }
2481
2482#if defined(__aarch64__)
2483 __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
2484#else
2485 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2486#endif
2487}
2488
2489// Copy single-precision (32-bit) floating-point element a to the lower element
2490// of dst, and zero the upper 3 elements.
2491// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
2493{
2494 float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
2495 return vreinterpretq_m128_f32(vld1q_f32(data));
2496}
2497
2498// Sets the four single-precision, floating-point values to w.
2499//
2500// r0 := r1 := r2 := r3 := w
2501//
2502// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2504{
2505 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2506}
2507
2508// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2509FORCE_INLINE void _mm_setcsr(unsigned int a)
2510{
2512}
2513
2514// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2516{
2517 return _MM_GET_ROUNDING_MODE();
2518}
2519
2520// Sets the four single-precision, floating-point values to the four inputs in
2521// reverse order.
2522// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
2523FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2524{
2525 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2526 return vreinterpretq_m128_f32(vld1q_f32(data));
2527}
2528
2529// Clears the four single-precision, floating-point values.
2530// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
2532{
2533 return vreinterpretq_m128_f32(vdupq_n_f32(0));
2534}
2535
2536// Shuffle 16-bit integers in a using the control in imm8, and store the results
2537// in dst.
2538// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
2539#if __has_builtin(__builtin_shufflevector)
2540#define _mm_shuffle_pi16(a, imm) \
2541 __extension__({ \
2542 vreinterpret_m64_s16(__builtin_shufflevector( \
2543 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2544 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2545 })
2546#else
2547#define _mm_shuffle_pi16(a, imm) \
2548 __extension__({ \
2549 int16x4_t ret; \
2550 ret = \
2551 vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2552 ret = vset_lane_s16( \
2553 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2554 1); \
2555 ret = vset_lane_s16( \
2556 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2557 2); \
2558 ret = vset_lane_s16( \
2559 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2560 3); \
2561 vreinterpret_m64_s16(ret); \
2562 })
2563#endif
2564
2565// Guarantees that every preceding store is globally visible before any
2566// subsequent store.
2567// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
2569{
2570 __sync_synchronize();
2571}
2572
2573// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2574// int imm)
2575#if __has_builtin(__builtin_shufflevector)
2576#define _mm_shuffle_ps(a, b, imm) \
2577 __extension__({ \
2578 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2579 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2580 float32x4_t _shuf = __builtin_shufflevector( \
2581 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2582 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2583 vreinterpretq_m128_f32(_shuf); \
2584 })
2585#else // generic
2586#define _mm_shuffle_ps(a, b, imm) \
2587 __extension__({ \
2588 __m128 ret; \
2589 switch (imm) { \
2590 case _MM_SHUFFLE(1, 0, 3, 2): \
2591 ret = _mm_shuffle_ps_1032((a), (b)); \
2592 break; \
2593 case _MM_SHUFFLE(2, 3, 0, 1): \
2594 ret = _mm_shuffle_ps_2301((a), (b)); \
2595 break; \
2596 case _MM_SHUFFLE(0, 3, 2, 1): \
2597 ret = _mm_shuffle_ps_0321((a), (b)); \
2598 break; \
2599 case _MM_SHUFFLE(2, 1, 0, 3): \
2600 ret = _mm_shuffle_ps_2103((a), (b)); \
2601 break; \
2602 case _MM_SHUFFLE(1, 0, 1, 0): \
2603 ret = _mm_movelh_ps((a), (b)); \
2604 break; \
2605 case _MM_SHUFFLE(1, 0, 0, 1): \
2606 ret = _mm_shuffle_ps_1001((a), (b)); \
2607 break; \
2608 case _MM_SHUFFLE(0, 1, 0, 1): \
2609 ret = _mm_shuffle_ps_0101((a), (b)); \
2610 break; \
2611 case _MM_SHUFFLE(3, 2, 1, 0): \
2612 ret = _mm_shuffle_ps_3210((a), (b)); \
2613 break; \
2614 case _MM_SHUFFLE(0, 0, 1, 1): \
2615 ret = _mm_shuffle_ps_0011((a), (b)); \
2616 break; \
2617 case _MM_SHUFFLE(0, 0, 2, 2): \
2618 ret = _mm_shuffle_ps_0022((a), (b)); \
2619 break; \
2620 case _MM_SHUFFLE(2, 2, 0, 0): \
2621 ret = _mm_shuffle_ps_2200((a), (b)); \
2622 break; \
2623 case _MM_SHUFFLE(3, 2, 0, 2): \
2624 ret = _mm_shuffle_ps_3202((a), (b)); \
2625 break; \
2626 case _MM_SHUFFLE(3, 2, 3, 2): \
2627 ret = _mm_movehl_ps((b), (a)); \
2628 break; \
2629 case _MM_SHUFFLE(1, 1, 3, 3): \
2630 ret = _mm_shuffle_ps_1133((a), (b)); \
2631 break; \
2632 case _MM_SHUFFLE(2, 0, 1, 0): \
2633 ret = _mm_shuffle_ps_2010((a), (b)); \
2634 break; \
2635 case _MM_SHUFFLE(2, 0, 0, 1): \
2636 ret = _mm_shuffle_ps_2001((a), (b)); \
2637 break; \
2638 case _MM_SHUFFLE(2, 0, 3, 2): \
2639 ret = _mm_shuffle_ps_2032((a), (b)); \
2640 break; \
2641 default: \
2642 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2643 break; \
2644 } \
2645 ret; \
2646 })
2647#endif
2648
2649// Computes the approximations of square roots of the four single-precision,
2650// floating-point values of a. First computes reciprocal square roots and then
2651// reciprocals of the four values.
2652//
2653// r0 := sqrt(a0)
2654// r1 := sqrt(a1)
2655// r2 := sqrt(a2)
2656// r3 := sqrt(a3)
2657//
2658// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2660{
2661#if SSE2NEON_PRECISE_SQRT
2662 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2663
2664 // Test for vrsqrteq_f32(0) -> positive infinity case.
2665 // Change to zero, so that s * 1/sqrt(s) result is zero too.
2666 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2667 const uint32x4_t div_by_zero =
2668 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2669 recip = vreinterpretq_f32_u32(
2670 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2671
2672 // Additional Netwon-Raphson iteration for accuracy
2673 recip = vmulq_f32(
2674 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2675 recip);
2676 recip = vmulq_f32(
2677 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2678 recip);
2679
2680 // sqrt(s) = s * 1/sqrt(s)
2681 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2682#elif defined(__aarch64__)
2683 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2684#else
2685 float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2686 float32x4_t sq = vrecpeq_f32(recipsq);
2687 return vreinterpretq_m128_f32(sq);
2688#endif
2689}
2690
2691// Computes the approximation of the square root of the scalar single-precision
2692// floating point value of in.
2693// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2695{
2696 float32_t value =
2697 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2699 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2700}
2701
2702// Stores four single-precision, floating-point values.
2703// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
2705{
2706 vst1q_f32(p, vreinterpretq_f32_m128(a));
2707}
2708
2709// Store the lower single-precision (32-bit) floating-point element from a into
2710// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2711// boundary or a general-protection exception may be generated.
2712//
2713// MEM[mem_addr+31:mem_addr] := a[31:0]
2714// MEM[mem_addr+63:mem_addr+32] := a[31:0]
2715// MEM[mem_addr+95:mem_addr+64] := a[31:0]
2716// MEM[mem_addr+127:mem_addr+96] := a[31:0]
2717//
2718// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
2720{
2721 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2722 vst1q_f32(p, vdupq_n_f32(a0));
2723}
2724
2725// Stores the lower single - precision, floating - point value.
2726// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
2728{
2729 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2730}
2731
2732// Store the lower single-precision (32-bit) floating-point element from a into
2733// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2734// boundary or a general-protection exception may be generated.
2735//
2736// MEM[mem_addr+31:mem_addr] := a[31:0]
2737// MEM[mem_addr+63:mem_addr+32] := a[31:0]
2738// MEM[mem_addr+95:mem_addr+64] := a[31:0]
2739// MEM[mem_addr+127:mem_addr+96] := a[31:0]
2740//
2741// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
2742#define _mm_store1_ps _mm_store_ps1
2743
2744// Stores the upper two single-precision, floating-point values of a to the
2745// address p.
2746//
2747// *p0 := a2
2748// *p1 := a3
2749//
2750// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
2752{
2753 *p = vreinterpret_m64_f32(vget_high_f32(a));
2754}
2755
2756// Stores the lower two single-precision floating point values of a to the
2757// address p.
2758//
2759// *p0 := a0
2760// *p1 := a1
2761//
2762// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
2764{
2765 *p = vreinterpret_m64_f32(vget_low_f32(a));
2766}
2767
2768// Store 4 single-precision (32-bit) floating-point elements from a into memory
2769// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2770// general-protection exception may be generated.
2771//
2772// MEM[mem_addr+31:mem_addr] := a[127:96]
2773// MEM[mem_addr+63:mem_addr+32] := a[95:64]
2774// MEM[mem_addr+95:mem_addr+64] := a[63:32]
2775// MEM[mem_addr+127:mem_addr+96] := a[31:0]
2776//
2777// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
2779{
2780 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2781 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2782 vst1q_f32(p, rev);
2783}
2784
2785// Stores four single-precision, floating-point values.
2786// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
2788{
2789 vst1q_f32(p, vreinterpretq_f32_m128(a));
2790}
2791
2792// Stores 16-bits of integer data a at the address p.
2793// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
2795{
2796 vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2797}
2798
2799// Stores 64-bits of integer data a at the address p.
2800// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
2802{
2803 vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2804}
2805
2806// Store 64-bits of integer data from a into memory using a non-temporal memory
2807// hint.
2808// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
2810{
2811 vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2812}
2813
2814// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2815// point elements) from a into memory using a non-temporal memory hint.
2816// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
2818{
2819#if __has_builtin(__builtin_nontemporal_store)
2820 __builtin_nontemporal_store(a, (float32x4_t *) p);
2821#else
2822 vst1q_f32(p, vreinterpretq_f32_m128(a));
2823#endif
2824}
2825
2826// Subtracts the four single-precision, floating-point values of a and b.
2827//
2828// r0 := a0 - b0
2829// r1 := a1 - b1
2830// r2 := a2 - b2
2831// r3 := a3 - b3
2832//
2833// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
2835{
2838}
2839
2840// Subtract the lower single-precision (32-bit) floating-point element in b from
2841// the lower single-precision (32-bit) floating-point element in a, store the
2842// result in the lower element of dst, and copy the upper 3 packed elements from
2843// a to the upper elements of dst.
2844//
2845// dst[31:0] := a[31:0] - b[31:0]
2846// dst[127:32] := a[127:32]
2847//
2848// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
2850{
2851 return _mm_move_ss(a, _mm_sub_ps(a, b));
2852}
2853
2854// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2855// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2856// transposed matrix in these vectors (row0 now contains column 0, etc.).
2857// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
2858#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2859 do { \
2860 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2861 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2862 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2863 vget_low_f32(ROW23.val[0])); \
2864 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2865 vget_low_f32(ROW23.val[1])); \
2866 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2867 vget_high_f32(ROW23.val[0])); \
2868 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2869 vget_high_f32(ROW23.val[1])); \
2870 } while (0)
2871
2872// according to the documentation, these intrinsics behave the same as the
2873// non-'u' versions. We'll just alias them here.
2874#define _mm_ucomieq_ss _mm_comieq_ss
2875#define _mm_ucomige_ss _mm_comige_ss
2876#define _mm_ucomigt_ss _mm_comigt_ss
2877#define _mm_ucomile_ss _mm_comile_ss
2878#define _mm_ucomilt_ss _mm_comilt_ss
2879#define _mm_ucomineq_ss _mm_comineq_ss
2880
2881// Return vector of type __m128i with undefined elements.
2882// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
2884{
2885#if defined(__GNUC__) || defined(__clang__)
2886#pragma GCC diagnostic push
2887#pragma GCC diagnostic ignored "-Wuninitialized"
2888#endif
2889 __m128i a;
2890 return a;
2891#if defined(__GNUC__) || defined(__clang__)
2892#pragma GCC diagnostic pop
2893#endif
2894}
2895
2896// Return vector of type __m128 with undefined elements.
2897// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
2899{
2900#if defined(__GNUC__) || defined(__clang__)
2901#pragma GCC diagnostic push
2902#pragma GCC diagnostic ignored "-Wuninitialized"
2903#endif
2904 __m128 a;
2905 return a;
2906#if defined(__GNUC__) || defined(__clang__)
2907#pragma GCC diagnostic pop
2908#endif
2909}
2910
2911// Selects and interleaves the upper two single-precision, floating-point values
2912// from a and b.
2913//
2914// r0 := a2
2915// r1 := b2
2916// r2 := a3
2917// r3 := b3
2918//
2919// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
2921{
2922#if defined(__aarch64__)
2925#else
2926 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2927 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2928 float32x2x2_t result = vzip_f32(a1, b1);
2929 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2930#endif
2931}
2932
2933// Selects and interleaves the lower two single-precision, floating-point values
2934// from a and b.
2935//
2936// r0 := a0
2937// r1 := b0
2938// r2 := a1
2939// r3 := b1
2940//
2941// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
2943{
2944#if defined(__aarch64__)
2947#else
2948 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2949 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2950 float32x2x2_t result = vzip_f32(a1, b1);
2951 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2952#endif
2953}
2954
2955// Computes bitwise EXOR (exclusive-or) of the four single-precision,
2956// floating-point values of a and b.
2957// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
2959{
2962}
2963
2964/* SSE2 */
2965
2966// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2967// unsigned 16-bit integers in b.
2968// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
2970{
2973}
2974
2975// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2976// unsigned 32-bit integers in b.
2977//
2978// r0 := a0 + b0
2979// r1 := a1 + b1
2980// r2 := a2 + b2
2981// r3 := a3 + b3
2982//
2983// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2985{
2988}
2989
2990// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2991// unsigned 32-bit integers in b.
2992// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2994{
2997}
2998
2999// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
3000// unsigned 8-bit integers in b.
3001// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
3003{
3006}
3007
3008// Add packed double-precision (64-bit) floating-point elements in a and b, and
3009// store the results in dst.
3010// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
3012{
3013#if defined(__aarch64__)
3014 return vreinterpretq_m128d_f64(
3015 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3016#else
3017 double *da = (double *) &a;
3018 double *db = (double *) &b;
3019 double c[2];
3020 c[0] = da[0] + db[0];
3021 c[1] = da[1] + db[1];
3022 return vld1q_f32((float32_t *) c);
3023#endif
3024}
3025
3026// Add the lower double-precision (64-bit) floating-point element in a and b,
3027// store the result in the lower element of dst, and copy the upper element from
3028// a to the upper element of dst.
3029//
3030// dst[63:0] := a[63:0] + b[63:0]
3031// dst[127:64] := a[127:64]
3032//
3033// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
3035{
3036#if defined(__aarch64__)
3037 return _mm_move_sd(a, _mm_add_pd(a, b));
3038#else
3039 double *da = (double *) &a;
3040 double *db = (double *) &b;
3041 double c[2];
3042 c[0] = da[0] + db[0];
3043 c[1] = da[1];
3044 return vld1q_f32((float32_t *) c);
3045#endif
3046}
3047
3048// Add 64-bit integers a and b, and store the result in dst.
3049//
3050// dst[63:0] := a[63:0] + b[63:0]
3051//
3052// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
3054{
3055 return vreinterpret_m64_s64(
3057}
3058
3059// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3060// and saturates.
3061//
3062// r0 := SignedSaturate(a0 + b0)
3063// r1 := SignedSaturate(a1 + b1)
3064// ...
3065// r7 := SignedSaturate(a7 + b7)
3066//
3067// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
3069{
3072}
3073
3074// Add packed signed 8-bit integers in a and b using saturation, and store the
3075// results in dst.
3076//
3077// FOR j := 0 to 15
3078// i := j*8
3079// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3080// ENDFOR
3081//
3082// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
3084{
3087}
3088
3089// Add packed unsigned 16-bit integers in a and b using saturation, and store
3090// the results in dst.
3091// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
3093{
3096}
3097
3098// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3099// b and saturates..
3100// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
3102{
3105}
3106
3107// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3108// elements in a and b, and store the results in dst.
3109//
3110// FOR j := 0 to 1
3111// i := j*64
3112// dst[i+63:i] := a[i+63:i] AND b[i+63:i]
3113// ENDFOR
3114//
3115// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
3117{
3120}
3121
3122// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
3123// b.
3124//
3125// r := a & b
3126//
3127// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
3129{
3132}
3133
3134// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3135// elements in a and then AND with b, and store the results in dst.
3136//
3137// FOR j := 0 to 1
3138// i := j*64
3139// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
3140// ENDFOR
3141//
3142// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
3144{
3145 // *NOTE* argument swap
3148}
3149
3150// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
3151// 128-bit value in a.
3152//
3153// r := (~a) & b
3154//
3155// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
3157{
3159 vbicq_s32(vreinterpretq_s32_m128i(b),
3160 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3161}
3162
3163// Computes the average of the 8 unsigned 16-bit integers in a and the 8
3164// unsigned 16-bit integers in b and rounds.
3165//
3166// r0 := (a0 + b0) / 2
3167// r1 := (a1 + b1) / 2
3168// ...
3169// r7 := (a7 + b7) / 2
3170//
3171// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
3173{
3174 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3176}
3177
3178// Computes the average of the 16 unsigned 8-bit integers in a and the 16
3179// unsigned 8-bit integers in b and rounds.
3180//
3181// r0 := (a0 + b0) / 2
3182// r1 := (a1 + b1) / 2
3183// ...
3184// r15 := (a15 + b15) / 2
3185//
3186// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
3188{
3191}
3192
3193// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3194// dst.
3195// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
3196#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3197
3198// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3199// dst.
3200// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
3201#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3202
3203// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3204// compilation and does not generate any instructions, thus it has zero latency.
3205// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
3207{
3209}
3210
3211// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3212// compilation and does not generate any instructions, thus it has zero latency.
3213// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
3215{
3217}
3218
3219// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3220// compilation and does not generate any instructions, thus it has zero latency.
3221// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
3223{
3225}
3226
3227// Applies a type cast to reinterpret four 32-bit floating point values passed
3228// in as a 128-bit parameter as packed 32-bit integers.
3229// https://msdn.microsoft.com/en-us/library/bb514099.aspx
3231{
3233}
3234
3235// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3236// compilation and does not generate any instructions, thus it has zero latency.
3237// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
3239{
3240#if defined(__aarch64__)
3241 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3242#else
3244#endif
3245}
3246
3247// Applies a type cast to reinterpret four 32-bit integers passed in as a
3248// 128-bit parameter as packed 32-bit floating point values.
3249// https://msdn.microsoft.com/en-us/library/bb514029.aspx
3251{
3253}
3254
3255// Cache line containing p is flushed and invalidated from all caches in the
3256// coherency domain. :
3257// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
3258FORCE_INLINE void _mm_clflush(void const *p)
3259{
3260 (void) p;
3261 // no corollary for Neon?
3262}
3263
3264// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3265// unsigned 16-bit integers in b for equality.
3266// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
3268{
3271}
3272
3273// Compare packed 32-bit integers in a and b for equality, and store the results
3274// in dst
3276{
3279}
3280
3281// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3282// unsigned 8-bit integers in b for equality.
3283// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
3285{
3288}
3289
3290// Compare packed double-precision (64-bit) floating-point elements in a and b
3291// for equality, and store the results in dst.
3292// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
3294{
3295#if defined(__aarch64__)
3297 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3298#else
3299 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3300 uint32x4_t cmp =
3302 uint32x4_t swapped = vrev64q_u32(cmp);
3303 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3304#endif
3305}
3306
3307// Compare the lower double-precision (64-bit) floating-point elements in a and
3308// b for equality, store the result in the lower element of dst, and copy the
3309// upper element from a to the upper element of dst.
3310// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
3312{
3313 return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3314}
3315
3316// Compare packed double-precision (64-bit) floating-point elements in a and b
3317// for greater-than-or-equal, and store the results in dst.
3318// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
3320{
3321#if defined(__aarch64__)
3323 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3324#else
3325 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3326 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3327 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3328 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3329 uint64_t d[2];
3330 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3331 d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3332
3333 return vreinterpretq_m128d_u64(vld1q_u64(d));
3334#endif
3335}
3336
3337// Compare the lower double-precision (64-bit) floating-point elements in a and
3338// b for greater-than-or-equal, store the result in the lower element of dst,
3339// and copy the upper element from a to the upper element of dst.
3340// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
3342{
3343#if defined(__aarch64__)
3344 return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3345#else
3346 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3347 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3348 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3349 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3350 uint64_t d[2];
3351 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3352 d[1] = a1;
3353
3354 return vreinterpretq_m128d_u64(vld1q_u64(d));
3355#endif
3356}
3357
3358// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3359// in b for greater than.
3360//
3361// r0 := (a0 > b0) ? 0xffff : 0x0
3362// r1 := (a1 > b1) ? 0xffff : 0x0
3363// ...
3364// r7 := (a7 > b7) ? 0xffff : 0x0
3365//
3366// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
3368{
3371}
3372
3373// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3374// in b for greater than.
3375// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
3377{
3380}
3381
3382// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3383// in b for greater than.
3384//
3385// r0 := (a0 > b0) ? 0xff : 0x0
3386// r1 := (a1 > b1) ? 0xff : 0x0
3387// ...
3388// r15 := (a15 > b15) ? 0xff : 0x0
3389//
3390// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
3392{
3395}
3396
3397// Compare packed double-precision (64-bit) floating-point elements in a and b
3398// for greater-than, and store the results in dst.
3399// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
3401{
3402#if defined(__aarch64__)
3404 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3405#else
3406 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3407 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3408 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3409 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3410 uint64_t d[2];
3411 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3412 d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3413
3414 return vreinterpretq_m128d_u64(vld1q_u64(d));
3415#endif
3416}
3417
3418// Compare the lower double-precision (64-bit) floating-point elements in a and
3419// b for greater-than, store the result in the lower element of dst, and copy
3420// the upper element from a to the upper element of dst.
3421// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
3423{
3424#if defined(__aarch64__)
3425 return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3426#else
3427 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3428 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3429 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3430 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3431 uint64_t d[2];
3432 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3433 d[1] = a1;
3434
3435 return vreinterpretq_m128d_u64(vld1q_u64(d));
3436#endif
3437}
3438
3439// Compare packed double-precision (64-bit) floating-point elements in a and b
3440// for less-than-or-equal, and store the results in dst.
3441// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
3443{
3444#if defined(__aarch64__)
3446 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3447#else
3448 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3449 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3450 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3451 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3452 uint64_t d[2];
3453 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3454 d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3455
3456 return vreinterpretq_m128d_u64(vld1q_u64(d));
3457#endif
3458}
3459
3460// Compare the lower double-precision (64-bit) floating-point elements in a and
3461// b for less-than-or-equal, store the result in the lower element of dst, and
3462// copy the upper element from a to the upper element of dst.
3463// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
3465{
3466#if defined(__aarch64__)
3467 return _mm_move_sd(a, _mm_cmple_pd(a, b));
3468#else
3469 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3470 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3471 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3472 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3473 uint64_t d[2];
3474 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3475 d[1] = a1;
3476
3477 return vreinterpretq_m128d_u64(vld1q_u64(d));
3478#endif
3479}
3480
3481// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3482// in b for less than.
3483//
3484// r0 := (a0 < b0) ? 0xffff : 0x0
3485// r1 := (a1 < b1) ? 0xffff : 0x0
3486// ...
3487// r7 := (a7 < b7) ? 0xffff : 0x0
3488//
3489// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
3491{
3494}
3495
3496
3497// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3498// in b for less than.
3499// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
3501{
3504}
3505
3506// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3507// in b for lesser than.
3508// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
3510{
3513}
3514
3515// Compare packed double-precision (64-bit) floating-point elements in a and b
3516// for less-than, and store the results in dst.
3517// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
3519{
3520#if defined(__aarch64__)
3522 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3523#else
3524 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3525 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3526 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3527 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3528 uint64_t d[2];
3529 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3530 d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3531
3532 return vreinterpretq_m128d_u64(vld1q_u64(d));
3533#endif
3534}
3535
3536// Compare the lower double-precision (64-bit) floating-point elements in a and
3537// b for less-than, store the result in the lower element of dst, and copy the
3538// upper element from a to the upper element of dst.
3539// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
3541{
3542#if defined(__aarch64__)
3543 return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3544#else
3545 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3546 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3547 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3548 uint64_t d[2];
3549 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3550 d[1] = a1;
3551
3552 return vreinterpretq_m128d_u64(vld1q_u64(d));
3553#endif
3554}
3555
3556// Compare packed double-precision (64-bit) floating-point elements in a and b
3557// for not-equal, and store the results in dst.
3558// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
3560{
3561#if defined(__aarch64__)
3562 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3563 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3564#else
3565 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3566 uint32x4_t cmp =
3568 uint32x4_t swapped = vrev64q_u32(cmp);
3569 return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3570#endif
3571}
3572
3573// Compare the lower double-precision (64-bit) floating-point elements in a and
3574// b for not-equal, store the result in the lower element of dst, and copy the
3575// upper element from a to the upper element of dst.
3576// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
3578{
3579 return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3580}
3581
3582// Compare packed double-precision (64-bit) floating-point elements in a and b
3583// for not-greater-than-or-equal, and store the results in dst.
3584// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
3586{
3587#if defined(__aarch64__)
3588 return vreinterpretq_m128d_u64(veorq_u64(
3589 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3590 vdupq_n_u64(UINT64_MAX)));
3591#else
3592 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3593 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3594 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3595 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3596 uint64_t d[2];
3597 d[0] =
3598 !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3599 d[1] =
3600 !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3601
3602 return vreinterpretq_m128d_u64(vld1q_u64(d));
3603#endif
3604}
3605
3606// Compare the lower double-precision (64-bit) floating-point elements in a and
3607// b for not-greater-than-or-equal, store the result in the lower element of
3608// dst, and copy the upper element from a to the upper element of dst.
3609// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
3611{
3612 return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3613}
3614
3615// Compare packed double-precision (64-bit) floating-point elements in a and b
3616// for not-greater-than, and store the results in dst.
3617// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
3619{
3620#if defined(__aarch64__)
3621 return vreinterpretq_m128d_u64(veorq_u64(
3622 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3623 vdupq_n_u64(UINT64_MAX)));
3624#else
3625 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3626 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3627 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3628 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3629 uint64_t d[2];
3630 d[0] =
3631 !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3632 d[1] =
3633 !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3634
3635 return vreinterpretq_m128d_u64(vld1q_u64(d));
3636#endif
3637}
3638
3639// Compare the lower double-precision (64-bit) floating-point elements in a and
3640// b for not-greater-than, store the result in the lower element of dst, and
3641// copy the upper element from a to the upper element of dst.
3642// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
3644{
3645 return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3646}
3647
3648// Compare packed double-precision (64-bit) floating-point elements in a and b
3649// for not-less-than-or-equal, and store the results in dst.
3650// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
3652{
3653#if defined(__aarch64__)
3654 return vreinterpretq_m128d_u64(veorq_u64(
3655 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3656 vdupq_n_u64(UINT64_MAX)));
3657#else
3658 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3659 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3660 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3661 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3662 uint64_t d[2];
3663 d[0] =
3664 !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3665 d[1] =
3666 !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3667
3668 return vreinterpretq_m128d_u64(vld1q_u64(d));
3669#endif
3670}
3671
3672// Compare the lower double-precision (64-bit) floating-point elements in a and
3673// b for not-less-than-or-equal, store the result in the lower element of dst,
3674// and copy the upper element from a to the upper element of dst.
3675// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
3677{
3678 return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3679}
3680
3681// Compare packed double-precision (64-bit) floating-point elements in a and b
3682// for not-less-than, and store the results in dst.
3683// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
3685{
3686#if defined(__aarch64__)
3687 return vreinterpretq_m128d_u64(veorq_u64(
3688 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3689 vdupq_n_u64(UINT64_MAX)));
3690#else
3691 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3692 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3693 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3694 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3695 uint64_t d[2];
3696 d[0] =
3697 !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3698 d[1] =
3699 !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3700
3701 return vreinterpretq_m128d_u64(vld1q_u64(d));
3702#endif
3703}
3704
3705// Compare the lower double-precision (64-bit) floating-point elements in a and
3706// b for not-less-than, store the result in the lower element of dst, and copy
3707// the upper element from a to the upper element of dst.
3708// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
3710{
3711 return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3712}
3713
3714// Compare packed double-precision (64-bit) floating-point elements in a and b
3715// to see if neither is NaN, and store the results in dst.
3716// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
3718{
3719#if defined(__aarch64__)
3720 // Excluding NaNs, any two floating point numbers can be compared.
3721 uint64x2_t not_nan_a =
3722 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3723 uint64x2_t not_nan_b =
3724 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3725 return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3726#else
3727 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3728 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3729 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3730 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3731 uint64_t d[2];
3732 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3733 (*(double *) &b0) == (*(double *) &b0))
3734 ? ~UINT64_C(0)
3735 : UINT64_C(0);
3736 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3737 (*(double *) &b1) == (*(double *) &b1))
3738 ? ~UINT64_C(0)
3739 : UINT64_C(0);
3740
3741 return vreinterpretq_m128d_u64(vld1q_u64(d));
3742#endif
3743}
3744
3745// Compare the lower double-precision (64-bit) floating-point elements in a and
3746// b to see if neither is NaN, store the result in the lower element of dst, and
3747// copy the upper element from a to the upper element of dst.
3748// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
3750{
3751#if defined(__aarch64__)
3752 return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3753#else
3754 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3755 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3756 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3757 uint64_t d[2];
3758 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3759 (*(double *) &b0) == (*(double *) &b0))
3760 ? ~UINT64_C(0)
3761 : UINT64_C(0);
3762 d[1] = a1;
3763
3764 return vreinterpretq_m128d_u64(vld1q_u64(d));
3765#endif
3766}
3767
3768// Compare packed double-precision (64-bit) floating-point elements in a and b
3769// to see if either is NaN, and store the results in dst.
3770// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
3772{
3773#if defined(__aarch64__)
3774 // Two NaNs are not equal in comparison operation.
3775 uint64x2_t not_nan_a =
3776 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3777 uint64x2_t not_nan_b =
3778 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3780 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3781#else
3782 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3783 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3784 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3785 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3786 uint64_t d[2];
3787 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3788 (*(double *) &b0) == (*(double *) &b0))
3789 ? UINT64_C(0)
3790 : ~UINT64_C(0);
3791 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3792 (*(double *) &b1) == (*(double *) &b1))
3793 ? UINT64_C(0)
3794 : ~UINT64_C(0);
3795
3796 return vreinterpretq_m128d_u64(vld1q_u64(d));
3797#endif
3798}
3799
3800// Compare the lower double-precision (64-bit) floating-point elements in a and
3801// b to see if either is NaN, store the result in the lower element of dst, and
3802// copy the upper element from a to the upper element of dst.
3803// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
3805{
3806#if defined(__aarch64__)
3807 return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3808#else
3809 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3810 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3811 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3812 uint64_t d[2];
3813 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3814 (*(double *) &b0) == (*(double *) &b0))
3815 ? UINT64_C(0)
3816 : ~UINT64_C(0);
3817 d[1] = a1;
3818
3819 return vreinterpretq_m128d_u64(vld1q_u64(d));
3820#endif
3821}
3822
3823// Compare the lower double-precision (64-bit) floating-point element in a and b
3824// for greater-than-or-equal, and return the boolean result (0 or 1).
3825// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
3827{
3828#if defined(__aarch64__)
3829 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3830#else
3831 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3832 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3833
3834 return (*(double *) &a0 >= *(double *) &b0);
3835#endif
3836}
3837
3838// Compare the lower double-precision (64-bit) floating-point element in a and b
3839// for greater-than, and return the boolean result (0 or 1).
3840// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
3842{
3843#if defined(__aarch64__)
3844 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3845#else
3846 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3847 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3848
3849 return (*(double *) &a0 > *(double *) &b0);
3850#endif
3851}
3852
3853// Compare the lower double-precision (64-bit) floating-point element in a and b
3854// for less-than-or-equal, and return the boolean result (0 or 1).
3855// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
3857{
3858#if defined(__aarch64__)
3859 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3860#else
3861 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3862 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3863
3864 return (*(double *) &a0 <= *(double *) &b0);
3865#endif
3866}
3867
3868// Compare the lower double-precision (64-bit) floating-point element in a and b
3869// for less-than, and return the boolean result (0 or 1).
3870// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
3872{
3873#if defined(__aarch64__)
3874 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3875#else
3876 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3877 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3878
3879 return (*(double *) &a0 < *(double *) &b0);
3880#endif
3881}
3882
3883// Compare the lower double-precision (64-bit) floating-point element in a and b
3884// for equality, and return the boolean result (0 or 1).
3885// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
3887{
3888#if defined(__aarch64__)
3889 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3890#else
3891 uint32x4_t a_not_nan =
3893 uint32x4_t b_not_nan =
3895 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3896 uint32x4_t a_eq_b =
3898 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3899 vreinterpretq_u64_u32(a_eq_b));
3900 return vgetq_lane_u64(and_results, 0) & 0x1;
3901#endif
3902}
3903
3904// Compare the lower double-precision (64-bit) floating-point element in a and b
3905// for not-equal, and return the boolean result (0 or 1).
3906// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
3908{
3909 return !_mm_comieq_sd(a, b);
3910}
3911
3912// Convert packed signed 32-bit integers in a to packed double-precision
3913// (64-bit) floating-point elements, and store the results in dst.
3914//
3915// FOR j := 0 to 1
3916// i := j*32
3917// m := j*64
3918// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3919// ENDFOR
3920//
3921// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
3923{
3924#if defined(__aarch64__)
3925 return vreinterpretq_m128d_f64(
3926 vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3927#else
3928 double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3929 double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3930 return _mm_set_pd(a1, a0);
3931#endif
3932}
3933
3934// Converts the four signed 32-bit integer values of a to single-precision,
3935// floating-point values
3936// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
3938{
3939 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3940}
3941
3942// Convert packed double-precision (64-bit) floating-point elements in a to
3943// packed 32-bit integers, and store the results in dst.
3944//
3945// FOR j := 0 to 1
3946// i := 32*j
3947// k := 64*j
3948// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3949// ENDFOR
3950//
3951// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
3953{
3955 double d0 = ((double *) &rnd)[0];
3956 double d1 = ((double *) &rnd)[1];
3957 return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3958}
3959
3960// Convert packed double-precision (64-bit) floating-point elements in a to
3961// packed 32-bit integers, and store the results in dst.
3962//
3963// FOR j := 0 to 1
3964// i := 32*j
3965// k := 64*j
3966// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3967// ENDFOR
3968//
3969// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
3971{
3973 double d0 = ((double *) &rnd)[0];
3974 double d1 = ((double *) &rnd)[1];
3975 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3976 return vreinterpret_m64_s32(vld1_s32(data));
3977}
3978
3979// Convert packed double-precision (64-bit) floating-point elements in a to
3980// packed single-precision (32-bit) floating-point elements, and store the
3981// results in dst.
3982//
3983// FOR j := 0 to 1
3984// i := 32*j
3985// k := 64*j
3986// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
3987// ENDFOR
3988// dst[127:64] := 0
3989//
3990// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
3992{
3993#if defined(__aarch64__)
3994 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3995 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3996#else
3997 float a0 = (float) ((double *) &a)[0];
3998 float a1 = (float) ((double *) &a)[1];
3999 return _mm_set_ps(0, 0, a1, a0);
4000#endif
4001}
4002
4003// Convert packed signed 32-bit integers in a to packed double-precision
4004// (64-bit) floating-point elements, and store the results in dst.
4005//
4006// FOR j := 0 to 1
4007// i := j*32
4008// m := j*64
4009// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
4010// ENDFOR
4011//
4012// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
4014{
4015#if defined(__aarch64__)
4016 return vreinterpretq_m128d_f64(
4017 vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
4018#else
4019 double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
4020 double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
4021 return _mm_set_pd(a1, a0);
4022#endif
4023}
4024
4025// Converts the four single-precision, floating-point values of a to signed
4026// 32-bit integer values.
4027//
4028// r0 := (int) a0
4029// r1 := (int) a1
4030// r2 := (int) a2
4031// r3 := (int) a3
4032//
4033// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4034// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4035// does not support! It is supported on ARMv8-A however.
4037{
4038#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
4039 switch (_MM_GET_ROUNDING_MODE()) {
4040 case _MM_ROUND_NEAREST:
4041 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4042 case _MM_ROUND_DOWN:
4043 return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
4044 case _MM_ROUND_UP:
4045 return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
4046 default: // _MM_ROUND_TOWARD_ZERO
4047 return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
4048 }
4049#else
4050 float *f = (float *) &a;
4051 switch (_MM_GET_ROUNDING_MODE()) {
4052 case _MM_ROUND_NEAREST: {
4053 uint32x4_t signmask = vdupq_n_u32(0x80000000);
4054 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4055 vdupq_n_f32(0.5f)); /* +/- 0.5 */
4056 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4057 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4058 int32x4_t r_trunc = vcvtq_s32_f32(
4059 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4060 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4061 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4062 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4063 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4064 float32x4_t delta = vsubq_f32(
4066 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4067 uint32x4_t is_delta_half =
4068 vceqq_f32(delta, half); /* delta == +/- 0.5 */
4070 vbslq_s32(is_delta_half, r_even, r_normal));
4071 }
4072 case _MM_ROUND_DOWN:
4073 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4074 floorf(f[0]));
4075 case _MM_ROUND_UP:
4076 return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
4077 ceilf(f[0]));
4078 default: // _MM_ROUND_TOWARD_ZERO
4079 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4080 (int32_t) f[0]);
4081 }
4082#endif
4083}
4084
4085// Convert packed single-precision (32-bit) floating-point elements in a to
4086// packed double-precision (64-bit) floating-point elements, and store the
4087// results in dst.
4088//
4089// FOR j := 0 to 1
4090// i := 64*j
4091// k := 32*j
4092// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4093// ENDFOR
4094//
4095// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
4097{
4098#if defined(__aarch64__)
4099 return vreinterpretq_m128d_f64(
4100 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4101#else
4102 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4103 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4104 return _mm_set_pd(a1, a0);
4105#endif
4106}
4107
4108// Copy the lower double-precision (64-bit) floating-point element of a to dst.
4109//
4110// dst[63:0] := a[63:0]
4111//
4112// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
4114{
4115#if defined(__aarch64__)
4116 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4117#else
4118 return ((double *) &a)[0];
4119#endif
4120}
4121
4122// Convert the lower double-precision (64-bit) floating-point element in a to a
4123// 32-bit integer, and store the result in dst.
4124//
4125// dst[31:0] := Convert_FP64_To_Int32(a[63:0])
4126//
4127// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
4129{
4130#if defined(__aarch64__)
4131 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4132#else
4134 double ret = ((double *) &rnd)[0];
4135 return (int32_t) ret;
4136#endif
4137}
4138
4139// Convert the lower double-precision (64-bit) floating-point element in a to a
4140// 64-bit integer, and store the result in dst.
4141//
4142// dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4143//
4144// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
4146{
4147#if defined(__aarch64__)
4148 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4149#else
4151 double ret = ((double *) &rnd)[0];
4152 return (int64_t) ret;
4153#endif
4154}
4155
4156// Convert the lower double-precision (64-bit) floating-point element in a to a
4157// 64-bit integer, and store the result in dst.
4158//
4159// dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4160//
4161// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
4162#define _mm_cvtsd_si64x _mm_cvtsd_si64
4163
4164// Convert the lower double-precision (64-bit) floating-point element in b to a
4165// single-precision (32-bit) floating-point element, store the result in the
4166// lower element of dst, and copy the upper 3 packed elements from a to the
4167// upper elements of dst.
4168// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
4170{
4171#if defined(__aarch64__)
4172 return vreinterpretq_m128_f32(vsetq_lane_f32(
4173 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4174 vreinterpretq_f32_m128(a), 0));
4175#else
4176 return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4177 vreinterpretq_f32_m128(a), 0));
4178#endif
4179}
4180
4181// Copy the lower 32-bit integer in a to dst.
4182//
4183// dst[31:0] := a[31:0]
4184//
4185// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
4187{
4188 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4189}
4190
4191// Copy the lower 64-bit integer in a to dst.
4192//
4193// dst[63:0] := a[63:0]
4194//
4195// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
4197{
4198 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4199}
4200
4201// Copy the lower 64-bit integer in a to dst.
4202// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4203#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4204
4205// Convert the signed 32-bit integer b to a double-precision (64-bit)
4206// floating-point element, store the result in the lower element of dst, and
4207// copy the upper element from a to the upper element of dst.
4208// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
4210{
4211#if defined(__aarch64__)
4212 return vreinterpretq_m128d_f64(
4213 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4214#else
4215 double bf = (double) b;
4217 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4218#endif
4219}
4220
4221// Copy the lower 64-bit integer in a to dst.
4222//
4223// dst[63:0] := a[63:0]
4224//
4225// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4226#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4227
4228// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4229// zero extending the upper bits.
4230//
4231// r0 := a
4232// r1 := 0x0
4233// r2 := 0x0
4234// r3 := 0x0
4235//
4236// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
4238{
4239 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4240}
4241
4242// Convert the signed 64-bit integer b to a double-precision (64-bit)
4243// floating-point element, store the result in the lower element of dst, and
4244// copy the upper element from a to the upper element of dst.
4245// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
4247{
4248#if defined(__aarch64__)
4249 return vreinterpretq_m128d_f64(
4250 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4251#else
4252 double bf = (double) b;
4254 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4255#endif
4256}
4257
4258// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4259// zero extending the upper bits.
4260//
4261// r0 := a
4262// r1 := 0x0
4264{
4265 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4266}
4267
4268// Copy 64-bit integer a to the lower element of dst, and zero the upper
4269// element.
4270// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
4271#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4272
4273// Convert the signed 64-bit integer b to a double-precision (64-bit)
4274// floating-point element, store the result in the lower element of dst, and
4275// copy the upper element from a to the upper element of dst.
4276// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
4277#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4278
4279// Convert the lower single-precision (32-bit) floating-point element in b to a
4280// double-precision (64-bit) floating-point element, store the result in the
4281// lower element of dst, and copy the upper element from a to the upper element
4282// of dst.
4283//
4284// dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4285// dst[127:64] := a[127:64]
4286//
4287// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
4289{
4290 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4291#if defined(__aarch64__)
4292 return vreinterpretq_m128d_f64(
4293 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4294#else
4296 vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4297#endif
4298}
4299
4300// Convert packed double-precision (64-bit) floating-point elements in a to
4301// packed 32-bit integers with truncation, and store the results in dst.
4302// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
4304{
4305 double a0 = ((double *) &a)[0];
4306 double a1 = ((double *) &a)[1];
4307 return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4308}
4309
4310// Convert packed double-precision (64-bit) floating-point elements in a to
4311// packed 32-bit integers with truncation, and store the results in dst.
4312// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
4314{
4315 double a0 = ((double *) &a)[0];
4316 double a1 = ((double *) &a)[1];
4317 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4318 return vreinterpret_m64_s32(vld1_s32(data));
4319}
4320
4321// Converts the four single-precision, floating-point values of a to signed
4322// 32-bit integer values using truncate.
4323// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
4325{
4326 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4327}
4328
4329// Convert the lower double-precision (64-bit) floating-point element in a to a
4330// 32-bit integer with truncation, and store the result in dst.
4331//
4332// dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4333//
4334// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
4336{
4337 double ret = *((double *) &a);
4338 return (int32_t) ret;
4339}
4340
4341// Convert the lower double-precision (64-bit) floating-point element in a to a
4342// 64-bit integer with truncation, and store the result in dst.
4343//
4344// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4345//
4346// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
4348{
4349#if defined(__aarch64__)
4350 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4351#else
4352 double ret = *((double *) &a);
4353 return (int64_t) ret;
4354#endif
4355}
4356
4357// Convert the lower double-precision (64-bit) floating-point element in a to a
4358// 64-bit integer with truncation, and store the result in dst.
4359//
4360// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4361//
4362// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4363#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4364
4365// Divide packed double-precision (64-bit) floating-point elements in a by
4366// packed elements in b, and store the results in dst.
4367//
4368// FOR j := 0 to 1
4369// i := 64*j
4370// dst[i+63:i] := a[i+63:i] / b[i+63:i]
4371// ENDFOR
4372//
4373// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
4375{
4376#if defined(__aarch64__)
4377 return vreinterpretq_m128d_f64(
4378 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4379#else
4380 double *da = (double *) &a;
4381 double *db = (double *) &b;
4382 double c[2];
4383 c[0] = da[0] / db[0];
4384 c[1] = da[1] / db[1];
4385 return vld1q_f32((float32_t *) c);
4386#endif
4387}
4388
4389// Divide the lower double-precision (64-bit) floating-point element in a by the
4390// lower double-precision (64-bit) floating-point element in b, store the result
4391// in the lower element of dst, and copy the upper element from a to the upper
4392// element of dst.
4393// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
4395{
4396#if defined(__aarch64__)
4397 float64x2_t tmp =
4398 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4399 return vreinterpretq_m128d_f64(
4400 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4401#else
4402 return _mm_move_sd(a, _mm_div_pd(a, b));
4403#endif
4404}
4405
4406// Extracts the selected signed or unsigned 16-bit integer from a and zero
4407// extends.
4408// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4409// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4410#define _mm_extract_epi16(a, imm) \
4411 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4412
4413// Inserts the least significant 16 bits of b into the selected 16-bit integer
4414// of a.
4415// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4416// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4417// __constrange(0,8) int imm)
4418#define _mm_insert_epi16(a, b, imm) \
4419 __extension__({ \
4420 vreinterpretq_m128i_s16( \
4421 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4422 })
4423
4424// Loads two double-precision from 16-byte aligned memory, floating-point
4425// values.
4426//
4427// dst[127:0] := MEM[mem_addr+127:mem_addr]
4428//
4429// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
4431{
4432#if defined(__aarch64__)
4433 return vreinterpretq_m128d_f64(vld1q_f64(p));
4434#else
4435 const float *fp = (const float *) p;
4436 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4437 return vreinterpretq_m128d_f32(vld1q_f32(data));
4438#endif
4439}
4440
4441// Load a double-precision (64-bit) floating-point element from memory into both
4442// elements of dst.
4443//
4444// dst[63:0] := MEM[mem_addr+63:mem_addr]
4445// dst[127:64] := MEM[mem_addr+63:mem_addr]
4446//
4447// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4448#define _mm_load_pd1 _mm_load1_pd
4449
4450// Load a double-precision (64-bit) floating-point element from memory into the
4451// lower of dst, and zero the upper element. mem_addr does not need to be
4452// aligned on any particular boundary.
4453//
4454// dst[63:0] := MEM[mem_addr+63:mem_addr]
4455// dst[127:64] := 0
4456//
4457// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
4459{
4460#if defined(__aarch64__)
4461 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4462#else
4463 const float *fp = (const float *) p;
4464 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4465 return vreinterpretq_m128d_f32(vld1q_f32(data));
4466#endif
4467}
4468
4469// Loads 128-bit value. :
4470// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
4472{
4473 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4474}
4475
4476// Load a double-precision (64-bit) floating-point element from memory into both
4477// elements of dst.
4478//
4479// dst[63:0] := MEM[mem_addr+63:mem_addr]
4480// dst[127:64] := MEM[mem_addr+63:mem_addr]
4481//
4482// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
4484{
4485#if defined(__aarch64__)
4486 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4487#else
4488 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4489#endif
4490}
4491
4492// Load a double-precision (64-bit) floating-point element from memory into the
4493// upper element of dst, and copy the lower element from a to dst. mem_addr does
4494// not need to be aligned on any particular boundary.
4495//
4496// dst[63:0] := a[63:0]
4497// dst[127:64] := MEM[mem_addr+63:mem_addr]
4498//
4499// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
4501{
4502#if defined(__aarch64__)
4503 return vreinterpretq_m128d_f64(
4504 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4505#else
4506 return vreinterpretq_m128d_f32(vcombine_f32(
4507 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4508#endif
4509}
4510
4511// Load 64-bit integer from memory into the first element of dst.
4512// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
4514{
4515 /* Load the lower 64 bits of the value pointed to by p into the
4516 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4517 */
4519 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4520}
4521
4522// Load a double-precision (64-bit) floating-point element from memory into the
4523// lower element of dst, and copy the upper element from a to dst. mem_addr does
4524// not need to be aligned on any particular boundary.
4525//
4526// dst[63:0] := MEM[mem_addr+63:mem_addr]
4527// dst[127:64] := a[127:64]
4528//
4529// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
4531{
4532#if defined(__aarch64__)
4533 return vreinterpretq_m128d_f64(
4534 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4535#else
4537 vcombine_f32(vld1_f32((const float *) p),
4538 vget_high_f32(vreinterpretq_f32_m128d(a))));
4539#endif
4540}
4541
4542// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4543// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4544// general-protection exception may be generated.
4545//
4546// dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4547// dst[127:64] := MEM[mem_addr+63:mem_addr]
4548//
4549// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
4551{
4552#if defined(__aarch64__)
4553 float64x2_t v = vld1q_f64(p);
4554 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4555#else
4556 int64x2_t v = vld1q_s64((const int64_t *) p);
4557 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4558#endif
4559}
4560
4561// Loads two double-precision from unaligned memory, floating-point values.
4562// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
4564{
4565 return _mm_load_pd(p);
4566}
4567
4568// Loads 128-bit value. :
4569// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
4571{
4572 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4573}
4574
4575// Load unaligned 32-bit integer from memory into the first element of dst.
4576//
4577// dst[31:0] := MEM[mem_addr+31:mem_addr]
4578// dst[MAX:32] := 0
4579//
4580// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
4582{
4584 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4585}
4586
4587// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4588// integers from b.
4589//
4590// r0 := (a0 * b0) + (a1 * b1)
4591// r1 := (a2 * b2) + (a3 * b3)
4592// r2 := (a4 * b4) + (a5 * b5)
4593// r3 := (a6 * b6) + (a7 * b7)
4594// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
4596{
4597 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4598 vget_low_s16(vreinterpretq_s16_m128i(b)));
4599 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4600 vget_high_s16(vreinterpretq_s16_m128i(b)));
4601
4602 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4603 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4604
4605 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4606}
4607
4608// Conditionally store 8-bit integer elements from a into memory using mask
4609// (elements are not stored when the highest bit is not set in the corresponding
4610// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4611// on any particular boundary.
4612// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
4614{
4615 int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4616 __m128 b = _mm_load_ps((const float *) mem_addr);
4617 int8x16_t masked =
4618 vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4620 vst1q_s8((int8_t *) mem_addr, masked);
4621}
4622
4623// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4624// signed 16-bit integers from b.
4625// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
4627{
4630}
4631
4632// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4633// 16 unsigned 8-bit integers from b.
4634// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
4636{
4639}
4640
4641// Compare packed double-precision (64-bit) floating-point elements in a and b,
4642// and store packed maximum values in dst.
4643// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
4645{
4646#if defined(__aarch64__)
4647#if SSE2NEON_PRECISE_MINMAX
4648 float64x2_t _a = vreinterpretq_f64_m128d(a);
4649 float64x2_t _b = vreinterpretq_f64_m128d(b);
4650 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4651#else
4652 return vreinterpretq_m128d_f64(
4653 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4654#endif
4655#else
4656 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4657 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4658 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4659 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4660 uint64_t d[2];
4661 d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4662 d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4663
4664 return vreinterpretq_m128d_u64(vld1q_u64(d));
4665#endif
4666}
4667
4668// Compare the lower double-precision (64-bit) floating-point elements in a and
4669// b, store the maximum value in the lower element of dst, and copy the upper
4670// element from a to the upper element of dst.
4671// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
4673{
4674#if defined(__aarch64__)
4675 return _mm_move_sd(a, _mm_max_pd(a, b));
4676#else
4677 double *da = (double *) &a;
4678 double *db = (double *) &b;
4679 double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4680 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4681#endif
4682}
4683
4684// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4685// signed 16-bit integers from b.
4686// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
4688{
4691}
4692
4693// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4694// 16 unsigned 8-bit integers from b.
4695// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
4697{
4700}
4701
4702// Compare packed double-precision (64-bit) floating-point elements in a and b,
4703// and store packed minimum values in dst.
4704// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
4706{
4707#if defined(__aarch64__)
4708#if SSE2NEON_PRECISE_MINMAX
4709 float64x2_t _a = vreinterpretq_f64_m128d(a);
4710 float64x2_t _b = vreinterpretq_f64_m128d(b);
4711 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4712#else
4713 return vreinterpretq_m128d_f64(
4714 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4715#endif
4716#else
4717 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4718 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4719 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4720 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4721 uint64_t d[2];
4722 d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4723 d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4724 return vreinterpretq_m128d_u64(vld1q_u64(d));
4725#endif
4726}
4727
4728// Compare the lower double-precision (64-bit) floating-point elements in a and
4729// b, store the minimum value in the lower element of dst, and copy the upper
4730// element from a to the upper element of dst.
4731// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
4733{
4734#if defined(__aarch64__)
4735 return _mm_move_sd(a, _mm_min_pd(a, b));
4736#else
4737 double *da = (double *) &a;
4738 double *db = (double *) &b;
4739 double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4740 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4741#endif
4742}
4743
4744// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4745// upper element.
4746//
4747// dst[63:0] := a[63:0]
4748// dst[127:64] := 0
4749//
4750// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
4752{
4754 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4755}
4756
4757// Move the lower double-precision (64-bit) floating-point element from b to the
4758// lower element of dst, and copy the upper element from a to the upper element
4759// of dst.
4760//
4761// dst[63:0] := b[63:0]
4762// dst[127:64] := a[127:64]
4763//
4764// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
4766{
4768 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4769 vget_high_f32(vreinterpretq_f32_m128d(a))));
4770}
4771
4772// NEON does not provide a version of this function.
4773// Creates a 16-bit mask from the most significant bits of the 16 signed or
4774// unsigned 8-bit integers in a and zero extends the upper bits.
4775// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
4777{
4778 // Use increasingly wide shifts+adds to collect the sign bits
4779 // together.
4780 // Since the widening shifts would be rather confusing to follow in little
4781 // endian, everything will be illustrated in big endian order instead. This
4782 // has a different result - the bits would actually be reversed on a big
4783 // endian machine.
4784
4785 // Starting input (only half the elements are shown):
4786 // 89 ff 1d c0 00 10 99 33
4787 uint8x16_t input = vreinterpretq_u8_m128i(a);
4788
4789 // Shift out everything but the sign bits with an unsigned shift right.
4790 //
4791 // Bytes of the vector::
4792 // 89 ff 1d c0 00 10 99 33
4793 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4794 // | | | | | | | |
4795 // 01 01 00 01 00 00 01 00
4796 //
4797 // Bits of first important lane(s):
4798 // 10001001 (89)
4799 // \______
4800 // |
4801 // 00000001 (01)
4802 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4803
4804 // Merge the even lanes together with a 16-bit unsigned shift right + add.
4805 // 'xx' represents garbage data which will be ignored in the final result.
4806 // In the important bytes, the add functions like a binary OR.
4807 //
4808 // 01 01 00 01 00 00 01 00
4809 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4810 // \| \| \| \|
4811 // xx 03 xx 01 xx 00 xx 02
4812 //
4813 // 00000001 00000001 (01 01)
4814 // \_______ |
4815 // \|
4816 // xxxxxxxx xxxxxx11 (xx 03)
4817 uint32x4_t paired16 =
4818 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4819
4820 // Repeat with a wider 32-bit shift + add.
4821 // xx 03 xx 01 xx 00 xx 02
4822 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4823 // 14))
4824 // \| \|
4825 // xx xx xx 0d xx xx xx 02
4826 //
4827 // 00000011 00000001 (03 01)
4828 // \\_____ ||
4829 // '----.\||
4830 // xxxxxxxx xxxx1101 (xx 0d)
4831 uint64x2_t paired32 =
4832 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4833
4834 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4835 // lanes. xx xx xx 0d xx xx xx 02
4836 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4837 // 28))
4838 // \|
4839 // xx xx xx xx xx xx xx d2
4840 //
4841 // 00001101 00000010 (0d 02)
4842 // \ \___ | |
4843 // '---. \| |
4844 // xxxxxxxx 11010010 (xx d2)
4845 uint8x16_t paired64 =
4846 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4847
4848 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4849 // xx xx xx xx xx xx xx d2
4850 // || return paired64[0]
4851 // d2
4852 // Note: Little endian would return the correct value 4b (01001011) instead.
4853 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4854}
4855
4856// Set each bit of mask dst based on the most significant bit of the
4857// corresponding packed double-precision (64-bit) floating-point element in a.
4858// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
4860{
4861 uint64x2_t input = vreinterpretq_u64_m128d(a);
4862 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4863 return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4864}
4865
4866// Copy the lower 64-bit integer in a to dst.
4867//
4868// dst[63:0] := a[63:0]
4869//
4870// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
4872{
4873 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4874}
4875
4876// Copy the 64-bit integer a to the lower element of dst, and zero the upper
4877// element.
4878//
4879// dst[63:0] := a[63:0]
4880// dst[127:64] := 0
4881//
4882// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
4884{
4886 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4887}
4888
4889// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4890// a and b, and store the unsigned 64-bit results in dst.
4891//
4892// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
4893// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
4895{
4896 // vmull_u32 upcasts instead of masking, so we downcast.
4897 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4898 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4899 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4900}
4901
4902// Multiply packed double-precision (64-bit) floating-point elements in a and b,
4903// and store the results in dst.
4904// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
4906{
4907#if defined(__aarch64__)
4908 return vreinterpretq_m128d_f64(
4909 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4910#else
4911 double *da = (double *) &a;
4912 double *db = (double *) &b;
4913 double c[2];
4914 c[0] = da[0] * db[0];
4915 c[1] = da[1] * db[1];
4916 return vld1q_f32((float32_t *) c);
4917#endif
4918}
4919
4920// Multiply the lower double-precision (64-bit) floating-point element in a and
4921// b, store the result in the lower element of dst, and copy the upper element
4922// from a to the upper element of dst.
4923// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
4925{
4926 return _mm_move_sd(a, _mm_mul_pd(a, b));
4927}
4928
4929// Multiply the low unsigned 32-bit integers from a and b, and store the
4930// unsigned 64-bit result in dst.
4931//
4932// dst[63:0] := a[31:0] * b[31:0]
4933//
4934// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
4936{
4937 return vreinterpret_m64_u64(vget_low_u64(
4938 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4939}
4940
4941// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4942// integers from b.
4943//
4944// r0 := (a0 * b0)[31:16]
4945// r1 := (a1 * b1)[31:16]
4946// ...
4947// r7 := (a7 * b7)[31:16]
4948//
4949// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
4951{
4952 /* FIXME: issue with large values because of result saturation */
4953 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4954 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4955 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4956 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4957 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4958 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4959 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4960 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4961 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4962 uint16x8x2_t r =
4963 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4964 return vreinterpretq_m128i_u16(r.val[1]);
4965}
4966
4967// Multiply the packed unsigned 16-bit integers in a and b, producing
4968// intermediate 32-bit integers, and store the high 16 bits of the intermediate
4969// integers in dst.
4970// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
4972{
4973 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4974 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4975 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4976#if defined(__aarch64__)
4977 uint32x4_t ab7654 =
4978 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4979 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4980 vreinterpretq_u16_u32(ab7654));
4981 return vreinterpretq_m128i_u16(r);
4982#else
4983 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4984 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4985 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4986 uint16x8x2_t r =
4987 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4988 return vreinterpretq_m128i_u16(r.val[1]);
4989#endif
4990}
4991
4992// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
4993// unsigned 16-bit integers from b.
4994//
4995// r0 := (a0 * b0)[15:0]
4996// r1 := (a1 * b1)[15:0]
4997// ...
4998// r7 := (a7 * b7)[15:0]
4999//
5000// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
5002{
5005}
5006
5007// Compute the bitwise OR of packed double-precision (64-bit) floating-point
5008// elements in a and b, and store the results in dst.
5009// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
5011{
5014}
5015
5016// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
5017//
5018// r := a | b
5019//
5020// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
5022{
5025}
5026
5027// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
5028// saturates.
5029// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
5031{
5033 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
5034 vqmovn_s16(vreinterpretq_s16_m128i(b))));
5035}
5036
5037// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
5038// and saturates.
5039//
5040// r0 := SignedSaturate(a0)
5041// r1 := SignedSaturate(a1)
5042// r2 := SignedSaturate(a2)
5043// r3 := SignedSaturate(a3)
5044// r4 := SignedSaturate(b0)
5045// r5 := SignedSaturate(b1)
5046// r6 := SignedSaturate(b2)
5047// r7 := SignedSaturate(b3)
5048//
5049// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
5051{
5053 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
5054 vqmovn_s32(vreinterpretq_s32_m128i(b))));
5055}
5056
5057// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
5058// integers and saturates.
5059//
5060// r0 := UnsignedSaturate(a0)
5061// r1 := UnsignedSaturate(a1)
5062// ...
5063// r7 := UnsignedSaturate(a7)
5064// r8 := UnsignedSaturate(b0)
5065// r9 := UnsignedSaturate(b1)
5066// ...
5067// r15 := UnsignedSaturate(b7)
5068//
5069// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
5071{
5073 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5074 vqmovun_s16(vreinterpretq_s16_m128i(b))));
5075}
5076
5077// Pause the processor. This is typically used in spin-wait loops and depending
5078// on the x86 processor typical values are in the 40-100 cycle range. The
5079// 'yield' instruction isn't a good fit because it's effectively a nop on most
5080// Arm cores. Experience with several databases has shown has shown an 'isb' is
5081// a reasonable approximation.
5083{
5084 __asm__ __volatile__("isb\n");
5085}
5086
5087// Compute the absolute differences of packed unsigned 8-bit integers in a and
5088// b, then horizontally sum each consecutive 8 differences to produce two
5089// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
5090// 16 bits of 64-bit elements in dst.
5091// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
5093{
5094 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
5095 return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
5096}
5097
5098// Sets the 8 signed 16-bit integer values.
5099// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
5101 short i6,
5102 short i5,
5103 short i4,
5104 short i3,
5105 short i2,
5106 short i1,
5107 short i0)
5108{
5109 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
5110 return vreinterpretq_m128i_s16(vld1q_s16(data));
5111}
5112
5113// Sets the 4 signed 32-bit integer values.
5114// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
5115FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
5116{
5117 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
5118 return vreinterpretq_m128i_s32(vld1q_s32(data));
5119}
5120
5121// Returns the __m128i structure with its two 64-bit integer values
5122// initialized to the values of the two 64-bit integers passed in.
5123// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5125{
5126 return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
5127}
5128
5129// Returns the __m128i structure with its two 64-bit integer values
5130// initialized to the values of the two 64-bit integers passed in.
5131// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5132FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
5133{
5135 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5136}
5137
5138// Sets the 16 signed 8-bit integer values.
5139// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
5141 signed char b14,
5142 signed char b13,
5143 signed char b12,
5144 signed char b11,
5145 signed char b10,
5146 signed char b9,
5147 signed char b8,
5148 signed char b7,
5149 signed char b6,
5150 signed char b5,
5151 signed char b4,
5152 signed char b3,
5153 signed char b2,
5154 signed char b1,
5155 signed char b0)
5156{
5157 int8_t ALIGN_STRUCT(16)
5158 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5159 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5160 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5161 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5162 return (__m128i) vld1q_s8(data);
5163}
5164
5165// Set packed double-precision (64-bit) floating-point elements in dst with the
5166// supplied values.
5167// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
5168FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
5169{
5170 double ALIGN_STRUCT(16) data[2] = {e0, e1};
5171#if defined(__aarch64__)
5172 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
5173#else
5174 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
5175#endif
5176}
5177
5178// Broadcast double-precision (64-bit) floating-point value a to all elements of
5179// dst.
5180// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
5181#define _mm_set_pd1 _mm_set1_pd
5182
5183// Copy double-precision (64-bit) floating-point element a to the lower element
5184// of dst, and zero the upper element.
5185// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
5187{
5188 return _mm_set_pd(0, a);
5189}
5190
5191// Sets the 8 signed 16-bit integer values to w.
5192//
5193// r0 := w
5194// r1 := w
5195// ...
5196// r7 := w
5197//
5198// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
5200{
5201 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
5202}
5203
5204// Sets the 4 signed 32-bit integer values to i.
5205//
5206// r0 := i
5207// r1 := i
5208// r2 := i
5209// r3 := I
5210//
5211// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
5213{
5214 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
5215}
5216
5217// Sets the 2 signed 64-bit integer values to i.
5218// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
5220{
5221 return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
5222}
5223
5224// Sets the 2 signed 64-bit integer values to i.
5225// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
5227{
5228 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
5229}
5230
5231// Sets the 16 signed 8-bit integer values to b.
5232//
5233// r0 := b
5234// r1 := b
5235// ...
5236// r15 := b
5237//
5238// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
5240{
5241 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
5242}
5243
5244// Broadcast double-precision (64-bit) floating-point value a to all elements of
5245// dst.
5246// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
5248{
5249#if defined(__aarch64__)
5250 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5251#else
5252 return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
5253#endif
5254}
5255
5256// Sets the 8 signed 16-bit integer values in reverse order.
5257//
5258// Return Value
5259// r0 := w0
5260// r1 := w1
5261// ...
5262// r7 := w7
5264 short w1,
5265 short w2,
5266 short w3,
5267 short w4,
5268 short w5,
5269 short w6,
5270 short w7)
5271{
5272 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5273 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5274}
5275
5276// Sets the 4 signed 32-bit integer values in reverse order
5277// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
5278FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5279{
5280 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5281 return vreinterpretq_m128i_s32(vld1q_s32(data));
5282}
5283
5284// Set packed 64-bit integers in dst with the supplied values in reverse order.
5285// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
5287{
5288 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5289}
5290
5291// Sets the 16 signed 8-bit integer values in reverse order.
5292// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
5294 signed char b1,
5295 signed char b2,
5296 signed char b3,
5297 signed char b4,
5298 signed char b5,
5299 signed char b6,
5300 signed char b7,
5301 signed char b8,
5302 signed char b9,
5303 signed char b10,
5304 signed char b11,
5305 signed char b12,
5306 signed char b13,
5307 signed char b14,
5308 signed char b15)
5309{
5310 int8_t ALIGN_STRUCT(16)
5311 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5312 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5313 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5314 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5315 return (__m128i) vld1q_s8(data);
5316}
5317
5318// Set packed double-precision (64-bit) floating-point elements in dst with the
5319// supplied values in reverse order.
5320// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
5321FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5322{
5323 return _mm_set_pd(e0, e1);
5324}
5325
5326// Return vector of type __m128d with all elements set to zero.
5327// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
5329{
5330#if defined(__aarch64__)
5331 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5332#else
5333 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5334#endif
5335}
5336
5337// Sets the 128-bit value to zero
5338// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
5340{
5341 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5342}
5343
5344// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
5345// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
5346// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5347// __constrange(0,255) int imm)
5348#if __has_builtin(__builtin_shufflevector)
5349#define _mm_shuffle_epi32(a, imm) \
5350 __extension__({ \
5351 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5352 int32x4_t _shuf = __builtin_shufflevector( \
5353 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5354 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5355 vreinterpretq_m128i_s32(_shuf); \
5356 })
5357#else // generic
5358#define _mm_shuffle_epi32(a, imm) \
5359 __extension__({ \
5360 __m128i ret; \
5361 switch (imm) { \
5362 case _MM_SHUFFLE(1, 0, 3, 2): \
5363 ret = _mm_shuffle_epi_1032((a)); \
5364 break; \
5365 case _MM_SHUFFLE(2, 3, 0, 1): \
5366 ret = _mm_shuffle_epi_2301((a)); \
5367 break; \
5368 case _MM_SHUFFLE(0, 3, 2, 1): \
5369 ret = _mm_shuffle_epi_0321((a)); \
5370 break; \
5371 case _MM_SHUFFLE(2, 1, 0, 3): \
5372 ret = _mm_shuffle_epi_2103((a)); \
5373 break; \
5374 case _MM_SHUFFLE(1, 0, 1, 0): \
5375 ret = _mm_shuffle_epi_1010((a)); \
5376 break; \
5377 case _MM_SHUFFLE(1, 0, 0, 1): \
5378 ret = _mm_shuffle_epi_1001((a)); \
5379 break; \
5380 case _MM_SHUFFLE(0, 1, 0, 1): \
5381 ret = _mm_shuffle_epi_0101((a)); \
5382 break; \
5383 case _MM_SHUFFLE(2, 2, 1, 1): \
5384 ret = _mm_shuffle_epi_2211((a)); \
5385 break; \
5386 case _MM_SHUFFLE(0, 1, 2, 2): \
5387 ret = _mm_shuffle_epi_0122((a)); \
5388 break; \
5389 case _MM_SHUFFLE(3, 3, 3, 2): \
5390 ret = _mm_shuffle_epi_3332((a)); \
5391 break; \
5392 case _MM_SHUFFLE(0, 0, 0, 0): \
5393 ret = _mm_shuffle_epi32_splat((a), 0); \
5394 break; \
5395 case _MM_SHUFFLE(1, 1, 1, 1): \
5396 ret = _mm_shuffle_epi32_splat((a), 1); \
5397 break; \
5398 case _MM_SHUFFLE(2, 2, 2, 2): \
5399 ret = _mm_shuffle_epi32_splat((a), 2); \
5400 break; \
5401 case _MM_SHUFFLE(3, 3, 3, 3): \
5402 ret = _mm_shuffle_epi32_splat((a), 3); \
5403 break; \
5404 default: \
5405 ret = _mm_shuffle_epi32_default((a), (imm)); \
5406 break; \
5407 } \
5408 ret; \
5409 })
5410#endif
5411
5412// Shuffle double-precision (64-bit) floating-point elements using the control
5413// in imm8, and store the results in dst.
5414//
5415// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
5416// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
5417//
5418// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
5419#if __has_builtin(__builtin_shufflevector)
5420#define _mm_shuffle_pd(a, b, imm8) \
5421 vreinterpretq_m128d_s64(__builtin_shufflevector( \
5422 vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5423 ((imm8 & 0x2) >> 1) + 2))
5424#else
5425#define _mm_shuffle_pd(a, b, imm8) \
5426 _mm_castsi128_pd(_mm_set_epi64x( \
5427 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5428 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5429#endif
5430
5431// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5432// __constrange(0,255) int imm)
5433#if __has_builtin(__builtin_shufflevector)
5434#define _mm_shufflehi_epi16(a, imm) \
5435 __extension__({ \
5436 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5437 int16x8_t _shuf = __builtin_shufflevector( \
5438 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5439 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5440 (((imm) >> 6) & 0x3) + 4); \
5441 vreinterpretq_m128i_s16(_shuf); \
5442 })
5443#else // generic
5444#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5445#endif
5446
5447// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5448// __constrange(0,255) int imm)
5449#if __has_builtin(__builtin_shufflevector)
5450#define _mm_shufflelo_epi16(a, imm) \
5451 __extension__({ \
5452 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5453 int16x8_t _shuf = __builtin_shufflevector( \
5454 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5455 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5456 vreinterpretq_m128i_s16(_shuf); \
5457 })
5458#else // generic
5459#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5460#endif
5461
5462// Shift packed 16-bit integers in a left by count while shifting in zeros, and
5463// store the results in dst.
5464//
5465// FOR j := 0 to 7
5466// i := j*16
5467// IF count[63:0] > 15
5468// dst[i+15:i] := 0
5469// ELSE
5470// dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
5471// FI
5472// ENDFOR
5473//
5474// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
5476{
5477 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5478 if (_sse2neon_unlikely(c & ~15))
5479 return _mm_setzero_si128();
5480
5481 int16x8_t vc = vdupq_n_s16((int16_t) c);
5482 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5483}
5484
5485// Shift packed 32-bit integers in a left by count while shifting in zeros, and
5486// store the results in dst.
5487//
5488// FOR j := 0 to 3
5489// i := j*32
5490// IF count[63:0] > 31
5491// dst[i+31:i] := 0
5492// ELSE
5493// dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
5494// FI
5495// ENDFOR
5496//
5497// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
5499{
5500 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5501 if (_sse2neon_unlikely(c & ~31))
5502 return _mm_setzero_si128();
5503
5504 int32x4_t vc = vdupq_n_s32((int32_t) c);
5505 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5506}
5507
5508// Shift packed 64-bit integers in a left by count while shifting in zeros, and
5509// store the results in dst.
5510//
5511// FOR j := 0 to 1
5512// i := j*64
5513// IF count[63:0] > 63
5514// dst[i+63:i] := 0
5515// ELSE
5516// dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
5517// FI
5518// ENDFOR
5519//
5520// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
5522{
5523 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5524 if (_sse2neon_unlikely(c & ~63))
5525 return _mm_setzero_si128();
5526
5527 int64x2_t vc = vdupq_n_s64((int64_t) c);
5528 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5529}
5530
5531// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5532// store the results in dst.
5533//
5534// FOR j := 0 to 7
5535// i := j*16
5536// IF imm8[7:0] > 15
5537// dst[i+15:i] := 0
5538// ELSE
5539// dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
5540// FI
5541// ENDFOR
5542//
5543// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
5545{
5546 if (_sse2neon_unlikely(imm & ~15))
5547 return _mm_setzero_si128();
5549 vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5550}
5551
5552// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5553// store the results in dst.
5554//
5555// FOR j := 0 to 3
5556// i := j*32
5557// IF imm8[7:0] > 31
5558// dst[i+31:i] := 0
5559// ELSE
5560// dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
5561// FI
5562// ENDFOR
5563//
5564// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
5566{
5567 if (_sse2neon_unlikely(imm & ~31))
5568 return _mm_setzero_si128();
5570 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5571}
5572
5573// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5574// store the results in dst.
5575//
5576// FOR j := 0 to 1
5577// i := j*64
5578// IF imm8[7:0] > 63
5579// dst[i+63:i] := 0
5580// ELSE
5581// dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
5582// FI
5583// ENDFOR
5584//
5585// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
5587{
5588 if (_sse2neon_unlikely(imm & ~63))
5589 return _mm_setzero_si128();
5591 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5592}
5593
5594// Shift a left by imm8 bytes while shifting in zeros, and store the results in
5595// dst.
5596//
5597// tmp := imm8[7:0]
5598// IF tmp > 15
5599// tmp := 16
5600// FI
5601// dst[127:0] := a[127:0] << (tmp*8)
5602//
5603// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
5605{
5606 if (_sse2neon_unlikely(imm & ~15))
5607 return _mm_setzero_si128();
5608 uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)};
5610 vld1q_u8(((uint8_t const *) tmp) + (16 - imm)));
5611}
5612
5613// Compute the square root of packed double-precision (64-bit) floating-point
5614// elements in a, and store the results in dst.
5615// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
5617{
5618#if defined(__aarch64__)
5619 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5620#else
5621 double a0 = sqrt(((double *) &a)[0]);
5622 double a1 = sqrt(((double *) &a)[1]);
5623 return _mm_set_pd(a1, a0);
5624#endif
5625}
5626
5627// Compute the square root of the lower double-precision (64-bit) floating-point
5628// element in b, store the result in the lower element of dst, and copy the
5629// upper element from a to the upper element of dst.
5630// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
5632{
5633#if defined(__aarch64__)
5634 return _mm_move_sd(a, _mm_sqrt_pd(b));
5635#else
5636 return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5637#endif
5638}
5639
5640// Shift packed 16-bit integers in a right by count while shifting in sign bits,
5641// and store the results in dst.
5642//
5643// FOR j := 0 to 7
5644// i := j*16
5645// IF count[63:0] > 15
5646// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5647// ELSE
5648// dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
5649// FI
5650// ENDFOR
5651//
5652// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
5654{
5655 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5656 if (_sse2neon_unlikely(c & ~15))
5658 return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5659}
5660
5661// Shift packed 32-bit integers in a right by count while shifting in sign bits,
5662// and store the results in dst.
5663//
5664// FOR j := 0 to 3
5665// i := j*32
5666// IF count[63:0] > 31
5667// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5668// ELSE
5669// dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
5670// FI
5671// ENDFOR
5672//
5673// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
5675{
5676 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5677 if (_sse2neon_unlikely(c & ~31))
5679 return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5680}
5681
5682// Shift packed 16-bit integers in a right by imm8 while shifting in sign
5683// bits, and store the results in dst.
5684//
5685// FOR j := 0 to 7
5686// i := j*16
5687// IF imm8[7:0] > 15
5688// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5689// ELSE
5690// dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
5691// FI
5692// ENDFOR
5693//
5694// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
5696{
5697 const int count = (imm & ~15) ? 15 : imm;
5698 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5699}
5700
5701// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5702// and store the results in dst.
5703//
5704// FOR j := 0 to 3
5705// i := j*32
5706// IF imm8[7:0] > 31
5707// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5708// ELSE
5709// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
5710// FI
5711// ENDFOR
5712//
5713// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
5714// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5715#define _mm_srai_epi32(a, imm) \
5716 __extension__({ \
5717 __m128i ret; \
5718 if (_sse2neon_unlikely((imm) == 0)) { \
5719 ret = a; \
5720 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5721 ret = vreinterpretq_m128i_s32( \
5722 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
5723 } else { \
5724 ret = vreinterpretq_m128i_s32( \
5725 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
5726 } \
5727 ret; \
5728 })
5729
5730// Shift packed 16-bit integers in a right by count while shifting in zeros, and
5731// store the results in dst.
5732//
5733// FOR j := 0 to 7
5734// i := j*16
5735// IF count[63:0] > 15
5736// dst[i+15:i] := 0
5737// ELSE
5738// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
5739// FI
5740// ENDFOR
5741//
5742// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
5744{
5745 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5746 if (_sse2neon_unlikely(c & ~15))
5747 return _mm_setzero_si128();
5748
5749 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5750 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5751}
5752
5753// Shift packed 32-bit integers in a right by count while shifting in zeros, and
5754// store the results in dst.
5755//
5756// FOR j := 0 to 3
5757// i := j*32
5758// IF count[63:0] > 31
5759// dst[i+31:i] := 0
5760// ELSE
5761// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
5762// FI
5763// ENDFOR
5764//
5765// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
5767{
5768 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5769 if (_sse2neon_unlikely(c & ~31))
5770 return _mm_setzero_si128();
5771
5772 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5773 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5774}
5775
5776// Shift packed 64-bit integers in a right by count while shifting in zeros, and
5777// store the results in dst.
5778//
5779// FOR j := 0 to 1
5780// i := j*64
5781// IF count[63:0] > 63
5782// dst[i+63:i] := 0
5783// ELSE
5784// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
5785// FI
5786// ENDFOR
5787//
5788// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
5790{
5791 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5792 if (_sse2neon_unlikely(c & ~63))
5793 return _mm_setzero_si128();
5794
5795 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5796 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5797}
5798
5799// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5800// store the results in dst.
5801//
5802// FOR j := 0 to 7
5803// i := j*16
5804// IF imm8[7:0] > 15
5805// dst[i+15:i] := 0
5806// ELSE
5807// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
5808// FI
5809// ENDFOR
5810//
5811// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
5812#define _mm_srli_epi16(a, imm) \
5813 __extension__({ \
5814 __m128i ret; \
5815 if (_sse2neon_unlikely((imm) & ~15)) { \
5816 ret = _mm_setzero_si128(); \
5817 } else { \
5818 ret = vreinterpretq_m128i_u16( \
5819 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
5820 } \
5821 ret; \
5822 })
5823
5824// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5825// store the results in dst.
5826//
5827// FOR j := 0 to 3
5828// i := j*32
5829// IF imm8[7:0] > 31
5830// dst[i+31:i] := 0
5831// ELSE
5832// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
5833// FI
5834// ENDFOR
5835//
5836// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
5837// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5838#define _mm_srli_epi32(a, imm) \
5839 __extension__({ \
5840 __m128i ret; \
5841 if (_sse2neon_unlikely((imm) & ~31)) { \
5842 ret = _mm_setzero_si128(); \
5843 } else { \
5844 ret = vreinterpretq_m128i_u32( \
5845 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
5846 } \
5847 ret; \
5848 })
5849
5850// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5851// store the results in dst.
5852//
5853// FOR j := 0 to 1
5854// i := j*64
5855// IF imm8[7:0] > 63
5856// dst[i+63:i] := 0
5857// ELSE
5858// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
5859// FI
5860// ENDFOR
5861//
5862// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
5863#define _mm_srli_epi64(a, imm) \
5864 __extension__({ \
5865 __m128i ret; \
5866 if (_sse2neon_unlikely((imm) & ~63)) { \
5867 ret = _mm_setzero_si128(); \
5868 } else { \
5869 ret = vreinterpretq_m128i_u64( \
5870 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
5871 } \
5872 ret; \
5873 })
5874
5875// Shift a right by imm8 bytes while shifting in zeros, and store the results in
5876// dst.
5877//
5878// tmp := imm8[7:0]
5879// IF tmp > 15
5880// tmp := 16
5881// FI
5882// dst[127:0] := a[127:0] >> (tmp*8)
5883//
5884// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
5886{
5887 if (_sse2neon_unlikely(imm & ~15))
5888 return _mm_setzero_si128();
5889 uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)};
5890 return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm));
5891}
5892
5893// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5894// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5895// or a general-protection exception may be generated.
5896// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
5897FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5898{
5899#if defined(__aarch64__)
5900 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5901#else
5902 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5903#endif
5904}
5905
5906// Store the lower double-precision (64-bit) floating-point element from a into
5907// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5908// boundary or a general-protection exception may be generated.
5909// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
5910FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5911{
5912#if defined(__aarch64__)
5913 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5914 vst1q_f64((float64_t *) mem_addr,
5915 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5916#else
5917 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5918 vst1q_f32((float32_t *) mem_addr,
5919 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5920#endif
5921}
5922
5923// Store the lower double-precision (64-bit) floating-point element from a into
5924// memory. mem_addr does not need to be aligned on any particular boundary.
5925// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
5926FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5927{
5928#if defined(__aarch64__)
5929 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5930#else
5931 vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5932#endif
5933}
5934
5935// Stores four 32-bit integer values as (as a __m128i value) at the address p.
5936// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
5938{
5939 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5940}
5941
5942// Store the lower double-precision (64-bit) floating-point element from a into
5943// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5944// boundary or a general-protection exception may be generated.
5945// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
5946#define _mm_store1_pd _mm_store_pd1
5947
5948// Store the upper double-precision (64-bit) floating-point element from a into
5949// memory.
5950//
5951// MEM[mem_addr+63:mem_addr] := a[127:64]
5952//
5953// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
5954FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5955{
5956#if defined(__aarch64__)
5957 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5958#else
5959 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5960#endif
5961}
5962
5963// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
5964// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
5966{
5967 vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
5968}
5969
5970// Store the lower double-precision (64-bit) floating-point element from a into
5971// memory.
5972//
5973// MEM[mem_addr+63:mem_addr] := a[63:0]
5974//
5975// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
5976FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5977{
5978#if defined(__aarch64__)
5979 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5980#else
5981 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5982#endif
5983}
5984
5985// Store 2 double-precision (64-bit) floating-point elements from a into memory
5986// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5987// general-protection exception may be generated.
5988//
5989// MEM[mem_addr+63:mem_addr] := a[127:64]
5990// MEM[mem_addr+127:mem_addr+64] := a[63:0]
5991//
5992// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
5993FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5994{
5995 float32x4_t f = vreinterpretq_f32_m128d(a);
5996 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5997}
5998
5999// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6000// elements) from a into memory. mem_addr does not need to be aligned on any
6001// particular boundary.
6002// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
6003FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
6004{
6005 _mm_store_pd(mem_addr, a);
6006}
6007
6008// Stores 128-bits of integer data a at the address p.
6009// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
6011{
6012 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
6013}
6014
6015// Stores 32-bits of integer data a at the address p.
6016// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
6018{
6019 vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
6020}
6021
6022// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6023// elements) from a into memory using a non-temporal memory hint. mem_addr must
6024// be aligned on a 16-byte boundary or a general-protection exception may be
6025// generated.
6026// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
6028{
6029#if __has_builtin(__builtin_nontemporal_store)
6030 __builtin_nontemporal_store(a, (float32x4_t *) p);
6031#elif defined(__aarch64__)
6032 vst1q_f64(p, vreinterpretq_f64_m128d(a));
6033#else
6034 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
6035#endif
6036}
6037
6038// Stores the data in a to the address p without polluting the caches. If the
6039// cache line containing address p is already in the cache, the cache will be
6040// updated.
6041// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
6043{
6044#if __has_builtin(__builtin_nontemporal_store)
6045 __builtin_nontemporal_store(a, p);
6046#else
6047 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
6048#endif
6049}
6050
6051// Store 32-bit integer a into memory using a non-temporal hint to minimize
6052// cache pollution. If the cache line containing address mem_addr is already in
6053// the cache, the cache will be updated.
6054// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
6056{
6057 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
6058}
6059
6060// Store 64-bit integer a into memory using a non-temporal hint to minimize
6061// cache pollution. If the cache line containing address mem_addr is already in
6062// the cache, the cache will be updated.
6063// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64
6065{
6066 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
6067}
6068
6069// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
6070// store the results in dst.
6071// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
6073{
6076}
6077
6078// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
6079// unsigned 32-bit integers of a.
6080//
6081// r0 := a0 - b0
6082// r1 := a1 - b1
6083// r2 := a2 - b2
6084// r3 := a3 - b3
6085//
6086// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
6088{
6091}
6092
6093// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
6094// and store the results in dst.
6095// r0 := a0 - b0
6096// r1 := a1 - b1
6098{
6101}
6102
6103// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
6104// store the results in dst.
6105// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
6107{
6110}
6111
6112// Subtract packed double-precision (64-bit) floating-point elements in b from
6113// packed double-precision (64-bit) floating-point elements in a, and store the
6114// results in dst.
6115//
6116// FOR j := 0 to 1
6117// i := j*64
6118// dst[i+63:i] := a[i+63:i] - b[i+63:i]
6119// ENDFOR
6120//
6121// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
6123{
6124#if defined(__aarch64__)
6125 return vreinterpretq_m128d_f64(
6126 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6127#else
6128 double *da = (double *) &a;
6129 double *db = (double *) &b;
6130 double c[2];
6131 c[0] = da[0] - db[0];
6132 c[1] = da[1] - db[1];
6133 return vld1q_f32((float32_t *) c);
6134#endif
6135}
6136
6137// Subtract the lower double-precision (64-bit) floating-point element in b from
6138// the lower double-precision (64-bit) floating-point element in a, store the
6139// result in the lower element of dst, and copy the upper element from a to the
6140// upper element of dst.
6141// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
6143{
6144 return _mm_move_sd(a, _mm_sub_pd(a, b));
6145}
6146
6147// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
6148//
6149// dst[63:0] := a[63:0] - b[63:0]
6150//
6151// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
6153{
6154 return vreinterpret_m64_s64(
6156}
6157
6158// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
6159// of a and saturates.
6160//
6161// r0 := SignedSaturate(a0 - b0)
6162// r1 := SignedSaturate(a1 - b1)
6163// ...
6164// r7 := SignedSaturate(a7 - b7)
6165//
6166// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
6168{
6171}
6172
6173// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
6174// of a and saturates.
6175//
6176// r0 := SignedSaturate(a0 - b0)
6177// r1 := SignedSaturate(a1 - b1)
6178// ...
6179// r15 := SignedSaturate(a15 - b15)
6180//
6181// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
6183{
6186}
6187
6188// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
6189// integers of a and saturates..
6190// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
6192{
6195}
6196
6197// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
6198// integers of a and saturates.
6199//
6200// r0 := UnsignedSaturate(a0 - b0)
6201// r1 := UnsignedSaturate(a1 - b1)
6202// ...
6203// r15 := UnsignedSaturate(a15 - b15)
6204//
6205// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
6207{
6210}
6211
6212#define _mm_ucomieq_sd _mm_comieq_sd
6213#define _mm_ucomige_sd _mm_comige_sd
6214#define _mm_ucomigt_sd _mm_comigt_sd
6215#define _mm_ucomile_sd _mm_comile_sd
6216#define _mm_ucomilt_sd _mm_comilt_sd
6217#define _mm_ucomineq_sd _mm_comineq_sd
6218
6219// Return vector of type __m128d with undefined elements.
6220// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
6222{
6223#if defined(__GNUC__) || defined(__clang__)
6224#pragma GCC diagnostic push
6225#pragma GCC diagnostic ignored "-Wuninitialized"
6226#endif
6227 __m128d a;
6228 return a;
6229#if defined(__GNUC__) || defined(__clang__)
6230#pragma GCC diagnostic pop
6231#endif
6232}
6233
6234// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
6235// upper 4 signed or unsigned 16-bit integers in b.
6236//
6237// r0 := a4
6238// r1 := b4
6239// r2 := a5
6240// r3 := b5
6241// r4 := a6
6242// r5 := b6
6243// r6 := a7
6244// r7 := b7
6245//
6246// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
6248{
6249#if defined(__aarch64__)
6252#else
6253 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6254 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6255 int16x4x2_t result = vzip_s16(a1, b1);
6256 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6257#endif
6258}
6259
6260// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
6261// upper 2 signed or unsigned 32-bit integers in b.
6262// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
6264{
6265#if defined(__aarch64__)
6268#else
6269 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6270 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6271 int32x2x2_t result = vzip_s32(a1, b1);
6272 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6273#endif
6274}
6275
6276// Interleaves the upper signed or unsigned 64-bit integer in a with the
6277// upper signed or unsigned 64-bit integer in b.
6278//
6279// r0 := a1
6280// r1 := b1
6282{
6283 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6284 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6285 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6286}
6287
6288// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
6289// 8 signed or unsigned 8-bit integers in b.
6290//
6291// r0 := a8
6292// r1 := b8
6293// r2 := a9
6294// r3 := b9
6295// ...
6296// r14 := a15
6297// r15 := b15
6298//
6299// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
6301{
6302#if defined(__aarch64__)
6305#else
6306 int8x8_t a1 =
6307 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6308 int8x8_t b1 =
6309 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6310 int8x8x2_t result = vzip_s8(a1, b1);
6311 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6312#endif
6313}
6314
6315// Unpack and interleave double-precision (64-bit) floating-point elements from
6316// the high half of a and b, and store the results in dst.
6317//
6318// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6319// dst[63:0] := src1[127:64]
6320// dst[127:64] := src2[127:64]
6321// RETURN dst[127:0]
6322// }
6323// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6324//
6325// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
6327{
6328#if defined(__aarch64__)
6329 return vreinterpretq_m128d_f64(
6330 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6331#else
6333 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6334 vget_high_s64(vreinterpretq_s64_m128d(b))));
6335#endif
6336}
6337
6338// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6339// lower 4 signed or unsigned 16-bit integers in b.
6340//
6341// r0 := a0
6342// r1 := b0
6343// r2 := a1
6344// r3 := b1
6345// r4 := a2
6346// r5 := b2
6347// r6 := a3
6348// r7 := b3
6349//
6350// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
6352{
6353#if defined(__aarch64__)
6356#else
6357 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6358 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6359 int16x4x2_t result = vzip_s16(a1, b1);
6360 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6361#endif
6362}
6363
6364// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6365// lower 2 signed or unsigned 32 - bit integers in b.
6366//
6367// r0 := a0
6368// r1 := b0
6369// r2 := a1
6370// r3 := b1
6371//
6372// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
6374{
6375#if defined(__aarch64__)
6378#else
6379 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6380 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6381 int32x2x2_t result = vzip_s32(a1, b1);
6382 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6383#endif
6384}
6385
6387{
6388 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6389 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6390 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6391}
6392
6393// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
6394// 8 signed or unsigned 8-bit integers in b.
6395//
6396// r0 := a0
6397// r1 := b0
6398// r2 := a1
6399// r3 := b1
6400// ...
6401// r14 := a7
6402// r15 := b7
6403//
6404// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
6406{
6407#if defined(__aarch64__)
6410#else
6411 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6412 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6413 int8x8x2_t result = vzip_s8(a1, b1);
6414 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6415#endif
6416}
6417
6418// Unpack and interleave double-precision (64-bit) floating-point elements from
6419// the low half of a and b, and store the results in dst.
6420//
6421// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6422// dst[63:0] := src1[63:0]
6423// dst[127:64] := src2[63:0]
6424// RETURN dst[127:0]
6425// }
6426// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6427//
6428// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
6430{
6431#if defined(__aarch64__)
6432 return vreinterpretq_m128d_f64(
6433 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6434#else
6436 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6437 vget_low_s64(vreinterpretq_s64_m128d(b))));
6438#endif
6439}
6440
6441// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6442// elements in a and b, and store the results in dst.
6443//
6444// FOR j := 0 to 1
6445// i := j*64
6446// dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
6447// ENDFOR
6448//
6449// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
6451{
6454}
6455
6456// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
6457// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
6459{
6462}
6463
6464/* SSE3 */
6465
6466// Alternatively add and subtract packed double-precision (64-bit)
6467// floating-point elements in a to/from packed elements in b, and store the
6468// results in dst.
6469//
6470// FOR j := 0 to 1
6471// i := j*64
6472// IF ((j & 1) == 0)
6473// dst[i+63:i] := a[i+63:i] - b[i+63:i]
6474// ELSE
6475// dst[i+63:i] := a[i+63:i] + b[i+63:i]
6476// FI
6477// ENDFOR
6478//
6479// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
6481{
6482 _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
6483#if defined(__aarch64__)
6484 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6485 vreinterpretq_f64_m128d(b),
6486 vreinterpretq_f64_m128d(mask)));
6487#else
6488 return _mm_add_pd(_mm_mul_pd(b, mask), a);
6489#endif
6490}
6491
6492// Alternatively add and subtract packed single-precision (32-bit)
6493// floating-point elements in a to/from packed elements in b, and store the
6494// results in dst.
6495// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
6497{
6498 _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
6499#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6503#else
6504 return _mm_add_ps(_mm_mul_ps(b, mask), a);
6505#endif
6506}
6507
6508// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6509// elements in a and b, and pack the results in dst.
6510// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
6512{
6513#if defined(__aarch64__)
6514 return vreinterpretq_m128d_f64(
6515 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6516#else
6517 double *da = (double *) &a;
6518 double *db = (double *) &b;
6519 double c[] = {da[0] + da[1], db[0] + db[1]};
6520 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6521#endif
6522}
6523
6524// Computes pairwise add of each argument as single-precision, floating-point
6525// values a and b.
6526// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
6528{
6529#if defined(__aarch64__)
6532#else
6533 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6534 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6535 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6536 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6538 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6539#endif
6540}
6541
6542// Horizontally subtract adjacent pairs of double-precision (64-bit)
6543// floating-point elements in a and b, and pack the results in dst.
6544// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
6546{
6547#if defined(__aarch64__)
6548 float64x2_t a = vreinterpretq_f64_m128d(_a);
6549 float64x2_t b = vreinterpretq_f64_m128d(_b);
6550 return vreinterpretq_m128d_f64(
6551 vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
6552#else
6553 double *da = (double *) &_a;
6554 double *db = (double *) &_b;
6555 double c[] = {da[0] - da[1], db[0] - db[1]};
6556 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6557#endif
6558}
6559
6560// Horizontally subtract adjacent pairs of single-precision (32-bit)
6561// floating-point elements in a and b, and pack the results in dst.
6562// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
6564{
6565 float32x4_t a = vreinterpretq_f32_m128(_a);
6566 float32x4_t b = vreinterpretq_f32_m128(_b);
6567#if defined(__aarch64__)
6569 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6570#else
6571 float32x4x2_t c = vuzpq_f32(a, b);
6572 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6573#endif
6574}
6575
6576// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6577// may perform better than _mm_loadu_si128 when the data crosses a cache line
6578// boundary.
6579//
6580// dst[127:0] := MEM[mem_addr+127:mem_addr]
6581//
6582// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
6583#define _mm_lddqu_si128 _mm_loadu_si128
6584
6585// Load a double-precision (64-bit) floating-point element from memory into both
6586// elements of dst.
6587//
6588// dst[63:0] := MEM[mem_addr+63:mem_addr]
6589// dst[127:64] := MEM[mem_addr+63:mem_addr]
6590//
6591// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
6592#define _mm_loaddup_pd _mm_load1_pd
6593
6594// Duplicate the low double-precision (64-bit) floating-point element from a,
6595// and store the results in dst.
6596// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
6598{
6599#if defined(__aarch64__)
6600 return vreinterpretq_m128d_f64(
6601 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6602#else
6604 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6605#endif
6606}
6607
6608// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6609// from a, and store the results in dst.
6610// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
6612{
6613#if __has_builtin(__builtin_shufflevector)
6614 return vreinterpretq_m128_f32(__builtin_shufflevector(
6616#else
6617 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6618 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6619 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6620 return vreinterpretq_m128_f32(vld1q_f32(data));
6621#endif
6622}
6623
6624// Duplicate even-indexed single-precision (32-bit) floating-point elements
6625// from a, and store the results in dst.
6626// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
6628{
6629#if __has_builtin(__builtin_shufflevector)
6630 return vreinterpretq_m128_f32(__builtin_shufflevector(
6632#else
6633 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6634 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6635 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6636 return vreinterpretq_m128_f32(vld1q_f32(data));
6637#endif
6638}
6639
6640/* SSSE3 */
6641
6642// Compute the absolute value of packed signed 16-bit integers in a, and store
6643// the unsigned results in dst.
6644//
6645// FOR j := 0 to 7
6646// i := j*16
6647// dst[i+15:i] := ABS(a[i+15:i])
6648// ENDFOR
6649//
6650// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
6652{
6654}
6655
6656// Compute the absolute value of packed signed 32-bit integers in a, and store
6657// the unsigned results in dst.
6658//
6659// FOR j := 0 to 3
6660// i := j*32
6661// dst[i+31:i] := ABS(a[i+31:i])
6662// ENDFOR
6663//
6664// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
6666{
6668}
6669
6670// Compute the absolute value of packed signed 8-bit integers in a, and store
6671// the unsigned results in dst.
6672//
6673// FOR j := 0 to 15
6674// i := j*8
6675// dst[i+7:i] := ABS(a[i+7:i])
6676// ENDFOR
6677//
6678// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
6680{
6682}
6683
6684// Compute the absolute value of packed signed 16-bit integers in a, and store
6685// the unsigned results in dst.
6686//
6687// FOR j := 0 to 3
6688// i := j*16
6689// dst[i+15:i] := ABS(a[i+15:i])
6690// ENDFOR
6691//
6692// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
6694{
6695 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6696}
6697
6698// Compute the absolute value of packed signed 32-bit integers in a, and store
6699// the unsigned results in dst.
6700//
6701// FOR j := 0 to 1
6702// i := j*32
6703// dst[i+31:i] := ABS(a[i+31:i])
6704// ENDFOR
6705//
6706// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
6708{
6709 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6710}
6711
6712// Compute the absolute value of packed signed 8-bit integers in a, and store
6713// the unsigned results in dst.
6714//
6715// FOR j := 0 to 7
6716// i := j*8
6717// dst[i+7:i] := ABS(a[i+7:i])
6718// ENDFOR
6719//
6720// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
6722{
6723 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6724}
6725
6726// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6727// the result right by imm8 bytes, and store the low 16 bytes in dst.
6728//
6729// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
6730// dst[127:0] := tmp[127:0]
6731//
6732// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
6734{
6735 if (_sse2neon_unlikely(imm & ~31))
6736 return _mm_setzero_si128();
6737 int idx;
6738 uint8x16_t tmp[2];
6739 if (imm >= 16) {
6740 idx = imm - 16;
6741 tmp[0] = vreinterpretq_u8_m128i(a);
6742 tmp[1] = vdupq_n_u8(0);
6743 } else {
6744 idx = imm;
6745 tmp[0] = vreinterpretq_u8_m128i(b);
6746 tmp[1] = vreinterpretq_u8_m128i(a);
6747 }
6748 return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx));
6749}
6750
6751// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6752// the result right by imm8 bytes, and store the low 8 bytes in dst.
6753//
6754// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
6755// dst[63:0] := tmp[63:0]
6756//
6757// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
6758#define _mm_alignr_pi8(a, b, imm) \
6759 __extension__({ \
6760 __m64 ret; \
6761 if (_sse2neon_unlikely((imm) >= 16)) { \
6762 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6763 } else { \
6764 uint8x8_t tmp_low, tmp_high; \
6765 if ((imm) >= 8) { \
6766 const int idx = (imm) -8; \
6767 tmp_low = vreinterpret_u8_m64(a); \
6768 tmp_high = vdup_n_u8(0); \
6769 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6770 } else { \
6771 const int idx = (imm); \
6772 tmp_low = vreinterpret_u8_m64(b); \
6773 tmp_high = vreinterpret_u8_m64(a); \
6774 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6775 } \
6776 } \
6777 ret; \
6778 })
6779
6780// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
6781// values a and b.
6783{
6784 int16x8_t a = vreinterpretq_s16_m128i(_a);
6785 int16x8_t b = vreinterpretq_s16_m128i(_b);
6786#if defined(__aarch64__)
6787 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6788#else
6790 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6791 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6792#endif
6793}
6794
6795// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
6796// values a and b.
6798{
6799 int32x4_t a = vreinterpretq_s32_m128i(_a);
6800 int32x4_t b = vreinterpretq_s32_m128i(_b);
6802 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6803 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6804}
6805
6806// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6807// signed 16-bit results in dst.
6808// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
6810{
6811 return vreinterpret_m64_s16(
6812 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6813}
6814
6815// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6816// signed 32-bit results in dst.
6817// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
6819{
6820 return vreinterpret_m64_s32(
6821 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6822}
6823
6824// Computes saturated pairwise sub of each argument as a 16-bit signed
6825// integer values a and b.
6827{
6828#if defined(__aarch64__)
6829 int16x8_t a = vreinterpretq_s16_m128i(_a);
6830 int16x8_t b = vreinterpretq_s16_m128i(_b);
6831 return vreinterpretq_s64_s16(
6832 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6833#else
6834 int32x4_t a = vreinterpretq_s32_m128i(_a);
6835 int32x4_t b = vreinterpretq_s32_m128i(_b);
6836 // Interleave using vshrn/vmovn
6837 // [a0|a2|a4|a6|b0|b2|b4|b6]
6838 // [a1|a3|a5|a7|b1|b3|b5|b7]
6839 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6840 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6841 // Saturated add
6842 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6843#endif
6844}
6845
6846// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6847// saturation, and pack the signed 16-bit results in dst.
6848// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
6850{
6851 int16x4_t a = vreinterpret_s16_m64(_a);
6852 int16x4_t b = vreinterpret_s16_m64(_b);
6853#if defined(__aarch64__)
6854 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6855#else
6856 int16x4x2_t res = vuzp_s16(a, b);
6857 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6858#endif
6859}
6860
6861// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6862// the signed 16-bit results in dst.
6863// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16
6865{
6866 int16x8_t a = vreinterpretq_s16_m128i(_a);
6867 int16x8_t b = vreinterpretq_s16_m128i(_b);
6868#if defined(__aarch64__)
6870 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6871#else
6872 int16x8x2_t c = vuzpq_s16(a, b);
6873 return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6874#endif
6875}
6876
6877// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6878// the signed 32-bit results in dst.
6879// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32
6881{
6882 int32x4_t a = vreinterpretq_s32_m128i(_a);
6883 int32x4_t b = vreinterpretq_s32_m128i(_b);
6884#if defined(__aarch64__)
6886 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6887#else
6888 int32x4x2_t c = vuzpq_s32(a, b);
6889 return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6890#endif
6891}
6892
6893// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6894// the signed 16-bit results in dst.
6895// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
6897{
6898 int16x4_t a = vreinterpret_s16_m64(_a);
6899 int16x4_t b = vreinterpret_s16_m64(_b);
6900#if defined(__aarch64__)
6901 return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6902#else
6903 int16x4x2_t c = vuzp_s16(a, b);
6904 return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6905#endif
6906}
6907
6908// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6909// the signed 32-bit results in dst.
6910// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
6912{
6913 int32x2_t a = vreinterpret_s32_m64(_a);
6914 int32x2_t b = vreinterpret_s32_m64(_b);
6915#if defined(__aarch64__)
6916 return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6917#else
6918 int32x2x2_t c = vuzp_s32(a, b);
6919 return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6920#endif
6921}
6922
6923// Computes saturated pairwise difference of each argument as a 16-bit signed
6924// integer values a and b.
6925// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
6927{
6928 int16x8_t a = vreinterpretq_s16_m128i(_a);
6929 int16x8_t b = vreinterpretq_s16_m128i(_b);
6930#if defined(__aarch64__)
6932 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6933#else
6934 int16x8x2_t c = vuzpq_s16(a, b);
6935 return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6936#endif
6937}
6938
6939// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6940// using saturation, and pack the signed 16-bit results in dst.
6941// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
6943{
6944 int16x4_t a = vreinterpret_s16_m64(_a);
6945 int16x4_t b = vreinterpret_s16_m64(_b);
6946#if defined(__aarch64__)
6947 return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6948#else
6949 int16x4x2_t c = vuzp_s16(a, b);
6950 return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6951#endif
6952}
6953
6954// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6955// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6956// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6957// and pack the saturated results in dst.
6958//
6959// FOR j := 0 to 7
6960// i := j*16
6961// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
6962// a[i+7:i]*b[i+7:i] )
6963// ENDFOR
6965{
6966#if defined(__aarch64__)
6967 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6968 int8x16_t b = vreinterpretq_s8_m128i(_b);
6969 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6970 vmovl_s8(vget_low_s8(b)));
6971 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6972 vmovl_s8(vget_high_s8(b)));
6974 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6975#else
6976 // This would be much simpler if x86 would choose to zero extend OR sign
6977 // extend, not both. This could probably be optimized better.
6978 uint16x8_t a = vreinterpretq_u16_m128i(_a);
6979 int16x8_t b = vreinterpretq_s16_m128i(_b);
6980
6981 // Zero extend a
6982 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6983 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6984
6985 // Sign extend by shifting left then shifting right.
6986 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6987 int16x8_t b_odd = vshrq_n_s16(b, 8);
6988
6989 // multiply
6990 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6991 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6992
6993 // saturated add
6994 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6995#endif
6996}
6997
6998// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6999// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
7000// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
7001// pack the saturated results in dst.
7002// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
7004{
7005 uint16x4_t a = vreinterpret_u16_m64(_a);
7006 int16x4_t b = vreinterpret_s16_m64(_b);
7007
7008 // Zero extend a
7009 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
7010 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
7011
7012 // Sign extend by shifting left then shifting right.
7013 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
7014 int16x4_t b_odd = vshr_n_s16(b, 8);
7015
7016 // multiply
7017 int16x4_t prod1 = vmul_s16(a_even, b_even);
7018 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
7019
7020 // saturated add
7021 return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
7022}
7023
7024// Multiply packed signed 16-bit integers in a and b, producing intermediate
7025// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
7026// the packed 16-bit integers in dst.
7027//
7028// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
7029// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
7030// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
7031// ...
7032// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
7034{
7035 // Has issues due to saturation
7036 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
7037
7038 // Multiply
7039 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
7040 vget_low_s16(vreinterpretq_s16_m128i(b)));
7041 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
7042 vget_high_s16(vreinterpretq_s16_m128i(b)));
7043
7044 // Rounding narrowing shift right
7045 // narrow = (int16_t)((mul + 16384) >> 15);
7046 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
7047 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
7048
7049 // Join together
7050 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
7051}
7052
7053// Multiply packed signed 16-bit integers in a and b, producing intermediate
7054// signed 32-bit integers. Truncate each intermediate integer to the 18 most
7055// significant bits, round by adding 1, and store bits [16:1] to dst.
7056// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
7058{
7059 int32x4_t mul_extend =
7060 vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
7061
7062 // Rounding narrowing shift right
7063 return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
7064}
7065
7066// Shuffle packed 8-bit integers in a according to shuffle control mask in the
7067// corresponding 8-bit element of b, and store the results in dst.
7068// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
7070{
7071 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
7072 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
7073 uint8x16_t idx_masked =
7074 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
7075#if defined(__aarch64__)
7076 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
7077#elif defined(__GNUC__)
7078 int8x16_t ret;
7079 // %e and %f represent the even and odd D registers
7080 // respectively.
7081 __asm__ __volatile__(
7082 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
7083 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
7084 : [ret] "=&w"(ret)
7085 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
7086 return vreinterpretq_m128i_s8(ret);
7087#else
7088 // use this line if testing on aarch64
7089 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
7091 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
7092 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
7093#endif
7094}
7095
7096// Shuffle packed 8-bit integers in a according to shuffle control mask in the
7097// corresponding 8-bit element of b, and store the results in dst.
7098//
7099// FOR j := 0 to 7
7100// i := j*8
7101// IF b[i+7] == 1
7102// dst[i+7:i] := 0
7103// ELSE
7104// index[2:0] := b[i+2:i]
7105// dst[i+7:i] := a[index*8+7:index*8]
7106// FI
7107// ENDFOR
7108//
7109// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
7111{
7112 const int8x8_t controlMask =
7113 vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
7114 int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
7115 return vreinterpret_m64_s8(res);
7116}
7117
7118// Negate packed 16-bit integers in a when the corresponding signed
7119// 16-bit integer in b is negative, and store the results in dst.
7120// Element in dst are zeroed out when the corresponding element
7121// in b is zero.
7122//
7123// for i in 0..7
7124// if b[i] < 0
7125// r[i] := -a[i]
7126// else if b[i] == 0
7127// r[i] := 0
7128// else
7129// r[i] := a[i]
7130// fi
7131// done
7133{
7134 int16x8_t a = vreinterpretq_s16_m128i(_a);
7135 int16x8_t b = vreinterpretq_s16_m128i(_b);
7136
7137 // signed shift right: faster than vclt
7138 // (b < 0) ? 0xFFFF : 0
7139 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
7140 // (b == 0) ? 0xFFFF : 0
7141#if defined(__aarch64__)
7142 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
7143#else
7144 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
7145#endif
7146
7147 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
7148 // 'a') based on ltMask
7149 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
7150 // res = masked & (~zeroMask)
7151 int16x8_t res = vbicq_s16(masked, zeroMask);
7152 return vreinterpretq_m128i_s16(res);
7153}
7154
7155// Negate packed 32-bit integers in a when the corresponding signed
7156// 32-bit integer in b is negative, and store the results in dst.
7157// Element in dst are zeroed out when the corresponding element
7158// in b is zero.
7159//
7160// for i in 0..3
7161// if b[i] < 0
7162// r[i] := -a[i]
7163// else if b[i] == 0
7164// r[i] := 0
7165// else
7166// r[i] := a[i]
7167// fi
7168// done
7170{
7171 int32x4_t a = vreinterpretq_s32_m128i(_a);
7172 int32x4_t b = vreinterpretq_s32_m128i(_b);
7173
7174 // signed shift right: faster than vclt
7175 // (b < 0) ? 0xFFFFFFFF : 0
7176 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
7177
7178 // (b == 0) ? 0xFFFFFFFF : 0
7179#if defined(__aarch64__)
7180 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
7181#else
7182 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
7183#endif
7184
7185 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
7186 // 'a') based on ltMask
7187 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
7188 // res = masked & (~zeroMask)
7189 int32x4_t res = vbicq_s32(masked, zeroMask);
7190 return vreinterpretq_m128i_s32(res);
7191}
7192
7193// Negate packed 8-bit integers in a when the corresponding signed
7194// 8-bit integer in b is negative, and store the results in dst.
7195// Element in dst are zeroed out when the corresponding element
7196// in b is zero.
7197//
7198// for i in 0..15
7199// if b[i] < 0
7200// r[i] := -a[i]
7201// else if b[i] == 0
7202// r[i] := 0
7203// else
7204// r[i] := a[i]
7205// fi
7206// done
7208{
7209 int8x16_t a = vreinterpretq_s8_m128i(_a);
7210 int8x16_t b = vreinterpretq_s8_m128i(_b);
7211
7212 // signed shift right: faster than vclt
7213 // (b < 0) ? 0xFF : 0
7214 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
7215
7216 // (b == 0) ? 0xFF : 0
7217#if defined(__aarch64__)
7218 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
7219#else
7220 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
7221#endif
7222
7223 // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
7224 // based on ltMask
7225 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
7226 // res = masked & (~zeroMask)
7227 int8x16_t res = vbicq_s8(masked, zeroMask);
7228
7229 return vreinterpretq_m128i_s8(res);
7230}
7231
7232// Negate packed 16-bit integers in a when the corresponding signed 16-bit
7233// integer in b is negative, and store the results in dst. Element in dst are
7234// zeroed out when the corresponding element in b is zero.
7235//
7236// FOR j := 0 to 3
7237// i := j*16
7238// IF b[i+15:i] < 0
7239// dst[i+15:i] := -(a[i+15:i])
7240// ELSE IF b[i+15:i] == 0
7241// dst[i+15:i] := 0
7242// ELSE
7243// dst[i+15:i] := a[i+15:i]
7244// FI
7245// ENDFOR
7246//
7247// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
7249{
7250 int16x4_t a = vreinterpret_s16_m64(_a);
7251 int16x4_t b = vreinterpret_s16_m64(_b);
7252
7253 // signed shift right: faster than vclt
7254 // (b < 0) ? 0xFFFF : 0
7255 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7256
7257 // (b == 0) ? 0xFFFF : 0
7258#if defined(__aarch64__)
7259 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7260#else
7261 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7262#endif
7263
7264 // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
7265 // based on ltMask
7266 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7267 // res = masked & (~zeroMask)
7268 int16x4_t res = vbic_s16(masked, zeroMask);
7269
7270 return vreinterpret_m64_s16(res);
7271}
7272
7273// Negate packed 32-bit integers in a when the corresponding signed 32-bit
7274// integer in b is negative, and store the results in dst. Element in dst are
7275// zeroed out when the corresponding element in b is zero.
7276//
7277// FOR j := 0 to 1
7278// i := j*32
7279// IF b[i+31:i] < 0
7280// dst[i+31:i] := -(a[i+31:i])
7281// ELSE IF b[i+31:i] == 0
7282// dst[i+31:i] := 0
7283// ELSE
7284// dst[i+31:i] := a[i+31:i]
7285// FI
7286// ENDFOR
7287//
7288// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
7290{
7291 int32x2_t a = vreinterpret_s32_m64(_a);
7292 int32x2_t b = vreinterpret_s32_m64(_b);
7293
7294 // signed shift right: faster than vclt
7295 // (b < 0) ? 0xFFFFFFFF : 0
7296 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7297
7298 // (b == 0) ? 0xFFFFFFFF : 0
7299#if defined(__aarch64__)
7300 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7301#else
7302 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7303#endif
7304
7305 // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
7306 // based on ltMask
7307 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7308 // res = masked & (~zeroMask)
7309 int32x2_t res = vbic_s32(masked, zeroMask);
7310
7311 return vreinterpret_m64_s32(res);
7312}
7313
7314// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
7315// in b is negative, and store the results in dst. Element in dst are zeroed out
7316// when the corresponding element in b is zero.
7317//
7318// FOR j := 0 to 7
7319// i := j*8
7320// IF b[i+7:i] < 0
7321// dst[i+7:i] := -(a[i+7:i])
7322// ELSE IF b[i+7:i] == 0
7323// dst[i+7:i] := 0
7324// ELSE
7325// dst[i+7:i] := a[i+7:i]
7326// FI
7327// ENDFOR
7328//
7329// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
7331{
7332 int8x8_t a = vreinterpret_s8_m64(_a);
7333 int8x8_t b = vreinterpret_s8_m64(_b);
7334
7335 // signed shift right: faster than vclt
7336 // (b < 0) ? 0xFF : 0
7337 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7338
7339 // (b == 0) ? 0xFF : 0
7340#if defined(__aarch64__)
7341 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7342#else
7343 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7344#endif
7345
7346 // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
7347 // based on ltMask
7348 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7349 // res = masked & (~zeroMask)
7350 int8x8_t res = vbic_s8(masked, zeroMask);
7351
7352 return vreinterpret_m64_s8(res);
7353}
7354
7355/* SSE4.1 */
7356
7357// Blend packed 16-bit integers from a and b using control mask imm8, and store
7358// the results in dst.
7359//
7360// FOR j := 0 to 7
7361// i := j*16
7362// IF imm8[j]
7363// dst[i+15:i] := b[i+15:i]
7364// ELSE
7365// dst[i+15:i] := a[i+15:i]
7366// FI
7367// ENDFOR
7368// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
7369// __constrange(0,255) int imm)
7370#define _mm_blend_epi16(a, b, imm) \
7371 __extension__({ \
7372 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
7373 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
7374 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
7375 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
7376 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
7377 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
7378 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
7379 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7380 uint16x8_t _mask_vec = vld1q_u16(_mask); \
7381 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
7382 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
7383 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
7384 })
7385
7386// Blend packed double-precision (64-bit) floating-point elements from a and b
7387// using control mask imm8, and store the results in dst.
7388// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
7389#define _mm_blend_pd(a, b, imm) \
7390 __extension__({ \
7391 const uint64_t _mask[2] = { \
7392 ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
7393 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \
7394 uint64x2_t _mask_vec = vld1q_u64(_mask); \
7395 uint64x2_t _a = vreinterpretq_u64_m128d(a); \
7396 uint64x2_t _b = vreinterpretq_u64_m128d(b); \
7397 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7398 })
7399
7400// Blend packed single-precision (32-bit) floating-point elements from a and b
7401// using mask, and store the results in dst.
7402// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
7404{
7405 const uint32_t ALIGN_STRUCT(16)
7406 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7407 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7408 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7409 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7410 uint32x4_t mask = vld1q_u32(data);
7411 float32x4_t a = vreinterpretq_f32_m128(_a);
7412 float32x4_t b = vreinterpretq_f32_m128(_b);
7413 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7414}
7415
7416// Blend packed 8-bit integers from a and b using mask, and store the results in
7417// dst.
7418//
7419// FOR j := 0 to 15
7420// i := j*8
7421// IF mask[i+7]
7422// dst[i+7:i] := b[i+7:i]
7423// ELSE
7424// dst[i+7:i] := a[i+7:i]
7425// FI
7426// ENDFOR
7428{
7429 // Use a signed shift right to create a mask with the sign bit
7430 uint8x16_t mask =
7431 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7432 uint8x16_t a = vreinterpretq_u8_m128i(_a);
7433 uint8x16_t b = vreinterpretq_u8_m128i(_b);
7434 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7435}
7436
7437// Blend packed double-precision (64-bit) floating-point elements from a and b
7438// using mask, and store the results in dst.
7439// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
7441{
7442 uint64x2_t mask =
7443 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7444#if defined(__aarch64__)
7445 float64x2_t a = vreinterpretq_f64_m128d(_a);
7446 float64x2_t b = vreinterpretq_f64_m128d(_b);
7447 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7448#else
7449 uint64x2_t a = vreinterpretq_u64_m128d(_a);
7450 uint64x2_t b = vreinterpretq_u64_m128d(_b);
7451 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7452#endif
7453}
7454
7455// Blend packed single-precision (32-bit) floating-point elements from a and b
7456// using mask, and store the results in dst.
7457// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
7459{
7460 // Use a signed shift right to create a mask with the sign bit
7461 uint32x4_t mask =
7462 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7463 float32x4_t a = vreinterpretq_f32_m128(_a);
7464 float32x4_t b = vreinterpretq_f32_m128(_b);
7465 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7466}
7467
7468// Round the packed double-precision (64-bit) floating-point elements in a up
7469// to an integer value, and store the results as packed double-precision
7470// floating-point elements in dst.
7471// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
7473{
7474#if defined(__aarch64__)
7475 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7476#else
7477 double *f = (double *) &a;
7478 return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7479#endif
7480}
7481
7482// Round the packed single-precision (32-bit) floating-point elements in a up to
7483// an integer value, and store the results as packed single-precision
7484// floating-point elements in dst.
7485// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
7487{
7488#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7489 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7490#else
7491 float *f = (float *) &a;
7492 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7493#endif
7494}
7495
7496// Round the lower double-precision (64-bit) floating-point element in b up to
7497// an integer value, store the result as a double-precision floating-point
7498// element in the lower element of dst, and copy the upper element from a to the
7499// upper element of dst.
7500// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
7502{
7503 return _mm_move_sd(a, _mm_ceil_pd(b));
7504}
7505
7506// Round the lower single-precision (32-bit) floating-point element in b up to
7507// an integer value, store the result as a single-precision floating-point
7508// element in the lower element of dst, and copy the upper 3 packed elements
7509// from a to the upper elements of dst.
7510//
7511// dst[31:0] := CEIL(b[31:0])
7512// dst[127:32] := a[127:32]
7513//
7514// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
7516{
7517 return _mm_move_ss(a, _mm_ceil_ps(b));
7518}
7519
7520// Compare packed 64-bit integers in a and b for equality, and store the results
7521// in dst
7523{
7524#if defined(__aarch64__)
7527#else
7528 // ARMv7 lacks vceqq_u64
7529 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
7530 uint32x4_t cmp =
7532 uint32x4_t swapped = vrev64q_u32(cmp);
7533 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7534#endif
7535}
7536
7537// Converts the four signed 16-bit integers in the lower 64 bits to four signed
7538// 32-bit integers.
7540{
7542 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7543}
7544
7545// Converts the two signed 16-bit integers in the lower 32 bits two signed
7546// 32-bit integers.
7548{
7549 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7550 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7551 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7552 return vreinterpretq_m128i_s64(s64x2);
7553}
7554
7555// Converts the two signed 32-bit integers in the lower 64 bits to two signed
7556// 64-bit integers.
7558{
7560 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7561}
7562
7563// Converts the four unsigned 8-bit integers in the lower 16 bits to four
7564// unsigned 32-bit integers.
7566{
7567 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7568 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7569 return vreinterpretq_m128i_s16(s16x8);
7570}
7571
7572// Converts the four unsigned 8-bit integers in the lower 32 bits to four
7573// unsigned 32-bit integers.
7575{
7576 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7577 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7578 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7579 return vreinterpretq_m128i_s32(s32x4);
7580}
7581
7582// Converts the two signed 8-bit integers in the lower 32 bits to four
7583// signed 64-bit integers.
7585{
7586 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
7587 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7588 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7589 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7590 return vreinterpretq_m128i_s64(s64x2);
7591}
7592
7593// Converts the four unsigned 16-bit integers in the lower 64 bits to four
7594// unsigned 32-bit integers.
7596{
7598 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7599}
7600
7601// Converts the two unsigned 16-bit integers in the lower 32 bits to two
7602// unsigned 64-bit integers.
7604{
7605 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7606 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7607 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7608 return vreinterpretq_m128i_u64(u64x2);
7609}
7610
7611// Converts the two unsigned 32-bit integers in the lower 64 bits to two
7612// unsigned 64-bit integers.
7614{
7616 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7617}
7618
7619// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7620// and store the results in dst.
7621// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
7623{
7624 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
7625 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7626 return vreinterpretq_m128i_u16(u16x8);
7627}
7628
7629// Converts the four unsigned 8-bit integers in the lower 32 bits to four
7630// unsigned 32-bit integers.
7631// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
7633{
7634 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
7635 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7636 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7637 return vreinterpretq_m128i_u32(u32x4);
7638}
7639
7640// Converts the two unsigned 8-bit integers in the lower 16 bits to two
7641// unsigned 64-bit integers.
7643{
7644 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
7645 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7646 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7647 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7648 return vreinterpretq_m128i_u64(u64x2);
7649}
7650
7651// Conditionally multiply the packed double-precision (64-bit) floating-point
7652// elements in a and b using the high 4 bits in imm8, sum the four products, and
7653// conditionally store the sum in dst using the low 4 bits of imm8.
7654// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
7656{
7657 // Generate mask value from constant immediate bit value
7658 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7659 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7660#if !SSE2NEON_PRECISE_DP
7661 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7662 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7663#endif
7664 // Conditional multiplication
7665#if !SSE2NEON_PRECISE_DP
7666 __m128d mul = _mm_mul_pd(a, b);
7667 const __m128d mulMask =
7668 _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
7669 __m128d tmp = _mm_and_pd(mul, mulMask);
7670#else
7671#if defined(__aarch64__)
7672 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7673 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7674 : 0;
7675 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7676 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7677 : 0;
7678#else
7679 double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
7680 double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
7681#endif
7682 __m128d tmp = _mm_set_pd(d1, d0);
7683#endif
7684 // Sum the products
7685#if defined(__aarch64__)
7686 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7687#else
7688 double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
7689#endif
7690 // Conditionally store the sum
7691 const __m128d sumMask =
7692 _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
7693 __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
7694 return res;
7695}
7696
7697// Conditionally multiply the packed single-precision (32-bit) floating-point
7698// elements in a and b using the high 4 bits in imm8, sum the four products,
7699// and conditionally store the sum in dst using the low 4 bits of imm.
7700// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
7702{
7703#if defined(__aarch64__)
7704 /* shortcuts */
7705 if (imm == 0xFF) {
7706 return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7707 }
7708 if (imm == 0x7F) {
7709 float32x4_t m = _mm_mul_ps(a, b);
7710 m[3] = 0;
7711 return _mm_set1_ps(vaddvq_f32(m));
7712 }
7713#endif
7714
7715 float s = 0, c = 0;
7716 float32x4_t f32a = vreinterpretq_f32_m128(a);
7717 float32x4_t f32b = vreinterpretq_f32_m128(b);
7718
7719 /* To improve the accuracy of floating-point summation, Kahan algorithm
7720 * is used for each operation.
7721 */
7722 if (imm & (1 << 4))
7723 _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7724 if (imm & (1 << 5))
7725 _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7726 if (imm & (1 << 6))
7727 _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7728 if (imm & (1 << 7))
7729 _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7730 s += c;
7731
7732 float32x4_t res = {
7733 (imm & 0x1) ? s : 0,
7734 (imm & 0x2) ? s : 0,
7735 (imm & 0x4) ? s : 0,
7736 (imm & 0x8) ? s : 0,
7737 };
7738 return vreinterpretq_m128_f32(res);
7739}
7740
7741// Extracts the selected signed or unsigned 32-bit integer from a and zero
7742// extends.
7743// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7744#define _mm_extract_epi32(a, imm) \
7745 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7746
7747// Extracts the selected signed or unsigned 64-bit integer from a and zero
7748// extends.
7749// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7750#define _mm_extract_epi64(a, imm) \
7751 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7752
7753// Extracts the selected signed or unsigned 8-bit integer from a and zero
7754// extends.
7755// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
7756// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
7757#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7758
7759// Extracts the selected single-precision (32-bit) floating-point from a.
7760// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7761#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7762
7763// Round the packed double-precision (64-bit) floating-point elements in a down
7764// to an integer value, and store the results as packed double-precision
7765// floating-point elements in dst.
7766// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
7768{
7769#if defined(__aarch64__)
7770 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7771#else
7772 double *f = (double *) &a;
7773 return _mm_set_pd(floor(f[1]), floor(f[0]));
7774#endif
7775}
7776
7777// Round the packed single-precision (32-bit) floating-point elements in a down
7778// to an integer value, and store the results as packed single-precision
7779// floating-point elements in dst.
7780// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
7782{
7783#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7784 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7785#else
7786 float *f = (float *) &a;
7787 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7788#endif
7789}
7790
7791// Round the lower double-precision (64-bit) floating-point element in b down to
7792// an integer value, store the result as a double-precision floating-point
7793// element in the lower element of dst, and copy the upper element from a to the
7794// upper element of dst.
7795// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
7797{
7798 return _mm_move_sd(a, _mm_floor_pd(b));
7799}
7800
7801// Round the lower single-precision (32-bit) floating-point element in b down to
7802// an integer value, store the result as a single-precision floating-point
7803// element in the lower element of dst, and copy the upper 3 packed elements
7804// from a to the upper elements of dst.
7805//
7806// dst[31:0] := FLOOR(b[31:0])
7807// dst[127:32] := a[127:32]
7808//
7809// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
7811{
7812 return _mm_move_ss(a, _mm_floor_ps(b));
7813}
7814
7815// Inserts the least significant 32 bits of b into the selected 32-bit integer
7816// of a.
7817// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7818// __constrange(0,4) int imm)
7819#define _mm_insert_epi32(a, b, imm) \
7820 __extension__({ \
7821 vreinterpretq_m128i_s32( \
7822 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7823 })
7824
7825// Inserts the least significant 64 bits of b into the selected 64-bit integer
7826// of a.
7827// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7828// __constrange(0,2) int imm)
7829#define _mm_insert_epi64(a, b, imm) \
7830 __extension__({ \
7831 vreinterpretq_m128i_s64( \
7832 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7833 })
7834
7835// Inserts the least significant 8 bits of b into the selected 8-bit integer
7836// of a.
7837// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7838// __constrange(0,16) int imm)
7839#define _mm_insert_epi8(a, b, imm) \
7840 __extension__({ \
7841 vreinterpretq_m128i_s8( \
7842 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7843 })
7844
7845// Copy a to tmp, then insert a single-precision (32-bit) floating-point
7846// element from b into tmp using the control in imm8. Store tmp to dst using
7847// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7848// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
7849#define _mm_insert_ps(a, b, imm8) \
7850 __extension__({ \
7851 float32x4_t tmp1 = \
7852 vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \
7853 vreinterpretq_f32_m128(a), 0); \
7854 float32x4_t tmp2 = \
7855 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7856 ((imm8 >> 4) & 0x3)); \
7857 const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7858 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7859 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7860 ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \
7861 uint32x4_t mask = vld1q_u32(data); \
7862 float32x4_t all_zeros = vdupq_n_f32(0); \
7863 \
7864 vreinterpretq_m128_f32( \
7865 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
7866 })
7867
7868// epi versions of min/max
7869// Computes the pariwise maximums of the four signed 32-bit integer values of a
7870// and b.
7871//
7872// A 128-bit parameter that can be defined with the following equations:
7873// r0 := (a0 > b0) ? a0 : b0
7874// r1 := (a1 > b1) ? a1 : b1
7875// r2 := (a2 > b2) ? a2 : b2
7876// r3 := (a3 > b3) ? a3 : b3
7877//
7878// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
7880{
7883}
7884
7885// Compare packed signed 8-bit integers in a and b, and store packed maximum
7886// values in dst.
7887// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
7889{
7892}
7893
7894// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7895// values in dst.
7896// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
7898{
7901}
7902
7903// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7904// values in dst.
7905// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7907{
7910}
7911
7912// Computes the pariwise minima of the four signed 32-bit integer values of a
7913// and b.
7914//
7915// A 128-bit parameter that can be defined with the following equations:
7916// r0 := (a0 < b0) ? a0 : b0
7917// r1 := (a1 < b1) ? a1 : b1
7918// r2 := (a2 < b2) ? a2 : b2
7919// r3 := (a3 < b3) ? a3 : b3
7920//
7921// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
7923{
7926}
7927
7928// Compare packed signed 8-bit integers in a and b, and store packed minimum
7929// values in dst.
7930// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
7932{
7935}
7936
7937// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7938// values in dst.
7939// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
7941{
7944}
7945
7946// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7947// values in dst.
7948// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7950{
7953}
7954
7955// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7956// in a, store the minimum and index in dst, and zero the remaining bits in dst.
7957//
7958// index[2:0] := 0
7959// min[15:0] := a[15:0]
7960// FOR j := 0 to 7
7961// i := j*16
7962// IF a[i+15:i] < min[15:0]
7963// index[2:0] := j
7964// min[15:0] := a[i+15:i]
7965// FI
7966// ENDFOR
7967// dst[15:0] := min[15:0]
7968// dst[18:16] := index[2:0]
7969// dst[127:19] := 0
7970//
7971// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
7973{
7974 __m128i dst;
7975 uint16_t min, idx = 0;
7976 // Find the minimum value
7977#if defined(__aarch64__)
7978 min = vminvq_u16(vreinterpretq_u16_m128i(a));
7979#else
7980 __m64 tmp;
7982 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7983 vget_high_u16(vreinterpretq_u16_m128i(a))));
7985 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7987 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7988 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7989#endif
7990 // Get the index of the minimum value
7991 int i;
7992 for (i = 0; i < 8; i++) {
7993 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7994 idx = (uint16_t) i;
7995 break;
7996 }
7997 a = _mm_srli_si128(a, 2);
7998 }
7999 // Generate result
8000 dst = _mm_setzero_si128();
8002 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
8004 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
8005 return dst;
8006}
8007
8008// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
8009// 8-bit integers in a compared to those in b, and store the 16-bit results in
8010// dst. Eight SADs are performed using one quadruplet from b and eight
8011// quadruplets from a. One quadruplet is selected from b starting at on the
8012// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
8013// integers selected from a starting at the offset specified in imm8.
8014// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
8016{
8017 uint8x16_t _a, _b;
8018
8019 switch (imm & 0x4) {
8020 case 0:
8021 // do nothing
8022 _a = vreinterpretq_u8_m128i(a);
8023 break;
8024 case 4:
8025 _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
8027 break;
8028 default:
8029#if defined(__GNUC__) || defined(__clang__)
8030 __builtin_unreachable();
8031#endif
8032 break;
8033 }
8034
8035 switch (imm & 0x3) {
8036 case 0:
8037 _b = vreinterpretq_u8_u32(
8038 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
8039 break;
8040 case 1:
8041 _b = vreinterpretq_u8_u32(
8042 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
8043 break;
8044 case 2:
8045 _b = vreinterpretq_u8_u32(
8046 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
8047 break;
8048 case 3:
8049 _b = vreinterpretq_u8_u32(
8050 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
8051 break;
8052 default:
8053#if defined(__GNUC__) || defined(__clang__)
8054 __builtin_unreachable();
8055#endif
8056 break;
8057 }
8058
8059 int16x8_t c04, c15, c26, c37;
8060 uint8x8_t low_b = vget_low_u8(_b);
8061 c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8062 _a = vextq_u8(_a, _a, 1);
8063 c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8064 _a = vextq_u8(_a, _a, 1);
8065 c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8066 _a = vextq_u8(_a, _a, 1);
8067 c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8068#if defined(__aarch64__)
8069 // |0|4|2|6|
8070 c04 = vpaddq_s16(c04, c26);
8071 // |1|5|3|7|
8072 c15 = vpaddq_s16(c15, c37);
8073
8074 int32x4_t trn1_c =
8075 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8076 int32x4_t trn2_c =
8077 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8078 return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
8079 vreinterpretq_s16_s32(trn2_c)));
8080#else
8081 int16x4_t c01, c23, c45, c67;
8082 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8083 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8084 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8085 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8086
8088 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8089#endif
8090}
8091
8092// Multiply the low signed 32-bit integers from each packed 64-bit element in
8093// a and b, and store the signed 64-bit results in dst.
8094//
8095// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
8096// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
8098{
8099 // vmull_s32 upcasts instead of masking, so we downcast.
8100 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
8101 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
8102 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
8103}
8104
8105// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
8106// unsigned 32-bit integers from b.
8107// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
8109{
8112}
8113
8114// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
8115// integers and saturates.
8116//
8117// r0 := UnsignedSaturate(a0)
8118// r1 := UnsignedSaturate(a1)
8119// r2 := UnsignedSaturate(a2)
8120// r3 := UnsignedSaturate(a3)
8121// r4 := UnsignedSaturate(b0)
8122// r5 := UnsignedSaturate(b1)
8123// r6 := UnsignedSaturate(b2)
8124// r7 := UnsignedSaturate(b3)
8126{
8128 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
8129 vqmovun_s32(vreinterpretq_s32_m128i(b))));
8130}
8131
8132// Round the packed double-precision (64-bit) floating-point elements in a using
8133// the rounding parameter, and store the results as packed double-precision
8134// floating-point elements in dst.
8135// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
8137{
8138#if defined(__aarch64__)
8139 switch (rounding) {
8141 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
8143 return _mm_floor_pd(a);
8145 return _mm_ceil_pd(a);
8147 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
8148 default: //_MM_FROUND_CUR_DIRECTION
8149 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
8150 }
8151#else
8152 double *v_double = (double *) &a;
8153
8154 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8155 (rounding == _MM_FROUND_CUR_DIRECTION &&
8157 double res[2], tmp;
8158 for (int i = 0; i < 2; i++) {
8159 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
8160 double roundDown = floor(tmp); // Round down value
8161 double roundUp = ceil(tmp); // Round up value
8162 double diffDown = tmp - roundDown;
8163 double diffUp = roundUp - tmp;
8164 if (diffDown < diffUp) {
8165 /* If it's closer to the round down value, then use it */
8166 res[i] = roundDown;
8167 } else if (diffDown > diffUp) {
8168 /* If it's closer to the round up value, then use it */
8169 res[i] = roundUp;
8170 } else {
8171 /* If it's equidistant between round up and round down value,
8172 * pick the one which is an even number */
8173 double half = roundDown / 2;
8174 if (half != floor(half)) {
8175 /* If the round down value is odd, return the round up value
8176 */
8177 res[i] = roundUp;
8178 } else {
8179 /* If the round up value is odd, return the round down value
8180 */
8181 res[i] = roundDown;
8182 }
8183 }
8184 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
8185 }
8186 return _mm_set_pd(res[1], res[0]);
8187 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8188 (rounding == _MM_FROUND_CUR_DIRECTION &&
8190 return _mm_floor_pd(a);
8191 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8192 (rounding == _MM_FROUND_CUR_DIRECTION &&
8194 return _mm_ceil_pd(a);
8195 }
8196 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
8197 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
8198#endif
8199}
8200
8201// Round the packed single-precision (32-bit) floating-point elements in a using
8202// the rounding parameter, and store the results as packed single-precision
8203// floating-point elements in dst.
8204// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
8206{
8207#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
8208 switch (rounding) {
8210 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
8212 return _mm_floor_ps(a);
8214 return _mm_ceil_ps(a);
8216 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
8217 default: //_MM_FROUND_CUR_DIRECTION
8218 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
8219 }
8220#else
8221 float *v_float = (float *) &a;
8222
8223 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8224 (rounding == _MM_FROUND_CUR_DIRECTION &&
8226 uint32x4_t signmask = vdupq_n_u32(0x80000000);
8227 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
8228 vdupq_n_f32(0.5f)); /* +/- 0.5 */
8229 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
8230 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
8231 int32x4_t r_trunc = vcvtq_s32_f32(
8232 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
8233 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
8234 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
8235 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
8236 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
8237 float32x4_t delta = vsubq_f32(
8239 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
8240 uint32x4_t is_delta_half =
8241 vceqq_f32(delta, half); /* delta == +/- 0.5 */
8243 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
8244 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8245 (rounding == _MM_FROUND_CUR_DIRECTION &&
8247 return _mm_floor_ps(a);
8248 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8249 (rounding == _MM_FROUND_CUR_DIRECTION &&
8251 return _mm_ceil_ps(a);
8252 }
8253 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
8254 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
8255 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
8256 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
8257#endif
8258}
8259
8260// Round the lower double-precision (64-bit) floating-point element in b using
8261// the rounding parameter, store the result as a double-precision floating-point
8262// element in the lower element of dst, and copy the upper element from a to the
8263// upper element of dst.
8264// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
8266{
8267 return _mm_move_sd(a, _mm_round_pd(b, rounding));
8268}
8269
8270// Round the lower single-precision (32-bit) floating-point element in b using
8271// the rounding parameter, store the result as a single-precision floating-point
8272// element in the lower element of dst, and copy the upper 3 packed elements
8273// from a to the upper elements of dst. Rounding is done according to the
8274// rounding[3:0] parameter, which can be one of:
8275// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
8276// suppress exceptions
8277// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
8278// suppress exceptions
8279// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
8280// exceptions
8281// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
8282// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
8283// _MM_SET_ROUNDING_MODE
8284// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
8286{
8287 return _mm_move_ss(a, _mm_round_ps(b, rounding));
8288}
8289
8290// Load 128-bits of integer data from memory into dst using a non-temporal
8291// memory hint. mem_addr must be aligned on a 16-byte boundary or a
8292// general-protection exception may be generated.
8293//
8294// dst[127:0] := MEM[mem_addr+127:mem_addr]
8295//
8296// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
8298{
8299#if __has_builtin(__builtin_nontemporal_store)
8300 return __builtin_nontemporal_load(p);
8301#else
8302 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
8303#endif
8304}
8305
8306// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
8307// all 1's, and return 1 if the result is zero, otherwise return 0.
8308// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
8310{
8311 return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
8312 ~(uint64_t) 0;
8313}
8314
8315// Compute the bitwise AND of 128 bits (representing integer data) in a and
8316// mask, and return 1 if the result is zero, otherwise return 0.
8317// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
8319{
8320 int64x2_t a_and_mask =
8322 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
8323}
8324
8325// Compute the bitwise AND of 128 bits (representing integer data) in a and
8326// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
8327// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
8328// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8329// otherwise return 0.
8330// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
8332{
8333 uint64x2_t zf =
8335 uint64x2_t cf =
8337 uint64x2_t result = vandq_u64(zf, cf);
8338 return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
8339}
8340
8341// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8342// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8343// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8344// otherwise set CF to 0. Return the CF value.
8345// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
8347{
8348 int64x2_t s64 =
8349 vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
8351 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8352}
8353
8354// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8355// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8356// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8357// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8358// otherwise return 0.
8359// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
8360#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
8361
8362// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8363// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8364// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8365// otherwise set CF to 0. Return the ZF value.
8366// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
8368{
8369 int64x2_t s64 =
8371 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8372}
8373
8374/* SSE4.2 */
8375
8376// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8377// in b for greater than.
8379{
8380#if defined(__aarch64__)
8383#else
8384 return vreinterpretq_m128i_s64(vshrq_n_s64(
8386 63));
8387#endif
8388}
8389
8390// Starting with the initial value in crc, accumulates a CRC32 value for
8391// unsigned 16-bit integer v.
8392// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
8393FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8394{
8395#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8396 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8397 : [c] "+r"(crc)
8398 : [v] "r"(v));
8399#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8400 crc = __crc32ch(crc, v);
8401#else
8402 crc = _mm_crc32_u8(crc, v & 0xff);
8403 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8404#endif
8405 return crc;
8406}
8407
8408// Starting with the initial value in crc, accumulates a CRC32 value for
8409// unsigned 32-bit integer v.
8410// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
8411FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8412{
8413#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8414 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8415 : [c] "+r"(crc)
8416 : [v] "r"(v));
8417#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8418 crc = __crc32cw(crc, v);
8419#else
8420 crc = _mm_crc32_u16(crc, v & 0xffff);
8421 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8422#endif
8423 return crc;
8424}
8425
8426// Starting with the initial value in crc, accumulates a CRC32 value for
8427// unsigned 64-bit integer v.
8428// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
8429FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8430{
8431#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8432 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8433 : [c] "+r"(crc)
8434 : [v] "r"(v));
8435#else
8436 crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
8437 crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8438#endif
8439 return crc;
8440}
8441
8442// Starting with the initial value in crc, accumulates a CRC32 value for
8443// unsigned 8-bit integer v.
8444// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
8445FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8446{
8447#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8448 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8449 : [c] "+r"(crc)
8450 : [v] "r"(v));
8451#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8452 crc = __crc32cb(crc, v);
8453#else
8454 crc ^= v;
8455 for (int bit = 0; bit < 8; bit++) {
8456 if (crc & 1)
8457 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8458 else
8459 crc = (crc >> 1);
8460 }
8461#endif
8462 return crc;
8463}
8464
8465/* AES */
8466
8467#if !defined(__ARM_FEATURE_CRYPTO)
8468/* clang-format off */
8469#define SSE2NEON_AES_DATA(w) \
8470 { \
8471 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8472 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8473 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8474 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8475 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8476 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8477 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8478 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8479 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8480 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8481 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8482 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8483 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8484 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8485 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8486 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8487 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8488 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8489 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8490 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8491 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8492 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8493 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8494 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8495 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8496 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8497 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8498 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8499 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8500 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8501 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8502 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8503 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8504 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8505 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8506 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8507 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8508 }
8509/* clang-format on */
8510
8511/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8512#define SSE2NEON_AES_H0(x) (x)
8514#undef SSE2NEON_AES_H0
8515
8516// In the absence of crypto extensions, implement aesenc using regular neon
8517// intrinsics instead. See:
8518// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8519// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8520// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
8521// for more information Reproduced with permission of the author.
8523{
8524#if defined(__aarch64__)
8525 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8526 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8527 0xc, 0x1, 0x6, 0xb};
8528 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8529 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8530
8531 uint8x16_t v;
8532 uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
8533
8534 // shift rows
8535 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8536
8537 // sub bytes
8538 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
8539 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
8540 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
8541 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
8542
8543 // mix columns
8544 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8545 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8546 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8547
8548 // add round key
8549 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8550
8551#else /* ARMv7-A NEON implementation */
8552#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8553 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8554 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8555#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8556#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8557#define SSE2NEON_AES_U0(p) \
8558 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8559#define SSE2NEON_AES_U1(p) \
8560 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8561#define SSE2NEON_AES_U2(p) \
8562 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8563#define SSE2NEON_AES_U3(p) \
8564 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8565 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8570 };
8571#undef SSE2NEON_AES_B2W
8572#undef SSE2NEON_AES_F2
8573#undef SSE2NEON_AES_F3
8574#undef SSE2NEON_AES_U0
8575#undef SSE2NEON_AES_U1
8576#undef SSE2NEON_AES_U2
8577#undef SSE2NEON_AES_U3
8578
8579 uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
8580 uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
8581 uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
8582 uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
8583
8584 __m128i out = _mm_set_epi32(
8585 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8586 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8587 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8588 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8589 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8590 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8591 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8592 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8593
8594 return _mm_xor_si128(out, RoundKey);
8595#endif
8596}
8597
8598// Perform the last round of an AES encryption flow on data (state) in a using
8599// the round key in RoundKey, and store the result in dst.
8600// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8602{
8603 /* FIXME: optimized for NEON */
8604 uint8_t v[4][4] = {
8621 };
8622 for (int i = 0; i < 16; i++)
8624 v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
8625 return a;
8626}
8627
8628// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8629// This instruction generates a round key for AES encryption. See
8630// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8631// for details.
8632//
8633// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
8635{
8636 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
8637 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
8638 for (int i = 0; i < 4; ++i) {
8639 ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
8640 ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
8641 }
8642 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8643 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8644}
8645#undef SSE2NEON_AES_DATA
8646
8647#else /* __ARM_FEATURE_CRYPTO */
8648// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8649// AESMC and then manually applying the real key as an xor operation. This
8650// unfortunately means an additional xor op; the compiler should be able to
8651// optimize this away for repeated calls however. See
8652// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8653// for more details.
8655{
8657 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
8659}
8660
8661// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8663{
8664 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8665 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8666 RoundKey);
8667}
8668
8670{
8671 // AESE does ShiftRows and SubBytes on A
8672 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8673
8674 uint8x16_t dest = {
8675 // Undo ShiftRows step from AESE and extract X1 and X3
8676 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
8677 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
8678 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
8679 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
8680 };
8681 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
8683}
8684#endif
8685
8686/* Others */
8687
8688// Perform a carry-less multiplication of two 64-bit integers, selected from a
8689// and b according to imm8, and store the results in dst.
8690// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
8692{
8693 uint64x2_t a = vreinterpretq_u64_m128i(_a);
8694 uint64x2_t b = vreinterpretq_u64_m128i(_b);
8695 switch (imm & 0x11) {
8696 case 0x00:
8698 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
8699 case 0x01:
8701 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
8702 case 0x10:
8704 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
8705 case 0x11:
8707 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
8708 default:
8709 abort();
8710 }
8711}
8712
8714{
8715 union {
8716 fpcr_bitfield field;
8717#if defined(__aarch64__)
8718 uint64_t value;
8719#else
8720 uint32_t value;
8721#endif
8722 } r;
8723
8724#if defined(__aarch64__)
8725 __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
8726#else
8727 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
8728#endif
8729
8730 return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
8731}
8732
8733// Count the number of bits set to 1 in unsigned 32-bit integer a, and
8734// return that count in dst.
8735// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
8736FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
8737{
8738#if defined(__aarch64__)
8739#if __has_builtin(__builtin_popcount)
8740 return __builtin_popcount(a);
8741#else
8742 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8743#endif
8744#else
8745 uint32_t count = 0;
8746 uint8x8_t input_val, count8x8_val;
8747 uint16x4_t count16x4_val;
8748 uint32x2_t count32x2_val;
8749
8750 input_val = vld1_u8((uint8_t *) &a);
8751 count8x8_val = vcnt_u8(input_val);
8752 count16x4_val = vpaddl_u8(count8x8_val);
8753 count32x2_val = vpaddl_u16(count16x4_val);
8754
8755 vst1_u32(&count, count32x2_val);
8756 return count;
8757#endif
8758}
8759
8760// Count the number of bits set to 1 in unsigned 64-bit integer a, and
8761// return that count in dst.
8762// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
8763FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
8764{
8765#if defined(__aarch64__)
8766#if __has_builtin(__builtin_popcountll)
8767 return __builtin_popcountll(a);
8768#else
8769 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8770#endif
8771#else
8772 uint64_t count = 0;
8773 uint8x8_t input_val, count8x8_val;
8774 uint16x4_t count16x4_val;
8775 uint32x2_t count32x2_val;
8776 uint64x1_t count64x1_val;
8777
8778 input_val = vld1_u8((uint8_t *) &a);
8779 count8x8_val = vcnt_u8(input_val);
8780 count16x4_val = vpaddl_u8(count8x8_val);
8781 count32x2_val = vpaddl_u16(count16x4_val);
8782 count64x1_val = vpaddl_u32(count32x2_val);
8783 vst1_u64(&count, count64x1_val);
8784 return count;
8785#endif
8786}
8787
8789{
8790 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
8791 // regardless of the value of the FZ bit.
8792 union {
8793 fpcr_bitfield field;
8794#if defined(__aarch64__)
8795 uint64_t value;
8796#else
8797 uint32_t value;
8798#endif
8799 } r;
8800
8801#if defined(__aarch64__)
8802 __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
8803#else
8804 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
8805#endif
8806
8807 r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
8808
8809#if defined(__aarch64__)
8810 __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
8811#else
8812 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
8813#endif
8814}
8815
8816// Return the current 64-bit value of the processor's time-stamp counter.
8817// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
8818
8819FORCE_INLINE uint64_t _rdtsc(void)
8820{
8821#if defined(__aarch64__)
8822 uint64_t val;
8823
8824 /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
8825 * system counter is at least 56 bits wide; from Armv8.6, the counter
8826 * must be 64 bits wide. So the system counter could be less than 64
8827 * bits wide and it is attributed with the flag 'cap_user_time_short'
8828 * is true.
8829 */
8830 asm volatile("mrs %0, cntvct_el0" : "=r"(val));
8831
8832 return val;
8833#else
8834 uint32_t pmccntr, pmuseren, pmcntenset;
8835 // Read the user mode Performance Monitoring Unit (PMU)
8836 // User Enable Register (PMUSERENR) access permissions.
8837 asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
8838 if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
8839 asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
8840 if (pmcntenset & 0x80000000UL) { // Is it counting?
8841 asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
8842 // The counter is set up to count every 64th cycle
8843 return (uint64_t) (pmccntr) << 6;
8844 }
8845 }
8846
8847 // Fallback to syscall as we can't enable PMUSERENR in user mode.
8848 struct timeval tv;
8849 gettimeofday(&tv, NULL);
8850 return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
8851#endif
8852}
8853
8854#if defined(__GNUC__) || defined(__clang__)
8855#pragma pop_macro("ALIGN_STRUCT")
8856#pragma pop_macro("FORCE_INLINE")
8857#endif
8858
8859#if defined(__GNUC__) && !defined(__clang__)
8860#pragma GCC pop_options
8861#endif
8862
8863#endif
8864// clang-format on