39 #define HWY_NEON_BUILD_TPL_1
40 #define HWY_NEON_BUILD_TPL_2
41 #define HWY_NEON_BUILD_TPL_3
45 #define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
46 #define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
47 #define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
50 #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
51 #define HWY_NEON_BUILD_PARAM_2(type, size) \
52 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
53 #define HWY_NEON_BUILD_PARAM_3(type, size) \
54 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
55 const Vec128<type##_t, size> c
59 #define HWY_NEON_BUILD_ARG_1 a.raw
60 #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
61 #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
70 #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
76 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
77 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
78 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
79 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
80 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
81 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
91 #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
92 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
93 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
94 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
95 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
96 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
99 #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
100 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
101 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
102 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
103 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
104 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
107 #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
108 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
109 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
110 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
111 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
114 #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
115 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
116 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
117 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
118 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
121 #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
122 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
123 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
124 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
127 #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
128 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
129 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
130 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
133 #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
134 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
135 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
138 #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
139 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
140 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
143 #define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
144 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
145 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
146 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
150 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
151 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
152 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
154 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
159 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
160 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
161 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
165 #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
166 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
167 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
168 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
171 #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
172 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
173 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
174 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
177 #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
178 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
179 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
182 #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
183 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
184 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
187 #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
188 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
189 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
192 #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
193 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
194 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
196 #define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \
197 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
198 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
199 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
203 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
204 #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
205 #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
206 #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
207 #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
208 #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
209 #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
210 #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
211 #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
212 #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
213 #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
214 #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
215 #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
216 #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
217 #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
218 #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
219 #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
220 #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
221 #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
222 #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
223 #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
224 #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
225 #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
226 #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
227 #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
228 #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
229 #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
230 #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
231 #define vzip1_s8(x, y) vzip_s8(x, y).val[0]
232 #define vzip1_u8(x, y) vzip_u8(x, y).val[0]
233 #define vzip1_s16(x, y) vzip_s16(x, y).val[0]
234 #define vzip1_u16(x, y) vzip_u16(x, y).val[0]
235 #define vzip1_f32(x, y) vzip_f32(x, y).val[0]
236 #define vzip1_u32(x, y) vzip_u32(x, y).val[0]
237 #define vzip1_s32(x, y) vzip_s32(x, y).val[0]
238 #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
239 #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
240 #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
241 #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
242 #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
243 #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
244 #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
245 #define vzip2_s8(x, y) vzip_s8(x, y).val[1]
246 #define vzip2_u8(x, y) vzip_u8(x, y).val[1]
247 #define vzip2_s16(x, y) vzip_s16(x, y).val[1]
248 #define vzip2_u16(x, y) vzip_u16(x, y).val[1]
249 #define vzip2_s32(x, y) vzip_s32(x, y).val[1]
250 #define vzip2_u32(x, y) vzip_u32(x, y).val[1]
251 #define vzip2_f32(x, y) vzip_f32(x, y).val[1]
252 #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
253 #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
254 #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
255 #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
256 #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
257 #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
258 #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
264 template <
typename T,
size_t N>
266 template <
typename T,
size_t N>
268 template <
typename T,
size_t N>
574 template <
typename T,
size_t N>
635 struct Raw128<double, 2> {
636 using type = float64x2_t;
698 struct Raw128<double, 1> {
699 using type = float64x1_t;
759 template <
typename T,
size_t N = 16 /
sizeof(T)>
772 return *
this = (*
this * other);
775 return *
this = (*
this / other);
778 return *
this = (*
this + other);
781 return *
this = (*
this - other);
784 return *
this = (*
this & other);
787 return *
this = (*
this | other);
790 return *
this = (*
this ^ other);
796 template <
typename T>
799 template <
typename T>
803 template <
typename T,
size_t N = 16 /
sizeof(T)>
817 template <
typename T>
824 template <
typename T,
size_t N>
844 #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
845 #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
846 Vec128<uint8_t, size * sizeof(type##_t)>
847 #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
848 #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
873 #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
874 #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
875 #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
876 #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
886 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
891 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
896 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
901 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
906 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
911 template <
size_t N, HWY_IF_LE64(
float, N)>
987 template <
typename T,
size_t N,
typename FromT>
989 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
996 #define HWY_NEON_BUILD_TPL_HWY_SET1
997 #define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
998 #define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
999 Simd<type##_t, size, 0> , const type##_t t
1000 #define HWY_NEON_BUILD_ARG_HWY_SET1 t
1004 #undef HWY_NEON_BUILD_TPL_HWY_SET1
1005 #undef HWY_NEON_BUILD_RET_HWY_SET1
1006 #undef HWY_NEON_BUILD_PARAM_HWY_SET1
1007 #undef HWY_NEON_BUILD_ARG_HWY_SET1
1010 template <
typename T,
size_t N>
1024 template <
typename T,
size_t N>
1034 template <
typename T,
size_t N,
typename T2>
1037 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
1038 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
1040 return Load(
d, lanes);
1046 #define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
1047 #define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
1048 #define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
1049 #define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1053 #undef HWY_NEON_BUILD_TPL_HWY_GET
1054 #undef HWY_NEON_BUILD_RET_HWY_GET
1055 #undef HWY_NEON_BUILD_PARAM_HWY_GET
1056 #undef HWY_NEON_BUILD_ARG_HWY_GET
1062 return detail::GetLane<0>(
v);
1069 template <
typename T>
1073 return detail::GetLane<0>(
v);
1076 template <
typename T>
1078 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1079 if (__builtin_constant_p(i)) {
1082 return detail::GetLane<0>(
v);
1084 return detail::GetLane<1>(
v);
1088 alignas(16) T lanes[2];
1093 template <
typename T>
1095 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1096 if (__builtin_constant_p(i)) {
1099 return detail::GetLane<0>(
v);
1101 return detail::GetLane<1>(
v);
1103 return detail::GetLane<2>(
v);
1105 return detail::GetLane<3>(
v);
1109 alignas(16) T lanes[4];
1114 template <
typename T>
1116 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1117 if (__builtin_constant_p(i)) {
1120 return detail::GetLane<0>(
v);
1122 return detail::GetLane<1>(
v);
1124 return detail::GetLane<2>(
v);
1126 return detail::GetLane<3>(
v);
1128 return detail::GetLane<4>(
v);
1130 return detail::GetLane<5>(
v);
1132 return detail::GetLane<6>(
v);
1134 return detail::GetLane<7>(
v);
1138 alignas(16) T lanes[8];
1143 template <
typename T>
1145 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1146 if (__builtin_constant_p(i)) {
1149 return detail::GetLane<0>(
v);
1151 return detail::GetLane<1>(
v);
1153 return detail::GetLane<2>(
v);
1155 return detail::GetLane<3>(
v);
1157 return detail::GetLane<4>(
v);
1159 return detail::GetLane<5>(
v);
1161 return detail::GetLane<6>(
v);
1163 return detail::GetLane<7>(
v);
1165 return detail::GetLane<8>(
v);
1167 return detail::GetLane<9>(
v);
1169 return detail::GetLane<10>(
v);
1171 return detail::GetLane<11>(
v);
1173 return detail::GetLane<12>(
v);
1175 return detail::GetLane<13>(
v);
1177 return detail::GetLane<14>(
v);
1179 return detail::GetLane<15>(
v);
1183 alignas(16) T lanes[16];
1191 #define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
1192 #define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
1193 #define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
1194 Vec128<type##_t, size> v, type##_t t
1195 #define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1199 #undef HWY_NEON_BUILD_TPL_HWY_INSERT
1200 #undef HWY_NEON_BUILD_RET_HWY_INSERT
1201 #undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1202 #undef HWY_NEON_BUILD_ARG_HWY_INSERT
1209 template <
typename T>
1216 template <
typename T>
1218 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1219 if (__builtin_constant_p(i)) {
1222 return detail::InsertLane<0>(
v, t);
1224 return detail::InsertLane<1>(
v, t);
1229 alignas(16) T lanes[2];
1232 return Load(
d, lanes);
1235 template <
typename T>
1237 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1238 if (__builtin_constant_p(i)) {
1241 return detail::InsertLane<0>(
v, t);
1243 return detail::InsertLane<1>(
v, t);
1245 return detail::InsertLane<2>(
v, t);
1247 return detail::InsertLane<3>(
v, t);
1252 alignas(16) T lanes[4];
1255 return Load(
d, lanes);
1258 template <
typename T>
1260 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1261 if (__builtin_constant_p(i)) {
1264 return detail::InsertLane<0>(
v, t);
1266 return detail::InsertLane<1>(
v, t);
1268 return detail::InsertLane<2>(
v, t);
1270 return detail::InsertLane<3>(
v, t);
1272 return detail::InsertLane<4>(
v, t);
1274 return detail::InsertLane<5>(
v, t);
1276 return detail::InsertLane<6>(
v, t);
1278 return detail::InsertLane<7>(
v, t);
1283 alignas(16) T lanes[8];
1286 return Load(
d, lanes);
1289 template <
typename T>
1291 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1292 if (__builtin_constant_p(i)) {
1295 return detail::InsertLane<0>(
v, t);
1297 return detail::InsertLane<1>(
v, t);
1299 return detail::InsertLane<2>(
v, t);
1301 return detail::InsertLane<3>(
v, t);
1303 return detail::InsertLane<4>(
v, t);
1305 return detail::InsertLane<5>(
v, t);
1307 return detail::InsertLane<6>(
v, t);
1309 return detail::InsertLane<7>(
v, t);
1311 return detail::InsertLane<8>(
v, t);
1313 return detail::InsertLane<9>(
v, t);
1315 return detail::InsertLane<10>(
v, t);
1317 return detail::InsertLane<11>(
v, t);
1319 return detail::InsertLane<12>(
v, t);
1321 return detail::InsertLane<13>(
v, t);
1323 return detail::InsertLane<14>(
v, t);
1325 return detail::InsertLane<15>(
v, t);
1330 alignas(16) T lanes[16];
1333 return Load(
d, lanes);
1391 #if HWY_ARCH_ARM_A64
1399 #if HWY_ARCH_ARM_A64
1409 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
1410 #undef HWY_NEON_DEF_FUNCTION
1411 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
1412 template <int kBits> \
1413 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
1414 return kBits == 0 ? v \
1415 : Vec128<type##_t, size>(HWY_NEON_EVAL( \
1416 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
1424 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
1428 template <
int kBits,
size_t N>
1430 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
1431 if (kBits == 0)
return v;
1435 template <
int kBits,
size_t N>
1437 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
1438 if (kBits == 0)
return v;
1451 template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1461 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1471 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1490 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1500 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1510 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1532 template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1544 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1556 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1578 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1588 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1598 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1615 template <
typename T,
size_t N>
1617 return v << Set(Simd<T, N, 0>(),
static_cast<T
>(bits));
1619 template <
typename T,
size_t N>
1636 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1641 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1657 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1662 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1671 int32x4_t rlo = vmull_s16(vget_low_s16(a.
raw), vget_low_s16(b.
raw));
1672 #if HWY_ARCH_ARM_A64
1673 int32x4_t rhi = vmull_high_s16(a.
raw, b.
raw);
1675 int32x4_t rhi = vmull_s16(vget_high_s16(a.
raw), vget_high_s16(b.
raw));
1678 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1682 uint32x4_t rlo = vmull_u16(vget_low_u16(a.
raw), vget_low_u16(b.
raw));
1683 #if HWY_ARCH_ARM_A64
1684 uint32x4_t rhi = vmull_high_u16(a.
raw, b.
raw);
1686 uint32x4_t rhi = vmull_u16(vget_high_u16(a.
raw), vget_high_u16(b.
raw));
1689 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1692 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1695 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.
raw, b.
raw));
1698 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1701 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.
raw, b.
raw));
1708 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1727 #if HWY_ARCH_ARM_A64
1761 template <
size_t N, HWY_IF_LE64(
float, N)>
1770 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1771 template <
size_t N, HWY_IF_LE64(
float, N)>
1773 const Vec128<float, N> x,
1774 const Vec128<float, N> add) {
1775 return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1777 HWY_API Vec128<float>
MulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1778 const Vec128<float> add) {
1779 return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1787 return mul * x + add;
1791 #if HWY_ARCH_ARM_A64
1792 HWY_API Vec64<double>
MulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1793 const Vec64<double> add) {
1794 return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
1796 HWY_API Vec128<double>
MulAdd(
const Vec128<double> mul,
const Vec128<double> x,
1797 const Vec128<double> add) {
1798 return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1803 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1804 template <
size_t N, HWY_IF_LE64(
float, N)>
1806 const Vec128<float, N> x,
1807 const Vec128<float, N> add) {
1808 return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1810 HWY_API Vec128<float>
NegMulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1811 const Vec128<float> add) {
1812 return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1820 return add - mul * x;
1824 #if HWY_ARCH_ARM_A64
1825 HWY_API Vec64<double>
NegMulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1826 const Vec64<double> add) {
1827 return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
1830 const Vec128<double> x,
1831 const Vec128<double> add) {
1832 return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1852 #if HWY_ARCH_ARM_A64
1854 HWY_API Vec128<double, N>
MulSub(
const Vec128<double, N> mul,
1855 const Vec128<double, N> x,
1856 const Vec128<double, N> sub) {
1861 const Vec128<double, N> x,
1862 const Vec128<double, N> sub) {
1879 #if HWY_ARCH_ARM_A64
1905 const auto root =
v * recip;
1915 template <
typename T>
1921 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1925 using V8 = decltype(
Zero(d8));
1948 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1951 return detail::reversed_andnot(mask, not_mask);
1955 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1957 const Vec128<T, N> mask) {
1958 const DFromV<decltype(mask)>
d;
1960 VFromD<decltype(du)> ret =
1961 detail::reversed_andnot(
BitCast(du, mask),
BitCast(du, not_mask));
1991 template <
typename T,
size_t N>
1993 return Or(o1,
Or(o2, o3));
1998 template <
typename T,
size_t N>
2000 return Or(o,
And(a1, a2));
2005 template <
typename T,
size_t N>
2013 template <
typename T,
size_t N>
2018 template <
typename T,
size_t N>
2023 template <
typename T,
size_t N>
2030 #ifdef HWY_NATIVE_POPCNT
2031 #undef HWY_NATIVE_POPCNT
2033 #define HWY_NATIVE_POPCNT
2038 template <
typename T>
2043 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2051 template <
typename T>
2054 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2057 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2061 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2065 template <
typename T>
2068 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2069 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
2071 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2075 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2079 template <
typename T>
2082 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2083 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
2085 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2089 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2090 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
2095 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
2119 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
2123 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2127 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2131 template <
size_t N, HWY_IF_LE64(
float, N)>
2136 #if HWY_ARCH_ARM_A64
2137 HWY_API Vec128<double>
Abs(
const Vec128<double>
v) {
2138 return Vec128<double>(vabsq_f64(
v.raw));
2141 HWY_API Vec64<double>
Abs(
const Vec64<double>
v) {
2142 return Vec64<double>(vabs_f64(
v.raw));
2148 template <
typename T,
size_t N>
2151 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2156 template <
typename T,
size_t N>
2159 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2165 template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
2175 template <
typename T,
size_t N>
2181 template <
typename T,
size_t N>
2188 template <
typename TFrom,
typename TTo,
size_t N>
2190 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
2196 #define HWY_NEON_BUILD_TPL_HWY_IF
2197 #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2198 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
2199 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
2200 const Vec128<type##_t, size> no
2201 #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
2205 #undef HWY_NEON_BUILD_TPL_HWY_IF
2206 #undef HWY_NEON_BUILD_RET_HWY_IF
2207 #undef HWY_NEON_BUILD_PARAM_HWY_IF
2208 #undef HWY_NEON_BUILD_ARG_HWY_IF
2211 template <
typename T,
size_t N>
2218 template <
typename T,
size_t N>
2224 template <
typename T,
size_t N>
2227 static_assert(IsSigned<T>(),
"Only works for signed/float");
2235 template <
typename T,
size_t N>
2238 return Max(zero,
v);
2243 template <
typename T,
size_t N>
2248 template <
typename T,
size_t N>
2254 template <
typename T,
size_t N>
2260 template <
typename T,
size_t N>
2266 template <
typename T,
size_t N>
2298 #define HWY_NEON_BUILD_TPL_HWY_COMPARE
2299 #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
2300 #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
2301 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
2302 #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
2306 #if HWY_ARCH_ARM_A64
2315 #if HWY_ARCH_ARM_A64
2326 #undef HWY_NEON_BUILD_TPL_HWY_COMPARE
2327 #undef HWY_NEON_BUILD_RET_HWY_COMPARE
2328 #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
2329 #undef HWY_NEON_BUILD_ARG_HWY_COMPARE
2337 const Vec128<int64_t, N> b) {
2338 const Simd<int32_t, N * 2, 0> d32;
2339 const Simd<int64_t, N, 0> d64;
2347 const Vec128<uint64_t, N> b) {
2348 const Simd<uint32_t, N * 2, 0> d32;
2349 const Simd<uint64_t, N, 0> d64;
2356 const Vec128<int64_t> b) {
2357 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
2361 const Vec64<int64_t> b) {
2362 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
2368 const Vec128<uint64_t, N> b) {
2369 const DFromV<decltype(a)> du;
2371 const Vec128<uint64_t, N> msb =
AndNot(a, b) |
AndNot(a ^ b, a - b);
2380 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
2381 #undef HWY_NEON_DEF_FUNCTION
2385 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
2386 HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \
2387 Vec128<type##_t, size> b) { \
2388 return Not(a == b); \
2393 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2397 template <
typename T,
size_t N>
2401 template <
typename T,
size_t N>
2408 template <
typename T,
size_t N>
2416 #define HWY_NEON_BUILD_TPL_HWY_TESTBIT
2417 #define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
2418 #define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
2419 Vec128<type##_t, size> v, Vec128<type##_t, size> bit
2420 #define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
2422 #if HWY_ARCH_ARM_A64
2432 return (
v & bit) == bit;
2437 return (
v & bit) == bit;
2441 #undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
2442 #undef HWY_NEON_BUILD_RET_HWY_TESTBIT
2443 #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
2444 #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
2448 #if HWY_ARCH_ARM_A64
2456 #if HWY_ARCH_ARM_A64
2471 const
Vec128<uint64_t,
N> b) {
2472 #if HWY_ARCH_ARM_A64
2475 const DFromV<decltype(a)> du;
2487 #if HWY_ARCH_ARM_A64
2496 #if HWY_ARCH_ARM_A64
2509 const
Vec128<uint64_t,
N> b) {
2510 #if HWY_ARCH_ARM_A64
2513 const DFromV<decltype(a)> du;
2525 #if HWY_ARCH_ARM_A64
2534 #if HWY_ARCH_ARM_A64
2580 #if HWY_ARCH_ARM_A64
2583 return Vec128<double>(vld1q_f64(unaligned));
2625 #if HWY_ARCH_ARM_A64
2628 return Vec64<double>(vld1_f64(p));
2647 template <
typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
2651 CopyBytes<4>(p, &buf);
2668 template <
typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
2672 CopyBytes<2>(p, &buf);
2693 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2700 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2705 template <
typename T,
size_t N>
2710 template <
typename T,
size_t N>
2717 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2727 vst1q_u8(unaligned,
v.raw);
2731 vst1q_u16(unaligned,
v.raw);
2735 vst1q_u32(unaligned,
v.raw);
2739 vst1q_u64(unaligned,
v.raw);
2743 vst1q_s8(unaligned,
v.raw);
2747 vst1q_s16(unaligned,
v.raw);
2751 vst1q_s32(unaligned,
v.raw);
2755 vst1q_s64(unaligned,
v.raw);
2759 vst1q_f32(unaligned,
v.raw);
2761 #if HWY_ARCH_ARM_A64
2764 vst1q_f64(unaligned,
v.raw);
2806 #if HWY_ARCH_ARM_A64
2817 vst1_lane_u32(p,
v.raw, 0);
2821 vst1_lane_s32(p,
v.raw, 0);
2825 vst1_lane_f32(p,
v.raw, 0);
2828 template <
typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
2832 CopyBytes<4>(&buf, p);
2839 vst1_lane_u16(p,
v.raw, 0);
2843 vst1_lane_s16(p,
v.raw, 0);
2846 template <
typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
2850 CopyBytes<2>(&buf, p);
2857 vst1_lane_u8(p,
v.raw, 0);
2861 vst1_lane_s8(p,
v.raw, 0);
2869 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2876 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2881 template <
typename T,
size_t N>
2886 template <
typename T,
size_t N>
2891 const auto blended =
2900 template <
typename T,
size_t N>
2917 uint16x8_t a = vmovl_u8(
v.raw);
2932 uint16x8_t a = vmovl_u8(
v.raw);
2940 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
2945 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
2948 uint16x8_t a = vmovl_u8(
v.raw);
2956 template <
size_t N, HWY_IF_LE64(u
int64_t, N)>
2961 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2966 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2969 uint16x8_t a = vmovl_u8(
v.raw);
2970 uint32x4_t b = vmovl_u16(vget_low_u16(a));
2973 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2976 uint32x4_t a = vmovl_u16(
v.raw);
2987 int16x8_t a = vmovl_s8(
v.raw);
3008 int16x8_t a = vmovl_s8(
v.raw);
3009 int32x4_t b = vmovl_s16(vget_low_s16(a));
3026 const Vec128<float16_t, 4>
v) {
3027 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(
v.raw));
3028 return Vec128<float>(f32);
3032 const Vec128<float16_t, N>
v) {
3033 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(
v.raw));
3034 return Vec128<float, N>(vget_low_f32(f32));
3046 const auto sign = ShiftRight<15>(bits16);
3047 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
3048 const auto mantissa = bits16 &
Set(du32, 0x3FF);
3049 const auto subnormal =
3051 Set(df32, 1.0f / 16384 / 1024));
3053 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
3054 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3055 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3056 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
3057 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3062 #if HWY_ARCH_ARM_A64
3065 const Vec64<float>
v) {
3066 return Vec128<double>(vcvt_f64_f32(
v.raw));
3070 const Vec32<float>
v) {
3071 return Vec64<double>(vget_low_f64(vcvt_f64_f32(
v.raw)));
3075 const Vec64<int32_t>
v) {
3076 const int64x2_t i64 = vmovl_s32(
v.raw);
3077 return Vec128<double>(vcvtq_f64_s64(i64));
3081 const Vec32<int32_t>
v) {
3082 const int64x1_t i64 = vget_low_s64(vmovl_s32(
v.raw));
3083 return Vec64<double>(vcvt_f64_s64(i64));
3101 const uint16x4_t a = vqmovun_s32(
v.raw);
3110 const int16x4_t a = vqmovn_s32(
v.raw);
3119 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3124 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3129 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3132 const uint16x4_t a = vqmovun_s32(vcombine_s32(
v.raw,
v.raw));
3135 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3140 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3143 const int16x4_t a = vqmovn_s32(vcombine_s32(
v.raw,
v.raw));
3146 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3155 const Vec128<float>
v) {
3156 return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(
v.raw))};
3160 const Vec128<float, N>
v) {
3161 const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(
v.raw,
v.raw));
3162 return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
3171 const Rebind<uint32_t, decltype(du16)> du;
3173 const auto bits32 =
BitCast(du,
v);
3174 const auto sign = ShiftRight<31>(bits32);
3175 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
3176 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3178 const auto k15 =
Set(di, 15);
3179 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3180 const auto is_tiny = exp <
Set(di, -24);
3182 const auto is_subnormal = exp <
Set(di, -14);
3183 const auto biased_exp16 =
3185 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3186 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3187 (mantissa32 >> (
Set(du, 13) + sub_exp));
3189 ShiftRight<13>(mantissa32));
3191 const auto sign16 = ShiftLeft<15>(sign);
3192 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3202 const Rebind<int32_t, decltype(dbf16)> di32;
3203 const Rebind<uint32_t, decltype(dbf16)> du32;
3204 const Rebind<uint16_t, decltype(dbf16)> du16;
3205 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3209 #if HWY_ARCH_ARM_A64
3212 return Vec64<float>(vcvt_f32_f64(
v.raw));
3215 return Vec32<float>(vcvt_f32_f64(vcombine_f64(
v.raw,
v.raw)));
3219 const Vec128<double>
v) {
3220 const int64x2_t i64 = vcvtq_s64_f64(
v.raw);
3221 return Vec64<int32_t>(vqmovn_s64(i64));
3224 const Vec64<double>
v) {
3225 const int64x1_t i64 = vcvt_s64_f64(
v.raw);
3227 const int64x2_t i64x2 = vcombine_s64(i64, i64);
3228 return Vec32<int32_t>(vqmovn_s64(i64x2));
3235 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
3238 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3241 const uint8x8_t w = vuzp1_u8(org_v, org_v);
3256 uint16x8_t c = vcombine_u16(a.
raw, b.
raw);
3265 int16x8_t c = vcombine_s16(a.
raw, b.
raw);
3277 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3288 template <
size_t N, HWY_IF_LE64(
float, N)>
3294 #if HWY_ARCH_ARM_A64
3297 const Vec128<int64_t>
v) {
3298 return Vec128<double>(vcvtq_f64_s64(
v.raw));
3301 const Vec64<int64_t>
v) {
3302 return Vec64<double>(vcvt_f64_s64(
v.raw));
3307 const Vec128<double>
v) {
3308 return Vec128<int64_t>(vcvtq_s64_f64(
v.raw));
3311 const Vec64<double>
v) {
3312 return Vec64<int64_t>(vcvt_s64_f64(
v.raw));
3319 #if HWY_ARCH_ARM_A64
3357 const auto int_f =
ConvertTo(df, integer);
3372 const auto added = large +
v;
3373 const auto rounded = added - large;
3385 const auto int_f =
ConvertTo(df, integer);
3399 const auto int_f =
ConvertTo(df, integer);
3411 #if HWY_ARCH_ARM_A64
3414 return Vec128<int32_t>(vcvtnq_s32_f32(
v.raw));
3416 template <
size_t N, HWY_IF_LE64(
float, N)>
3418 return Vec128<int32_t, N>(vcvtn_s32_f32(
v.raw));
3432 template <
typename T,
size_t N>
3437 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
3447 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
3456 const VFromD<decltype(di)> exp =
3466 template <
typename T,
size_t N, HWY_IF_LE64(u
int8_t, N)>
3498 #if HWY_ARCH_ARM_A64
3500 return Vec64<double>(vget_low_f64(
v.raw));
3504 template <
typename T,
size_t N>
3513 template <
int kBytes,
typename T,
class V128 = Vec128<T>>
3515 static_assert(0 < kBytes && kBytes < 16,
"kBytes must be in [1, 15]");
3517 uint8x16_t v8 = vextq_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3522 template <
int kBytes,
typename T>
3524 static_assert(0 < kBytes && kBytes < 8,
"kBytes must be in [1, 7]");
3526 uint8x8_t v8 = vext_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3538 template <
int kBytes>
3548 template <
class T,
size_t N, HWY_IF_LE64(T, N)>
3552 const auto zero64 =
Zero(d64);
3553 const decltype(zero64) v64(
v.raw);
3555 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
3560 template <
class T,
size_t N>
3567 template <
class T,
size_t N>
3573 template <
int kBytes>
3575 template <
class T,
size_t N>
3579 if (
N *
sizeof(T) < 8) {
3580 constexpr
size_t kReg =
N *
sizeof(T) == 16 ? 16 : 8;
3581 const Simd<T, kReg /
sizeof(T), 0> dreg;
3585 return CombineShiftRightBytes<kBytes>(
d,
Zero(
d),
v);
3590 template <
class T,
size_t N>
3597 template <
class T,
size_t N>
3605 template <
int kBytes,
typename T,
size_t N>
3611 template <
int kBytes,
typename T,
size_t N>
3616 template <
int kLanes,
typename T,
size_t N>
3622 template <
int kLanes,
typename T,
size_t N>
3628 template <
int kBytes,
typename T,
size_t N>
3634 template <
int kLanes,
typename T,
size_t N>
3641 template <
int kBytes,
typename T,
size_t N, HWY_IF_LE32(T, N)>
3644 constexpr
size_t kSize =
N *
sizeof(T);
3645 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
3649 using V64 =
VFromD<decltype(d_full8)>;
3650 const V64 hi64(
BitCast(d8, hi).raw);
3696 #if HWY_ARCH_ARM_A64
3698 const Vec128<double>
v) {
3699 return Vec64<double>(vget_high_f64(
v.raw));
3704 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3711 return Vec128<T, (
N + 1) / 2>(upper.raw);
3716 #if HWY_ARCH_ARM_A64
3718 template <
int kLane>
3720 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3721 return Vec128<uint16_t>(vdupq_laneq_u16(
v.raw, kLane));
3723 template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3725 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3726 return Vec128<uint16_t, N>(vdup_lane_u16(
v.raw, kLane));
3728 template <
int kLane>
3730 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3731 return Vec128<uint32_t>(vdupq_laneq_u32(
v.raw, kLane));
3733 template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3735 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3736 return Vec128<uint32_t, N>(vdup_lane_u32(
v.raw, kLane));
3738 template <
int kLane>
3740 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3741 return Vec128<uint64_t>(vdupq_laneq_u64(
v.raw, kLane));
3746 template <
int kLane>
3748 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3749 return Vec128<int16_t>(vdupq_laneq_s16(
v.raw, kLane));
3751 template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3753 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3754 return Vec128<int16_t, N>(vdup_lane_s16(
v.raw, kLane));
3756 template <
int kLane>
3758 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3759 return Vec128<int32_t>(vdupq_laneq_s32(
v.raw, kLane));
3761 template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3763 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3764 return Vec128<int32_t, N>(vdup_lane_s32(
v.raw, kLane));
3766 template <
int kLane>
3768 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3769 return Vec128<int64_t>(vdupq_laneq_s64(
v.raw, kLane));
3774 template <
int kLane>
3776 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3777 return Vec128<float>(vdupq_laneq_f32(
v.raw, kLane));
3779 template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3781 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3782 return Vec128<float, N>(vdup_lane_f32(
v.raw, kLane));
3784 template <
int kLane>
3786 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3787 return Vec128<double>(vdupq_laneq_f64(
v.raw, kLane));
3789 template <
int kLane>
3791 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3799 template <
int kLane>
3801 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3804 template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3806 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3809 template <
int kLane>
3811 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3814 template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3816 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3819 template <
int kLane>
3821 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3827 template <
int kLane>
3829 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3832 template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3834 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3837 template <
int kLane>
3839 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3842 template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3844 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3847 template <
int kLane>
3849 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3855 template <
int kLane>
3857 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3860 template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3862 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3868 template <
int kLane>
3870 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3873 template <
int kLane>
3875 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3882 template <
typename T,
size_t N>
3887 template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
3889 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3890 #if HWY_IS_DEBUG_BUILD
3891 const Rebind<TI, decltype(
d)> di;
3897 using V8 =
VFromD<decltype(d8)>;
3901 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"");
3902 if (
sizeof(T) == 4) {
3903 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3904 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3905 const V8 lane_indices =
3907 const V8 byte_indices =
3909 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3910 0, 1, 2, 3, 0, 1, 2, 3};
3911 const V8 sum =
Add(byte_indices,
Load(d8, kByteOffsets));
3914 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3915 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3916 const V8 lane_indices =
3918 const V8 byte_indices =
3920 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
3921 0, 1, 2, 3, 4, 5, 6, 7};
3922 const V8 sum =
Add(byte_indices,
Load(d8, kByteOffsets));
3927 template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
3929 const Rebind<TI, decltype(
d)> di;
3933 template <
typename T,
size_t N>
3944 template <
typename T>
3950 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3955 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3961 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3967 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3975 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
3980 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3986 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
3991 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3997 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4004 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4009 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4015 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4020 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4027 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4032 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4045 template <
typename T>
4049 template <
typename T>
4055 template <
typename T>
4061 template <
typename T>
4067 template <
typename T>
4080 #if HWY_ARCH_ARM_A64
4083 const Vec128<uint64_t> b) {
4084 return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
4087 const Vec128<int64_t> b) {
4088 return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
4091 const Vec128<double> b) {
4092 return Vec128<double>(vzip1q_f64(a.raw, b.raw));
4111 template <
size_t N, HWY_IF_LE64(
float, N)>
4118 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4124 template <
typename T,
size_t N,
class V = Vec128<T, N>>
4136 #if HWY_ARCH_ARM_A64
4139 const Vec128<uint64_t> b) {
4140 return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
4143 return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
4146 return Vec128<double>(vzip2q_f64(a.raw, b.raw));
4170 template <
typename T,
size_t N, HWY_IF_GE64(T, N),
class V = Vec128<T, N>>
4176 template <
typename T,
size_t N, HWY_IF_LE32(T, N),
class V = Vec128<T, N>>
4178 const Half<decltype(
d)> d2;
4186 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4190 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4195 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4262 #if HWY_ARCH_ARM_A64
4263 HWY_API Vec128<double>
Combine(Full128<double> , Vec64<double> hi,
4265 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
4270 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4283 template <
typename T,
size_t N>
4291 template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4300 #if HWY_ARCH_ARM_A64
4306 #define HWY_NEON_BUILD_TPL_HWY_TRN
4307 #define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
4310 #define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
4311 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
4312 #define HWY_NEON_BUILD_ARG_HWY_TRN a, b
4334 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4339 #if HWY_ARCH_ARM_A64
4342 using VU =
VFromD<decltype(du)>;
4344 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
4352 template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4361 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4366 #if HWY_ARCH_ARM_A64
4369 using VU =
VFromD<decltype(du)>;
4371 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
4379 template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4386 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4389 constexpr
size_t kSize =
N *
sizeof(T);
4391 const Full64<uint8_t> d8x8;
4392 const Full64<T> d64;
4393 using V8x8 =
VFromD<decltype(d8x8)>;
4394 const V8x8 hi8x8(
BitCast(d8, hi).raw);
4399 return Vec128<T, N>(
BitCast(d64, r).raw);
4405 template <
typename T,
size_t N>
4420 template <typename T,
size_t N,
4428 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4431 const Twice<decltype(
d)> d2;
4442 template <
typename T>
4451 template <
typename T,
size_t N,
4459 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4462 const Twice<decltype(
d)> d2;
4473 template <
typename T>
4481 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4483 #if HWY_ARCH_ARM_A64
4484 return detail::InterleaveEven(
v,
v);
4486 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[0]);
4490 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4497 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4499 #if HWY_ARCH_ARM_A64
4500 return detail::InterleaveOdd(
v,
v);
4502 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[1]);
4506 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4513 template <
typename T,
size_t N>
4517 alignas(16) constexpr uint8_t kBytes[16] = {
4518 ((0 /
sizeof(T)) & 1) ? 0 : 0xFF, ((1 /
sizeof(T)) & 1) ? 0 : 0xFF,
4519 ((2 /
sizeof(T)) & 1) ? 0 : 0xFF, ((3 /
sizeof(T)) & 1) ? 0 : 0xFF,
4520 ((4 /
sizeof(T)) & 1) ? 0 : 0xFF, ((5 /
sizeof(T)) & 1) ? 0 : 0xFF,
4521 ((6 /
sizeof(T)) & 1) ? 0 : 0xFF, ((7 /
sizeof(T)) & 1) ? 0 : 0xFF,
4522 ((8 /
sizeof(T)) & 1) ? 0 : 0xFF, ((9 /
sizeof(T)) & 1) ? 0 : 0xFF,
4523 ((10 /
sizeof(T)) & 1) ? 0 : 0xFF, ((11 /
sizeof(T)) & 1) ? 0 : 0xFF,
4524 ((12 /
sizeof(T)) & 1) ? 0 : 0xFF, ((13 /
sizeof(T)) & 1) ? 0 : 0xFF,
4525 ((14 /
sizeof(T)) & 1) ? 0 : 0xFF, ((15 /
sizeof(T)) & 1) ? 0 : 0xFF,
4532 template <
typename T,
size_t N>
4539 template <
typename T,
size_t N>
4547 template <
typename T>
4558 const Repartition<uint32_t, decltype(dbf16)> du32;
4565 #if defined(__ARM_FEATURE_AES)
4568 #ifdef HWY_NATIVE_AES
4569 #undef HWY_NATIVE_AES
4571 #define HWY_NATIVE_AES
4575 Vec128<uint8_t> round_key) {
4580 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
4585 Vec128<uint8_t> round_key) {
4586 return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
4590 return Vec128<uint64_t>((uint64x2_t)vmull_p64(
GetLane(a),
GetLane(b)));
4594 return Vec128<uint64_t>(
4595 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
4605 const Rebind<uint16_t, decltype(df32)> du16;
4619 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
4626 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
4635 return Vec128<int64_t, (
N + 1) / 2>(
4636 vget_low_s64(vmull_s32(a_packed, b_packed)));
4644 return Vec128<uint64_t, (
N + 1) / 2>(
4645 vget_low_u64(vmull_u32(a_packed, b_packed)));
4650 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 0), vgetq_lane_u64(b.
raw, 0), &hi);
4656 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 1), vgetq_lane_u64(b.
raw, 1), &hi);
4663 template <
typename T,
typename TI>
4668 #if HWY_ARCH_ARM_A64
4672 uint8x16_t table0 =
BitCast(d8, bytes).raw;
4674 table.val[0] = vget_low_u8(table0);
4675 table.val[1] = vget_high_u8(table0);
4676 uint8x16_t idx =
BitCast(d8, from).raw;
4677 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
4678 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
4684 template <
typename T,
typename TI,
size_t NI, HWY_IF_LE64(TI, NI)>
4689 const auto idx_full =
Combine(d_full, from64, from64);
4695 template <
typename T,
size_t N,
typename TI, HWY_IF_LE64(T, N)>
4703 template <
typename T,
size_t N,
typename TI,
size_t NI,
HWY_IF_LE64(T,
N),
4709 const Repartition<uint8_t, decltype(d_idx)> d_idx8;
4712 const auto from8 =
BitCast(d_idx8, from);
4713 const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4718 template <
class V,
class VI>
4725 template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
4729 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4731 alignas(16) T lanes[
N];
4734 alignas(16) Offset offset_lanes[
N];
4735 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
4737 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
4738 for (
size_t i = 0; i <
N; ++i) {
4739 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
4743 template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
4746 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
4748 alignas(16) T lanes[
N];
4751 alignas(16) Index index_lanes[
N];
4752 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
4754 for (
size_t i = 0; i <
N; ++i) {
4755 base[index_lanes[i]] = lanes[i];
4761 template <
typename T,
size_t N,
typename Offset>
4765 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4767 alignas(16) Offset offset_lanes[
N];
4768 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
4770 alignas(16) T lanes[
N];
4771 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
4772 for (
size_t i = 0; i <
N; ++i) {
4773 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
4775 return Load(
d, lanes);
4778 template <
typename T,
size_t N,
typename Index>
4782 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
4784 alignas(16) Index index_lanes[
N];
4785 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
4787 alignas(16) T lanes[
N];
4788 for (
size_t i = 0; i <
N; ++i) {
4789 lanes[i] = base[index_lanes[i]];
4791 return Load(
d, lanes);
4799 template <
typename T>
4803 template <
typename T>
4808 template <
typename T>
4815 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4819 template <
typename T>
4824 template <
typename T>
4831 #if HWY_ARCH_ARM_A64
4836 return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(
v.raw)));
4839 return Vec128<float>(vdupq_n_f32(vaddvq_f32(
v.raw)));
4842 return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(
v.raw)));
4845 return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(
v.raw)));
4848 return Vec128<double>(vdupq_n_f64(vaddvq_f64(
v.raw)));
4853 uint32x4x2_t v0 = vuzpq_u32(
v.raw,
v.raw);
4854 uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
4855 uint32x4x2_t v1 = vuzpq_u32(c0, c0);
4859 int32x4x2_t v0 = vuzpq_s32(
v.raw,
v.raw);
4860 int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
4861 int32x4x2_t v1 = vuzpq_s32(c0, c0);
4865 float32x4x2_t v0 = vuzpq_f32(
v.raw,
v.raw);
4866 float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
4867 float32x4x2_t v1 = vuzpq_f32(c0, c0);
4878 template <
typename T>
4884 return Min(v20_31_20_31, v31_20_31_20);
4886 template <
typename T>
4892 return Max(v20_31_20_31, v31_20_31_20);
4896 template <
typename T>
4900 return Min(v10, v01);
4902 template <
typename T>
4906 return Max(v10, v01);
4910 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4914 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4919 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4923 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4931 template <
typename T,
size_t N>
4935 template <
typename T,
size_t N>
4939 template <
typename T,
size_t N>
4951 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4956 template <
typename T>
4961 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4966 const auto vmask_bits =
Set64(du, mask_bits);
4969 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4970 1, 1, 1, 1, 1, 1, 1, 1};
4973 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4974 1, 2, 4, 8, 16, 32, 64, 128};
4978 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4981 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4982 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
4986 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4989 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4990 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
4994 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4997 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
5004 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
5007 uint64_t mask_bits = 0;
5018 template <
typename T>
5026 template <
typename T>
5029 const Twice<decltype(
d)> d2;
5035 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
5040 constexpr
size_t kBytes =
sizeof(T) *
N;
5041 return nib & ((1ull << (kBytes * 4)) - 1);
5044 template <
typename T>
5047 alignas(16) constexpr uint8_t kSliceLanes[16] = {
5048 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
5054 #if HWY_ARCH_ARM_A64
5056 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.
raw, values.
raw));
5057 const uint8x8_t x4 = vpadd_u8(x2, x2);
5058 const uint8x8_t x8 = vpadd_u8(x4, x4);
5059 return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
5062 const uint16x8_t x2 = vpaddlq_u8(values.
raw);
5063 const uint32x4_t x4 = vpaddlq_u16(x2);
5064 const uint64x2_t x8 = vpaddlq_u32(x4);
5065 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
5069 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5074 alignas(8) constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
5075 0x10, 0x20, 0x40, 0x80};
5081 #if HWY_ARCH_ARM_A64
5082 return vaddv_u8(values.
raw);
5084 const uint16x4_t x2 = vpaddl_u8(values.
raw);
5085 const uint32x2_t x4 = vpaddl_u16(x2);
5086 const uint64x1_t x8 = vpaddl_u32(x4);
5087 return vget_lane_u64(x8, 0);
5091 template <
typename T>
5094 alignas(16) constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
5095 0x10, 0x20, 0x40, 0x80};
5100 #if HWY_ARCH_ARM_A64
5101 return vaddvq_u16(values.
raw);
5103 const uint32x4_t x2 = vpaddlq_u16(values.
raw);
5104 const uint64x2_t x4 = vpaddlq_u32(x2);
5105 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
5109 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5114 alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
5119 #if HWY_ARCH_ARM_A64
5120 return vaddv_u16(values.
raw);
5122 const uint32x2_t x2 = vpaddl_u16(values.
raw);
5123 const uint64x1_t x4 = vpaddl_u32(x2);
5124 return vget_lane_u64(x4, 0);
5128 template <
typename T>
5131 alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
5136 #if HWY_ARCH_ARM_A64
5137 return vaddvq_u32(values.
raw);
5139 const uint64x2_t x2 = vpaddlq_u32(values.
raw);
5140 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
5144 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5149 alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2};
5154 #if HWY_ARCH_ARM_A64
5155 return vaddv_u32(values.
raw);
5157 const uint64x1_t x2 = vpaddl_u32(values.
raw);
5158 return vget_lane_u64(x2, 0);
5162 template <
typename T>
5164 alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2};
5169 #if HWY_ARCH_ARM_A64
5170 return vaddvq_u64(values.
raw);
5172 return vgetq_lane_u64(values.
raw, 0) + vgetq_lane_u64(values.
raw, 1);
5176 template <
typename T>
5182 return vget_lane_u64(values.
raw, 0);
5186 template <
typename T,
size_t N>
5188 return ((
N *
sizeof(T)) >= 8) ? bits : (bits & ((1ull <<
N) - 1));
5191 template <
typename T,
size_t N>
5206 template <
typename T>
5209 const int8x16_t ones =
5212 #if HWY_ARCH_ARM_A64
5213 return static_cast<size_t>(vaddvq_s8(ones));
5215 const int16x8_t x2 = vpaddlq_s8(ones);
5216 const int32x4_t x4 = vpaddlq_s16(x2);
5217 const int64x2_t x8 = vpaddlq_s32(x4);
5218 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
5221 template <
typename T>
5224 const int16x8_t ones =
5227 #if HWY_ARCH_ARM_A64
5228 return static_cast<size_t>(vaddvq_s16(ones));
5230 const int32x4_t x2 = vpaddlq_s16(ones);
5231 const int64x2_t x4 = vpaddlq_s32(x2);
5232 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
5236 template <
typename T>
5239 const int32x4_t ones =
5242 #if HWY_ARCH_ARM_A64
5243 return static_cast<size_t>(vaddvq_s32(ones));
5245 const int64x2_t x2 = vpaddlq_s32(ones);
5246 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
5250 template <
typename T>
5252 #if HWY_ARCH_ARM_A64
5254 const int64x2_t ones =
5256 return static_cast<size_t>(vaddvq_s64(ones));
5260 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
5261 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
5268 template <
typename T>
5274 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5276 constexpr
int kDiv = 4 *
sizeof(T);
5279 template <
typename T,
size_t N>
5283 if (nib == 0)
return -1;
5284 constexpr
int kDiv = 4 *
sizeof(T);
5289 template <
typename T,
size_t N>
5293 const size_t kNumBytes = (
N + 7) / 8;
5294 CopyBytes<kNumBytes>(&mask_bits, bits);
5298 template <
typename T,
size_t N>
5304 template <
typename T>
5309 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5311 constexpr
size_t kBytes =
sizeof(T) *
N;
5317 template <
typename T>
5326 const uint8_t* bytes) {
5328 vld1q_dup_u64(
reinterpret_cast<const uint64_t*
>(bytes))));
5332 template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
5334 const uint8_t* bytes) {
5335 return Load(
d, bytes);
5338 template <
typename T,
size_t N>
5340 const uint64_t mask_bits) {
5354 alignas(16) constexpr uint8_t table[256 * 8] = {
5356 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5357 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5358 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
5359 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5360 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
5361 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
5362 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
5363 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5364 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
5365 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
5366 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
5367 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
5368 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
5369 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
5370 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
5371 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5372 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
5373 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
5374 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
5375 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
5376 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
5377 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
5378 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
5379 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
5380 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
5381 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
5382 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
5383 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
5384 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
5385 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
5386 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
5387 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5388 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
5389 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
5390 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
5391 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
5392 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
5393 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
5394 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
5395 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
5396 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
5397 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
5398 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
5399 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
5400 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
5401 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
5402 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
5403 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
5404 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
5405 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
5406 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
5407 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
5408 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
5409 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
5410 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
5411 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
5412 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
5413 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
5414 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
5415 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
5416 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
5417 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
5418 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
5419 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5420 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
5421 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
5422 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
5423 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
5424 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
5425 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
5426 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
5427 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
5428 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
5429 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
5430 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
5431 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
5432 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
5433 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
5434 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
5435 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
5436 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
5437 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
5438 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
5439 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
5440 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
5441 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
5442 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
5443 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
5444 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
5445 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
5446 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
5447 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
5448 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
5449 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
5450 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
5451 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
5452 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
5453 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
5454 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
5455 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
5456 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
5457 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
5458 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
5459 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
5460 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
5461 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
5462 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
5463 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
5464 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
5465 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
5466 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
5467 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
5468 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
5469 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
5470 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
5471 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
5472 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
5473 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
5474 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
5475 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
5476 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
5477 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
5478 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
5479 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
5480 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
5481 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
5482 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
5483 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5490 template <
typename T,
size_t N>
5492 const uint64_t mask_bits) {
5506 alignas(16) constexpr uint8_t table[256 * 8] = {
5508 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
5509 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
5510 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
5511 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
5512 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
5513 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
5514 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
5515 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
5516 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
5517 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
5518 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
5519 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
5520 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
5521 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
5522 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
5523 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
5524 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
5525 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
5526 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
5527 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
5528 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
5529 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
5530 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
5531 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
5532 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
5533 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
5534 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
5535 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
5536 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
5537 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
5538 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
5539 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
5540 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
5541 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
5542 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
5543 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
5544 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
5545 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
5546 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
5547 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
5548 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
5549 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
5550 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
5551 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
5552 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
5553 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
5554 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
5555 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
5556 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
5557 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
5558 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
5559 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
5560 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
5561 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
5562 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
5563 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
5564 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
5565 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
5566 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
5567 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
5568 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
5569 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
5570 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
5571 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
5572 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
5573 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
5574 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
5575 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
5576 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
5577 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
5578 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
5579 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
5580 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
5581 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
5582 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
5583 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
5584 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
5585 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
5586 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
5587 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
5588 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
5589 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
5590 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
5591 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
5592 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
5593 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
5594 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
5595 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
5596 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
5597 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
5598 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
5599 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
5600 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
5601 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
5602 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
5603 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
5604 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
5605 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
5606 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
5607 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
5608 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
5609 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
5610 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
5611 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
5612 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
5613 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
5614 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
5615 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
5616 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
5617 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
5618 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
5619 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
5620 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
5621 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
5622 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
5623 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
5624 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
5625 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
5626 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
5627 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
5628 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
5629 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
5630 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
5631 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
5632 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
5633 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
5634 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
5635 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
5642 template <
typename T,
size_t N>
5644 const uint64_t mask_bits) {
5648 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
5650 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5651 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5652 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
5653 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5654 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
5655 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
5656 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
5657 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5658 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5659 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
5660 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
5661 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5662 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5663 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
5664 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
5665 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5668 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5671 template <
typename T,
size_t N>
5673 const uint64_t mask_bits) {
5677 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
5679 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
5680 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5681 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5682 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5683 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
5684 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
5685 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5686 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5687 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
5688 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5689 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
5690 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
5691 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5692 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5696 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5699 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
5701 template <
typename T,
size_t N>
5703 const uint64_t mask_bits) {
5707 alignas(16) constexpr uint8_t u8_indices[64] = {
5709 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5710 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5711 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5712 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5714 const Simd<T, N, 0>
d;
5716 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5719 template <
typename T,
size_t N>
5721 const uint64_t mask_bits) {
5725 alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
5727 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5728 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5729 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5730 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5732 const Simd<T, N, 0>
d;
5734 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5741 template <
typename T,
size_t N>
5744 detail::IdxFromBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
5750 template <
typename T,
size_t N>
5753 detail::IdxFromNotBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
5762 template <
typename T>
5768 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
5780 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
5786 template <
typename T>
5792 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
5804 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
5808 if (
N < 16 /
sizeof(T)) {
5822 template <
typename T,
size_t N>
5825 uint64_t mask_bits = 0;
5826 constexpr
size_t kNumBytes = (
N + 7) / 8;
5827 CopyBytes<kNumBytes>(bits, &mask_bits);
5829 mask_bits &= (1ull <<
N) - 1;
5836 template <
typename T,
size_t N>
5845 template <
typename T,
size_t N>
5850 using TU =
TFromD<decltype(du)>;
5852 const size_t count =
PopCount(mask_bits);
5861 template <
typename T,
size_t N>
5865 uint64_t mask_bits = 0;
5866 constexpr
size_t kNumBytes = (
N + 7) / 8;
5867 CopyBytes<kNumBytes>(bits, &mask_bits);
5869 mask_bits &= (1ull <<
N) - 1;
5879 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
5880 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
5882 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
5886 #define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
5887 #define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
5889 #if HWY_ARCH_ARM_A64
5890 #define HWY_IF_LOAD_INT(T, N) HWY_IF_GE64(T, N)
5891 #define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
5894 #define HWY_IF_LOAD_INT(T, N) \
5895 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
5896 #define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
5897 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
5898 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
5899 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
5900 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
5901 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
5907 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5908 decltype(Tuple2<type##_t, size>().raw)
5910 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5911 const type##_t *from, Tuple2<type##_t, size>
5913 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5914 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5916 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5917 decltype(Tuple3<type##_t, size>().raw)
5918 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5919 const type##_t *from, Tuple3<type##_t, size>
5921 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5922 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5924 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5925 decltype(Tuple4<type##_t, size>().raw)
5926 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5927 const type##_t *from, Tuple4<type##_t, size>
5929 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5930 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5932 #undef HWY_NEON_DEF_FUNCTION_LOAD_INT
5933 #undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
5934 #undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
5937 template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
5947 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
5952 alignas(16) T buf[2 * 8 /
sizeof(T)] = {};
5953 CopyBytes<N * 2 * sizeof(T)>(unaligned, buf);
5955 v0 = Vec128<T, N>(raw.val[0]);
5956 v1 = Vec128<T, N>(raw.val[1]);
5961 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
5963 Vec128<T>& v0, Vec128<T>& v1) {
5964 const Half<decltype(
d)> dh;
5965 VFromD<decltype(dh)> v00, v10, v01, v11;
5975 template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
5986 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
5989 Vec128<T, N>& v1, Vec128<T, N>& v2) {
5991 alignas(16) T buf[3 * 8 /
sizeof(T)] = {};
5992 CopyBytes<N * 3 * sizeof(T)>(unaligned, buf);
5994 v0 = Vec128<T, N>(raw.val[0]);
5995 v1 = Vec128<T, N>(raw.val[1]);
5996 v2 = Vec128<T, N>(raw.val[2]);
6001 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6003 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
6004 const Half<decltype(
d)> dh;
6005 VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
6016 template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
6029 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6032 Vec128<T, N>& v1, Vec128<T, N>& v2,
6034 alignas(16) T buf[4 * 8 /
sizeof(T)] = {};
6035 CopyBytes<N * 4 * sizeof(T)>(unaligned, buf);
6037 v0 = Vec128<T, N>(raw.val[0]);
6038 v1 = Vec128<T, N>(raw.val[1]);
6039 v2 = Vec128<T, N>(raw.val[2]);
6040 v3 = Vec128<T, N>(raw.val[3]);
6045 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6047 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
6049 const Half<decltype(
d)> dh;
6050 VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
6060 #undef HWY_IF_LOAD_INT
6065 #define HWY_NEON_BUILD_TPL_HWY_STORE_INT
6066 #define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
6067 #define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
6069 #if HWY_ARCH_ARM_A64
6070 #define HWY_IF_STORE_INT(T, N) HWY_IF_GE64(T, N)
6071 #define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6074 #define HWY_IF_STORE_INT(T, N) \
6075 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6076 #define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
6077 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6078 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6079 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6080 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6081 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6084 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6085 Tuple2<type##_t, size> tup, type##_t *to
6087 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6089 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6090 Tuple3<type##_t, size> tup, type##_t *to
6092 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6094 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6095 Tuple4<type##_t, size> tup, type##_t *to
6097 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6099 #undef HWY_NEON_DEF_FUNCTION_STORE_INT
6100 #undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
6101 #undef HWY_NEON_BUILD_RET_HWY_STORE_INT
6102 #undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
6105 template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6114 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6118 alignas(16) T buf[2 * 8 /
sizeof(T)];
6119 detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6121 CopyBytes<N * 2 * sizeof(T)>(buf, unaligned);
6126 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6129 const Half<decltype(
d)> dh;
6137 template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6146 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6148 const Vec128<T, N> v2, Simd<T, N, 0> ,
6150 alignas(16) T buf[3 * 8 /
sizeof(T)];
6151 detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6153 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
6158 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6160 const Vec128<T> v2, Full128<T>
d,
6162 const Half<decltype(
d)> dh;
6172 template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6182 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6184 const Vec128<T, N> v2,
const Vec128<T, N> v3,
6187 alignas(16) T buf[4 * 8 /
sizeof(T)];
6188 detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6190 CopyBytes<N * 4 * sizeof(T)>(buf, unaligned);
6195 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6197 const Vec128<T> v2,
const Vec128<T> v3,
6199 const Half<decltype(
d)> dh;
6207 #undef HWY_IF_STORE_INT
6211 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6214 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"Use u64");
6239 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6387 #undef HWY_NEON_BUILD_ARG_1
6388 #undef HWY_NEON_BUILD_ARG_2
6389 #undef HWY_NEON_BUILD_ARG_3
6390 #undef HWY_NEON_BUILD_PARAM_1
6391 #undef HWY_NEON_BUILD_PARAM_2
6392 #undef HWY_NEON_BUILD_PARAM_3
6393 #undef HWY_NEON_BUILD_RET_1
6394 #undef HWY_NEON_BUILD_RET_2
6395 #undef HWY_NEON_BUILD_RET_3
6396 #undef HWY_NEON_BUILD_TPL_1
6397 #undef HWY_NEON_BUILD_TPL_2
6398 #undef HWY_NEON_BUILD_TPL_3
6399 #undef HWY_NEON_DEF_FUNCTION
6400 #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
6401 #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
6402 #undef HWY_NEON_DEF_FUNCTION_FLOAT_64
6403 #undef HWY_NEON_DEF_FUNCTION_INTS
6404 #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
6405 #undef HWY_NEON_DEF_FUNCTION_INT_16
6406 #undef HWY_NEON_DEF_FUNCTION_INT_32
6407 #undef HWY_NEON_DEF_FUNCTION_INT_8
6408 #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
6409 #undef HWY_NEON_DEF_FUNCTION_TPL
6410 #undef HWY_NEON_DEF_FUNCTION_UIF81632
6411 #undef HWY_NEON_DEF_FUNCTION_UINTS
6412 #undef HWY_NEON_DEF_FUNCTION_UINT_16
6413 #undef HWY_NEON_DEF_FUNCTION_UINT_32
6414 #undef HWY_NEON_DEF_FUNCTION_UINT_8
6415 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
6416 #undef HWY_NEON_EVAL
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition: arm_neon-inl.h:159
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:182
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition: arm_neon-inl.h:192
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:138
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:133
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:6076
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:91
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:187
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:121
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:165
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition: arm_neon-inl.h:2385
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:107
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:114
#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args)
Definition: arm_neon-inl.h:196
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:177
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:99
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:127
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:5896
#define HWY_IF_FLOAT(T)
Definition: base.h:343
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
HWY_INLINE Mask128()
Definition: arm_neon-inl.h:809
Mask128(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition: arm_neon-inl.h:812
Raw raw
Definition: arm_neon-inl.h:814
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
Mask128 & operator=(const Mask128 &)=default
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: arm_neon-inl.h:783
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: arm_neon-inl.h:786
HWY_INLINE Vec128()
Definition: arm_neon-inl.h:764
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: arm_neon-inl.h:774
HWY_INLINE Vec128(const Raw raw)
Definition: arm_neon-inl.h:767
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: arm_neon-inl.h:789
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: arm_neon-inl.h:771
Vec128(const Vec128 &)=default
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: arm_neon-inl.h:777
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: arm_neon-inl.h:780
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3345
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE Vec128< T, N > Set64(Simd< T, N, 0 >, uint64_t mask_bits)
Definition: arm_neon-inl.h:4952
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition: arm_neon-inl.h:1884
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_INLINE uint64_t NibblesFromMask(const Full128< T > d, Mask128< T > mask)
Definition: arm_neon-inl.h:5019
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition: arm_neon-inl.h:5325
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) template< size_t N > HWY_INLINE Vec128< uint8_t
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition: arm_neon-inl.h:1733
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5491
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5339
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition: arm_neon-inl.h:1388
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, 2 > ConcatEven(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4474
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< T, 2 > ConcatOdd(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4443
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
constexpr float MantissaEnd< float >()
Definition: base.h:636
double float64_t
Definition: base.h:258
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:309
float float32_t
Definition: base.h:257
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:5318
Definition: arm_neon-inl.h:3883
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
Definition: ops/shared-inl.h:40
Definition: arm_neon-inl.h:823
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: arm_neon-inl.h:825
uint16x4_t type
Definition: arm_neon-inl.h:688
uint16x8_t type
Definition: arm_neon-inl.h:625
uint16x4_t type
Definition: arm_neon-inl.h:683
uint16x8_t type
Definition: arm_neon-inl.h:620
float32x2_t type
Definition: arm_neon-inl.h:693
float32x4_t type
Definition: arm_neon-inl.h:630
int16x4_t type
Definition: arm_neon-inl.h:668
int16x8_t type
Definition: arm_neon-inl.h:605
int32x2_t type
Definition: arm_neon-inl.h:673
int32x4_t type
Definition: arm_neon-inl.h:610
int64x1_t type
Definition: arm_neon-inl.h:678
int64x2_t type
Definition: arm_neon-inl.h:615
int8x16_t type
Definition: arm_neon-inl.h:600
int8x8_t type
Definition: arm_neon-inl.h:663
uint16x4_t type
Definition: arm_neon-inl.h:648
uint16x8_t type
Definition: arm_neon-inl.h:585
uint32x2_t type
Definition: arm_neon-inl.h:653
uint32x4_t type
Definition: arm_neon-inl.h:590
uint64x1_t type
Definition: arm_neon-inl.h:658
uint64x2_t type
Definition: arm_neon-inl.h:595
uint8x16_t type
Definition: arm_neon-inl.h:580
uint8x8_t type
Definition: arm_neon-inl.h:643
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3561
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3568
Definition: arm_neon-inl.h:3539
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition: arm_neon-inl.h:3542
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3549
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3591
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3598
Definition: arm_neon-inl.h:3574
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition: arm_neon-inl.h:3576
uint16x8x2_t raw
Definition: arm_neon-inl.h:346
uint16x4x2_t raw
Definition: arm_neon-inl.h:350
uint16x8x2_t raw
Definition: arm_neon-inl.h:338
uint16x4x2_t raw
Definition: arm_neon-inl.h:342
float32x4x2_t raw
Definition: arm_neon-inl.h:355
float32x2x2_t raw
Definition: arm_neon-inl.h:359
int16x8x2_t raw
Definition: arm_neon-inl.h:297
int16x4x2_t raw
Definition: arm_neon-inl.h:301
int32x4x2_t raw
Definition: arm_neon-inl.h:313
int32x2x2_t raw
Definition: arm_neon-inl.h:317
int64x2x2_t raw
Definition: arm_neon-inl.h:329
int64x1x2_t raw
Definition: arm_neon-inl.h:333
int8x16x2_t raw
Definition: arm_neon-inl.h:281
int8x8x2_t raw
Definition: arm_neon-inl.h:285
uint16x8x2_t raw
Definition: arm_neon-inl.h:289
uint16x4x2_t raw
Definition: arm_neon-inl.h:293
uint32x4x2_t raw
Definition: arm_neon-inl.h:305
uint32x2x2_t raw
Definition: arm_neon-inl.h:309
uint64x2x2_t raw
Definition: arm_neon-inl.h:321
uint64x1x2_t raw
Definition: arm_neon-inl.h:325
uint8x16x2_t raw
Definition: arm_neon-inl.h:273
uint8x8x2_t raw
Definition: arm_neon-inl.h:277
Definition: arm_neon-inl.h:265
uint16x8x3_t raw
Definition: arm_neon-inl.h:447
uint16x4x3_t raw
Definition: arm_neon-inl.h:451
uint16x8x3_t raw
Definition: arm_neon-inl.h:439
uint16x4x3_t raw
Definition: arm_neon-inl.h:443
float32x4x3_t raw
Definition: arm_neon-inl.h:456
float32x2x3_t raw
Definition: arm_neon-inl.h:460
int16x8x3_t raw
Definition: arm_neon-inl.h:398
int16x4x3_t raw
Definition: arm_neon-inl.h:402
int32x4x3_t raw
Definition: arm_neon-inl.h:414
int32x2x3_t raw
Definition: arm_neon-inl.h:418
int64x2x3_t raw
Definition: arm_neon-inl.h:430
int64x1x3_t raw
Definition: arm_neon-inl.h:434
int8x16x3_t raw
Definition: arm_neon-inl.h:382
int8x8x3_t raw
Definition: arm_neon-inl.h:386
uint16x8x3_t raw
Definition: arm_neon-inl.h:390
uint16x4x3_t raw
Definition: arm_neon-inl.h:394
uint32x4x3_t raw
Definition: arm_neon-inl.h:406
uint32x2x3_t raw
Definition: arm_neon-inl.h:410
uint64x2x3_t raw
Definition: arm_neon-inl.h:422
uint64x1x3_t raw
Definition: arm_neon-inl.h:426
uint8x16x3_t raw
Definition: arm_neon-inl.h:374
uint8x8x3_t raw
Definition: arm_neon-inl.h:378
Definition: arm_neon-inl.h:267
uint16x8x4_t raw
Definition: arm_neon-inl.h:548
uint16x4x4_t raw
Definition: arm_neon-inl.h:552
uint16x8x4_t raw
Definition: arm_neon-inl.h:540
uint16x4x4_t raw
Definition: arm_neon-inl.h:544
float32x4x4_t raw
Definition: arm_neon-inl.h:557
float32x2x4_t raw
Definition: arm_neon-inl.h:561
int16x8x4_t raw
Definition: arm_neon-inl.h:499
int16x4x4_t raw
Definition: arm_neon-inl.h:503
int32x4x4_t raw
Definition: arm_neon-inl.h:515
int32x2x4_t raw
Definition: arm_neon-inl.h:519
int64x2x4_t raw
Definition: arm_neon-inl.h:531
int64x1x4_t raw
Definition: arm_neon-inl.h:535
int8x16x4_t raw
Definition: arm_neon-inl.h:483
int8x8x4_t raw
Definition: arm_neon-inl.h:487
uint16x8x4_t raw
Definition: arm_neon-inl.h:491
uint16x4x4_t raw
Definition: arm_neon-inl.h:495
uint32x4x4_t raw
Definition: arm_neon-inl.h:507
uint32x2x4_t raw
Definition: arm_neon-inl.h:511
uint64x2x4_t raw
Definition: arm_neon-inl.h:523
uint64x1x4_t raw
Definition: arm_neon-inl.h:527
uint8x16x4_t raw
Definition: arm_neon-inl.h:475
uint8x8x4_t raw
Definition: arm_neon-inl.h:479
Definition: arm_neon-inl.h:269