Grok  10.0.3
arm_neon-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 128-bit ARM64 NEON vectors and operations.
17 // External include guard in highway.h - see comment there.
18 
19 // ARM NEON intrinsics are documented at:
20 // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
21 
22 #include <arm_neon.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 
26 #include "hwy/base.h"
27 #include "hwy/ops/shared-inl.h"
28 
30 namespace hwy {
31 namespace HWY_NAMESPACE {
32 
33 namespace detail { // for code folding and Raw128
34 
35 // Macros used to define single and double function calls for multiple types
36 // for full and half vectors. These macros are undefined at the end of the file.
37 
38 // HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
39 #define HWY_NEON_BUILD_TPL_1
40 #define HWY_NEON_BUILD_TPL_2
41 #define HWY_NEON_BUILD_TPL_3
42 
43 // HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can
44 // extend it to int32x4x2_t packs.
45 #define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
46 #define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
47 #define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
48 
49 // HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
50 #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
51 #define HWY_NEON_BUILD_PARAM_2(type, size) \
52  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
53 #define HWY_NEON_BUILD_PARAM_3(type, size) \
54  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
55  const Vec128<type##_t, size> c
56 
57 // HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
58 // function.
59 #define HWY_NEON_BUILD_ARG_1 a.raw
60 #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
61 #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
62 
63 // We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
64 // the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
65 // itself like with some of the library "functions" such as vshlq_u8. For
66 // example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
67 // "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
68 // Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
69 // expects two arguments.
70 #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
71 
72 // Main macro definition that defines a single function for the given type and
73 // size of vector, using the underlying (prefix##infix##suffix) function and
74 // the template, return type, parameters and arguments defined by the "args"
75 // parameters passed here (see HWY_NEON_BUILD_* macros defined before).
76 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
77  HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
78  HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
79  name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
80  return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
81  HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
82  }
83 
84 // The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
85 // called "name" using the set of neon functions starting with the given
86 // "prefix" for all the variants of certain types, as specified next to each
87 // macro. For example, the prefix "vsub" can be used to define the operator-
88 // using args=2.
89 
90 // uint8_t
91 #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
92  HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
93  HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
94  HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
95  HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
96  HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
97 
98 // int8_t
99 #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
100  HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
101  HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
102  HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
103  HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
104  HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
105 
106 // uint16_t
107 #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
108  HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
109  HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
110  HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
111  HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
112 
113 // int16_t
114 #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
115  HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
116  HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
117  HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
118  HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
119 
120 // uint32_t
121 #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
122  HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
123  HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
124  HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
125 
126 // int32_t
127 #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
128  HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
129  HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
130  HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
131 
132 // uint64_t
133 #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
134  HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
135  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
136 
137 // int64_t
138 #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
139  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
140  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
141 
142 // float
143 #define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
144  HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
145  HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
146  HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
147 
148 // double
149 #if HWY_ARCH_ARM_A64
150 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
151  HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
152  HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
153 #else
154 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
155 #endif
156 
157 // float and double
158 
159 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
160  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
161  HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
162 
163 // Helper macros to define for more than one type.
164 // uint8_t, uint16_t and uint32_t
165 #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
166  HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
167  HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
168  HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
169 
170 // int8_t, int16_t and int32_t
171 #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
172  HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
173  HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
174  HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
175 
176 // uint8_t, uint16_t, uint32_t and uint64_t
177 #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
178  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
179  HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
180 
181 // int8_t, int16_t, int32_t and int64_t
182 #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
183  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
184  HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
185 
186 // All int*_t and uint*_t up to 64
187 #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
188  HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
189  HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
190 
191 // All previous types.
192 #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
193  HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
194  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
195 
196 #define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \
197  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
198  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
199  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
200 
201 // Emulation of some intrinsics on armv7.
202 #if HWY_ARCH_ARM_V7
203 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
204 #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
205 #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
206 #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
207 #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
208 #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
209 #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
210 #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
211 #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
212 #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
213 #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
214 #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
215 #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
216 #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
217 #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
218 #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
219 #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
220 #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
221 #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
222 #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
223 #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
224 #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
225 #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
226 #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
227 #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
228 #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
229 #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
230 #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
231 #define vzip1_s8(x, y) vzip_s8(x, y).val[0]
232 #define vzip1_u8(x, y) vzip_u8(x, y).val[0]
233 #define vzip1_s16(x, y) vzip_s16(x, y).val[0]
234 #define vzip1_u16(x, y) vzip_u16(x, y).val[0]
235 #define vzip1_f32(x, y) vzip_f32(x, y).val[0]
236 #define vzip1_u32(x, y) vzip_u32(x, y).val[0]
237 #define vzip1_s32(x, y) vzip_s32(x, y).val[0]
238 #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
239 #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
240 #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
241 #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
242 #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
243 #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
244 #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
245 #define vzip2_s8(x, y) vzip_s8(x, y).val[1]
246 #define vzip2_u8(x, y) vzip_u8(x, y).val[1]
247 #define vzip2_s16(x, y) vzip_s16(x, y).val[1]
248 #define vzip2_u16(x, y) vzip_u16(x, y).val[1]
249 #define vzip2_s32(x, y) vzip_s32(x, y).val[1]
250 #define vzip2_u32(x, y) vzip_u32(x, y).val[1]
251 #define vzip2_f32(x, y) vzip_f32(x, y).val[1]
252 #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
253 #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
254 #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
255 #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
256 #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
257 #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
258 #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
259 #endif
260 
261 // Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 overloads
262 // for all vector types, even those (bfloat16_t) where the underlying vector is
263 // the same as others (uint16_t).
264 template <typename T, size_t N>
265 struct Tuple2;
266 template <typename T, size_t N>
267 struct Tuple3;
268 template <typename T, size_t N>
269 struct Tuple4;
270 
271 template <>
272 struct Tuple2<uint8_t, 16> {
273  uint8x16x2_t raw;
274 };
275 template <size_t N>
276 struct Tuple2<uint8_t, N> {
277  uint8x8x2_t raw;
278 };
279 template <>
280 struct Tuple2<int8_t, 16> {
281  int8x16x2_t raw;
282 };
283 template <size_t N>
284 struct Tuple2<int8_t, N> {
285  int8x8x2_t raw;
286 };
287 template <>
288 struct Tuple2<uint16_t, 8> {
289  uint16x8x2_t raw;
290 };
291 template <size_t N>
292 struct Tuple2<uint16_t, N> {
293  uint16x4x2_t raw;
294 };
295 template <>
296 struct Tuple2<int16_t, 8> {
297  int16x8x2_t raw;
298 };
299 template <size_t N>
300 struct Tuple2<int16_t, N> {
301  int16x4x2_t raw;
302 };
303 template <>
304 struct Tuple2<uint32_t, 4> {
305  uint32x4x2_t raw;
306 };
307 template <size_t N>
308 struct Tuple2<uint32_t, N> {
309  uint32x2x2_t raw;
310 };
311 template <>
312 struct Tuple2<int32_t, 4> {
313  int32x4x2_t raw;
314 };
315 template <size_t N>
316 struct Tuple2<int32_t, N> {
317  int32x2x2_t raw;
318 };
319 template <>
320 struct Tuple2<uint64_t, 2> {
321  uint64x2x2_t raw;
322 };
323 template <size_t N>
324 struct Tuple2<uint64_t, N> {
325  uint64x1x2_t raw;
326 };
327 template <>
328 struct Tuple2<int64_t, 2> {
329  int64x2x2_t raw;
330 };
331 template <size_t N>
332 struct Tuple2<int64_t, N> {
333  int64x1x2_t raw;
334 };
335 
336 template <>
337 struct Tuple2<float16_t, 8> {
338  uint16x8x2_t raw;
339 };
340 template <size_t N>
341 struct Tuple2<float16_t, N> {
342  uint16x4x2_t raw;
343 };
344 template <>
345 struct Tuple2<bfloat16_t, 8> {
346  uint16x8x2_t raw;
347 };
348 template <size_t N>
349 struct Tuple2<bfloat16_t, N> {
350  uint16x4x2_t raw;
351 };
352 
353 template <>
354 struct Tuple2<float32_t, 4> {
355  float32x4x2_t raw;
356 };
357 template <size_t N>
358 struct Tuple2<float32_t, N> {
359  float32x2x2_t raw;
360 };
361 #if HWY_ARCH_ARM_A64
362 template <>
363 struct Tuple2<float64_t, 2> {
364  float64x2x2_t raw;
365 };
366 template <size_t N>
367 struct Tuple2<float64_t, N> {
368  float64x1x2_t raw;
369 };
370 #endif // HWY_ARCH_ARM_A64
371 
372 template <>
373 struct Tuple3<uint8_t, 16> {
374  uint8x16x3_t raw;
375 };
376 template <size_t N>
377 struct Tuple3<uint8_t, N> {
378  uint8x8x3_t raw;
379 };
380 template <>
381 struct Tuple3<int8_t, 16> {
382  int8x16x3_t raw;
383 };
384 template <size_t N>
385 struct Tuple3<int8_t, N> {
386  int8x8x3_t raw;
387 };
388 template <>
389 struct Tuple3<uint16_t, 8> {
390  uint16x8x3_t raw;
391 };
392 template <size_t N>
393 struct Tuple3<uint16_t, N> {
394  uint16x4x3_t raw;
395 };
396 template <>
397 struct Tuple3<int16_t, 8> {
398  int16x8x3_t raw;
399 };
400 template <size_t N>
401 struct Tuple3<int16_t, N> {
402  int16x4x3_t raw;
403 };
404 template <>
405 struct Tuple3<uint32_t, 4> {
406  uint32x4x3_t raw;
407 };
408 template <size_t N>
409 struct Tuple3<uint32_t, N> {
410  uint32x2x3_t raw;
411 };
412 template <>
413 struct Tuple3<int32_t, 4> {
414  int32x4x3_t raw;
415 };
416 template <size_t N>
417 struct Tuple3<int32_t, N> {
418  int32x2x3_t raw;
419 };
420 template <>
421 struct Tuple3<uint64_t, 2> {
422  uint64x2x3_t raw;
423 };
424 template <size_t N>
425 struct Tuple3<uint64_t, N> {
426  uint64x1x3_t raw;
427 };
428 template <>
429 struct Tuple3<int64_t, 2> {
430  int64x2x3_t raw;
431 };
432 template <size_t N>
433 struct Tuple3<int64_t, N> {
434  int64x1x3_t raw;
435 };
436 
437 template <>
438 struct Tuple3<float16_t, 8> {
439  uint16x8x3_t raw;
440 };
441 template <size_t N>
442 struct Tuple3<float16_t, N> {
443  uint16x4x3_t raw;
444 };
445 template <>
446 struct Tuple3<bfloat16_t, 8> {
447  uint16x8x3_t raw;
448 };
449 template <size_t N>
450 struct Tuple3<bfloat16_t, N> {
451  uint16x4x3_t raw;
452 };
453 
454 template <>
455 struct Tuple3<float32_t, 4> {
456  float32x4x3_t raw;
457 };
458 template <size_t N>
459 struct Tuple3<float32_t, N> {
460  float32x2x3_t raw;
461 };
462 #if HWY_ARCH_ARM_A64
463 template <>
464 struct Tuple3<float64_t, 2> {
465  float64x2x3_t raw;
466 };
467 template <size_t N>
468 struct Tuple3<float64_t, N> {
469  float64x1x3_t raw;
470 };
471 #endif // HWY_ARCH_ARM_A64
472 
473 template <>
474 struct Tuple4<uint8_t, 16> {
475  uint8x16x4_t raw;
476 };
477 template <size_t N>
478 struct Tuple4<uint8_t, N> {
479  uint8x8x4_t raw;
480 };
481 template <>
482 struct Tuple4<int8_t, 16> {
483  int8x16x4_t raw;
484 };
485 template <size_t N>
486 struct Tuple4<int8_t, N> {
487  int8x8x4_t raw;
488 };
489 template <>
490 struct Tuple4<uint16_t, 8> {
491  uint16x8x4_t raw;
492 };
493 template <size_t N>
494 struct Tuple4<uint16_t, N> {
495  uint16x4x4_t raw;
496 };
497 template <>
498 struct Tuple4<int16_t, 8> {
499  int16x8x4_t raw;
500 };
501 template <size_t N>
502 struct Tuple4<int16_t, N> {
503  int16x4x4_t raw;
504 };
505 template <>
506 struct Tuple4<uint32_t, 4> {
507  uint32x4x4_t raw;
508 };
509 template <size_t N>
510 struct Tuple4<uint32_t, N> {
511  uint32x2x4_t raw;
512 };
513 template <>
514 struct Tuple4<int32_t, 4> {
515  int32x4x4_t raw;
516 };
517 template <size_t N>
518 struct Tuple4<int32_t, N> {
519  int32x2x4_t raw;
520 };
521 template <>
522 struct Tuple4<uint64_t, 2> {
523  uint64x2x4_t raw;
524 };
525 template <size_t N>
526 struct Tuple4<uint64_t, N> {
527  uint64x1x4_t raw;
528 };
529 template <>
530 struct Tuple4<int64_t, 2> {
531  int64x2x4_t raw;
532 };
533 template <size_t N>
534 struct Tuple4<int64_t, N> {
535  int64x1x4_t raw;
536 };
537 
538 template <>
539 struct Tuple4<float16_t, 8> {
540  uint16x8x4_t raw;
541 };
542 template <size_t N>
543 struct Tuple4<float16_t, N> {
544  uint16x4x4_t raw;
545 };
546 template <>
547 struct Tuple4<bfloat16_t, 8> {
548  uint16x8x4_t raw;
549 };
550 template <size_t N>
551 struct Tuple4<bfloat16_t, N> {
552  uint16x4x4_t raw;
553 };
554 
555 template <>
556 struct Tuple4<float32_t, 4> {
557  float32x4x4_t raw;
558 };
559 template <size_t N>
560 struct Tuple4<float32_t, N> {
561  float32x2x4_t raw;
562 };
563 #if HWY_ARCH_ARM_A64
564 template <>
565 struct Tuple4<float64_t, 2> {
566  float64x2x4_t raw;
567 };
568 template <size_t N>
569 struct Tuple4<float64_t, N> {
570  float64x1x4_t raw;
571 };
572 #endif // HWY_ARCH_ARM_A64
573 
574 template <typename T, size_t N>
575 struct Raw128;
576 
577 // 128
578 template <>
579 struct Raw128<uint8_t, 16> {
580  using type = uint8x16_t;
581 };
582 
583 template <>
584 struct Raw128<uint16_t, 8> {
585  using type = uint16x8_t;
586 };
587 
588 template <>
589 struct Raw128<uint32_t, 4> {
590  using type = uint32x4_t;
591 };
592 
593 template <>
594 struct Raw128<uint64_t, 2> {
595  using type = uint64x2_t;
596 };
597 
598 template <>
599 struct Raw128<int8_t, 16> {
600  using type = int8x16_t;
601 };
602 
603 template <>
604 struct Raw128<int16_t, 8> {
605  using type = int16x8_t;
606 };
607 
608 template <>
609 struct Raw128<int32_t, 4> {
610  using type = int32x4_t;
611 };
612 
613 template <>
614 struct Raw128<int64_t, 2> {
615  using type = int64x2_t;
616 };
617 
618 template <>
619 struct Raw128<float16_t, 8> {
620  using type = uint16x8_t;
621 };
622 
623 template <>
624 struct Raw128<bfloat16_t, 8> {
625  using type = uint16x8_t;
626 };
627 
628 template <>
629 struct Raw128<float, 4> {
630  using type = float32x4_t;
631 };
632 
633 #if HWY_ARCH_ARM_A64
634 template <>
635 struct Raw128<double, 2> {
636  using type = float64x2_t;
637 };
638 #endif
639 
640 // 64
641 template <>
642 struct Raw128<uint8_t, 8> {
643  using type = uint8x8_t;
644 };
645 
646 template <>
647 struct Raw128<uint16_t, 4> {
648  using type = uint16x4_t;
649 };
650 
651 template <>
652 struct Raw128<uint32_t, 2> {
653  using type = uint32x2_t;
654 };
655 
656 template <>
657 struct Raw128<uint64_t, 1> {
658  using type = uint64x1_t;
659 };
660 
661 template <>
662 struct Raw128<int8_t, 8> {
663  using type = int8x8_t;
664 };
665 
666 template <>
667 struct Raw128<int16_t, 4> {
668  using type = int16x4_t;
669 };
670 
671 template <>
672 struct Raw128<int32_t, 2> {
673  using type = int32x2_t;
674 };
675 
676 template <>
677 struct Raw128<int64_t, 1> {
678  using type = int64x1_t;
679 };
680 
681 template <>
682 struct Raw128<float16_t, 4> {
683  using type = uint16x4_t;
684 };
685 
686 template <>
687 struct Raw128<bfloat16_t, 4> {
688  using type = uint16x4_t;
689 };
690 
691 template <>
692 struct Raw128<float, 2> {
693  using type = float32x2_t;
694 };
695 
696 #if HWY_ARCH_ARM_A64
697 template <>
698 struct Raw128<double, 1> {
699  using type = float64x1_t;
700 };
701 #endif
702 
703 // 32 (same as 64)
704 template <>
705 struct Raw128<uint8_t, 4> : public Raw128<uint8_t, 8> {};
706 
707 template <>
708 struct Raw128<uint16_t, 2> : public Raw128<uint16_t, 4> {};
709 
710 template <>
711 struct Raw128<uint32_t, 1> : public Raw128<uint32_t, 2> {};
712 
713 template <>
714 struct Raw128<int8_t, 4> : public Raw128<int8_t, 8> {};
715 
716 template <>
717 struct Raw128<int16_t, 2> : public Raw128<int16_t, 4> {};
718 
719 template <>
720 struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {};
721 
722 template <>
723 struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {};
724 
725 template <>
726 struct Raw128<bfloat16_t, 2> : public Raw128<bfloat16_t, 4> {};
727 
728 template <>
729 struct Raw128<float, 1> : public Raw128<float, 2> {};
730 
731 // 16 (same as 64)
732 template <>
733 struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {};
734 
735 template <>
736 struct Raw128<uint16_t, 1> : public Raw128<uint16_t, 4> {};
737 
738 template <>
739 struct Raw128<int8_t, 2> : public Raw128<int8_t, 8> {};
740 
741 template <>
742 struct Raw128<int16_t, 1> : public Raw128<int16_t, 4> {};
743 
744 template <>
745 struct Raw128<float16_t, 1> : public Raw128<float16_t, 4> {};
746 
747 template <>
748 struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {};
749 
750 // 8 (same as 64)
751 template <>
752 struct Raw128<uint8_t, 1> : public Raw128<uint8_t, 8> {};
753 
754 template <>
755 struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {};
756 
757 } // namespace detail
758 
759 template <typename T, size_t N = 16 / sizeof(T)>
760 class Vec128 {
761  using Raw = typename detail::Raw128<T, N>::type;
762 
763  public:
765  Vec128(const Vec128&) = default;
766  Vec128& operator=(const Vec128&) = default;
767  HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
768 
769  // Compound assignment. Only usable if there is a corresponding non-member
770  // binary operator overload. For example, only f32 and f64 support division.
772  return *this = (*this * other);
773  }
775  return *this = (*this / other);
776  }
778  return *this = (*this + other);
779  }
781  return *this = (*this - other);
782  }
784  return *this = (*this & other);
785  }
787  return *this = (*this | other);
788  }
790  return *this = (*this ^ other);
791  }
792 
794 };
795 
796 template <typename T>
797 using Vec64 = Vec128<T, 8 / sizeof(T)>;
798 
799 template <typename T>
800 using Vec32 = Vec128<T, 4 / sizeof(T)>;
801 
802 // FF..FF or 0.
803 template <typename T, size_t N = 16 / sizeof(T)>
804 class Mask128 {
805  // ARM C Language Extensions return and expect unsigned type.
806  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
807 
808  public:
810  Mask128(const Mask128&) = default;
811  Mask128& operator=(const Mask128&) = default;
812  HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}
813 
815 };
816 
817 template <typename T>
818 using Mask64 = Mask128<T, 8 / sizeof(T)>;
819 
820 namespace detail {
821 
822 // Deduce Simd<T, N, 0> from Vec128<T, N>
823 struct DeduceD {
824  template <typename T, size_t N>
826  return Simd<T, N, 0>();
827  }
828 };
829 
830 } // namespace detail
831 
832 template <class V>
833 using DFromV = decltype(detail::DeduceD()(V()));
834 
835 template <class V>
837 
838 // ------------------------------ BitCast
839 
840 namespace detail {
841 
842 // Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
843 // vreinterpret*_u8_*() set of functions.
844 #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
845 #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
846  Vec128<uint8_t, size * sizeof(type##_t)>
847 #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
848 #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
849 
850 // Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
851 template <size_t N>
853  return v;
854 }
855 
857  HWY_CAST_TO_U8)
858 HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
859 HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
860 HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
861 HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
862 
863 // Special cases for [b]float16_t, which have the same Raw as uint16_t.
864 template <size_t N>
866  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
867 }
868 template <size_t N>
870  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
871 }
872 
873 #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
874 #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
875 #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
876 #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
877 
878 template <size_t N>
881  return v;
882 }
883 
884 // 64-bit or less:
885 
886 template <size_t N, HWY_IF_LE64(int8_t, N)>
889  return Vec128<int8_t, N>(vreinterpret_s8_u8(v.raw));
890 }
891 template <size_t N, HWY_IF_LE64(uint16_t, N)>
894  return Vec128<uint16_t, N>(vreinterpret_u16_u8(v.raw));
895 }
896 template <size_t N, HWY_IF_LE64(int16_t, N)>
899  return Vec128<int16_t, N>(vreinterpret_s16_u8(v.raw));
900 }
901 template <size_t N, HWY_IF_LE64(uint32_t, N)>
904  return Vec128<uint32_t, N>(vreinterpret_u32_u8(v.raw));
905 }
906 template <size_t N, HWY_IF_LE64(int32_t, N)>
909  return Vec128<int32_t, N>(vreinterpret_s32_u8(v.raw));
910 }
911 template <size_t N, HWY_IF_LE64(float, N)>
914  return Vec128<float, N>(vreinterpret_f32_u8(v.raw));
915 }
918  return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
919 }
922  return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
923 }
924 #if HWY_ARCH_ARM_A64
927  return Vec64<double>(vreinterpret_f64_u8(v.raw));
928 }
929 #endif
930 
931 // 128-bit full:
932 
934  Vec128<uint8_t> v) {
935  return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
936 }
938  Vec128<uint8_t> v) {
939  return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
940 }
942  Vec128<uint8_t> v) {
943  return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
944 }
946  Vec128<uint8_t> v) {
947  return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
948 }
950  Vec128<uint8_t> v) {
951  return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
952 }
954  Vec128<uint8_t> v) {
955  return Vec128<float>(vreinterpretq_f32_u8(v.raw));
956 }
958  Vec128<uint8_t> v) {
959  return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
960 }
962  Vec128<uint8_t> v) {
963  return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
964 }
965 
966 #if HWY_ARCH_ARM_A64
968  Vec128<uint8_t> v) {
969  return Vec128<double>(vreinterpretq_f64_u8(v.raw));
970 }
971 #endif
972 
973 // Special cases for [b]float16_t, which have the same Raw as uint16_t.
974 template <size_t N>
978 }
979 template <size_t N>
983 }
984 
985 } // namespace detail
986 
987 template <typename T, size_t N, typename FromT>
989  Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
991 }
992 
993 // ------------------------------ Set
994 
995 // Returns a vector with all lanes set to "t".
996 #define HWY_NEON_BUILD_TPL_HWY_SET1
997 #define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
998 #define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
999  Simd<type##_t, size, 0> /* tag */, const type##_t t
1000 #define HWY_NEON_BUILD_ARG_HWY_SET1 t
1001 
1002 HWY_NEON_DEF_FUNCTION_ALL_TYPES(Set, vdup, _n_, HWY_SET1)
1003 
1004 #undef HWY_NEON_BUILD_TPL_HWY_SET1
1005 #undef HWY_NEON_BUILD_RET_HWY_SET1
1006 #undef HWY_NEON_BUILD_PARAM_HWY_SET1
1007 #undef HWY_NEON_BUILD_ARG_HWY_SET1
1008 
1009 // Returns an all-zero vector.
1010 template <typename T, size_t N>
1012  return Set(d, 0);
1013 }
1014 
1015 template <size_t N>
1018 }
1019 
1020 template <class D>
1021 using VFromD = decltype(Zero(D()));
1022 
1023 // Returns a vector with uninitialized elements.
1024 template <typename T, size_t N>
1026  HWY_DIAGNOSTICS(push)
1027  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
1028  typename detail::Raw128<T, N>::type a;
1029  return Vec128<T, N>(a);
1030  HWY_DIAGNOSTICS(pop)
1031 }
1032 
1033 // Returns a vector with lane i=[0, N) set to "first" + i.
1034 template <typename T, size_t N, typename T2>
1035 Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
1036  HWY_ALIGN T lanes[16 / sizeof(T)];
1037  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
1038  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
1039  }
1040  return Load(d, lanes);
1041 }
1042 
1043 // ------------------------------ GetLane
1044 
1045 namespace detail {
1046 #define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
1047 #define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
1048 #define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
1049 #define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1050 
1051 HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
1052 
1053 #undef HWY_NEON_BUILD_TPL_HWY_GET
1054 #undef HWY_NEON_BUILD_RET_HWY_GET
1055 #undef HWY_NEON_BUILD_PARAM_HWY_GET
1056 #undef HWY_NEON_BUILD_ARG_HWY_GET
1057 
1058 } // namespace detail
1059 
1060 template <class V>
1062  return detail::GetLane<0>(v);
1063 }
1064 
1065 // ------------------------------ ExtractLane
1066 
1067 // Requires one overload per vector length because GetLane<3> is a compile error
1068 // if v is a uint32x2_t.
1069 template <typename T>
1070 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
1071  HWY_DASSERT(i == 0);
1072  (void)i;
1073  return detail::GetLane<0>(v);
1074 }
1075 
1076 template <typename T>
1077 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
1078 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1079  if (__builtin_constant_p(i)) {
1080  switch (i) {
1081  case 0:
1082  return detail::GetLane<0>(v);
1083  case 1:
1084  return detail::GetLane<1>(v);
1085  }
1086  }
1087 #endif
1088  alignas(16) T lanes[2];
1089  Store(v, DFromV<decltype(v)>(), lanes);
1090  return lanes[i];
1091 }
1092 
1093 template <typename T>
1094 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
1095 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1096  if (__builtin_constant_p(i)) {
1097  switch (i) {
1098  case 0:
1099  return detail::GetLane<0>(v);
1100  case 1:
1101  return detail::GetLane<1>(v);
1102  case 2:
1103  return detail::GetLane<2>(v);
1104  case 3:
1105  return detail::GetLane<3>(v);
1106  }
1107  }
1108 #endif
1109  alignas(16) T lanes[4];
1110  Store(v, DFromV<decltype(v)>(), lanes);
1111  return lanes[i];
1112 }
1113 
1114 template <typename T>
1115 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
1116 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1117  if (__builtin_constant_p(i)) {
1118  switch (i) {
1119  case 0:
1120  return detail::GetLane<0>(v);
1121  case 1:
1122  return detail::GetLane<1>(v);
1123  case 2:
1124  return detail::GetLane<2>(v);
1125  case 3:
1126  return detail::GetLane<3>(v);
1127  case 4:
1128  return detail::GetLane<4>(v);
1129  case 5:
1130  return detail::GetLane<5>(v);
1131  case 6:
1132  return detail::GetLane<6>(v);
1133  case 7:
1134  return detail::GetLane<7>(v);
1135  }
1136  }
1137 #endif
1138  alignas(16) T lanes[8];
1139  Store(v, DFromV<decltype(v)>(), lanes);
1140  return lanes[i];
1141 }
1142 
1143 template <typename T>
1144 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
1145 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1146  if (__builtin_constant_p(i)) {
1147  switch (i) {
1148  case 0:
1149  return detail::GetLane<0>(v);
1150  case 1:
1151  return detail::GetLane<1>(v);
1152  case 2:
1153  return detail::GetLane<2>(v);
1154  case 3:
1155  return detail::GetLane<3>(v);
1156  case 4:
1157  return detail::GetLane<4>(v);
1158  case 5:
1159  return detail::GetLane<5>(v);
1160  case 6:
1161  return detail::GetLane<6>(v);
1162  case 7:
1163  return detail::GetLane<7>(v);
1164  case 8:
1165  return detail::GetLane<8>(v);
1166  case 9:
1167  return detail::GetLane<9>(v);
1168  case 10:
1169  return detail::GetLane<10>(v);
1170  case 11:
1171  return detail::GetLane<11>(v);
1172  case 12:
1173  return detail::GetLane<12>(v);
1174  case 13:
1175  return detail::GetLane<13>(v);
1176  case 14:
1177  return detail::GetLane<14>(v);
1178  case 15:
1179  return detail::GetLane<15>(v);
1180  }
1181  }
1182 #endif
1183  alignas(16) T lanes[16];
1184  Store(v, DFromV<decltype(v)>(), lanes);
1185  return lanes[i];
1186 }
1187 
1188 // ------------------------------ InsertLane
1189 
1190 namespace detail {
1191 #define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
1192 #define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
1193 #define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
1194  Vec128<type##_t, size> v, type##_t t
1195 #define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1196 
1197 HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
1198 
1199 #undef HWY_NEON_BUILD_TPL_HWY_INSERT
1200 #undef HWY_NEON_BUILD_RET_HWY_INSERT
1201 #undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1202 #undef HWY_NEON_BUILD_ARG_HWY_INSERT
1203 
1204 } // namespace detail
1205 
1206 // Requires one overload per vector length because InsertLane<3> may be a
1207 // compile error.
1208 
1209 template <typename T>
1211  HWY_DASSERT(i == 0);
1212  (void)i;
1213  return Set(DFromV<decltype(v)>(), t);
1214 }
1215 
1216 template <typename T>
1218 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1219  if (__builtin_constant_p(i)) {
1220  switch (i) {
1221  case 0:
1222  return detail::InsertLane<0>(v, t);
1223  case 1:
1224  return detail::InsertLane<1>(v, t);
1225  }
1226  }
1227 #endif
1228  const DFromV<decltype(v)> d;
1229  alignas(16) T lanes[2];
1230  Store(v, d, lanes);
1231  lanes[i] = t;
1232  return Load(d, lanes);
1233 }
1234 
1235 template <typename T>
1237 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1238  if (__builtin_constant_p(i)) {
1239  switch (i) {
1240  case 0:
1241  return detail::InsertLane<0>(v, t);
1242  case 1:
1243  return detail::InsertLane<1>(v, t);
1244  case 2:
1245  return detail::InsertLane<2>(v, t);
1246  case 3:
1247  return detail::InsertLane<3>(v, t);
1248  }
1249  }
1250 #endif
1251  const DFromV<decltype(v)> d;
1252  alignas(16) T lanes[4];
1253  Store(v, d, lanes);
1254  lanes[i] = t;
1255  return Load(d, lanes);
1256 }
1257 
1258 template <typename T>
1260 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1261  if (__builtin_constant_p(i)) {
1262  switch (i) {
1263  case 0:
1264  return detail::InsertLane<0>(v, t);
1265  case 1:
1266  return detail::InsertLane<1>(v, t);
1267  case 2:
1268  return detail::InsertLane<2>(v, t);
1269  case 3:
1270  return detail::InsertLane<3>(v, t);
1271  case 4:
1272  return detail::InsertLane<4>(v, t);
1273  case 5:
1274  return detail::InsertLane<5>(v, t);
1275  case 6:
1276  return detail::InsertLane<6>(v, t);
1277  case 7:
1278  return detail::InsertLane<7>(v, t);
1279  }
1280  }
1281 #endif
1282  const DFromV<decltype(v)> d;
1283  alignas(16) T lanes[8];
1284  Store(v, d, lanes);
1285  lanes[i] = t;
1286  return Load(d, lanes);
1287 }
1288 
1289 template <typename T>
1291 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1292  if (__builtin_constant_p(i)) {
1293  switch (i) {
1294  case 0:
1295  return detail::InsertLane<0>(v, t);
1296  case 1:
1297  return detail::InsertLane<1>(v, t);
1298  case 2:
1299  return detail::InsertLane<2>(v, t);
1300  case 3:
1301  return detail::InsertLane<3>(v, t);
1302  case 4:
1303  return detail::InsertLane<4>(v, t);
1304  case 5:
1305  return detail::InsertLane<5>(v, t);
1306  case 6:
1307  return detail::InsertLane<6>(v, t);
1308  case 7:
1309  return detail::InsertLane<7>(v, t);
1310  case 8:
1311  return detail::InsertLane<8>(v, t);
1312  case 9:
1313  return detail::InsertLane<9>(v, t);
1314  case 10:
1315  return detail::InsertLane<10>(v, t);
1316  case 11:
1317  return detail::InsertLane<11>(v, t);
1318  case 12:
1319  return detail::InsertLane<12>(v, t);
1320  case 13:
1321  return detail::InsertLane<13>(v, t);
1322  case 14:
1323  return detail::InsertLane<14>(v, t);
1324  case 15:
1325  return detail::InsertLane<15>(v, t);
1326  }
1327  }
1328 #endif
1329  const DFromV<decltype(v)> d;
1330  alignas(16) T lanes[16];
1331  Store(v, d, lanes);
1332  lanes[i] = t;
1333  return Load(d, lanes);
1334 }
1335 
1336 // ================================================== ARITHMETIC
1337 
1338 // ------------------------------ Addition
1339 HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)
1340 
1341 // ------------------------------ Subtraction
1342 HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)
1343 
1344 // ------------------------------ SumsOf8
1345 
1346 HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
1347  return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw))));
1348 }
1350  return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
1351 }
1352 
1353 // ------------------------------ SaturatedAdd
1354 // Only defined for uint8_t, uint16_t and their signed versions, as in other
1355 // architectures.
1356 
1357 // Returns a + b clamped to the destination range.
1362 
1363 // ------------------------------ SaturatedSub
1364 
1365 // Returns a - b clamped to the destination range.
1370 
1371 // Not part of API, used in implementation.
1372 namespace detail {
1377 } // namespace detail
1378 
1379 // ------------------------------ Average
1380 
1381 // Returns (a + b + 1) / 2
1384 
1385 // ------------------------------ Neg
1386 
1388 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below
1389 
1390 HWY_API Vec64<int64_t> Neg(const Vec64<int64_t> v) {
1391 #if HWY_ARCH_ARM_A64
1392  return Vec64<int64_t>(vneg_s64(v.raw));
1393 #else
1394  return Zero(Full64<int64_t>()) - v;
1395 #endif
1396 }
1397 
1399 #if HWY_ARCH_ARM_A64
1400  return Vec128<int64_t>(vnegq_s64(v.raw));
1401 #else
1402  return Zero(Full128<int64_t>()) - v;
1403 #endif
1404 }
1405 
1406 // ------------------------------ ShiftLeft
1407 
1408 // Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
1409 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
1410 #undef HWY_NEON_DEF_FUNCTION
1411 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
1412  template <int kBits> \
1413  HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
1414  return kBits == 0 ? v \
1415  : Vec128<type##_t, size>(HWY_NEON_EVAL( \
1416  prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
1417  }
1418 
1419 HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored)
1420 
1421 HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored)
1422 HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
1423 
1424 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
1425 
1426 // ------------------------------ RotateRight (ShiftRight, Or)
1427 
1428 template <int kBits, size_t N>
1430  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
1431  if (kBits == 0) return v;
1432  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
1433 }
1434 
1435 template <int kBits, size_t N>
1437  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
1438  if (kBits == 0) return v;
1439  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
1440 }
1441 
1442 // NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a
1443 // mechanism for checking for extensions to ARMv8.
1444 
1445 // ------------------------------ Shl
1446 
1448  const Vec128<uint8_t> bits) {
1449  return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
1450 }
1451 template <size_t N, HWY_IF_LE64(uint8_t, N)>
1453  const Vec128<uint8_t, N> bits) {
1454  return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
1455 }
1456 
1458  const Vec128<uint16_t> bits) {
1459  return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
1460 }
1461 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1463  const Vec128<uint16_t, N> bits) {
1464  return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw)));
1465 }
1466 
1468  const Vec128<uint32_t> bits) {
1469  return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
1470 }
1471 template <size_t N, HWY_IF_LE64(uint32_t, N)>
1473  const Vec128<uint32_t, N> bits) {
1474  return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
1475 }
1476 
1478  const Vec128<uint64_t> bits) {
1479  return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
1480 }
1482  const Vec64<uint64_t> bits) {
1483  return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
1484 }
1485 
1487  const Vec128<int8_t> bits) {
1488  return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
1489 }
1490 template <size_t N, HWY_IF_LE64(int8_t, N)>
1492  const Vec128<int8_t, N> bits) {
1493  return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
1494 }
1495 
1497  const Vec128<int16_t> bits) {
1498  return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
1499 }
1500 template <size_t N, HWY_IF_LE64(int16_t, N)>
1502  const Vec128<int16_t, N> bits) {
1503  return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw));
1504 }
1505 
1507  const Vec128<int32_t> bits) {
1508  return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw));
1509 }
1510 template <size_t N, HWY_IF_LE64(int32_t, N)>
1512  const Vec128<int32_t, N> bits) {
1513  return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw));
1514 }
1515 
1517  const Vec128<int64_t> bits) {
1518  return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw));
1519 }
1521  const Vec64<int64_t> bits) {
1522  return Vec64<int64_t>(vshl_s64(v.raw, bits.raw));
1523 }
1524 
1525 // ------------------------------ Shr (Neg)
1526 
1528  const Vec128<uint8_t> bits) {
1529  const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
1530  return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
1531 }
1532 template <size_t N, HWY_IF_LE64(uint8_t, N)>
1534  const Vec128<uint8_t, N> bits) {
1535  const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N, 0>(), bits)).raw;
1536  return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
1537 }
1538 
1540  const Vec128<uint16_t> bits) {
1541  const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
1542  return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits));
1543 }
1544 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1546  const Vec128<uint16_t, N> bits) {
1547  const int16x4_t neg_bits = Neg(BitCast(Simd<int16_t, N, 0>(), bits)).raw;
1548  return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits));
1549 }
1550 
1552  const Vec128<uint32_t> bits) {
1553  const int32x4_t neg_bits = Neg(BitCast(Full128<int32_t>(), bits)).raw;
1554  return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits));
1555 }
1556 template <size_t N, HWY_IF_LE64(uint32_t, N)>
1558  const Vec128<uint32_t, N> bits) {
1559  const int32x2_t neg_bits = Neg(BitCast(Simd<int32_t, N, 0>(), bits)).raw;
1560  return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits));
1561 }
1562 
1564  const Vec128<uint64_t> bits) {
1565  const int64x2_t neg_bits = Neg(BitCast(Full128<int64_t>(), bits)).raw;
1566  return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
1567 }
1569  const Vec64<uint64_t> bits) {
1570  const int64x1_t neg_bits = Neg(BitCast(Full64<int64_t>(), bits)).raw;
1571  return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits));
1572 }
1573 
1575  const Vec128<int8_t> bits) {
1576  return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
1577 }
1578 template <size_t N, HWY_IF_LE64(int8_t, N)>
1580  const Vec128<int8_t, N> bits) {
1581  return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
1582 }
1583 
1585  const Vec128<int16_t> bits) {
1586  return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
1587 }
1588 template <size_t N, HWY_IF_LE64(int16_t, N)>
1590  const Vec128<int16_t, N> bits) {
1591  return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw));
1592 }
1593 
1595  const Vec128<int32_t> bits) {
1596  return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw));
1597 }
1598 template <size_t N, HWY_IF_LE64(int32_t, N)>
1600  const Vec128<int32_t, N> bits) {
1601  return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw));
1602 }
1603 
1605  const Vec128<int64_t> bits) {
1606  return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw));
1607 }
1609  const Vec64<int64_t> bits) {
1610  return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
1611 }
1612 
1613 // ------------------------------ ShiftLeftSame (Shl)
1614 
1615 template <typename T, size_t N>
1617  return v << Set(Simd<T, N, 0>(), static_cast<T>(bits));
1618 }
1619 template <typename T, size_t N>
1621  return v >> Set(Simd<T, N, 0>(), static_cast<T>(bits));
1622 }
1623 
1624 // ------------------------------ Integer multiplication
1625 
1626 // Unsigned
1628  const Vec128<uint16_t> b) {
1629  return Vec128<uint16_t>(vmulq_u16(a.raw, b.raw));
1630 }
1632  const Vec128<uint32_t> b) {
1633  return Vec128<uint32_t>(vmulq_u32(a.raw, b.raw));
1634 }
1635 
1636 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1638  const Vec128<uint16_t, N> b) {
1639  return Vec128<uint16_t, N>(vmul_u16(a.raw, b.raw));
1640 }
1641 template <size_t N, HWY_IF_LE64(uint32_t, N)>
1643  const Vec128<uint32_t, N> b) {
1644  return Vec128<uint32_t, N>(vmul_u32(a.raw, b.raw));
1645 }
1646 
1647 // Signed
1649  const Vec128<int16_t> b) {
1650  return Vec128<int16_t>(vmulq_s16(a.raw, b.raw));
1651 }
1653  const Vec128<int32_t> b) {
1654  return Vec128<int32_t>(vmulq_s32(a.raw, b.raw));
1655 }
1656 
1657 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1659  const Vec128<int16_t, N> b) {
1660  return Vec128<int16_t, N>(vmul_s16(a.raw, b.raw));
1661 }
1662 template <size_t N, HWY_IF_LE64(int32_t, N)>
1664  const Vec128<int32_t, N> b) {
1665  return Vec128<int32_t, N>(vmul_s32(a.raw, b.raw));
1666 }
1667 
1668 // Returns the upper 16 bits of a * b in each lane.
1670  const Vec128<int16_t> b) {
1671  int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
1672 #if HWY_ARCH_ARM_A64
1673  int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
1674 #else
1675  int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
1676 #endif
1677  return Vec128<int16_t>(
1678  vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1679 }
1681  const Vec128<uint16_t> b) {
1682  uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
1683 #if HWY_ARCH_ARM_A64
1684  uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
1685 #else
1686  uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
1687 #endif
1688  return Vec128<uint16_t>(
1689  vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1690 }
1691 
1692 template <size_t N, HWY_IF_LE64(int16_t, N)>
1694  const Vec128<int16_t, N> b) {
1695  int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
1696  return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
1697 }
1698 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1700  const Vec128<uint16_t, N> b) {
1701  uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
1702  return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
1703 }
1704 
1706  return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
1707 }
1708 template <size_t N, HWY_IF_LE64(int16_t, N)>
1710  Vec128<int16_t, N> b) {
1711  return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw));
1712 }
1713 
1714 // ------------------------------ Floating-point mul / div
1715 
1716 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
1717 
1718 // Approximate reciprocal
1720  return Vec128<float>(vrecpeq_f32(v.raw));
1721 }
1722 template <size_t N>
1724  return Vec128<float, N>(vrecpe_f32(v.raw));
1725 }
1726 
1727 #if HWY_ARCH_ARM_A64
1728 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
1729 #else
1730 // Not defined on armv7: approximate
1731 namespace detail {
1732 
1734  const Vec128<float> recip, const Vec128<float> divisor) {
1735  return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
1736 }
1737 template <size_t N>
1739  const Vec128<float, N> recip, Vec128<float, N> divisor) {
1740  return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
1741 }
1742 
1743 } // namespace detail
1744 
1745 template <size_t N>
1747  const Vec128<float, N> b) {
1748  auto x = ApproximateReciprocal(b);
1752  return a * x;
1753 }
1754 #endif
1755 
1756 // ------------------------------ Absolute value of difference.
1757 
1759  return Vec128<float>(vabdq_f32(a.raw, b.raw));
1760 }
1761 template <size_t N, HWY_IF_LE64(float, N)>
1763  const Vec128<float, N> b) {
1764  return Vec128<float, N>(vabd_f32(a.raw, b.raw));
1765 }
1766 
1767 // ------------------------------ Floating-point multiply-add variants
1768 
1769 // Returns add + mul * x
1770 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1771 template <size_t N, HWY_IF_LE64(float, N)>
1772 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
1773  const Vec128<float, N> x,
1774  const Vec128<float, N> add) {
1775  return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1776 }
1777 HWY_API Vec128<float> MulAdd(const Vec128<float> mul, const Vec128<float> x,
1778  const Vec128<float> add) {
1779  return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1780 }
1781 #else
1782 // Emulate FMA for floats.
1783 template <size_t N>
1785  const Vec128<float, N> x,
1786  const Vec128<float, N> add) {
1787  return mul * x + add;
1788 }
1789 #endif
1790 
1791 #if HWY_ARCH_ARM_A64
1792 HWY_API Vec64<double> MulAdd(const Vec64<double> mul, const Vec64<double> x,
1793  const Vec64<double> add) {
1794  return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
1795 }
1796 HWY_API Vec128<double> MulAdd(const Vec128<double> mul, const Vec128<double> x,
1797  const Vec128<double> add) {
1798  return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1799 }
1800 #endif
1801 
1802 // Returns add - mul * x
1803 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1804 template <size_t N, HWY_IF_LE64(float, N)>
1805 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
1806  const Vec128<float, N> x,
1807  const Vec128<float, N> add) {
1808  return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1809 }
1810 HWY_API Vec128<float> NegMulAdd(const Vec128<float> mul, const Vec128<float> x,
1811  const Vec128<float> add) {
1812  return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1813 }
1814 #else
1815 // Emulate FMA for floats.
1816 template <size_t N>
1818  const Vec128<float, N> x,
1819  const Vec128<float, N> add) {
1820  return add - mul * x;
1821 }
1822 #endif
1823 
1824 #if HWY_ARCH_ARM_A64
1825 HWY_API Vec64<double> NegMulAdd(const Vec64<double> mul, const Vec64<double> x,
1826  const Vec64<double> add) {
1827  return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
1828 }
1829 HWY_API Vec128<double> NegMulAdd(const Vec128<double> mul,
1830  const Vec128<double> x,
1831  const Vec128<double> add) {
1832  return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1833 }
1834 #endif
1835 
1836 // Returns mul * x - sub
1837 template <size_t N>
1839  const Vec128<float, N> x,
1840  const Vec128<float, N> sub) {
1841  return MulAdd(mul, x, Neg(sub));
1842 }
1843 
1844 // Returns -mul * x - sub
1845 template <size_t N>
1847  const Vec128<float, N> x,
1848  const Vec128<float, N> sub) {
1849  return Neg(MulAdd(mul, x, sub));
1850 }
1851 
1852 #if HWY_ARCH_ARM_A64
1853 template <size_t N>
1854 HWY_API Vec128<double, N> MulSub(const Vec128<double, N> mul,
1855  const Vec128<double, N> x,
1856  const Vec128<double, N> sub) {
1857  return MulAdd(mul, x, Neg(sub));
1858 }
1859 template <size_t N>
1860 HWY_API Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
1861  const Vec128<double, N> x,
1862  const Vec128<double, N> sub) {
1863  return Neg(MulAdd(mul, x, sub));
1864 }
1865 #endif
1866 
1867 // ------------------------------ Floating-point square root (IfThenZeroElse)
1868 
1869 // Approximate reciprocal square root
1871  return Vec128<float>(vrsqrteq_f32(v.raw));
1872 }
1873 template <size_t N>
1875  return Vec128<float, N>(vrsqrte_f32(v.raw));
1876 }
1877 
1878 // Full precision square root
1879 #if HWY_ARCH_ARM_A64
1881 #else
1882 namespace detail {
1883 
1885  const Vec128<float> recip) {
1886  return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
1887 }
1888 template <size_t N>
1890  Vec128<float, N> recip) {
1891  return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
1892 }
1893 
1894 } // namespace detail
1895 
1896 // Not defined on armv7: approximate
1897 template <size_t N>
1899  auto recip = ApproximateReciprocalSqrt(v);
1900 
1901  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1902  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1903  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1904 
1905  const auto root = v * recip;
1906  return IfThenZeroElse(v == Zero(Simd<float, N, 0>()), root);
1907 }
1908 #endif
1909 
1910 // ================================================== LOGICAL
1911 
1912 // ------------------------------ Not
1913 
1914 // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
1915 template <typename T>
1917  const Full128<T> d;
1918  const Repartition<uint8_t, decltype(d)> d8;
1919  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
1920 }
1921 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1923  const Simd<T, N, 0> d;
1924  const Repartition<uint8_t, decltype(d)> d8;
1925  using V8 = decltype(Zero(d8));
1926  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
1927 }
1928 
1929 // ------------------------------ And
1931 
1932 // Uses the u32/64 defined above.
1933 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1934 HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
1935  const DFromV<decltype(a)> d;
1936  const RebindToUnsigned<decltype(d)> du;
1937  return BitCast(d, BitCast(du, a) & BitCast(du, b));
1938 }
1939 
1940 // ------------------------------ AndNot
1941 
1942 namespace detail {
1943 // reversed_andnot returns a & ~b.
1944 HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
1945 } // namespace detail
1946 
1947 // Returns ~not_mask & mask.
1948 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1950  const Vec128<T, N> mask) {
1951  return detail::reversed_andnot(mask, not_mask);
1952 }
1953 
1954 // Uses the u32/64 defined above.
1955 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1956 HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
1957  const Vec128<T, N> mask) {
1958  const DFromV<decltype(mask)> d;
1959  const RebindToUnsigned<decltype(d)> du;
1960  VFromD<decltype(du)> ret =
1961  detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
1962  return BitCast(d, ret);
1963 }
1964 
1965 // ------------------------------ Or
1966 
1968 
1969 // Uses the u32/64 defined above.
1970 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1971 HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
1972  const DFromV<decltype(a)> d;
1973  const RebindToUnsigned<decltype(d)> du;
1974  return BitCast(d, BitCast(du, a) | BitCast(du, b));
1975 }
1976 
1977 // ------------------------------ Xor
1978 
1980 
1981 // Uses the u32/64 defined above.
1982 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1983 HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
1984  const DFromV<decltype(a)> d;
1985  const RebindToUnsigned<decltype(d)> du;
1986  return BitCast(d, BitCast(du, a) ^ BitCast(du, b));
1987 }
1988 
1989 // ------------------------------ Or3
1990 
1991 template <typename T, size_t N>
1993  return Or(o1, Or(o2, o3));
1994 }
1995 
1996 // ------------------------------ OrAnd
1997 
1998 template <typename T, size_t N>
2000  return Or(o, And(a1, a2));
2001 }
2002 
2003 // ------------------------------ IfVecThenElse
2004 
2005 template <typename T, size_t N>
2007  Vec128<T, N> no) {
2008  return IfThenElse(MaskFromVec(mask), yes, no);
2009 }
2010 
2011 // ------------------------------ Operator overloads (internal-only if float)
2012 
2013 template <typename T, size_t N>
2015  return And(a, b);
2016 }
2017 
2018 template <typename T, size_t N>
2020  return Or(a, b);
2021 }
2022 
2023 template <typename T, size_t N>
2025  return Xor(a, b);
2026 }
2027 
2028 // ------------------------------ PopulationCount
2029 
2030 #ifdef HWY_NATIVE_POPCNT
2031 #undef HWY_NATIVE_POPCNT
2032 #else
2033 #define HWY_NATIVE_POPCNT
2034 #endif
2035 
2036 namespace detail {
2037 
2038 template <typename T>
2040  const Full128<uint8_t> d8;
2041  return Vec128<T>(vcntq_u8(BitCast(d8, v).raw));
2042 }
2043 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2045  Vec128<T, N> v) {
2046  const Simd<uint8_t, N, 0> d8;
2047  return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw));
2048 }
2049 
2050 // ARM lacks popcount for lane sizes > 1, so take pairwise sums of the bytes.
2051 template <typename T>
2053  const Full128<uint8_t> d8;
2054  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2055  return Vec128<T>(vpaddlq_u8(bytes));
2056 }
2057 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2059  Vec128<T, N> v) {
2061  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2062  return Vec128<T, N>(vpaddl_u8(bytes));
2063 }
2064 
2065 template <typename T>
2067  const Full128<uint8_t> d8;
2068  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2069  return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
2070 }
2071 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2073  Vec128<T, N> v) {
2075  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2076  return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes)));
2077 }
2078 
2079 template <typename T>
2081  const Full128<uint8_t> d8;
2082  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2083  return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
2084 }
2085 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2087  Vec128<T, N> v) {
2089  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2090  return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
2091 }
2092 
2093 } // namespace detail
2094 
2095 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
2097  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
2098 }
2099 
2100 // ================================================== SIGN
2101 
2102 // ------------------------------ Abs
2103 
2104 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
2106  return Vec128<int8_t>(vabsq_s8(v.raw));
2107 }
2109  return Vec128<int16_t>(vabsq_s16(v.raw));
2110 }
2112  return Vec128<int32_t>(vabsq_s32(v.raw));
2113 }
2114 // i64 is implemented after BroadcastSignBit.
2116  return Vec128<float>(vabsq_f32(v.raw));
2117 }
2118 
2119 template <size_t N, HWY_IF_LE64(int8_t, N)>
2121  return Vec128<int8_t, N>(vabs_s8(v.raw));
2122 }
2123 template <size_t N, HWY_IF_LE64(int16_t, N)>
2125  return Vec128<int16_t, N>(vabs_s16(v.raw));
2126 }
2127 template <size_t N, HWY_IF_LE64(int32_t, N)>
2129  return Vec128<int32_t, N>(vabs_s32(v.raw));
2130 }
2131 template <size_t N, HWY_IF_LE64(float, N)>
2133  return Vec128<float, N>(vabs_f32(v.raw));
2134 }
2135 
2136 #if HWY_ARCH_ARM_A64
2137 HWY_API Vec128<double> Abs(const Vec128<double> v) {
2138  return Vec128<double>(vabsq_f64(v.raw));
2139 }
2140 
2141 HWY_API Vec64<double> Abs(const Vec64<double> v) {
2142  return Vec64<double>(vabs_f64(v.raw));
2143 }
2144 #endif
2145 
2146 // ------------------------------ CopySign
2147 
2148 template <typename T, size_t N>
2150  const Vec128<T, N> sign) {
2151  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
2152  const auto msb = SignBit(Simd<T, N, 0>());
2153  return Or(AndNot(msb, magn), And(msb, sign));
2154 }
2155 
2156 template <typename T, size_t N>
2158  const Vec128<T, N> sign) {
2159  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
2160  return Or(abs, And(SignBit(Simd<T, N, 0>()), sign));
2161 }
2162 
2163 // ------------------------------ BroadcastSignBit
2164 
2165 template <typename T, size_t N, HWY_IF_SIGNED(T)>
2167  return ShiftRight<sizeof(T) * 8 - 1>(v);
2168 }
2169 
2170 // ================================================== MASK
2171 
2172 // ------------------------------ To/from vector
2173 
2174 // Mask and Vec have the same representation (true = FF..FF).
2175 template <typename T, size_t N>
2177  const Simd<MakeUnsigned<T>, N, 0> du;
2178  return Mask128<T, N>(BitCast(du, v).raw);
2179 }
2180 
2181 template <typename T, size_t N>
2183  return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
2184 }
2185 
2186 // ------------------------------ RebindMask
2187 
2188 template <typename TFrom, typename TTo, size_t N>
2190  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
2191  return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N, 0>(), m)));
2192 }
2193 
2194 // ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
2195 
2196 #define HWY_NEON_BUILD_TPL_HWY_IF
2197 #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2198 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
2199  const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
2200  const Vec128<type##_t, size> no
2201 #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
2202 
2204 
2205 #undef HWY_NEON_BUILD_TPL_HWY_IF
2206 #undef HWY_NEON_BUILD_RET_HWY_IF
2207 #undef HWY_NEON_BUILD_PARAM_HWY_IF
2208 #undef HWY_NEON_BUILD_ARG_HWY_IF
2209 
2210 // mask ? yes : 0
2211 template <typename T, size_t N>
2213  const Vec128<T, N> yes) {
2214  return yes & VecFromMask(Simd<T, N, 0>(), mask);
2215 }
2216 
2217 // mask ? 0 : no
2218 template <typename T, size_t N>
2220  const Vec128<T, N> no) {
2221  return AndNot(VecFromMask(Simd<T, N, 0>(), mask), no);
2222 }
2223 
2224 template <typename T, size_t N>
2226  Vec128<T, N> no) {
2227  static_assert(IsSigned<T>(), "Only works for signed/float");
2228  const Simd<T, N, 0> d;
2229  const RebindToSigned<decltype(d)> di;
2230 
2232  return IfThenElse(m, yes, no);
2233 }
2234 
2235 template <typename T, size_t N>
2237  const auto zero = Zero(Simd<T, N, 0>());
2238  return Max(zero, v);
2239 }
2240 
2241 // ------------------------------ Mask logical
2242 
2243 template <typename T, size_t N>
2245  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
2246 }
2247 
2248 template <typename T, size_t N>
2250  const Simd<T, N, 0> d;
2251  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
2252 }
2253 
2254 template <typename T, size_t N>
2256  const Simd<T, N, 0> d;
2257  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
2258 }
2259 
2260 template <typename T, size_t N>
2262  const Simd<T, N, 0> d;
2263  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
2264 }
2265 
2266 template <typename T, size_t N>
2268  const Simd<T, N, 0> d;
2269  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
2270 }
2271 
2272 // ================================================== COMPARE
2273 
2274 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
2275 
2276 // ------------------------------ Shuffle2301 (for i64 compares)
2277 
2278 // Swap 32-bit halves in 64-bits
2280  return Vec64<uint32_t>(vrev64_u32(v.raw));
2281 }
2283  return Vec64<int32_t>(vrev64_s32(v.raw));
2284 }
2286  return Vec64<float>(vrev64_f32(v.raw));
2287 }
2289  return Vec128<uint32_t>(vrev64q_u32(v.raw));
2290 }
2292  return Vec128<int32_t>(vrev64q_s32(v.raw));
2293 }
2295  return Vec128<float>(vrev64q_f32(v.raw));
2296 }
2297 
2298 #define HWY_NEON_BUILD_TPL_HWY_COMPARE
2299 #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
2300 #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
2301  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
2302 #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
2303 
2304 // ------------------------------ Equality
2305 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
2306 #if HWY_ARCH_ARM_A64
2307 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
2308 #else
2309 // No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
2310 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
2311 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
2312 #endif
2313 
2314 // ------------------------------ Strict inequality (signed, float)
2315 #if HWY_ARCH_ARM_A64
2316 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE)
2317 #else
2318 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE)
2319 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
2320 #endif
2321 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
2322 
2323 // ------------------------------ Weak inequality (float)
2324 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
2325 
2326 #undef HWY_NEON_BUILD_TPL_HWY_COMPARE
2327 #undef HWY_NEON_BUILD_RET_HWY_COMPARE
2328 #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
2329 #undef HWY_NEON_BUILD_ARG_HWY_COMPARE
2330 
2331 // ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
2332 
2333 #if HWY_ARCH_ARM_V7
2334 
2335 template <size_t N>
2336 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
2337  const Vec128<int64_t, N> b) {
2338  const Simd<int32_t, N * 2, 0> d32;
2339  const Simd<int64_t, N, 0> d64;
2340  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
2341  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
2342  return MaskFromVec(BitCast(d64, cmp64));
2343 }
2344 
2345 template <size_t N>
2346 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
2347  const Vec128<uint64_t, N> b) {
2348  const Simd<uint32_t, N * 2, 0> d32;
2349  const Simd<uint64_t, N, 0> d64;
2350  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
2351  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
2352  return MaskFromVec(BitCast(d64, cmp64));
2353 }
2354 
2355 HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a,
2356  const Vec128<int64_t> b) {
2357  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
2358  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
2359 }
2360 HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a,
2361  const Vec64<int64_t> b) {
2362  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
2363  return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub)));
2364 }
2365 
2366 template <size_t N>
2367 HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a,
2368  const Vec128<uint64_t, N> b) {
2369  const DFromV<decltype(a)> du;
2370  const RebindToSigned<decltype(du)> di;
2371  const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b);
2372  return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb))));
2373 }
2374 
2375 #endif
2376 
2377 // ------------------------------ operator!= (operator==)
2378 
2379 // Customize HWY_NEON_DEF_FUNCTION to call 2 functions.
2380 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
2381 #undef HWY_NEON_DEF_FUNCTION
2382 // This cannot have _any_ template argument (in x86_128 we can at least have N
2383 // as an argument), otherwise it is not more specialized than rewritten
2384 // operator== in C++20, leading to compile errors.
2385 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
2386  HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \
2387  Vec128<type##_t, size> b) { \
2388  return Not(a == b); \
2389  }
2390 
2391 HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored)
2392 
2393 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2394 
2395 // ------------------------------ Reversed comparisons
2396 
2397 template <typename T, size_t N>
2399  return operator<(b, a);
2400 }
2401 template <typename T, size_t N>
2403  return operator<=(b, a);
2404 }
2405 
2406 // ------------------------------ FirstN (Iota, Lt)
2407 
2408 template <typename T, size_t N>
2410  const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
2411  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
2412 }
2413 
2414 // ------------------------------ TestBit (Eq)
2415 
2416 #define HWY_NEON_BUILD_TPL_HWY_TESTBIT
2417 #define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
2418 #define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
2419  Vec128<type##_t, size> v, Vec128<type##_t, size> bit
2420 #define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
2421 
2422 #if HWY_ARCH_ARM_A64
2423 HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
2424 #else
2425 // No 64-bit versions on armv7
2426 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
2427 HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
2428 
2429 template <size_t N>
2431  Vec128<uint64_t, N> bit) {
2432  return (v & bit) == bit;
2433 }
2434 template <size_t N>
2436  Vec128<int64_t, N> bit) {
2437  return (v & bit) == bit;
2438 }
2439 
2440 #endif
2441 #undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
2442 #undef HWY_NEON_BUILD_RET_HWY_TESTBIT
2443 #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
2444 #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
2445 
2446 // ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
2448 #if HWY_ARCH_ARM_A64
2449  return Vec128<int64_t>(vabsq_s64(v.raw));
2450 #else
2451  const auto zero = Zero(Full128<int64_t>());
2452  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2453 #endif
2454 }
2456 #if HWY_ARCH_ARM_A64
2457  return Vec64<int64_t>(vabs_s64(v.raw));
2458 #else
2459  const auto zero = Zero(Full64<int64_t>());
2460  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2461 #endif
2462 }
2463 
2464 // ------------------------------ Min (IfThenElse, BroadcastSignBit)
2465 
2466 // Unsigned
2468 
2469 template <size_t N>
2470 HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
2471  const Vec128<uint64_t, N> b) {
2472 #if HWY_ARCH_ARM_A64
2473  return IfThenElse(b < a, b, a);
2474 #else
2475  const DFromV<decltype(a)> du;
2476  const RebindToSigned<decltype(du)> di;
2477  return BitCast(du, BitCast(di, a) - BitCast(di, detail::SaturatedSub(a, b)));
2478 #endif
2479 }
2480 
2481 // Signed
2483 
2484 template <size_t N>
2485 HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
2486  const Vec128<int64_t, N> b) {
2487 #if HWY_ARCH_ARM_A64
2488  return IfThenElse(b < a, b, a);
2489 #else
2490  const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
2491  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
2492 #endif
2493 }
2494 
2495 // Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
2496 #if HWY_ARCH_ARM_A64
2498 #else
2500 #endif
2501 
2502 // ------------------------------ Max (IfThenElse, BroadcastSignBit)
2503 
2504 // Unsigned (no u64)
2506 
2507 template <size_t N>
2508 HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
2509  const Vec128<uint64_t, N> b) {
2510 #if HWY_ARCH_ARM_A64
2511  return IfThenElse(b < a, a, b);
2512 #else
2513  const DFromV<decltype(a)> du;
2514  const RebindToSigned<decltype(du)> di;
2515  return BitCast(du, BitCast(di, b) + BitCast(di, detail::SaturatedSub(a, b)));
2516 #endif
2517 }
2518 
2519 // Signed (no i64)
2521 
2522 template <size_t N>
2523 HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
2524  const Vec128<int64_t, N> b) {
2525 #if HWY_ARCH_ARM_A64
2526  return IfThenElse(b < a, a, b);
2527 #else
2528  const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
2529  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
2530 #endif
2531 }
2532 
2533 // Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
2534 #if HWY_ARCH_ARM_A64
2536 #else
2538 #endif
2539 
2540 // ================================================== MEMORY
2541 
2542 // ------------------------------ Load 128
2543 
2545  const uint8_t* HWY_RESTRICT unaligned) {
2546  return Vec128<uint8_t>(vld1q_u8(unaligned));
2547 }
2549  const uint16_t* HWY_RESTRICT unaligned) {
2550  return Vec128<uint16_t>(vld1q_u16(unaligned));
2551 }
2553  const uint32_t* HWY_RESTRICT unaligned) {
2554  return Vec128<uint32_t>(vld1q_u32(unaligned));
2555 }
2557  const uint64_t* HWY_RESTRICT unaligned) {
2558  return Vec128<uint64_t>(vld1q_u64(unaligned));
2559 }
2561  const int8_t* HWY_RESTRICT unaligned) {
2562  return Vec128<int8_t>(vld1q_s8(unaligned));
2563 }
2565  const int16_t* HWY_RESTRICT unaligned) {
2566  return Vec128<int16_t>(vld1q_s16(unaligned));
2567 }
2569  const int32_t* HWY_RESTRICT unaligned) {
2570  return Vec128<int32_t>(vld1q_s32(unaligned));
2571 }
2573  const int64_t* HWY_RESTRICT unaligned) {
2574  return Vec128<int64_t>(vld1q_s64(unaligned));
2575 }
2577  const float* HWY_RESTRICT unaligned) {
2578  return Vec128<float>(vld1q_f32(unaligned));
2579 }
2580 #if HWY_ARCH_ARM_A64
2581 HWY_API Vec128<double> LoadU(Full128<double> /* tag */,
2582  const double* HWY_RESTRICT unaligned) {
2583  return Vec128<double>(vld1q_f64(unaligned));
2584 }
2585 #endif
2586 
2587 // ------------------------------ Load 64
2588 
2590  const uint8_t* HWY_RESTRICT p) {
2591  return Vec64<uint8_t>(vld1_u8(p));
2592 }
2594  const uint16_t* HWY_RESTRICT p) {
2595  return Vec64<uint16_t>(vld1_u16(p));
2596 }
2598  const uint32_t* HWY_RESTRICT p) {
2599  return Vec64<uint32_t>(vld1_u32(p));
2600 }
2602  const uint64_t* HWY_RESTRICT p) {
2603  return Vec64<uint64_t>(vld1_u64(p));
2604 }
2606  const int8_t* HWY_RESTRICT p) {
2607  return Vec64<int8_t>(vld1_s8(p));
2608 }
2610  const int16_t* HWY_RESTRICT p) {
2611  return Vec64<int16_t>(vld1_s16(p));
2612 }
2614  const int32_t* HWY_RESTRICT p) {
2615  return Vec64<int32_t>(vld1_s32(p));
2616 }
2618  const int64_t* HWY_RESTRICT p) {
2619  return Vec64<int64_t>(vld1_s64(p));
2620 }
2622  const float* HWY_RESTRICT p) {
2623  return Vec64<float>(vld1_f32(p));
2624 }
2625 #if HWY_ARCH_ARM_A64
2626 HWY_API Vec64<double> LoadU(Full64<double> /* tag */,
2627  const double* HWY_RESTRICT p) {
2628  return Vec64<double>(vld1_f64(p));
2629 }
2630 #endif
2631 // ------------------------------ Load 32
2632 
2633 // Actual 32-bit broadcast load - used to implement the other lane types
2634 // because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
2636  const uint32_t* HWY_RESTRICT p) {
2637  return Vec32<uint32_t>(vld1_dup_u32(p));
2638 }
2640  const int32_t* HWY_RESTRICT p) {
2641  return Vec32<int32_t>(vld1_dup_s32(p));
2642 }
2644  return Vec32<float>(vld1_dup_f32(p));
2645 }
2646 
2647 template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
2649  const Repartition<uint32_t, decltype(d)> d32;
2650  uint32_t buf;
2651  CopyBytes<4>(p, &buf);
2652  return BitCast(d, LoadU(d32, &buf));
2653 }
2654 
2655 // ------------------------------ Load 16
2656 
2657 // Actual 16-bit broadcast load - used to implement the other lane types
2658 // because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
2660  const uint16_t* HWY_RESTRICT p) {
2661  return Vec128<uint16_t, 1>(vld1_dup_u16(p));
2662 }
2664  const int16_t* HWY_RESTRICT p) {
2665  return Vec128<int16_t, 1>(vld1_dup_s16(p));
2666 }
2667 
2668 template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
2670  const Repartition<uint16_t, decltype(d)> d16;
2671  uint16_t buf;
2672  CopyBytes<2>(p, &buf);
2673  return BitCast(d, LoadU(d16, &buf));
2674 }
2675 
2676 // ------------------------------ Load 8
2677 
2679  const uint8_t* HWY_RESTRICT p) {
2680  return Vec128<uint8_t, 1>(vld1_dup_u8(p));
2681 }
2682 
2684  const int8_t* HWY_RESTRICT p) {
2685  return Vec128<int8_t, 1>(vld1_dup_s8(p));
2686 }
2687 
2688 // [b]float16_t use the same Raw as uint16_t, so forward to that.
2689 template <size_t N>
2691  const float16_t* HWY_RESTRICT p) {
2692  const RebindToUnsigned<decltype(d)> du16;
2693  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2694  return Vec128<float16_t, N>(LoadU(du16, pu16).raw);
2695 }
2696 template <size_t N>
2698  const bfloat16_t* HWY_RESTRICT p) {
2699  const RebindToUnsigned<decltype(d)> du16;
2700  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2701  return Vec128<bfloat16_t, N>(LoadU(du16, pu16).raw);
2702 }
2703 
2704 // On ARM, Load is the same as LoadU.
2705 template <typename T, size_t N>
2707  return LoadU(d, p);
2708 }
2709 
2710 template <typename T, size_t N>
2712  const T* HWY_RESTRICT aligned) {
2713  return IfThenElseZero(m, Load(d, aligned));
2714 }
2715 
2716 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
2717 template <typename T, size_t N, HWY_IF_LE128(T, N)>
2719  const T* const HWY_RESTRICT p) {
2720  return LoadU(d, p);
2721 }
2722 
2723 // ------------------------------ Store 128
2724 
2726  uint8_t* HWY_RESTRICT unaligned) {
2727  vst1q_u8(unaligned, v.raw);
2728 }
2730  uint16_t* HWY_RESTRICT unaligned) {
2731  vst1q_u16(unaligned, v.raw);
2732 }
2734  uint32_t* HWY_RESTRICT unaligned) {
2735  vst1q_u32(unaligned, v.raw);
2736 }
2738  uint64_t* HWY_RESTRICT unaligned) {
2739  vst1q_u64(unaligned, v.raw);
2740 }
2742  int8_t* HWY_RESTRICT unaligned) {
2743  vst1q_s8(unaligned, v.raw);
2744 }
2746  int16_t* HWY_RESTRICT unaligned) {
2747  vst1q_s16(unaligned, v.raw);
2748 }
2750  int32_t* HWY_RESTRICT unaligned) {
2751  vst1q_s32(unaligned, v.raw);
2752 }
2754  int64_t* HWY_RESTRICT unaligned) {
2755  vst1q_s64(unaligned, v.raw);
2756 }
2758  float* HWY_RESTRICT unaligned) {
2759  vst1q_f32(unaligned, v.raw);
2760 }
2761 #if HWY_ARCH_ARM_A64
2762 HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
2763  double* HWY_RESTRICT unaligned) {
2764  vst1q_f64(unaligned, v.raw);
2765 }
2766 #endif
2767 
2768 // ------------------------------ Store 64
2769 
2771  uint8_t* HWY_RESTRICT p) {
2772  vst1_u8(p, v.raw);
2773 }
2775  uint16_t* HWY_RESTRICT p) {
2776  vst1_u16(p, v.raw);
2777 }
2779  uint32_t* HWY_RESTRICT p) {
2780  vst1_u32(p, v.raw);
2781 }
2783  uint64_t* HWY_RESTRICT p) {
2784  vst1_u64(p, v.raw);
2785 }
2787  int8_t* HWY_RESTRICT p) {
2788  vst1_s8(p, v.raw);
2789 }
2791  int16_t* HWY_RESTRICT p) {
2792  vst1_s16(p, v.raw);
2793 }
2795  int32_t* HWY_RESTRICT p) {
2796  vst1_s32(p, v.raw);
2797 }
2799  int64_t* HWY_RESTRICT p) {
2800  vst1_s64(p, v.raw);
2801 }
2803  float* HWY_RESTRICT p) {
2804  vst1_f32(p, v.raw);
2805 }
2806 #if HWY_ARCH_ARM_A64
2807 HWY_API void StoreU(const Vec64<double> v, Full64<double> /* tag */,
2808  double* HWY_RESTRICT p) {
2809  vst1_f64(p, v.raw);
2810 }
2811 #endif
2812 
2813 // ------------------------------ Store 32
2814 
2816  uint32_t* HWY_RESTRICT p) {
2817  vst1_lane_u32(p, v.raw, 0);
2818 }
2820  int32_t* HWY_RESTRICT p) {
2821  vst1_lane_s32(p, v.raw, 0);
2822 }
2824  float* HWY_RESTRICT p) {
2825  vst1_lane_f32(p, v.raw, 0);
2826 }
2827 
2828 template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
2830  const Repartition<uint32_t, decltype(d)> d32;
2831  const uint32_t buf = GetLane(BitCast(d32, v));
2832  CopyBytes<4>(&buf, p);
2833 }
2834 
2835 // ------------------------------ Store 16
2836 
2838  uint16_t* HWY_RESTRICT p) {
2839  vst1_lane_u16(p, v.raw, 0);
2840 }
2842  int16_t* HWY_RESTRICT p) {
2843  vst1_lane_s16(p, v.raw, 0);
2844 }
2845 
2846 template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
2848  const Repartition<uint16_t, decltype(d)> d16;
2849  const uint16_t buf = GetLane(BitCast(d16, v));
2850  CopyBytes<2>(&buf, p);
2851 }
2852 
2853 // ------------------------------ Store 8
2854 
2856  uint8_t* HWY_RESTRICT p) {
2857  vst1_lane_u8(p, v.raw, 0);
2858 }
2860  int8_t* HWY_RESTRICT p) {
2861  vst1_lane_s8(p, v.raw, 0);
2862 }
2863 
2864 // [b]float16_t use the same Raw as uint16_t, so forward to that.
2865 template <size_t N>
2867  float16_t* HWY_RESTRICT p) {
2868  const RebindToUnsigned<decltype(d)> du16;
2869  const auto pu16 = reinterpret_cast<uint16_t*>(p);
2870  return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2871 }
2872 template <size_t N>
2874  bfloat16_t* HWY_RESTRICT p) {
2875  const RebindToUnsigned<decltype(d)> du16;
2876  const auto pu16 = reinterpret_cast<uint16_t*>(p);
2877  return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2878 }
2879 
2880 // On ARM, Store is the same as StoreU.
2881 template <typename T, size_t N>
2883  StoreU(v, d, aligned);
2884 }
2885 
2886 template <typename T, size_t N>
2888  T* HWY_RESTRICT p) {
2889  // Treat as unsigned so that we correctly support float16.
2890  const RebindToUnsigned<decltype(d)> du;
2891  const auto blended =
2892  IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
2893  StoreU(BitCast(d, blended), d, p);
2894 }
2895 
2896 // ------------------------------ Non-temporal stores
2897 
2898 // Same as aligned stores on non-x86.
2899 
2900 template <typename T, size_t N>
2902  T* HWY_RESTRICT aligned) {
2903  Store(v, d, aligned);
2904 }
2905 
2906 // ================================================== CONVERT
2907 
2908 // ------------------------------ Promotions (part w/ narrow lanes -> full)
2909 
2910 // Unsigned: zero-extend to full vector.
2912  const Vec64<uint8_t> v) {
2913  return Vec128<uint16_t>(vmovl_u8(v.raw));
2914 }
2916  const Vec32<uint8_t> v) {
2917  uint16x8_t a = vmovl_u8(v.raw);
2918  return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
2919 }
2921  const Vec64<uint16_t> v) {
2922  return Vec128<uint32_t>(vmovl_u16(v.raw));
2923 }
2925  const Vec64<uint32_t> v) {
2926  return Vec128<uint64_t>(vmovl_u32(v.raw));
2927 }
2929  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
2930 }
2932  uint16x8_t a = vmovl_u8(v.raw);
2933  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
2934 }
2936  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
2937 }
2938 
2939 // Unsigned: zero-extend to half vector.
2940 template <size_t N, HWY_IF_LE64(uint16_t, N)>
2942  const Vec128<uint8_t, N> v) {
2943  return Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw)));
2944 }
2945 template <size_t N, HWY_IF_LE64(uint32_t, N)>
2947  const Vec128<uint8_t, N> v) {
2948  uint16x8_t a = vmovl_u8(v.raw);
2949  return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(vget_low_u16(a))));
2950 }
2951 template <size_t N>
2953  const Vec128<uint16_t, N> v) {
2954  return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(v.raw)));
2955 }
2956 template <size_t N, HWY_IF_LE64(uint64_t, N)>
2958  const Vec128<uint32_t, N> v) {
2959  return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
2960 }
2961 template <size_t N, HWY_IF_LE64(int16_t, N)>
2963  const Vec128<uint8_t, N> v) {
2964  return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
2965 }
2966 template <size_t N, HWY_IF_LE64(int32_t, N)>
2968  const Vec128<uint8_t, N> v) {
2969  uint16x8_t a = vmovl_u8(v.raw);
2970  uint32x4_t b = vmovl_u16(vget_low_u16(a));
2971  return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(b)));
2972 }
2973 template <size_t N, HWY_IF_LE64(int32_t, N)>
2975  const Vec128<uint16_t, N> v) {
2976  uint32x4_t a = vmovl_u16(v.raw);
2977  return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(a)));
2978 }
2979 
2980 // Signed: replicate sign bit to full vector.
2982  const Vec64<int8_t> v) {
2983  return Vec128<int16_t>(vmovl_s8(v.raw));
2984 }
2986  const Vec32<int8_t> v) {
2987  int16x8_t a = vmovl_s8(v.raw);
2988  return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
2989 }
2991  const Vec64<int16_t> v) {
2992  return Vec128<int32_t>(vmovl_s16(v.raw));
2993 }
2995  const Vec64<int32_t> v) {
2996  return Vec128<int64_t>(vmovl_s32(v.raw));
2997 }
2998 
2999 // Signed: replicate sign bit to half vector.
3000 template <size_t N>
3002  const Vec128<int8_t, N> v) {
3003  return Vec128<int16_t, N>(vget_low_s16(vmovl_s8(v.raw)));
3004 }
3005 template <size_t N>
3007  const Vec128<int8_t, N> v) {
3008  int16x8_t a = vmovl_s8(v.raw);
3009  int32x4_t b = vmovl_s16(vget_low_s16(a));
3010  return Vec128<int32_t, N>(vget_low_s32(b));
3011 }
3012 template <size_t N>
3014  const Vec128<int16_t, N> v) {
3015  return Vec128<int32_t, N>(vget_low_s32(vmovl_s16(v.raw)));
3016 }
3017 template <size_t N>
3019  const Vec128<int32_t, N> v) {
3020  return Vec128<int64_t, N>(vget_low_s64(vmovl_s32(v.raw)));
3021 }
3022 
3023 #if __ARM_FP & 2
3024 
3025 HWY_API Vec128<float> PromoteTo(Full128<float> /* tag */,
3026  const Vec128<float16_t, 4> v) {
3027  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
3028  return Vec128<float>(f32);
3029 }
3030 template <size_t N>
3031 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
3032  const Vec128<float16_t, N> v) {
3033  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
3034  return Vec128<float, N>(vget_low_f32(f32));
3035 }
3036 
3037 #else
3038 
3039 template <size_t N>
3041  const Vec128<float16_t, N> v) {
3042  const RebindToSigned<decltype(df32)> di32;
3043  const RebindToUnsigned<decltype(df32)> du32;
3044  // Expand to u32 so we can shift.
3045  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
3046  const auto sign = ShiftRight<15>(bits16);
3047  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
3048  const auto mantissa = bits16 & Set(du32, 0x3FF);
3049  const auto subnormal =
3050  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
3051  Set(df32, 1.0f / 16384 / 1024));
3052 
3053  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
3054  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
3055  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3056  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
3057  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3058 }
3059 
3060 #endif
3061 
3062 #if HWY_ARCH_ARM_A64
3063 
3064 HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
3065  const Vec64<float> v) {
3066  return Vec128<double>(vcvt_f64_f32(v.raw));
3067 }
3068 
3069 HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
3070  const Vec32<float> v) {
3071  return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
3072 }
3073 
3074 HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
3075  const Vec64<int32_t> v) {
3076  const int64x2_t i64 = vmovl_s32(v.raw);
3077  return Vec128<double>(vcvtq_f64_s64(i64));
3078 }
3079 
3080 HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
3081  const Vec32<int32_t> v) {
3082  const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw));
3083  return Vec64<double>(vcvt_f64_s64(i64));
3084 }
3085 
3086 #endif
3087 
3088 // ------------------------------ Demotions (full -> part w/ narrow lanes)
3089 
3090 // From full vector to half or quarter
3092  const Vec128<int32_t> v) {
3093  return Vec64<uint16_t>(vqmovun_s32(v.raw));
3094 }
3096  const Vec128<int32_t> v) {
3097  return Vec64<int16_t>(vqmovn_s32(v.raw));
3098 }
3100  const Vec128<int32_t> v) {
3101  const uint16x4_t a = vqmovun_s32(v.raw);
3102  return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
3103 }
3105  const Vec128<int16_t> v) {
3106  return Vec64<uint8_t>(vqmovun_s16(v.raw));
3107 }
3109  const Vec128<int32_t> v) {
3110  const int16x4_t a = vqmovn_s32(v.raw);
3111  return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a)));
3112 }
3114  const Vec128<int16_t> v) {
3115  return Vec64<int8_t>(vqmovn_s16(v.raw));
3116 }
3117 
3118 // From half vector to partial half
3119 template <size_t N, HWY_IF_LE64(int32_t, N)>
3121  const Vec128<int32_t, N> v) {
3122  return Vec128<uint16_t, N>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
3123 }
3124 template <size_t N, HWY_IF_LE64(int32_t, N)>
3126  const Vec128<int32_t, N> v) {
3127  return Vec128<int16_t, N>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
3128 }
3129 template <size_t N, HWY_IF_LE64(int32_t, N)>
3131  const Vec128<int32_t, N> v) {
3132  const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
3133  return Vec128<uint8_t, N>(vqmovn_u16(vcombine_u16(a, a)));
3134 }
3135 template <size_t N, HWY_IF_LE64(int16_t, N)>
3137  const Vec128<int16_t, N> v) {
3138  return Vec128<uint8_t, N>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
3139 }
3140 template <size_t N, HWY_IF_LE64(int32_t, N)>
3142  const Vec128<int32_t, N> v) {
3143  const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
3144  return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(a, a)));
3145 }
3146 template <size_t N, HWY_IF_LE64(int16_t, N)>
3148  const Vec128<int16_t, N> v) {
3149  return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
3150 }
3151 
3152 #if __ARM_FP & 2
3153 
3154 HWY_API Vec128<float16_t, 4> DemoteTo(Full64<float16_t> /* tag */,
3155  const Vec128<float> v) {
3156  return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
3157 }
3158 template <size_t N>
3159 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
3160  const Vec128<float, N> v) {
3161  const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
3162  return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
3163 }
3164 
3165 #else
3166 
3167 template <size_t N>
3169  const Vec128<float, N> v) {
3170  const RebindToUnsigned<decltype(df16)> du16;
3171  const Rebind<uint32_t, decltype(du16)> du;
3172  const RebindToSigned<decltype(du)> di;
3173  const auto bits32 = BitCast(du, v);
3174  const auto sign = ShiftRight<31>(bits32);
3175  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3176  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3177 
3178  const auto k15 = Set(di, 15);
3179  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3180  const auto is_tiny = exp < Set(di, -24);
3181 
3182  const auto is_subnormal = exp < Set(di, -14);
3183  const auto biased_exp16 =
3184  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3185  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3186  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3187  (mantissa32 >> (Set(du, 13) + sub_exp));
3188  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3189  ShiftRight<13>(mantissa32)); // <1024
3190 
3191  const auto sign16 = ShiftLeft<15>(sign);
3192  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3193  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3194  return Vec128<float16_t, N>(DemoteTo(du16, bits16).raw);
3195 }
3196 
3197 #endif
3198 
3199 template <size_t N>
3201  const Vec128<float, N> v) {
3202  const Rebind<int32_t, decltype(dbf16)> di32;
3203  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3204  const Rebind<uint16_t, decltype(dbf16)> du16;
3205  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3206  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3207 }
3208 
3209 #if HWY_ARCH_ARM_A64
3210 
3211 HWY_API Vec64<float> DemoteTo(Full64<float> /* tag */, const Vec128<double> v) {
3212  return Vec64<float>(vcvt_f32_f64(v.raw));
3213 }
3214 HWY_API Vec32<float> DemoteTo(Full32<float> /* tag */, const Vec64<double> v) {
3215  return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
3216 }
3217 
3218 HWY_API Vec64<int32_t> DemoteTo(Full64<int32_t> /* tag */,
3219  const Vec128<double> v) {
3220  const int64x2_t i64 = vcvtq_s64_f64(v.raw);
3221  return Vec64<int32_t>(vqmovn_s64(i64));
3222 }
3223 HWY_API Vec32<int32_t> DemoteTo(Full32<int32_t> /* tag */,
3224  const Vec64<double> v) {
3225  const int64x1_t i64 = vcvt_s64_f64(v.raw);
3226  // There is no i64x1 -> i32x1 narrow, so expand to int64x2_t first.
3227  const int64x2_t i64x2 = vcombine_s64(i64, i64);
3228  return Vec32<int32_t>(vqmovn_s64(i64x2));
3229 }
3230 
3231 #endif
3232 
3234  const uint8x16_t org_v = detail::BitCastToByte(v).raw;
3235  const uint8x16_t w = vuzp1q_u8(org_v, org_v);
3236  return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w)));
3237 }
3238 template <size_t N, HWY_IF_LE64(uint32_t, N)>
3240  const uint8x8_t org_v = detail::BitCastToByte(v).raw;
3241  const uint8x8_t w = vuzp1_u8(org_v, org_v);
3242  return Vec128<uint8_t, N>(vuzp1_u8(w, w));
3243 }
3244 
3245 // In the following DemoteTo functions, |b| is purposely undefined.
3246 // The value a needs to be extended to 128 bits so that vqmovn can be
3247 // used and |b| is undefined so that no extra overhead is introduced.
3248 HWY_DIAGNOSTICS(push)
3249 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
3250 
3251 template <size_t N>
3253  const Vec128<int32_t> v) {
3256  uint16x8_t c = vcombine_u16(a.raw, b.raw);
3257  return Vec128<uint8_t, N>(vqmovn_u16(c));
3258 }
3259 
3260 template <size_t N>
3262  const Vec128<int32_t> v) {
3265  int16x8_t c = vcombine_s16(a.raw, b.raw);
3266  return Vec128<int8_t, N>(vqmovn_s16(c));
3267 }
3268 
3269 HWY_DIAGNOSTICS(pop)
3270 
3271 // ------------------------------ Convert integer <=> floating-point
3272 
3273 HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
3274  const Vec128<int32_t> v) {
3275  return Vec128<float>(vcvtq_f32_s32(v.raw));
3276 }
3277 template <size_t N, HWY_IF_LE64(int32_t, N)>
3279  const Vec128<int32_t, N> v) {
3280  return Vec128<float, N>(vcvt_f32_s32(v.raw));
3281 }
3282 
3283 // Truncates (rounds toward zero).
3285  const Vec128<float> v) {
3286  return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
3287 }
3288 template <size_t N, HWY_IF_LE64(float, N)>
3290  const Vec128<float, N> v) {
3291  return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
3292 }
3293 
3294 #if HWY_ARCH_ARM_A64
3295 
3296 HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
3297  const Vec128<int64_t> v) {
3298  return Vec128<double>(vcvtq_f64_s64(v.raw));
3299 }
3300 HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
3301  const Vec64<int64_t> v) {
3302  return Vec64<double>(vcvt_f64_s64(v.raw));
3303 }
3304 
3305 // Truncates (rounds toward zero).
3306 HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
3307  const Vec128<double> v) {
3308  return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
3309 }
3310 HWY_API Vec64<int64_t> ConvertTo(Full64<int64_t> /* tag */,
3311  const Vec64<double> v) {
3312  return Vec64<int64_t>(vcvt_s64_f64(v.raw));
3313 }
3314 
3315 #endif
3316 
3317 // ------------------------------ Round (IfThenElse, mask, logical)
3318 
3319 #if HWY_ARCH_ARM_A64
3320 // Toward nearest integer
3322 
3323 // Toward zero, aka truncate
3325 
3326 // Toward +infinity, aka ceiling
3328 
3329 // Toward -infinity, aka floor
3331 #else
3332 
3333 // ------------------------------ Trunc
3334 
3335 // ARMv7 only supports truncation to integer. We can either convert back to
3336 // float (3 floating-point and 2 logic operations) or manipulate the binary32
3337 // representation, clearing the lowest 23-exp mantissa bits. This requires 9
3338 // integer operations and 3 constants, which is likely more expensive.
3339 
3340 namespace detail {
3341 
3342 // The original value is already the desired result if NaN or the magnitude is
3343 // large (i.e. the value is already an integer).
3344 template <size_t N>
3346  return Abs(v) < Set(Simd<float, N, 0>(), MantissaEnd<float>());
3347 }
3348 
3349 } // namespace detail
3350 
3351 template <size_t N>
3353  const DFromV<decltype(v)> df;
3354  const RebindToSigned<decltype(df)> di;
3355 
3356  const auto integer = ConvertTo(di, v); // round toward 0
3357  const auto int_f = ConvertTo(df, integer);
3358 
3359  return IfThenElse(detail::UseInt(v), int_f, v);
3360 }
3361 
3362 template <size_t N>
3364  const DFromV<decltype(v)> df;
3365 
3366  // ARMv7 also lacks a native NearestInt, but we can instead rely on rounding
3367  // (we assume the current mode is nearest-even) after addition with a large
3368  // value such that no mantissa bits remain. We may need a compiler flag for
3369  // precise floating-point to prevent this from being "optimized" out.
3370  const auto max = Set(df, MantissaEnd<float>());
3371  const auto large = CopySignToAbs(max, v);
3372  const auto added = large + v;
3373  const auto rounded = added - large;
3374 
3375  // Keep original if NaN or the magnitude is large (already an int).
3376  return IfThenElse(Abs(v) < max, rounded, v);
3377 }
3378 
3379 template <size_t N>
3381  const DFromV<decltype(v)> df;
3382  const RebindToSigned<decltype(df)> di;
3383 
3384  const auto integer = ConvertTo(di, v); // round toward 0
3385  const auto int_f = ConvertTo(df, integer);
3386 
3387  // Truncating a positive non-integer ends up smaller; if so, add 1.
3388  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
3389 
3390  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
3391 }
3392 
3393 template <size_t N>
3395  const DFromV<decltype(v)> df;
3396  const RebindToSigned<decltype(df)> di;
3397 
3398  const auto integer = ConvertTo(di, v); // round toward 0
3399  const auto int_f = ConvertTo(df, integer);
3400 
3401  // Truncating a negative non-integer ends up larger; if so, subtract 1.
3402  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
3403 
3404  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
3405 }
3406 
3407 #endif
3408 
3409 // ------------------------------ NearestInt (Round)
3410 
3411 #if HWY_ARCH_ARM_A64
3412 
3413 HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
3414  return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
3415 }
3416 template <size_t N, HWY_IF_LE64(float, N)>
3417 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
3418  return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
3419 }
3420 
3421 #else
3422 
3423 template <size_t N>
3425  const RebindToSigned<DFromV<decltype(v)>> di;
3426  return ConvertTo(di, Round(v));
3427 }
3428 
3429 #endif
3430 
3431 // ------------------------------ Floating-point classification
3432 template <typename T, size_t N>
3434  return v != v;
3435 }
3436 
3437 template <typename T, size_t N, HWY_IF_FLOAT(T)>
3439  const Simd<T, N, 0> d;
3440  const RebindToSigned<decltype(d)> di;
3441  const VFromD<decltype(di)> vi = BitCast(di, v);
3442  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
3443  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
3444 }
3445 
3446 // Returns whether normal/subnormal/zero.
3447 template <typename T, size_t N, HWY_IF_FLOAT(T)>
3449  const Simd<T, N, 0> d;
3450  const RebindToUnsigned<decltype(d)> du;
3451  const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
3452  const VFromD<decltype(du)> vu = BitCast(du, v);
3453  // 'Shift left' to clear the sign bit, then right so we can compare with the
3454  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
3455  // negative and non-negative floats would be greater).
3456  const VFromD<decltype(di)> exp =
3457  BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
3458  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
3459 }
3460 
3461 // ================================================== SWIZZLE
3462 
3463 // ------------------------------ LowerHalf
3464 
3465 // <= 64 bit: just return different type
3466 template <typename T, size_t N, HWY_IF_LE64(uint8_t, N)>
3468  return Vec128<T, N / 2>(v.raw);
3469 }
3470 
3472  return Vec64<uint8_t>(vget_low_u8(v.raw));
3473 }
3475  return Vec64<uint16_t>(vget_low_u16(v.raw));
3476 }
3478  return Vec64<uint32_t>(vget_low_u32(v.raw));
3479 }
3481  return Vec64<uint64_t>(vget_low_u64(v.raw));
3482 }
3484  return Vec64<int8_t>(vget_low_s8(v.raw));
3485 }
3487  return Vec64<int16_t>(vget_low_s16(v.raw));
3488 }
3490  return Vec64<int32_t>(vget_low_s32(v.raw));
3491 }
3493  return Vec64<int64_t>(vget_low_s64(v.raw));
3494 }
3496  return Vec64<float>(vget_low_f32(v.raw));
3497 }
3498 #if HWY_ARCH_ARM_A64
3499 HWY_API Vec64<double> LowerHalf(const Vec128<double> v) {
3500  return Vec64<double>(vget_low_f64(v.raw));
3501 }
3502 #endif
3503 
3504 template <typename T, size_t N>
3506  Vec128<T, N> v) {
3507  return LowerHalf(v);
3508 }
3509 
3510 // ------------------------------ CombineShiftRightBytes
3511 
3512 // 128-bit
3513 template <int kBytes, typename T, class V128 = Vec128<T>>
3514 HWY_API V128 CombineShiftRightBytes(Full128<T> d, V128 hi, V128 lo) {
3515  static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
3516  const Repartition<uint8_t, decltype(d)> d8;
3517  uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
3518  return BitCast(d, Vec128<uint8_t>(v8));
3519 }
3520 
3521 // 64-bit
3522 template <int kBytes, typename T>
3524  static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]");
3525  const Repartition<uint8_t, decltype(d)> d8;
3526  uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
3527  return BitCast(d, VFromD<decltype(d8)>(v8));
3528 }
3529 
3530 // <= 32-bit defined after ShiftLeftBytes.
3531 
3532 // ------------------------------ Shift vector by constant #bytes
3533 
3534 namespace detail {
3535 
3536 // Partially specialize because kBytes = 0 and >= size are compile errors;
3537 // callers replace the latter with 0xFF for easier specialization.
3538 template <int kBytes>
3540  // Full
3541  template <class T>
3543  const Full128<T> d;
3544  return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d));
3545  }
3546 
3547  // Partial
3548  template <class T, size_t N, HWY_IF_LE64(T, N)>
3550  // Expand to 64-bit so we only use the native EXT instruction.
3551  const Full64<T> d64;
3552  const auto zero64 = Zero(d64);
3553  const decltype(zero64) v64(v.raw);
3554  return Vec128<T, N>(
3555  CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
3556  }
3557 };
3558 template <>
3559 struct ShiftLeftBytesT<0> {
3560  template <class T, size_t N>
3562  return v;
3563  }
3564 };
3565 template <>
3566 struct ShiftLeftBytesT<0xFF> {
3567  template <class T, size_t N>
3569  return Zero(Simd<T, N, 0>());
3570  }
3571 };
3572 
3573 template <int kBytes>
3575  template <class T, size_t N>
3577  const Simd<T, N, 0> d;
3578  // For < 64-bit vectors, zero undefined lanes so we shift in zeros.
3579  if (N * sizeof(T) < 8) {
3580  constexpr size_t kReg = N * sizeof(T) == 16 ? 16 : 8;
3581  const Simd<T, kReg / sizeof(T), 0> dreg;
3582  v = Vec128<T, N>(
3583  IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw);
3584  }
3585  return CombineShiftRightBytes<kBytes>(d, Zero(d), v);
3586  }
3587 };
3588 template <>
3589 struct ShiftRightBytesT<0> {
3590  template <class T, size_t N>
3592  return v;
3593  }
3594 };
3595 template <>
3596 struct ShiftRightBytesT<0xFF> {
3597  template <class T, size_t N>
3599  return Zero(Simd<T, N, 0>());
3600  }
3601 };
3602 
3603 } // namespace detail
3604 
3605 template <int kBytes, typename T, size_t N>
3607  return detail::ShiftLeftBytesT < kBytes >= N * sizeof(T) ? 0xFF
3608  : kBytes > ()(v);
3609 }
3610 
3611 template <int kBytes, typename T, size_t N>
3613  return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
3614 }
3615 
3616 template <int kLanes, typename T, size_t N>
3618  const Repartition<uint8_t, decltype(d)> d8;
3619  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3620 }
3621 
3622 template <int kLanes, typename T, size_t N>
3624  return ShiftLeftLanes<kLanes>(Simd<T, N, 0>(), v);
3625 }
3626 
3627 // 0x01..0F, kBytes = 1 => 0x0001..0E
3628 template <int kBytes, typename T, size_t N>
3630  return detail::ShiftRightBytesT < kBytes >= N * sizeof(T) ? 0xFF
3631  : kBytes > ()(v);
3632 }
3633 
3634 template <int kLanes, typename T, size_t N>
3636  const Repartition<uint8_t, decltype(d)> d8;
3637  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
3638 }
3639 
3640 // Calls ShiftLeftBytes
3641 template <int kBytes, typename T, size_t N, HWY_IF_LE32(T, N)>
3643  Vec128<T, N> lo) {
3644  constexpr size_t kSize = N * sizeof(T);
3645  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3646  const Repartition<uint8_t, decltype(d)> d8;
3647  const Full64<uint8_t> d_full8;
3648  const Repartition<T, decltype(d_full8)> d_full;
3649  using V64 = VFromD<decltype(d_full8)>;
3650  const V64 hi64(BitCast(d8, hi).raw);
3651  // Move into most-significant bytes
3652  const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw));
3653  const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64);
3654  // After casting to full 64-bit vector of correct type, shrink to 32-bit
3655  return Vec128<T, N>(BitCast(d_full, r).raw);
3656 }
3657 
3658 // ------------------------------ UpperHalf (ShiftRightBytes)
3659 
3660 // Full input
3662  const Vec128<uint8_t> v) {
3663  return Vec64<uint8_t>(vget_high_u8(v.raw));
3664 }
3666  const Vec128<uint16_t> v) {
3667  return Vec64<uint16_t>(vget_high_u16(v.raw));
3668 }
3670  const Vec128<uint32_t> v) {
3671  return Vec64<uint32_t>(vget_high_u32(v.raw));
3672 }
3674  const Vec128<uint64_t> v) {
3675  return Vec64<uint64_t>(vget_high_u64(v.raw));
3676 }
3678  const Vec128<int8_t> v) {
3679  return Vec64<int8_t>(vget_high_s8(v.raw));
3680 }
3682  const Vec128<int16_t> v) {
3683  return Vec64<int16_t>(vget_high_s16(v.raw));
3684 }
3686  const Vec128<int32_t> v) {
3687  return Vec64<int32_t>(vget_high_s32(v.raw));
3688 }
3690  const Vec128<int64_t> v) {
3691  return Vec64<int64_t>(vget_high_s64(v.raw));
3692 }
3694  return Vec64<float>(vget_high_f32(v.raw));
3695 }
3696 #if HWY_ARCH_ARM_A64
3697 HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */,
3698  const Vec128<double> v) {
3699  return Vec64<double>(vget_high_f64(v.raw));
3700 }
3701 #endif
3702 
3703 // Partial
3704 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3705 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
3706  Vec128<T, N> v) {
3707  const DFromV<decltype(v)> d;
3708  const RebindToUnsigned<decltype(d)> du;
3709  const auto vu = BitCast(du, v);
3710  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
3711  return Vec128<T, (N + 1) / 2>(upper.raw);
3712 }
3713 
3714 // ------------------------------ Broadcast/splat any lane
3715 
3716 #if HWY_ARCH_ARM_A64
3717 // Unsigned
3718 template <int kLane>
3719 HWY_API Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
3720  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3721  return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
3722 }
3723 template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3724 HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
3725  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3726  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3727 }
3728 template <int kLane>
3729 HWY_API Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) {
3730  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3731  return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
3732 }
3733 template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3734 HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
3735  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3736  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3737 }
3738 template <int kLane>
3739 HWY_API Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) {
3740  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3741  return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
3742 }
3743 // Vec64<uint64_t> is defined below.
3744 
3745 // Signed
3746 template <int kLane>
3747 HWY_API Vec128<int16_t> Broadcast(const Vec128<int16_t> v) {
3748  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3749  return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
3750 }
3751 template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3752 HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
3753  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3754  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3755 }
3756 template <int kLane>
3757 HWY_API Vec128<int32_t> Broadcast(const Vec128<int32_t> v) {
3758  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3759  return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
3760 }
3761 template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3762 HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
3763  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3764  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3765 }
3766 template <int kLane>
3767 HWY_API Vec128<int64_t> Broadcast(const Vec128<int64_t> v) {
3768  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3769  return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
3770 }
3771 // Vec64<int64_t> is defined below.
3772 
3773 // Float
3774 template <int kLane>
3775 HWY_API Vec128<float> Broadcast(const Vec128<float> v) {
3776  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3777  return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
3778 }
3779 template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3780 HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
3781  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3782  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3783 }
3784 template <int kLane>
3785 HWY_API Vec128<double> Broadcast(const Vec128<double> v) {
3786  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3787  return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
3788 }
3789 template <int kLane>
3790 HWY_API Vec64<double> Broadcast(const Vec64<double> v) {
3791  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3792  return v;
3793 }
3794 
3795 #else
3796 // No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
3797 
3798 // Unsigned
3799 template <int kLane>
3801  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3802  return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
3803 }
3804 template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3806  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3807  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3808 }
3809 template <int kLane>
3811  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3812  return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
3813 }
3814 template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3816  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3817  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3818 }
3819 template <int kLane>
3821  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3822  return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
3823 }
3824 // Vec64<uint64_t> is defined below.
3825 
3826 // Signed
3827 template <int kLane>
3829  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3830  return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
3831 }
3832 template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3834  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3835  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3836 }
3837 template <int kLane>
3839  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3840  return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
3841 }
3842 template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3844  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3845  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3846 }
3847 template <int kLane>
3849  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3850  return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
3851 }
3852 // Vec64<int64_t> is defined below.
3853 
3854 // Float
3855 template <int kLane>
3857  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3858  return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
3859 }
3860 template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3862  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3863  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3864 }
3865 
3866 #endif
3867 
3868 template <int kLane>
3870  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3871  return v;
3872 }
3873 template <int kLane>
3875  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3876  return v;
3877 }
3878 
3879 // ------------------------------ TableLookupLanes
3880 
3881 // Returned by SetTableIndices for use by TableLookupLanes.
3882 template <typename T, size_t N>
3883 struct Indices128 {
3885 };
3886 
3887 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
3889  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3890 #if HWY_IS_DEBUG_BUILD
3891  const Rebind<TI, decltype(d)> di;
3892  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
3893  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
3894 #endif
3895 
3896  const Repartition<uint8_t, decltype(d)> d8;
3897  using V8 = VFromD<decltype(d8)>;
3898  const Repartition<uint16_t, decltype(d)> d16;
3899 
3900  // Broadcast each lane index to all bytes of T and shift to bytes
3901  static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
3902  if (sizeof(T) == 4) {
3903  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3904  0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3905  const V8 lane_indices =
3906  TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
3907  const V8 byte_indices =
3908  BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
3909  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3910  0, 1, 2, 3, 0, 1, 2, 3};
3911  const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
3912  return Indices128<T, N>{BitCast(d, sum).raw};
3913  } else {
3914  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3915  0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3916  const V8 lane_indices =
3917  TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
3918  const V8 byte_indices =
3919  BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
3920  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
3921  0, 1, 2, 3, 4, 5, 6, 7};
3922  const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
3923  return Indices128<T, N>{BitCast(d, sum).raw};
3924  }
3925 }
3926 
3927 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
3929  const Rebind<TI, decltype(d)> di;
3930  return IndicesFromVec(d, LoadU(di, idx));
3931 }
3932 
3933 template <typename T, size_t N>
3935  const DFromV<decltype(v)> d;
3936  const RebindToSigned<decltype(d)> di;
3937  return BitCast(
3938  d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw})));
3939 }
3940 
3941 // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
3942 
3943 // Single lane: no change
3944 template <typename T>
3946  return v;
3947 }
3948 
3949 // Two lanes: shuffle
3950 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3952  return Vec128<T, 2>(Shuffle2301(v));
3953 }
3954 
3955 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3957  return Shuffle01(v);
3958 }
3959 
3960 // Four lanes: shuffle
3961 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3962 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
3963  return Shuffle0123(v);
3964 }
3965 
3966 // 16-bit
3967 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3969  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
3970  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
3971 }
3972 
3973 // ------------------------------ Reverse2
3974 
3975 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
3977  const RebindToUnsigned<decltype(d)> du;
3978  return BitCast(d, Vec128<uint16_t, N>(vrev32_u16(BitCast(du, v).raw)));
3979 }
3980 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3982  const RebindToUnsigned<decltype(d)> du;
3983  return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw)));
3984 }
3985 
3986 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
3987 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
3988  const RebindToUnsigned<decltype(d)> du;
3989  return BitCast(d, Vec128<uint32_t, N>(vrev64_u32(BitCast(du, v).raw)));
3990 }
3991 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3992 HWY_API Vec128<T> Reverse2(Full128<T> d, const Vec128<T> v) {
3993  const RebindToUnsigned<decltype(d)> du;
3994  return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw)));
3995 }
3996 
3997 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3999  return Shuffle01(v);
4000 }
4001 
4002 // ------------------------------ Reverse4
4003 
4004 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4006  const RebindToUnsigned<decltype(d)> du;
4007  return BitCast(d, Vec128<uint16_t, N>(vrev64_u16(BitCast(du, v).raw)));
4008 }
4009 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4011  const RebindToUnsigned<decltype(d)> du;
4012  return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw)));
4013 }
4014 
4015 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4017  return Shuffle0123(v);
4018 }
4019 
4020 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4021 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
4022  HWY_ASSERT(0); // don't have 8 u64 lanes
4023 }
4024 
4025 // ------------------------------ Reverse8
4026 
4027 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4029  return Reverse(d, v);
4030 }
4031 
4032 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4033 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
4034  HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
4035 }
4036 
4037 // ------------------------------ Other shuffles (TableLookupBytes)
4038 
4039 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
4040 // Shuffle0321 rotates one lane to the right (the previous least-significant
4041 // lane is now most-significant). These could also be implemented via
4042 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
4043 
4044 // Swap 64-bit halves
4045 template <typename T>
4047  return CombineShiftRightBytes<8>(Full128<T>(), v, v);
4048 }
4049 template <typename T>
4051  return CombineShiftRightBytes<8>(Full128<T>(), v, v);
4052 }
4053 
4054 // Rotate right 32 bits
4055 template <typename T>
4057  return CombineShiftRightBytes<4>(Full128<T>(), v, v);
4058 }
4059 
4060 // Rotate left 32 bits
4061 template <typename T>
4063  return CombineShiftRightBytes<12>(Full128<T>(), v, v);
4064 }
4065 
4066 // Reverse
4067 template <typename T>
4069  return Shuffle2301(Shuffle1032(v));
4070 }
4071 
4072 // ------------------------------ InterleaveLower
4073 
4074 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
4075 // the least-significant lane) and "b". To concatenate two half-width integers
4076 // into one, use ZipLower/Upper instead (also works with scalar).
4079 
4080 #if HWY_ARCH_ARM_A64
4081 // N=1 makes no sense (in that case, there would be no upper/lower).
4082 HWY_API Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
4083  const Vec128<uint64_t> b) {
4084  return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
4085 }
4086 HWY_API Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a,
4087  const Vec128<int64_t> b) {
4088  return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
4089 }
4090 HWY_API Vec128<double> InterleaveLower(const Vec128<double> a,
4091  const Vec128<double> b) {
4092  return Vec128<double>(vzip1q_f64(a.raw, b.raw));
4093 }
4094 #else
4095 // ARMv7 emulation.
4097  const Vec128<uint64_t> b) {
4098  return CombineShiftRightBytes<8>(Full128<uint64_t>(), b, Shuffle01(a));
4099 }
4101  const Vec128<int64_t> b) {
4102  return CombineShiftRightBytes<8>(Full128<int64_t>(), b, Shuffle01(a));
4103 }
4104 #endif
4105 
4106 // Floats
4108  const Vec128<float> b) {
4109  return Vec128<float>(vzip1q_f32(a.raw, b.raw));
4110 }
4111 template <size_t N, HWY_IF_LE64(float, N)>
4113  const Vec128<float, N> b) {
4114  return Vec128<float, N>(vzip1_f32(a.raw, b.raw));
4115 }
4116 
4117 // < 64 bit parts
4118 template <typename T, size_t N, HWY_IF_LE32(T, N)>
4120  return Vec128<T, N>(InterleaveLower(Vec64<T>(a.raw), Vec64<T>(b.raw)).raw);
4121 }
4122 
4123 // Additional overload for the optional Simd<> tag.
4124 template <typename T, size_t N, class V = Vec128<T, N>>
4125 HWY_API V InterleaveLower(Simd<T, N, 0> /* tag */, V a, V b) {
4126  return InterleaveLower(a, b);
4127 }
4128 
4129 // ------------------------------ InterleaveUpper (UpperHalf)
4130 
4131 // All functions inside detail lack the required D parameter.
4132 namespace detail {
4135 
4136 #if HWY_ARCH_ARM_A64
4137 // N=1 makes no sense (in that case, there would be no upper/lower).
4138 HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
4139  const Vec128<uint64_t> b) {
4140  return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
4141 }
4142 HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) {
4143  return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
4144 }
4145 HWY_API Vec128<double> InterleaveUpper(Vec128<double> a, Vec128<double> b) {
4146  return Vec128<double>(vzip2q_f64(a.raw, b.raw));
4147 }
4148 #else
4149 // ARMv7 emulation.
4151  const Vec128<uint64_t> b) {
4152  return CombineShiftRightBytes<8>(Full128<uint64_t>(), Shuffle01(b), a);
4153 }
4155  return CombineShiftRightBytes<8>(Full128<int64_t>(), Shuffle01(b), a);
4156 }
4157 #endif
4158 
4160  return Vec128<float>(vzip2q_f32(a.raw, b.raw));
4161 }
4163  const Vec64<float> b) {
4164  return Vec64<float>(vzip2_f32(a.raw, b.raw));
4165 }
4166 
4167 } // namespace detail
4168 
4169 // Full register
4170 template <typename T, size_t N, HWY_IF_GE64(T, N), class V = Vec128<T, N>>
4171 HWY_API V InterleaveUpper(Simd<T, N, 0> /* tag */, V a, V b) {
4172  return detail::InterleaveUpper(a, b);
4173 }
4174 
4175 // Partial
4176 template <typename T, size_t N, HWY_IF_LE32(T, N), class V = Vec128<T, N>>
4177 HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
4178  const Half<decltype(d)> d2;
4179  return InterleaveLower(d, V(UpperHalf(d2, a).raw), V(UpperHalf(d2, b).raw));
4180 }
4181 
4182 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
4183 
4184 // Same as Interleave*, except that the return lanes are double-width integers;
4185 // this is necessary because the single-lane scalar cannot return two values.
4186 template <class V, class DW = RepartitionToWide<DFromV<V>>>
4188  return BitCast(DW(), InterleaveLower(a, b));
4189 }
4190 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4191 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
4192  return BitCast(dw, InterleaveLower(D(), a, b));
4193 }
4194 
4195 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4196 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
4197  return BitCast(dw, InterleaveUpper(D(), a, b));
4198 }
4199 
4200 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4201 
4202 template <size_t N>
4206  const Vec128<float, N> sum0,
4207  Vec128<float, N>& sum1) {
4208  const Repartition<uint16_t, decltype(df32)> du16;
4209  const RebindToUnsigned<decltype(df32)> du32;
4210  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
4211  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
4212  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
4213  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
4214  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
4215  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
4216  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
4217 }
4218 
4219 // ================================================== COMBINE
4220 
4221 // ------------------------------ Combine (InterleaveLower)
4222 
4223 // Full result
4225  Vec64<uint8_t> lo) {
4226  return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
4227 }
4230  return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
4231 }
4234  return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
4235 }
4238  return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
4239 }
4240 
4242  Vec64<int8_t> lo) {
4243  return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
4244 }
4246  Vec64<int16_t> lo) {
4247  return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
4248 }
4250  Vec64<int32_t> lo) {
4251  return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
4252 }
4254  Vec64<int64_t> lo) {
4255  return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
4256 }
4257 
4259  Vec64<float> lo) {
4260  return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
4261 }
4262 #if HWY_ARCH_ARM_A64
4263 HWY_API Vec128<double> Combine(Full128<double> /* tag */, Vec64<double> hi,
4264  Vec64<double> lo) {
4265  return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
4266 }
4267 #endif
4268 
4269 // < 64bit input, <= 64 bit result
4270 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4272  Vec128<T, N / 2> lo) {
4273  // First double N (only lower halves will be used).
4274  const Vec128<T, N> hi2(hi.raw);
4275  const Vec128<T, N> lo2(lo.raw);
4276  // Repartition to two unsigned lanes (each the size of the valid input).
4277  const Simd<UnsignedFromSize<N * sizeof(T) / 2>, 2, 0> du;
4278  return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2)));
4279 }
4280 
4281 // ------------------------------ ZeroExtendVector (Combine)
4282 
4283 template <typename T, size_t N>
4285  return Combine(d, Zero(Half<decltype(d)>()), lo);
4286 }
4287 
4288 // ------------------------------ ConcatLowerLower
4289 
4290 // 64 or 128-bit input: just interleave
4291 template <typename T, size_t N, HWY_IF_GE64(T, N)>
4293  Vec128<T, N> lo) {
4294  // Treat half-width input as a single lane and interleave them.
4295  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4296  return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi)));
4297 }
4298 
4299 namespace detail {
4300 #if HWY_ARCH_ARM_A64
4301 HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveEven, vtrn1, _, 2)
4302 HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveOdd, vtrn2, _, 2)
4303 #else
4304 
4305 // vtrn returns a struct with even and odd result.
4306 #define HWY_NEON_BUILD_TPL_HWY_TRN
4307 #define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
4308 // Pass raw args so we can accept uint16x2 args, for which there is no
4309 // corresponding uint16x2x2 return type.
4310 #define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
4311  Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
4312 #define HWY_NEON_BUILD_ARG_HWY_TRN a, b
4313 
4314 // Cannot use UINT8 etc. type macros because the x2_t tuples are only defined
4315 // for full and half vectors.
4316 HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN)
4317 HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN)
4318 HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN)
4319 HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN)
4320 HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN)
4321 HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN)
4322 HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN)
4323 HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN)
4324 HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN)
4325 HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN)
4326 HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN)
4327 HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN)
4328 HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN)
4329 HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN)
4330 #endif
4331 } // namespace detail
4332 
4333 // <= 32-bit input/output
4334 template <typename T, size_t N, HWY_IF_LE32(T, N)>
4335 HWY_API Vec128<T, N> ConcatLowerLower(const Simd<T, N, 0> d, Vec128<T, N> hi,
4336  Vec128<T, N> lo) {
4337  // Treat half-width input as two lanes and take every second one.
4338  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4339 #if HWY_ARCH_ARM_A64
4340  return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi)));
4341 #else
4342  using VU = VFromD<decltype(du)>;
4343  return BitCast(
4344  d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
4345  .val[0]));
4346 #endif
4347 }
4348 
4349 // ------------------------------ ConcatUpperUpper
4350 
4351 // 64 or 128-bit input: just interleave
4352 template <typename T, size_t N, HWY_IF_GE64(T, N)>
4354  Vec128<T, N> lo) {
4355  // Treat half-width input as a single lane and interleave them.
4356  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4357  return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi)));
4358 }
4359 
4360 // <= 32-bit input/output
4361 template <typename T, size_t N, HWY_IF_LE32(T, N)>
4362 HWY_API Vec128<T, N> ConcatUpperUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
4363  Vec128<T, N> lo) {
4364  // Treat half-width input as two lanes and take every second one.
4365  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4366 #if HWY_ARCH_ARM_A64
4367  return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi)));
4368 #else
4369  using VU = VFromD<decltype(du)>;
4370  return BitCast(
4371  d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
4372  .val[1]));
4373 #endif
4374 }
4375 
4376 // ------------------------------ ConcatLowerUpper (ShiftLeftBytes)
4377 
4378 // 64 or 128-bit input: extract from concatenated
4379 template <typename T, size_t N, HWY_IF_GE64(T, N)>
4381  Vec128<T, N> lo) {
4382  return CombineShiftRightBytes<N * sizeof(T) / 2>(d, hi, lo);
4383 }
4384 
4385 // <= 32-bit input/output
4386 template <typename T, size_t N, HWY_IF_LE32(T, N)>
4387 HWY_API Vec128<T, N> ConcatLowerUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
4388  Vec128<T, N> lo) {
4389  constexpr size_t kSize = N * sizeof(T);
4390  const Repartition<uint8_t, decltype(d)> d8;
4391  const Full64<uint8_t> d8x8;
4392  const Full64<T> d64;
4393  using V8x8 = VFromD<decltype(d8x8)>;
4394  const V8x8 hi8x8(BitCast(d8, hi).raw);
4395  // Move into most-significant bytes
4396  const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
4397  const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
4398  // Back to original lane type, then shrink N.
4399  return Vec128<T, N>(BitCast(d64, r).raw);
4400 }
4401 
4402 // ------------------------------ ConcatUpperLower
4403 
4404 // Works for all N.
4405 template <typename T, size_t N>
4407  Vec128<T, N> lo) {
4408  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
4409 }
4410 
4411 // ------------------------------ ConcatOdd (InterleaveUpper)
4412 
4413 namespace detail {
4414 // There is no vuzpq_u64.
4417 } // namespace detail
4418 
4419 // Full/half vector
4420 template <typename T, size_t N,
4421  hwy::EnableIf<N != 2 && sizeof(T) * N >= 8>* = nullptr>
4422 HWY_API Vec128<T, N> ConcatOdd(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
4423  Vec128<T, N> lo) {
4424  return detail::ConcatOdd(lo, hi);
4425 }
4426 
4427 // 8-bit x4
4428 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4430  Vec128<T, 4> lo) {
4431  const Twice<decltype(d)> d2;
4432  const Repartition<uint16_t, decltype(d2)> dw2;
4433  const VFromD<decltype(d2)> hi2(hi.raw);
4434  const VFromD<decltype(d2)> lo2(lo.raw);
4435  const VFromD<decltype(dw2)> Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2));
4436  // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
4437  // vcopy_lane_u16, but that's A64-only.
4438  return Vec128<T, 4>(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw);
4439 }
4440 
4441 // Any type x2
4442 template <typename T>
4444  Vec128<T, 2> lo) {
4445  return InterleaveUpper(d, lo, hi);
4446 }
4447 
4448 // ------------------------------ ConcatEven (InterleaveLower)
4449 
4450 // Full/half vector
4451 template <typename T, size_t N,
4452  hwy::EnableIf<N != 2 && sizeof(T) * N >= 8>* = nullptr>
4454  Vec128<T, N> lo) {
4455  return detail::ConcatEven(lo, hi);
4456 }
4457 
4458 // 8-bit x4
4459 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4461  Vec128<T, 4> lo) {
4462  const Twice<decltype(d)> d2;
4463  const Repartition<uint16_t, decltype(d2)> dw2;
4464  const VFromD<decltype(d2)> hi2(hi.raw);
4465  const VFromD<decltype(d2)> lo2(lo.raw);
4466  const VFromD<decltype(dw2)> Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2));
4467  // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
4468  // vcopy_lane_u16, but that's A64-only.
4469  return Vec128<T, 4>(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw);
4470 }
4471 
4472 // Any type x2
4473 template <typename T>
4475  Vec128<T, 2> lo) {
4476  return InterleaveLower(d, lo, hi);
4477 }
4478 
4479 // ------------------------------ DupEven (InterleaveLower)
4480 
4481 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4483 #if HWY_ARCH_ARM_A64
4484  return detail::InterleaveEven(v, v);
4485 #else
4486  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]);
4487 #endif
4488 }
4489 
4490 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4491 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
4492  return InterleaveLower(Simd<T, N, 0>(), v, v);
4493 }
4494 
4495 // ------------------------------ DupOdd (InterleaveUpper)
4496 
4497 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4499 #if HWY_ARCH_ARM_A64
4500  return detail::InterleaveOdd(v, v);
4501 #else
4502  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]);
4503 #endif
4504 }
4505 
4506 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4507 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
4508  return InterleaveUpper(Simd<T, N, 0>(), v, v);
4509 }
4510 
4511 // ------------------------------ OddEven (IfThenElse)
4512 
4513 template <typename T, size_t N>
4515  const Simd<T, N, 0> d;
4516  const Repartition<uint8_t, decltype(d)> d8;
4517  alignas(16) constexpr uint8_t kBytes[16] = {
4518  ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
4519  ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
4520  ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
4521  ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
4522  ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
4523  ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
4524  ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
4525  ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
4526  };
4527  const auto vec = BitCast(d, Load(d8, kBytes));
4528  return IfThenElse(MaskFromVec(vec), b, a);
4529 }
4530 
4531 // ------------------------------ OddEvenBlocks
4532 template <typename T, size_t N>
4534  return even;
4535 }
4536 
4537 // ------------------------------ SwapAdjacentBlocks
4538 
4539 template <typename T, size_t N>
4541  return v;
4542 }
4543 
4544 // ------------------------------ ReverseBlocks
4545 
4546 // Single block: no change
4547 template <typename T>
4549  return v;
4550 }
4551 
4552 // ------------------------------ ReorderDemote2To (OddEven)
4553 
4554 template <size_t N>
4557  const RebindToUnsigned<decltype(dbf16)> du16;
4558  const Repartition<uint32_t, decltype(dbf16)> du32;
4559  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
4560  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4561 }
4562 
4563 // ================================================== CRYPTO
4564 
4565 #if defined(__ARM_FEATURE_AES)
4566 
4567 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4568 #ifdef HWY_NATIVE_AES
4569 #undef HWY_NATIVE_AES
4570 #else
4571 #define HWY_NATIVE_AES
4572 #endif
4573 
4574 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
4575  Vec128<uint8_t> round_key) {
4576  // NOTE: it is important that AESE and AESMC be consecutive instructions so
4577  // they can be fused. AESE includes AddRoundKey, which is a different ordering
4578  // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
4579  // round key (the compiler will hopefully optimize this for multiple rounds).
4580  return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
4581  round_key;
4582 }
4583 
4584 HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
4585  Vec128<uint8_t> round_key) {
4586  return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
4587 }
4588 
4589 HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) {
4590  return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b)));
4591 }
4592 
4593 HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
4594  return Vec128<uint64_t>(
4595  (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
4596 }
4597 
4598 #endif // __ARM_FEATURE_AES
4599 
4600 // ================================================== MISC
4601 
4602 template <size_t N>
4604  const Vec128<bfloat16_t, N> v) {
4605  const Rebind<uint16_t, decltype(df32)> du16;
4606  const RebindToSigned<decltype(df32)> di32;
4607  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4608 }
4609 
4610 // ------------------------------ MulEven (ConcatEven)
4611 
4612 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
4613 // even and the upper half into its odd neighbor lane.
4615  const Full128<int32_t> d;
4616  int32x4_t a_packed = ConcatEven(d, a, a).raw;
4617  int32x4_t b_packed = ConcatEven(d, b, b).raw;
4618  return Vec128<int64_t>(
4619  vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
4620 }
4622  const Full128<uint32_t> d;
4623  uint32x4_t a_packed = ConcatEven(d, a, a).raw;
4624  uint32x4_t b_packed = ConcatEven(d, b, b).raw;
4625  return Vec128<uint64_t>(
4626  vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
4627 }
4628 
4629 template <size_t N>
4630 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
4631  const Vec128<int32_t, N> b) {
4632  const DFromV<decltype(a)> d;
4633  int32x2_t a_packed = ConcatEven(d, a, a).raw;
4634  int32x2_t b_packed = ConcatEven(d, b, b).raw;
4635  return Vec128<int64_t, (N + 1) / 2>(
4636  vget_low_s64(vmull_s32(a_packed, b_packed)));
4637 }
4638 template <size_t N>
4639 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
4640  const Vec128<uint32_t, N> b) {
4641  const DFromV<decltype(a)> d;
4642  uint32x2_t a_packed = ConcatEven(d, a, a).raw;
4643  uint32x2_t b_packed = ConcatEven(d, b, b).raw;
4644  return Vec128<uint64_t, (N + 1) / 2>(
4645  vget_low_u64(vmull_u32(a_packed, b_packed)));
4646 }
4647 
4649  uint64_t hi;
4650  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi);
4651  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
4652 }
4653 
4655  uint64_t hi;
4656  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi);
4657  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
4658 }
4659 
4660 // ------------------------------ TableLookupBytes (Combine, LowerHalf)
4661 
4662 // Both full
4663 template <typename T, typename TI>
4665  const Vec128<TI> from) {
4666  const Full128<TI> d;
4667  const Repartition<uint8_t, decltype(d)> d8;
4668 #if HWY_ARCH_ARM_A64
4669  return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
4670  BitCast(d8, from).raw)));
4671 #else
4672  uint8x16_t table0 = BitCast(d8, bytes).raw;
4673  uint8x8x2_t table;
4674  table.val[0] = vget_low_u8(table0);
4675  table.val[1] = vget_high_u8(table0);
4676  uint8x16_t idx = BitCast(d8, from).raw;
4677  uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
4678  uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
4679  return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
4680 #endif
4681 }
4682 
4683 // Partial index vector
4684 template <typename T, typename TI, size_t NI, HWY_IF_LE64(TI, NI)>
4686  const Vec128<TI, NI> from) {
4687  const Full128<TI> d_full;
4688  const Vec64<TI> from64(from.raw);
4689  const auto idx_full = Combine(d_full, from64, from64);
4690  const auto out_full = TableLookupBytes(bytes, idx_full);
4691  return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw);
4692 }
4693 
4694 // Partial table vector
4695 template <typename T, size_t N, typename TI, HWY_IF_LE64(T, N)>
4697  const Vec128<TI> from) {
4698  const Full128<T> d_full;
4699  return TableLookupBytes(Combine(d_full, bytes, bytes), from);
4700 }
4701 
4702 // Partial both
4703 template <typename T, size_t N, typename TI, size_t NI, HWY_IF_LE64(T, N),
4704  HWY_IF_LE64(TI, NI)>
4706  Vec128<T, N> bytes, Vec128<TI, NI> from) {
4707  const Simd<T, N, 0> d;
4708  const Simd<TI, NI, 0> d_idx;
4709  const Repartition<uint8_t, decltype(d_idx)> d_idx8;
4710  // uint8x8
4711  const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes);
4712  const auto from8 = BitCast(d_idx8, from);
4713  const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4714  return BitCast(d_idx, v8);
4715 }
4716 
4717 // For all vector widths; ARM anyway zeroes if >= 0x10.
4718 template <class V, class VI>
4719 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
4720  return TableLookupBytes(bytes, from);
4721 }
4722 
4723 // ------------------------------ Scatter (Store)
4724 
4725 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
4727  T* HWY_RESTRICT base,
4728  const Vec128<Offset, N> offset) {
4729  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
4730 
4731  alignas(16) T lanes[N];
4732  Store(v, d, lanes);
4733 
4734  alignas(16) Offset offset_lanes[N];
4735  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
4736 
4737  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
4738  for (size_t i = 0; i < N; ++i) {
4739  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
4740  }
4741 }
4742 
4743 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
4745  const Vec128<Index, N> index) {
4746  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
4747 
4748  alignas(16) T lanes[N];
4749  Store(v, d, lanes);
4750 
4751  alignas(16) Index index_lanes[N];
4752  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
4753 
4754  for (size_t i = 0; i < N; ++i) {
4755  base[index_lanes[i]] = lanes[i];
4756  }
4757 }
4758 
4759 // ------------------------------ Gather (Load/Store)
4760 
4761 template <typename T, size_t N, typename Offset>
4763  const T* HWY_RESTRICT base,
4764  const Vec128<Offset, N> offset) {
4765  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
4766 
4767  alignas(16) Offset offset_lanes[N];
4768  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
4769 
4770  alignas(16) T lanes[N];
4771  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
4772  for (size_t i = 0; i < N; ++i) {
4773  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
4774  }
4775  return Load(d, lanes);
4776 }
4777 
4778 template <typename T, size_t N, typename Index>
4780  const T* HWY_RESTRICT base,
4781  const Vec128<Index, N> index) {
4782  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
4783 
4784  alignas(16) Index index_lanes[N];
4785  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
4786 
4787  alignas(16) T lanes[N];
4788  for (size_t i = 0; i < N; ++i) {
4789  lanes[i] = base[index_lanes[i]];
4790  }
4791  return Load(d, lanes);
4792 }
4793 
4794 // ------------------------------ Reductions
4795 
4796 namespace detail {
4797 
4798 // N=1 for any T: no-op
4799 template <typename T>
4801  return v;
4802 }
4803 template <typename T>
4805  const Vec128<T, 1> v) {
4806  return v;
4807 }
4808 template <typename T>
4810  const Vec128<T, 1> v) {
4811  return v;
4812 }
4813 
4814 // u32/i32/f32: N=2
4815 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4817  return v10 + Shuffle2301(v10);
4818 }
4819 template <typename T>
4821  const Vec128<T, 2> v10) {
4822  return Min(v10, Shuffle2301(v10));
4823 }
4824 template <typename T>
4826  const Vec128<T, 2> v10) {
4827  return Max(v10, Shuffle2301(v10));
4828 }
4829 
4830 // full vectors
4831 #if HWY_ARCH_ARM_A64
4833  return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
4834 }
4835 HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
4836  return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
4837 }
4838 HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
4839  return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
4840 }
4841 HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
4842  return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
4843 }
4844 HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
4845  return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
4846 }
4847 HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
4848  return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
4849 }
4850 #else
4851 // ARMv7 version for everything except doubles.
4853  uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
4854  uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
4855  uint32x4x2_t v1 = vuzpq_u32(c0, c0);
4856  return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
4857 }
4859  int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
4860  int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
4861  int32x4x2_t v1 = vuzpq_s32(c0, c0);
4862  return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
4863 }
4865  float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
4866  float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
4867  float32x4x2_t v1 = vuzpq_f32(c0, c0);
4868  return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
4869 }
4871  return v + Shuffle01(v);
4872 }
4874  return v + Shuffle01(v);
4875 }
4876 #endif
4877 
4878 template <typename T>
4880  const Vec128<T> v3210) {
4881  const Vec128<T> v1032 = Shuffle1032(v3210);
4882  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
4883  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4884  return Min(v20_31_20_31, v31_20_31_20);
4885 }
4886 template <typename T>
4888  const Vec128<T> v3210) {
4889  const Vec128<T> v1032 = Shuffle1032(v3210);
4890  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
4891  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4892  return Max(v20_31_20_31, v31_20_31_20);
4893 }
4894 
4895 // For u64/i64[/f64].
4896 template <typename T>
4898  const Vec128<T> v10) {
4899  const Vec128<T> v01 = Shuffle01(v10);
4900  return Min(v10, v01);
4901 }
4902 template <typename T>
4904  const Vec128<T> v10) {
4905  const Vec128<T> v01 = Shuffle01(v10);
4906  return Max(v10, v01);
4907 }
4908 
4909 // u16/i16
4910 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4913  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4914  const auto odd = ShiftRight<16>(BitCast(d32, v));
4915  const auto min = MinOfLanes(d32, Min(even, odd));
4916  // Also broadcast into odd lanes.
4917  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
4918 }
4919 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4922  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4923  const auto odd = ShiftRight<16>(BitCast(d32, v));
4924  const auto min = MaxOfLanes(d32, Max(even, odd));
4925  // Also broadcast into odd lanes.
4926  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
4927 }
4928 
4929 } // namespace detail
4930 
4931 template <typename T, size_t N>
4933  return detail::SumOfLanes(v);
4934 }
4935 template <typename T, size_t N>
4937  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4938 }
4939 template <typename T, size_t N>
4941  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4942 }
4943 
4944 // ------------------------------ LoadMaskBits (TestBit)
4945 
4946 namespace detail {
4947 
4948 // Helper function to set 64 bits and potentially return a smaller vector. The
4949 // overload is required to call the q vs non-q intrinsics. Note that 8-bit
4950 // LoadMaskBits only requires 16 bits, but 64 avoids casting.
4951 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4952 HWY_INLINE Vec128<T, N> Set64(Simd<T, N, 0> /* tag */, uint64_t mask_bits) {
4953  const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits));
4954  return Vec128<T, N>(BitCast(Full64<T>(), v64).raw);
4955 }
4956 template <typename T>
4957 HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) {
4958  return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits)));
4959 }
4960 
4961 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4963  const RebindToUnsigned<decltype(d)> du;
4964  // Easier than Set(), which would require an >8-bit type, which would not
4965  // compile for T=uint8_t, N=1.
4966  const auto vmask_bits = Set64(du, mask_bits);
4967 
4968  // Replicate bytes 8x such that each byte contains the bit that governs it.
4969  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4970  1, 1, 1, 1, 1, 1, 1, 1};
4971  const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8));
4972 
4973  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4974  1, 2, 4, 8, 16, 32, 64, 128};
4975  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4976 }
4977 
4978 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4979 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
4980  const RebindToUnsigned<decltype(d)> du;
4981  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4982  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
4983  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4984 }
4985 
4986 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4987 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
4988  const RebindToUnsigned<decltype(d)> du;
4989  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4990  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
4991  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4992 }
4993 
4994 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4995 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
4996  const RebindToUnsigned<decltype(d)> du;
4997  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
4998  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
4999 }
5000 
5001 } // namespace detail
5002 
5003 // `p` points to at least 8 readable bytes, not all of which need be valid.
5004 template <typename T, size_t N, HWY_IF_LE128(T, N)>
5006  const uint8_t* HWY_RESTRICT bits) {
5007  uint64_t mask_bits = 0;
5008  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
5009  return detail::LoadMaskBits(d, mask_bits);
5010 }
5011 
5012 // ------------------------------ Mask
5013 
5014 namespace detail {
5015 
5016 // Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than
5017 // BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse.
5018 template <typename T>
5020  const Full128<uint16_t> du16;
5021  const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask));
5022  const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4));
5023  return GetLane(BitCast(Full64<uint64_t>(), nib));
5024 }
5025 
5026 template <typename T>
5028  // There is no vshrn_n_u16 for uint16x4, so zero-extend.
5029  const Twice<decltype(d)> d2;
5030  const Vec128<T> v128 = ZeroExtendVector(d2, VecFromMask(d, mask));
5031  // No need to mask, upper half is zero thanks to ZeroExtendVector.
5032  return NibblesFromMask(d2, MaskFromVec(v128));
5033 }
5034 
5035 template <typename T, size_t N, HWY_IF_LE32(T, N)>
5037  const Mask64<T> mask64(mask.raw);
5038  const uint64_t nib = NibblesFromMask(Full64<T>(), mask64);
5039  // Clear nibbles from upper half of 64-bits
5040  constexpr size_t kBytes = sizeof(T) * N;
5041  return nib & ((1ull << (kBytes * 4)) - 1);
5042 }
5043 
5044 template <typename T>
5046  const Mask128<T> mask) {
5047  alignas(16) constexpr uint8_t kSliceLanes[16] = {
5048  1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
5049  };
5050  const Full128<uint8_t> du;
5051  const Vec128<uint8_t> values =
5052  BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
5053 
5054 #if HWY_ARCH_ARM_A64
5055  // Can't vaddv - we need two separate bytes (16 bits).
5056  const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
5057  const uint8x8_t x4 = vpadd_u8(x2, x2);
5058  const uint8x8_t x8 = vpadd_u8(x4, x4);
5059  return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
5060 #else
5061  // Don't have vpaddq, so keep doubling lane size.
5062  const uint16x8_t x2 = vpaddlq_u8(values.raw);
5063  const uint32x4_t x4 = vpaddlq_u16(x2);
5064  const uint64x2_t x8 = vpaddlq_u32(x4);
5065  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
5066 #endif
5067 }
5068 
5069 template <typename T, size_t N, HWY_IF_LE64(T, N)>
5071  const Mask128<T, N> mask) {
5072  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
5073  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
5074  alignas(8) constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
5075  0x10, 0x20, 0x40, 0x80};
5076  const Simd<T, N, 0> d;
5077  const RebindToUnsigned<decltype(d)> du;
5078  const Vec128<uint8_t, N> slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
5079  const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
5080 
5081 #if HWY_ARCH_ARM_A64
5082  return vaddv_u8(values.raw);
5083 #else
5084  const uint16x4_t x2 = vpaddl_u8(values.raw);
5085  const uint32x2_t x4 = vpaddl_u16(x2);
5086  const uint64x1_t x8 = vpaddl_u32(x4);
5087  return vget_lane_u64(x8, 0);
5088 #endif
5089 }
5090 
5091 template <typename T>
5093  const Mask128<T> mask) {
5094  alignas(16) constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
5095  0x10, 0x20, 0x40, 0x80};
5096  const Full128<T> d;
5097  const Full128<uint16_t> du;
5098  const Vec128<uint16_t> values =
5099  BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
5100 #if HWY_ARCH_ARM_A64
5101  return vaddvq_u16(values.raw);
5102 #else
5103  const uint32x4_t x2 = vpaddlq_u16(values.raw);
5104  const uint64x2_t x4 = vpaddlq_u32(x2);
5105  return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
5106 #endif
5107 }
5108 
5109 template <typename T, size_t N, HWY_IF_LE64(T, N)>
5111  const Mask128<T, N> mask) {
5112  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
5113  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
5114  alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
5115  const Simd<T, N, 0> d;
5116  const RebindToUnsigned<decltype(d)> du;
5117  const Vec128<uint16_t, N> slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
5118  const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
5119 #if HWY_ARCH_ARM_A64
5120  return vaddv_u16(values.raw);
5121 #else
5122  const uint32x2_t x2 = vpaddl_u16(values.raw);
5123  const uint64x1_t x4 = vpaddl_u32(x2);
5124  return vget_lane_u64(x4, 0);
5125 #endif
5126 }
5127 
5128 template <typename T>
5130  const Mask128<T> mask) {
5131  alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
5132  const Full128<T> d;
5133  const Full128<uint32_t> du;
5134  const Vec128<uint32_t> values =
5135  BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
5136 #if HWY_ARCH_ARM_A64
5137  return vaddvq_u32(values.raw);
5138 #else
5139  const uint64x2_t x2 = vpaddlq_u32(values.raw);
5140  return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
5141 #endif
5142 }
5143 
5144 template <typename T, size_t N, HWY_IF_LE64(T, N)>
5146  const Mask128<T, N> mask) {
5147  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
5148  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
5149  alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2};
5150  const Simd<T, N, 0> d;
5151  const RebindToUnsigned<decltype(d)> du;
5152  const Vec128<uint32_t, N> slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
5153  const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
5154 #if HWY_ARCH_ARM_A64
5155  return vaddv_u32(values.raw);
5156 #else
5157  const uint64x1_t x2 = vpaddl_u32(values.raw);
5158  return vget_lane_u64(x2, 0);
5159 #endif
5160 }
5161 
5162 template <typename T>
5164  alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2};
5165  const Full128<T> d;
5166  const Full128<uint64_t> du;
5167  const Vec128<uint64_t> values =
5168  BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
5169 #if HWY_ARCH_ARM_A64
5170  return vaddvq_u64(values.raw);
5171 #else
5172  return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
5173 #endif
5174 }
5175 
5176 template <typename T>
5178  const Mask128<T, 1> m) {
5179  const Full64<T> d;
5180  const Full64<uint64_t> du;
5181  const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1);
5182  return vget_lane_u64(values.raw, 0);
5183 }
5184 
5185 // Returns the lowest N for the BitsFromMask result.
5186 template <typename T, size_t N>
5187 constexpr uint64_t OnlyActive(uint64_t bits) {
5188  return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
5189 }
5190 
5191 template <typename T, size_t N>
5192 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
5193  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
5194 }
5195 
5196 // Returns number of lanes whose mask is set.
5197 //
5198 // Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
5199 // ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
5200 // changes each lane to 1 (if mask set) or 0.
5201 // NOTE: PopCount also operates on vectors, so we still have to do horizontal
5202 // sums separately. We specialize CountTrue for full vectors (negating instead
5203 // of PopCount because it avoids an extra shift), and use PopCount of
5204 // NibblesFromMask for partial vectors.
5205 
5206 template <typename T>
5207 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
5208  const Full128<int8_t> di;
5209  const int8x16_t ones =
5210  vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5211 
5212 #if HWY_ARCH_ARM_A64
5213  return static_cast<size_t>(vaddvq_s8(ones));
5214 #else
5215  const int16x8_t x2 = vpaddlq_s8(ones);
5216  const int32x4_t x4 = vpaddlq_s16(x2);
5217  const int64x2_t x8 = vpaddlq_s32(x4);
5218  return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
5219 #endif
5220 }
5221 template <typename T>
5222 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> mask) {
5223  const Full128<int16_t> di;
5224  const int16x8_t ones =
5225  vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5226 
5227 #if HWY_ARCH_ARM_A64
5228  return static_cast<size_t>(vaddvq_s16(ones));
5229 #else
5230  const int32x4_t x2 = vpaddlq_s16(ones);
5231  const int64x2_t x4 = vpaddlq_s32(x2);
5232  return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
5233 #endif
5234 }
5235 
5236 template <typename T>
5237 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> mask) {
5238  const Full128<int32_t> di;
5239  const int32x4_t ones =
5240  vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5241 
5242 #if HWY_ARCH_ARM_A64
5243  return static_cast<size_t>(vaddvq_s32(ones));
5244 #else
5245  const int64x2_t x2 = vpaddlq_s32(ones);
5246  return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
5247 #endif
5248 }
5249 
5250 template <typename T>
5251 HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
5252 #if HWY_ARCH_ARM_A64
5253  const Full128<int64_t> di;
5254  const int64x2_t ones =
5255  vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5256  return static_cast<size_t>(vaddvq_s64(ones));
5257 #else
5258  const Full128<uint64_t> du;
5259  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
5260  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
5261  return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
5262 #endif
5263 }
5264 
5265 } // namespace detail
5266 
5267 // Full
5268 template <typename T>
5269 HWY_API size_t CountTrue(Full128<T> /* tag */, const Mask128<T> mask) {
5270  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
5271 }
5272 
5273 // Partial
5274 template <typename T, size_t N, HWY_IF_LE64(T, N)>
5276  constexpr int kDiv = 4 * sizeof(T);
5277  return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
5278 }
5279 template <typename T, size_t N>
5281  const Mask128<T, N> mask) {
5282  const uint64_t nib = detail::NibblesFromMask(d, mask);
5283  if (nib == 0) return -1;
5284  constexpr int kDiv = 4 * sizeof(T);
5285  return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv);
5286 }
5287 
5288 // `p` points to at least 8 writable bytes.
5289 template <typename T, size_t N>
5290 HWY_API size_t StoreMaskBits(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask,
5291  uint8_t* bits) {
5292  const uint64_t mask_bits = detail::BitsFromMask(mask);
5293  const size_t kNumBytes = (N + 7) / 8;
5294  CopyBytes<kNumBytes>(&mask_bits, bits);
5295  return kNumBytes;
5296 }
5297 
5298 template <typename T, size_t N>
5300  return detail::NibblesFromMask(d, m) == 0;
5301 }
5302 
5303 // Full
5304 template <typename T>
5305 HWY_API bool AllTrue(const Full128<T> d, const Mask128<T> m) {
5306  return detail::NibblesFromMask(d, m) == ~0ull;
5307 }
5308 // Partial
5309 template <typename T, size_t N, HWY_IF_LE64(T, N)>
5311  constexpr size_t kBytes = sizeof(T) * N;
5312  return detail::NibblesFromMask(d, m) == (1ull << (kBytes * 4)) - 1;
5313 }
5314 
5315 // ------------------------------ Compress
5316 
5317 template <typename T>
5319  enum { value = 1 };
5320 };
5321 
5322 namespace detail {
5323 
5324 // Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
5326  const uint8_t* bytes) {
5327  return Vec128<uint8_t>(vreinterpretq_u8_u64(
5328  vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
5329 }
5330 
5331 // Load 8 bytes and return half-reg with N <= 8 bytes.
5332 template <size_t N, HWY_IF_LE64(uint8_t, N)>
5334  const uint8_t* bytes) {
5335  return Load(d, bytes);
5336 }
5337 
5338 template <typename T, size_t N>
5340  const uint64_t mask_bits) {
5341  HWY_DASSERT(mask_bits < 256);
5342  const Simd<T, N, 0> d;
5343  const Repartition<uint8_t, decltype(d)> d8;
5344  const Simd<uint16_t, N, 0> du;
5345 
5346  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
5347  // indices for VTBL (one vector's worth for each of 256 combinations of
5348  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
5349  // store lane indices and convert to byte indices (2*lane + 0..1), with the
5350  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
5351  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
5352  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
5353  // is likely more costly than the higher cache footprint from storing bytes.
5354  alignas(16) constexpr uint8_t table[256 * 8] = {
5355  // PrintCompress16x8Tables
5356  0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5357  2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5358  4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
5359  2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5360  6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
5361  2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
5362  4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
5363  2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5364  8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
5365  2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
5366  4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
5367  2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
5368  6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
5369  2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
5370  4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
5371  2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5372  10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
5373  2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
5374  4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
5375  2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
5376  6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
5377  2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
5378  4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
5379  2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
5380  8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
5381  2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
5382  4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
5383  2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
5384  6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
5385  2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
5386  4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
5387  2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5388  12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
5389  2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
5390  4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
5391  2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
5392  6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
5393  2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
5394  4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
5395  2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
5396  8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
5397  2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
5398  4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
5399  2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
5400  6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
5401  2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
5402  4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
5403  2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
5404  10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
5405  2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
5406  4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
5407  2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
5408  6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
5409  2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
5410  4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
5411  2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
5412  8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
5413  2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
5414  4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
5415  2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
5416  6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
5417  2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
5418  4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
5419  2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5420  14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
5421  2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
5422  4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
5423  2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
5424  6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
5425  2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
5426  4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
5427  2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
5428  8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
5429  2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
5430  4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
5431  2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
5432  6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
5433  2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
5434  4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
5435  2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
5436  10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
5437  2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
5438  4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
5439  2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
5440  6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
5441  2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
5442  4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
5443  2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
5444  8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
5445  2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
5446  4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
5447  2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
5448  6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
5449  2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
5450  4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
5451  2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
5452  12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
5453  2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
5454  4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
5455  2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
5456  6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
5457  2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
5458  4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
5459  2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
5460  8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
5461  2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
5462  4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
5463  2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
5464  6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
5465  2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
5466  4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
5467  2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
5468  10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
5469  2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
5470  4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
5471  2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
5472  6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
5473  2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
5474  4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
5475  2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
5476  8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
5477  2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
5478  4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
5479  2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
5480  6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
5481  2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
5482  4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
5483  2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5484 
5485  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
5486  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5487  return BitCast(d, pairs + Set(du, 0x0100));
5488 }
5489 
5490 template <typename T, size_t N>
5492  const uint64_t mask_bits) {
5493  HWY_DASSERT(mask_bits < 256);
5494  const Simd<T, N, 0> d;
5495  const Repartition<uint8_t, decltype(d)> d8;
5496  const Simd<uint16_t, N, 0> du;
5497 
5498  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
5499  // indices for VTBL (one vector's worth for each of 256 combinations of
5500  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
5501  // store lane indices and convert to byte indices (2*lane + 0..1), with the
5502  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
5503  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
5504  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
5505  // is likely more costly than the higher cache footprint from storing bytes.
5506  alignas(16) constexpr uint8_t table[256 * 8] = {
5507  // PrintCompressNot16x8Tables
5508  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
5509  0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
5510  0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
5511  0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
5512  0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
5513  0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
5514  0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
5515  0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
5516  0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
5517  0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
5518  0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
5519  0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
5520  0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
5521  0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
5522  0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
5523  0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
5524  0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
5525  0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
5526  0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
5527  0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
5528  0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
5529  0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
5530  0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
5531  0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
5532  0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
5533  0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
5534  0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
5535  0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
5536  0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
5537  0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
5538  0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
5539  0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
5540  0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
5541  0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
5542  0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
5543  0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
5544  0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
5545  0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
5546  0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
5547  0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
5548  0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
5549  0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
5550  0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
5551  0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
5552  0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
5553  0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
5554  0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
5555  0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
5556  0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
5557  0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
5558  0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
5559  0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
5560  0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
5561  0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
5562  0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
5563  0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
5564  0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
5565  0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
5566  0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
5567  0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
5568  0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
5569  0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
5570  0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
5571  0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
5572  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
5573  0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
5574  0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
5575  0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
5576  0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
5577  0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
5578  0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
5579  0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
5580  0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
5581  0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
5582  0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
5583  0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
5584  0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
5585  0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
5586  0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
5587  0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
5588  0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
5589  0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
5590  0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
5591  0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
5592  0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
5593  0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
5594  0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
5595  0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
5596  0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
5597  0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
5598  0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
5599  0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
5600  0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
5601  0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
5602  0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
5603  0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
5604  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
5605  0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
5606  0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
5607  0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
5608  0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
5609  0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
5610  0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
5611  0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
5612  0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
5613  0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
5614  0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
5615  0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
5616  0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
5617  0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
5618  0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
5619  0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
5620  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
5621  0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
5622  0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
5623  0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
5624  0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
5625  0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
5626  0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
5627  0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
5628  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
5629  0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
5630  0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
5631  0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
5632  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
5633  0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
5634  0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
5635  0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
5636 
5637  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
5638  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5639  return BitCast(d, pairs + Set(du, 0x0100));
5640 }
5641 
5642 template <typename T, size_t N>
5644  const uint64_t mask_bits) {
5645  HWY_DASSERT(mask_bits < 16);
5646 
5647  // There are only 4 lanes, so we can afford to load the index vector directly.
5648  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
5649  // PrintCompress32x4Tables
5650  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5651  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5652  4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
5653  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5654  8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
5655  0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
5656  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
5657  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5658  12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
5659  0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
5660  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
5661  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
5662  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
5663  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
5664  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
5665  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5666  const Simd<T, N, 0> d;
5667  const Repartition<uint8_t, decltype(d)> d8;
5668  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5669 }
5670 
5671 template <typename T, size_t N>
5673  const uint64_t mask_bits) {
5674  HWY_DASSERT(mask_bits < 16);
5675 
5676  // There are only 4 lanes, so we can afford to load the index vector directly.
5677  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
5678  // PrintCompressNot32x4Tables
5679  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
5680  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5681  8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5682  14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5683  12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
5684  2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
5685  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5686  10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5687  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
5688  2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5689  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
5690  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
5691  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5692  10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5693  12, 13, 14, 15};
5694  const Simd<T, N, 0> d;
5695  const Repartition<uint8_t, decltype(d)> d8;
5696  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5697 }
5698 
5699 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
5700 
5701 template <typename T, size_t N>
5703  const uint64_t mask_bits) {
5704  HWY_DASSERT(mask_bits < 4);
5705 
5706  // There are only 2 lanes, so we can afford to load the index vector directly.
5707  alignas(16) constexpr uint8_t u8_indices[64] = {
5708  // PrintCompress64x2Tables
5709  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5710  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5711  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5712  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5713 
5714  const Simd<T, N, 0> d;
5715  const Repartition<uint8_t, decltype(d)> d8;
5716  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5717 }
5718 
5719 template <typename T, size_t N>
5720 HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/,
5721  const uint64_t mask_bits) {
5722  HWY_DASSERT(mask_bits < 4);
5723 
5724  // There are only 2 lanes, so we can afford to load the index vector directly.
5725  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
5726  // PrintCompressNot64x2Tables
5727  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5728  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5729  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5730  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5731 
5732  const Simd<T, N, 0> d;
5733  const Repartition<uint8_t, decltype(d)> d8;
5734  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5735 }
5736 
5737 #endif
5738 
5739 // Helper function called by both Compress and CompressStore - avoids a
5740 // redundant BitsFromMask in the latter.
5741 template <typename T, size_t N>
5742 HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
5743  const auto idx =
5744  detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
5745  using D = Simd<T, N, 0>;
5746  const RebindToSigned<D> di;
5747  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
5748 }
5749 
5750 template <typename T, size_t N>
5751 HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
5752  const auto idx =
5753  detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
5754  using D = Simd<T, N, 0>;
5755  const RebindToSigned<D> di;
5756  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
5757 }
5758 
5759 } // namespace detail
5760 
5761 // Single lane: no-op
5762 template <typename T>
5764  return v;
5765 }
5766 
5767 // Two lanes: conditional swap
5768 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
5770  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
5771  const Simd<T, N, 0> d;
5772  const Vec128<T, N> m = VecFromMask(d, mask);
5773  const Vec128<T, N> maskL = DupEven(m);
5774  const Vec128<T, N> maskH = DupOdd(m);
5775  const Vec128<T, N> swap = AndNot(maskL, maskH);
5776  return IfVecThenElse(swap, Shuffle01(v), v);
5777 }
5778 
5779 // General case
5780 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
5781 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
5782  return detail::Compress(v, detail::BitsFromMask(mask));
5783 }
5784 
5785 // Single lane: no-op
5786 template <typename T>
5788  return v;
5789 }
5790 
5791 // Two lanes: conditional swap
5792 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
5794  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
5795  const Full128<T> d;
5796  const Vec128<T> m = VecFromMask(d, mask);
5797  const Vec128<T> maskL = DupEven(m);
5798  const Vec128<T> maskH = DupOdd(m);
5799  const Vec128<T> swap = AndNot(maskH, maskL);
5800  return IfVecThenElse(swap, Shuffle01(v), v);
5801 }
5802 
5803 // General case
5804 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
5806  // For partial vectors, we cannot pull the Not() into the table because
5807  // BitsFromMask clears the upper bits.
5808  if (N < 16 / sizeof(T)) {
5809  return detail::Compress(v, detail::BitsFromMask(Not(mask)));
5810  }
5812 }
5813 
5814 // ------------------------------ CompressBlocksNot
5816  Mask128<uint64_t> /* m */) {
5817  return v;
5818 }
5819 
5820 // ------------------------------ CompressBits
5821 
5822 template <typename T, size_t N>
5824  const uint8_t* HWY_RESTRICT bits) {
5825  uint64_t mask_bits = 0;
5826  constexpr size_t kNumBytes = (N + 7) / 8;
5827  CopyBytes<kNumBytes>(bits, &mask_bits);
5828  if (N < 8) {
5829  mask_bits &= (1ull << N) - 1;
5830  }
5831 
5832  return detail::Compress(v, mask_bits);
5833 }
5834 
5835 // ------------------------------ CompressStore
5836 template <typename T, size_t N>
5838  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
5839  const uint64_t mask_bits = detail::BitsFromMask(mask);
5840  StoreU(detail::Compress(v, mask_bits), d, unaligned);
5841  return PopCount(mask_bits);
5842 }
5843 
5844 // ------------------------------ CompressBlendedStore
5845 template <typename T, size_t N>
5847  Simd<T, N, 0> d,
5848  T* HWY_RESTRICT unaligned) {
5849  const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
5850  using TU = TFromD<decltype(du)>;
5851  const uint64_t mask_bits = detail::BitsFromMask(m);
5852  const size_t count = PopCount(mask_bits);
5853  const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
5854  const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
5855  BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
5856  return count;
5857 }
5858 
5859 // ------------------------------ CompressBitsStore
5860 
5861 template <typename T, size_t N>
5863  const uint8_t* HWY_RESTRICT bits,
5864  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
5865  uint64_t mask_bits = 0;
5866  constexpr size_t kNumBytes = (N + 7) / 8;
5867  CopyBytes<kNumBytes>(bits, &mask_bits);
5868  if (N < 8) {
5869  mask_bits &= (1ull << N) - 1;
5870  }
5871 
5872  StoreU(detail::Compress(v, mask_bits), d, unaligned);
5873  return PopCount(mask_bits);
5874 }
5875 
5876 // ------------------------------ LoadInterleaved2
5877 
5878 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
5879 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
5880 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
5881 #else
5882 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
5883 #endif
5884 
5885 namespace detail {
5886 #define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
5887 #define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
5888 
5889 #if HWY_ARCH_ARM_A64
5890 #define HWY_IF_LOAD_INT(T, N) HWY_IF_GE64(T, N)
5891 #define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
5892 #else
5893 // Exclude 64x2 and f64x1, which are only supported on aarch64
5894 #define HWY_IF_LOAD_INT(T, N) \
5895  hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
5896 #define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
5897  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
5898  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
5899  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
5900  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
5901  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
5902 #endif // HWY_ARCH_ARM_A64
5903 
5904 // Must return raw tuple because Tuple2 lack a ctor, and we cannot use
5905 // brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return
5906 // void.
5907 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5908  decltype(Tuple2<type##_t, size>().raw)
5909 // Tuple tag arg allows overloading (cannot just overload on return type)
5910 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5911  const type##_t *from, Tuple2<type##_t, size>
5912 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT)
5913 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5914 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5915 
5916 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5917  decltype(Tuple3<type##_t, size>().raw)
5918 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5919  const type##_t *from, Tuple3<type##_t, size>
5920 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT)
5921 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5922 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5923 
5924 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5925  decltype(Tuple4<type##_t, size>().raw)
5926 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5927  const type##_t *from, Tuple4<type##_t, size>
5928 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT)
5929 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5930 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5931 
5932 #undef HWY_NEON_DEF_FUNCTION_LOAD_INT
5933 #undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
5934 #undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
5935 } // namespace detail
5936 
5937 template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
5939  const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
5940  Vec128<T, N>& v1) {
5941  auto raw = detail::LoadInterleaved2(unaligned, detail::Tuple2<T, N>());
5942  v0 = Vec128<T, N>(raw.val[0]);
5943  v1 = Vec128<T, N>(raw.val[1]);
5944 }
5945 
5946 // <= 32 bits: avoid loading more than N bytes by copying to buffer
5947 template <typename T, size_t N, HWY_IF_LE32(T, N)>
5948 HWY_API void LoadInterleaved2(Simd<T, N, 0> /*tag*/,
5949  const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
5950  Vec128<T, N>& v1) {
5951  // The smallest vector registers are 64-bits and we want space for two.
5952  alignas(16) T buf[2 * 8 / sizeof(T)] = {};
5953  CopyBytes<N * 2 * sizeof(T)>(unaligned, buf);
5954  auto raw = detail::LoadInterleaved2(buf, detail::Tuple2<T, N>());
5955  v0 = Vec128<T, N>(raw.val[0]);
5956  v1 = Vec128<T, N>(raw.val[1]);
5957 }
5958 
5959 #if HWY_ARCH_ARM_V7
5960 // 64x2: split into two 64x1
5961 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
5962 HWY_API void LoadInterleaved2(Full128<T> d, T* HWY_RESTRICT unaligned,
5963  Vec128<T>& v0, Vec128<T>& v1) {
5964  const Half<decltype(d)> dh;
5965  VFromD<decltype(dh)> v00, v10, v01, v11;
5966  LoadInterleaved2(dh, unaligned, v00, v10);
5967  LoadInterleaved2(dh, unaligned + 2, v01, v11);
5968  v0 = Combine(d, v01, v00);
5969  v1 = Combine(d, v11, v10);
5970 }
5971 #endif // HWY_ARCH_ARM_V7
5972 
5973 // ------------------------------ LoadInterleaved3
5974 
5975 template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
5977  const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
5978  Vec128<T, N>& v1, Vec128<T, N>& v2) {
5979  auto raw = detail::LoadInterleaved3(unaligned, detail::Tuple3<T, N>());
5980  v0 = Vec128<T, N>(raw.val[0]);
5981  v1 = Vec128<T, N>(raw.val[1]);
5982  v2 = Vec128<T, N>(raw.val[2]);
5983 }
5984 
5985 // <= 32 bits: avoid writing more than N bytes by copying to buffer
5986 template <typename T, size_t N, HWY_IF_LE32(T, N)>
5987 HWY_API void LoadInterleaved3(Simd<T, N, 0> /*tag*/,
5988  const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
5989  Vec128<T, N>& v1, Vec128<T, N>& v2) {
5990  // The smallest vector registers are 64-bits and we want space for three.
5991  alignas(16) T buf[3 * 8 / sizeof(T)] = {};
5992  CopyBytes<N * 3 * sizeof(T)>(unaligned, buf);
5993  auto raw = detail::LoadInterleaved3(buf, detail::Tuple3<T, N>());
5994  v0 = Vec128<T, N>(raw.val[0]);
5995  v1 = Vec128<T, N>(raw.val[1]);
5996  v2 = Vec128<T, N>(raw.val[2]);
5997 }
5998 
5999 #if HWY_ARCH_ARM_V7
6000 // 64x2: split into two 64x1
6001 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6002 HWY_API void LoadInterleaved3(Full128<T> d, const T* HWY_RESTRICT unaligned,
6003  Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
6004  const Half<decltype(d)> dh;
6005  VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
6006  LoadInterleaved3(dh, unaligned, v00, v10, v20);
6007  LoadInterleaved3(dh, unaligned + 3, v01, v11, v21);
6008  v0 = Combine(d, v01, v00);
6009  v1 = Combine(d, v11, v10);
6010  v2 = Combine(d, v21, v20);
6011 }
6012 #endif // HWY_ARCH_ARM_V7
6013 
6014 // ------------------------------ LoadInterleaved4
6015 
6016 template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
6018  const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6019  Vec128<T, N>& v1, Vec128<T, N>& v2,
6020  Vec128<T, N>& v3) {
6021  auto raw = detail::LoadInterleaved4(unaligned, detail::Tuple4<T, N>());
6022  v0 = Vec128<T, N>(raw.val[0]);
6023  v1 = Vec128<T, N>(raw.val[1]);
6024  v2 = Vec128<T, N>(raw.val[2]);
6025  v3 = Vec128<T, N>(raw.val[3]);
6026 }
6027 
6028 // <= 32 bits: avoid writing more than N bytes by copying to buffer
6029 template <typename T, size_t N, HWY_IF_LE32(T, N)>
6030 HWY_API void LoadInterleaved4(Simd<T, N, 0> /*tag*/,
6031  const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6032  Vec128<T, N>& v1, Vec128<T, N>& v2,
6033  Vec128<T, N>& v3) {
6034  alignas(16) T buf[4 * 8 / sizeof(T)] = {};
6035  CopyBytes<N * 4 * sizeof(T)>(unaligned, buf);
6036  auto raw = detail::LoadInterleaved4(buf, detail::Tuple4<T, N>());
6037  v0 = Vec128<T, N>(raw.val[0]);
6038  v1 = Vec128<T, N>(raw.val[1]);
6039  v2 = Vec128<T, N>(raw.val[2]);
6040  v3 = Vec128<T, N>(raw.val[3]);
6041 }
6042 
6043 #if HWY_ARCH_ARM_V7
6044 // 64x2: split into two 64x1
6045 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6046 HWY_API void LoadInterleaved4(Full128<T> d, const T* HWY_RESTRICT unaligned,
6047  Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
6048  Vec128<T>& v3) {
6049  const Half<decltype(d)> dh;
6050  VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
6051  LoadInterleaved4(dh, unaligned, v00, v10, v20, v30);
6052  LoadInterleaved4(dh, unaligned + 4, v01, v11, v21, v31);
6053  v0 = Combine(d, v01, v00);
6054  v1 = Combine(d, v11, v10);
6055  v2 = Combine(d, v21, v20);
6056  v3 = Combine(d, v31, v30);
6057 }
6058 #endif // HWY_ARCH_ARM_V7
6059 
6060 #undef HWY_IF_LOAD_INT
6061 
6062 // ------------------------------ StoreInterleaved2
6063 
6064 namespace detail {
6065 #define HWY_NEON_BUILD_TPL_HWY_STORE_INT
6066 #define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
6067 #define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
6068 
6069 #if HWY_ARCH_ARM_A64
6070 #define HWY_IF_STORE_INT(T, N) HWY_IF_GE64(T, N)
6071 #define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6072 #else
6073 // Exclude 64x2 and f64x1, which are only supported on aarch64
6074 #define HWY_IF_STORE_INT(T, N) \
6075  hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6076 #define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
6077  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6078  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6079  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6080  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6081  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6082 #endif // HWY_ARCH_ARM_A64
6083 
6084 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6085  Tuple2<type##_t, size> tup, type##_t *to
6086 HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT)
6087 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6088 
6089 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6090  Tuple3<type##_t, size> tup, type##_t *to
6091 HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT)
6092 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6093 
6094 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6095  Tuple4<type##_t, size> tup, type##_t *to
6096 HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT)
6097 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6098 
6099 #undef HWY_NEON_DEF_FUNCTION_STORE_INT
6100 #undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
6101 #undef HWY_NEON_BUILD_RET_HWY_STORE_INT
6102 #undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
6103 } // namespace detail
6104 
6105 template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
6107  Simd<T, N, 0> /*tag*/,
6108  T* HWY_RESTRICT unaligned) {
6109  detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6110  detail::StoreInterleaved2(tup, unaligned);
6111 }
6112 
6113 // <= 32 bits: avoid writing more than N bytes by copying to buffer
6114 template <typename T, size_t N, HWY_IF_LE32(T, N)>
6115 HWY_API void StoreInterleaved2(const Vec128<T, N> v0, const Vec128<T, N> v1,
6116  Simd<T, N, 0> /*tag*/,
6117  T* HWY_RESTRICT unaligned) {
6118  alignas(16) T buf[2 * 8 / sizeof(T)];
6119  detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6120  detail::StoreInterleaved2(tup, buf);
6121  CopyBytes<N * 2 * sizeof(T)>(buf, unaligned);
6122 }
6123 
6124 #if HWY_ARCH_ARM_V7
6125 // 64x2: split into two 64x1
6126 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6127 HWY_API void StoreInterleaved2(const Vec128<T> v0, const Vec128<T> v1,
6128  Full128<T> d, T* HWY_RESTRICT unaligned) {
6129  const Half<decltype(d)> dh;
6130  StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, unaligned);
6131  StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, unaligned + 2);
6132 }
6133 #endif // HWY_ARCH_ARM_V7
6134 
6135 // ------------------------------ StoreInterleaved3
6136 
6137 template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
6139  const Vec128<T, N> v2, Simd<T, N, 0> /*tag*/,
6140  T* HWY_RESTRICT unaligned) {
6141  detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6142  detail::StoreInterleaved3(tup, unaligned);
6143 }
6144 
6145 // <= 32 bits: avoid writing more than N bytes by copying to buffer
6146 template <typename T, size_t N, HWY_IF_LE32(T, N)>
6147 HWY_API void StoreInterleaved3(const Vec128<T, N> v0, const Vec128<T, N> v1,
6148  const Vec128<T, N> v2, Simd<T, N, 0> /*tag*/,
6149  T* HWY_RESTRICT unaligned) {
6150  alignas(16) T buf[3 * 8 / sizeof(T)];
6151  detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6152  detail::StoreInterleaved3(tup, buf);
6153  CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
6154 }
6155 
6156 #if HWY_ARCH_ARM_V7
6157 // 64x2: split into two 64x1
6158 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6159 HWY_API void StoreInterleaved3(const Vec128<T> v0, const Vec128<T> v1,
6160  const Vec128<T> v2, Full128<T> d,
6161  T* HWY_RESTRICT unaligned) {
6162  const Half<decltype(d)> dh;
6163  StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
6164  unaligned);
6165  StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
6166  unaligned + 3);
6167 }
6168 #endif // HWY_ARCH_ARM_V7
6169 
6170 // ------------------------------ StoreInterleaved4
6171 
6172 template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
6174  const Vec128<T, N> v2, const Vec128<T, N> v3,
6175  Simd<T, N, 0> /*tag*/,
6176  T* HWY_RESTRICT unaligned) {
6177  detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6178  detail::StoreInterleaved4(tup, unaligned);
6179 }
6180 
6181 // <= 32 bits: avoid writing more than N bytes by copying to buffer
6182 template <typename T, size_t N, HWY_IF_LE32(T, N)>
6183 HWY_API void StoreInterleaved4(const Vec128<T, N> v0, const Vec128<T, N> v1,
6184  const Vec128<T, N> v2, const Vec128<T, N> v3,
6185  Simd<T, N, 0> /*tag*/,
6186  T* HWY_RESTRICT unaligned) {
6187  alignas(16) T buf[4 * 8 / sizeof(T)];
6188  detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6189  detail::StoreInterleaved4(tup, buf);
6190  CopyBytes<N * 4 * sizeof(T)>(buf, unaligned);
6191 }
6192 
6193 #if HWY_ARCH_ARM_V7
6194 // 64x2: split into two 64x1
6195 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6196 HWY_API void StoreInterleaved4(const Vec128<T> v0, const Vec128<T> v1,
6197  const Vec128<T> v2, const Vec128<T> v3,
6198  Full128<T> d, T* HWY_RESTRICT unaligned) {
6199  const Half<decltype(d)> dh;
6200  StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
6201  LowerHalf(dh, v3), dh, unaligned);
6202  StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
6203  UpperHalf(dh, v3), dh, unaligned + 4);
6204 }
6205 #endif // HWY_ARCH_ARM_V7
6206 
6207 #undef HWY_IF_STORE_INT
6208 
6209 // ------------------------------ Lt128
6210 
6211 template <typename T, size_t N, HWY_IF_LE128(T, N)>
6213  Vec128<T, N> b) {
6214  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
6215  // Truth table of Eq and Lt for Hi and Lo u64.
6216  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
6217  // =H =L cH cL | out = cH | (=H & cL)
6218  // 0 0 0 0 | 0
6219  // 0 0 0 1 | 0
6220  // 0 0 1 0 | 1
6221  // 0 0 1 1 | 1
6222  // 0 1 0 0 | 0
6223  // 0 1 0 1 | 0
6224  // 0 1 1 0 | 1
6225  // 1 0 0 0 | 0
6226  // 1 0 0 1 | 1
6227  // 1 1 0 0 | 0
6228  const Mask128<T, N> eqHL = Eq(a, b);
6229  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
6230  // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
6231  // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
6232  // comparison result leftwards requires only 4. IfThenElse compiles to the
6233  // same code as OrAnd().
6234  const Vec128<T, N> ltLx = DupEven(ltHL);
6235  const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
6236  return MaskFromVec(DupOdd(outHx));
6237 }
6238 
6239 template <typename T, size_t N, HWY_IF_LE128(T, N)>
6241  Vec128<T, N> b) {
6242  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
6243  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
6244 }
6245 
6246 // ------------------------------ Min128, Max128 (Lt128)
6247 
6248 // Without a native OddEven, it seems infeasible to go faster than Lt128.
6249 template <class D>
6251  return IfThenElse(Lt128(d, a, b), a, b);
6252 }
6253 
6254 template <class D>
6256  return IfThenElse(Lt128(d, b, a), a, b);
6257 }
6258 
6259 template <class D>
6261  return IfThenElse(Lt128Upper(d, a, b), a, b);
6262 }
6263 
6264 template <class D>
6266  return IfThenElse(Lt128Upper(d, b, a), a, b);
6267 }
6268 
6269 // ================================================== Operator wrapper
6270 
6271 // These apply to all x86_*-inl.h because there are no restrictions on V.
6272 
6273 template <class V>
6274 HWY_API V Add(V a, V b) {
6275  return a + b;
6276 }
6277 template <class V>
6278 HWY_API V Sub(V a, V b) {
6279  return a - b;
6280 }
6281 
6282 template <class V>
6283 HWY_API V Mul(V a, V b) {
6284  return a * b;
6285 }
6286 template <class V>
6287 HWY_API V Div(V a, V b) {
6288  return a / b;
6289 }
6290 
6291 template <class V>
6292 V Shl(V a, V b) {
6293  return a << b;
6294 }
6295 template <class V>
6296 V Shr(V a, V b) {
6297  return a >> b;
6298 }
6299 
6300 template <class V>
6301 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
6302  return a == b;
6303 }
6304 template <class V>
6305 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
6306  return a != b;
6307 }
6308 template <class V>
6309 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
6310  return a < b;
6311 }
6312 
6313 template <class V>
6314 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
6315  return a > b;
6316 }
6317 template <class V>
6318 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
6319  return a >= b;
6320 }
6321 
6322 template <class V>
6323 HWY_API auto Le(V a, V b) -> decltype(a == b) {
6324  return a <= b;
6325 }
6326 
6327 namespace detail { // for code folding
6328 #if HWY_ARCH_ARM_V7
6329 #undef vuzp1_s8
6330 #undef vuzp1_u8
6331 #undef vuzp1_s16
6332 #undef vuzp1_u16
6333 #undef vuzp1_s32
6334 #undef vuzp1_u32
6335 #undef vuzp1_f32
6336 #undef vuzp1q_s8
6337 #undef vuzp1q_u8
6338 #undef vuzp1q_s16
6339 #undef vuzp1q_u16
6340 #undef vuzp1q_s32
6341 #undef vuzp1q_u32
6342 #undef vuzp1q_f32
6343 #undef vuzp2_s8
6344 #undef vuzp2_u8
6345 #undef vuzp2_s16
6346 #undef vuzp2_u16
6347 #undef vuzp2_s32
6348 #undef vuzp2_u32
6349 #undef vuzp2_f32
6350 #undef vuzp2q_s8
6351 #undef vuzp2q_u8
6352 #undef vuzp2q_s16
6353 #undef vuzp2q_u16
6354 #undef vuzp2q_s32
6355 #undef vuzp2q_u32
6356 #undef vuzp2q_f32
6357 #undef vzip1_s8
6358 #undef vzip1_u8
6359 #undef vzip1_s16
6360 #undef vzip1_u16
6361 #undef vzip1_s32
6362 #undef vzip1_u32
6363 #undef vzip1_f32
6364 #undef vzip1q_s8
6365 #undef vzip1q_u8
6366 #undef vzip1q_s16
6367 #undef vzip1q_u16
6368 #undef vzip1q_s32
6369 #undef vzip1q_u32
6370 #undef vzip1q_f32
6371 #undef vzip2_s8
6372 #undef vzip2_u8
6373 #undef vzip2_s16
6374 #undef vzip2_u16
6375 #undef vzip2_s32
6376 #undef vzip2_u32
6377 #undef vzip2_f32
6378 #undef vzip2q_s8
6379 #undef vzip2q_u8
6380 #undef vzip2q_s16
6381 #undef vzip2q_u16
6382 #undef vzip2q_s32
6383 #undef vzip2q_u32
6384 #undef vzip2q_f32
6385 #endif
6386 
6387 #undef HWY_NEON_BUILD_ARG_1
6388 #undef HWY_NEON_BUILD_ARG_2
6389 #undef HWY_NEON_BUILD_ARG_3
6390 #undef HWY_NEON_BUILD_PARAM_1
6391 #undef HWY_NEON_BUILD_PARAM_2
6392 #undef HWY_NEON_BUILD_PARAM_3
6393 #undef HWY_NEON_BUILD_RET_1
6394 #undef HWY_NEON_BUILD_RET_2
6395 #undef HWY_NEON_BUILD_RET_3
6396 #undef HWY_NEON_BUILD_TPL_1
6397 #undef HWY_NEON_BUILD_TPL_2
6398 #undef HWY_NEON_BUILD_TPL_3
6399 #undef HWY_NEON_DEF_FUNCTION
6400 #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
6401 #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
6402 #undef HWY_NEON_DEF_FUNCTION_FLOAT_64
6403 #undef HWY_NEON_DEF_FUNCTION_INTS
6404 #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
6405 #undef HWY_NEON_DEF_FUNCTION_INT_16
6406 #undef HWY_NEON_DEF_FUNCTION_INT_32
6407 #undef HWY_NEON_DEF_FUNCTION_INT_8
6408 #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
6409 #undef HWY_NEON_DEF_FUNCTION_TPL
6410 #undef HWY_NEON_DEF_FUNCTION_UIF81632
6411 #undef HWY_NEON_DEF_FUNCTION_UINTS
6412 #undef HWY_NEON_DEF_FUNCTION_UINT_16
6413 #undef HWY_NEON_DEF_FUNCTION_UINT_32
6414 #undef HWY_NEON_DEF_FUNCTION_UINT_8
6415 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
6416 #undef HWY_NEON_EVAL
6417 } // namespace detail
6418 
6419 // NOLINTNEXTLINE(google-readability-namespace-comments)
6420 } // namespace HWY_NAMESPACE
6421 } // namespace hwy
HWY_AFTER_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition: arm_neon-inl.h:159
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:182
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition: arm_neon-inl.h:192
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:138
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:133
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:6076
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:91
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:187
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:121
HWY_BEFORE_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:165
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition: arm_neon-inl.h:2385
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:107
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:114
#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args)
Definition: arm_neon-inl.h:196
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:177
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:99
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:127
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:5896
#define HWY_IF_FLOAT(T)
Definition: base.h:343
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
HWY_INLINE Mask128()
Definition: arm_neon-inl.h:809
Mask128(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition: arm_neon-inl.h:812
Raw raw
Definition: arm_neon-inl.h:814
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
Mask128 & operator=(const Mask128 &)=default
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: arm_neon-inl.h:783
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: arm_neon-inl.h:786
HWY_INLINE Vec128()
Definition: arm_neon-inl.h:764
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: arm_neon-inl.h:774
HWY_INLINE Vec128(const Raw raw)
Definition: arm_neon-inl.h:767
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: arm_neon-inl.h:789
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: arm_neon-inl.h:771
Vec128(const Vec128 &)=default
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: arm_neon-inl.h:777
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: arm_neon-inl.h:780
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3345
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE Vec128< T, N > Set64(Simd< T, N, 0 >, uint64_t mask_bits)
Definition: arm_neon-inl.h:4952
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition: arm_neon-inl.h:1884
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_INLINE uint64_t NibblesFromMask(const Full128< T > d, Mask128< T > mask)
Definition: arm_neon-inl.h:5019
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition: arm_neon-inl.h:5325
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) template< size_t N > HWY_INLINE Vec128< uint8_t
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition: arm_neon-inl.h:1733
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5491
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5339
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition: arm_neon-inl.h:1388
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, 2 > ConcatEven(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4474
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< T, 2 > ConcatOdd(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4443
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
constexpr float MantissaEnd< float >()
Definition: base.h:636
double float64_t
Definition: base.h:258
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:309
float float32_t
Definition: base.h:257
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:5318
Definition: arm_neon-inl.h:3883
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
Definition: ops/shared-inl.h:40
Definition: arm_neon-inl.h:823
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: arm_neon-inl.h:825
uint16x4_t type
Definition: arm_neon-inl.h:688
uint16x8_t type
Definition: arm_neon-inl.h:625
uint16x4_t type
Definition: arm_neon-inl.h:683
uint16x8_t type
Definition: arm_neon-inl.h:620
float32x2_t type
Definition: arm_neon-inl.h:693
float32x4_t type
Definition: arm_neon-inl.h:630
int16x4_t type
Definition: arm_neon-inl.h:668
int16x8_t type
Definition: arm_neon-inl.h:605
int32x2_t type
Definition: arm_neon-inl.h:673
int32x4_t type
Definition: arm_neon-inl.h:610
int64x1_t type
Definition: arm_neon-inl.h:678
int64x2_t type
Definition: arm_neon-inl.h:615
int8x16_t type
Definition: arm_neon-inl.h:600
int8x8_t type
Definition: arm_neon-inl.h:663
uint16x4_t type
Definition: arm_neon-inl.h:648
uint16x8_t type
Definition: arm_neon-inl.h:585
uint32x2_t type
Definition: arm_neon-inl.h:653
uint32x4_t type
Definition: arm_neon-inl.h:590
uint64x1_t type
Definition: arm_neon-inl.h:658
uint64x2_t type
Definition: arm_neon-inl.h:595
uint8x16_t type
Definition: arm_neon-inl.h:580
uint8x8_t type
Definition: arm_neon-inl.h:643
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3561
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3568
Definition: arm_neon-inl.h:3539
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition: arm_neon-inl.h:3542
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3549
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3591
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3598
Definition: arm_neon-inl.h:3574
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition: arm_neon-inl.h:3576
uint16x8x2_t raw
Definition: arm_neon-inl.h:346
uint16x4x2_t raw
Definition: arm_neon-inl.h:350
uint16x8x2_t raw
Definition: arm_neon-inl.h:338
uint16x4x2_t raw
Definition: arm_neon-inl.h:342
float32x4x2_t raw
Definition: arm_neon-inl.h:355
float32x2x2_t raw
Definition: arm_neon-inl.h:359
int16x8x2_t raw
Definition: arm_neon-inl.h:297
int16x4x2_t raw
Definition: arm_neon-inl.h:301
int32x4x2_t raw
Definition: arm_neon-inl.h:313
int32x2x2_t raw
Definition: arm_neon-inl.h:317
int64x2x2_t raw
Definition: arm_neon-inl.h:329
int64x1x2_t raw
Definition: arm_neon-inl.h:333
int8x16x2_t raw
Definition: arm_neon-inl.h:281
int8x8x2_t raw
Definition: arm_neon-inl.h:285
uint16x8x2_t raw
Definition: arm_neon-inl.h:289
uint16x4x2_t raw
Definition: arm_neon-inl.h:293
uint32x4x2_t raw
Definition: arm_neon-inl.h:305
uint32x2x2_t raw
Definition: arm_neon-inl.h:309
uint64x2x2_t raw
Definition: arm_neon-inl.h:321
uint64x1x2_t raw
Definition: arm_neon-inl.h:325
uint8x16x2_t raw
Definition: arm_neon-inl.h:273
uint8x8x2_t raw
Definition: arm_neon-inl.h:277
Definition: arm_neon-inl.h:265
uint16x8x3_t raw
Definition: arm_neon-inl.h:447
uint16x4x3_t raw
Definition: arm_neon-inl.h:451
uint16x8x3_t raw
Definition: arm_neon-inl.h:439
uint16x4x3_t raw
Definition: arm_neon-inl.h:443
float32x4x3_t raw
Definition: arm_neon-inl.h:456
float32x2x3_t raw
Definition: arm_neon-inl.h:460
int16x8x3_t raw
Definition: arm_neon-inl.h:398
int16x4x3_t raw
Definition: arm_neon-inl.h:402
int32x4x3_t raw
Definition: arm_neon-inl.h:414
int32x2x3_t raw
Definition: arm_neon-inl.h:418
int64x2x3_t raw
Definition: arm_neon-inl.h:430
int64x1x3_t raw
Definition: arm_neon-inl.h:434
int8x16x3_t raw
Definition: arm_neon-inl.h:382
int8x8x3_t raw
Definition: arm_neon-inl.h:386
uint16x8x3_t raw
Definition: arm_neon-inl.h:390
uint16x4x3_t raw
Definition: arm_neon-inl.h:394
uint32x4x3_t raw
Definition: arm_neon-inl.h:406
uint32x2x3_t raw
Definition: arm_neon-inl.h:410
uint64x2x3_t raw
Definition: arm_neon-inl.h:422
uint64x1x3_t raw
Definition: arm_neon-inl.h:426
uint8x16x3_t raw
Definition: arm_neon-inl.h:374
uint8x8x3_t raw
Definition: arm_neon-inl.h:378
Definition: arm_neon-inl.h:267
uint16x8x4_t raw
Definition: arm_neon-inl.h:548
uint16x4x4_t raw
Definition: arm_neon-inl.h:552
uint16x8x4_t raw
Definition: arm_neon-inl.h:540
uint16x4x4_t raw
Definition: arm_neon-inl.h:544
float32x4x4_t raw
Definition: arm_neon-inl.h:557
float32x2x4_t raw
Definition: arm_neon-inl.h:561
int16x8x4_t raw
Definition: arm_neon-inl.h:499
int16x4x4_t raw
Definition: arm_neon-inl.h:503
int32x4x4_t raw
Definition: arm_neon-inl.h:515
int32x2x4_t raw
Definition: arm_neon-inl.h:519
int64x2x4_t raw
Definition: arm_neon-inl.h:531
int64x1x4_t raw
Definition: arm_neon-inl.h:535
int8x16x4_t raw
Definition: arm_neon-inl.h:483
int8x8x4_t raw
Definition: arm_neon-inl.h:487
uint16x8x4_t raw
Definition: arm_neon-inl.h:491
uint16x4x4_t raw
Definition: arm_neon-inl.h:495
uint32x4x4_t raw
Definition: arm_neon-inl.h:507
uint32x2x4_t raw
Definition: arm_neon-inl.h:511
uint64x2x4_t raw
Definition: arm_neon-inl.h:523
uint64x1x4_t raw
Definition: arm_neon-inl.h:527
uint8x16x4_t raw
Definition: arm_neon-inl.h:475
uint8x8x4_t raw
Definition: arm_neon-inl.h:479
Definition: arm_neon-inl.h:269
Definition: base.h:358
Definition: base.h:251
Definition: base.h:246