Grok  10.0.3
arm_sve-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // ARM SVE[2] vectors (length not known at compile time).
17 // External include guard in highway.h - see comment there.
18 
19 #include <arm_sve.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 
23 #include "hwy/base.h"
24 #include "hwy/ops/shared-inl.h"
25 
26 // If running on hardware whose vector length is known to be a power of two, we
27 // can skip fixups for non-power of two sizes.
28 #undef HWY_SVE_IS_POW2
29 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
30 #define HWY_SVE_IS_POW2 1
31 #else
32 #define HWY_SVE_IS_POW2 0
33 #endif
34 
36 namespace hwy {
37 namespace HWY_NAMESPACE {
38 
39 template <class V>
40 struct DFromV_t {}; // specialized in macros
41 template <class V>
42 using DFromV = typename DFromV_t<RemoveConst<V>>::type;
43 
44 template <class V>
45 using TFromV = TFromD<DFromV<V>>;
46 
47 // ================================================== MACROS
48 
49 // Generate specializations and function definitions using X macros. Although
50 // harder to read and debug, writing everything manually is too bulky.
51 
52 namespace detail { // for code folding
53 
54 // Unsigned:
55 #define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
56 #define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
57 #define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
58  X_MACRO(uint, u, 32, 16, NAME, OP)
59 #define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
60  X_MACRO(uint, u, 64, 32, NAME, OP)
61 
62 // Signed:
63 #define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
64 #define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
65 #define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
66 #define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
67 
68 // Float:
69 #define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
70  X_MACRO(float, f, 16, 16, NAME, OP)
71 #define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
72  X_MACRO(float, f, 32, 16, NAME, OP)
73 #define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
74  X_MACRO(float, f, 64, 32, NAME, OP)
75 
76 // For all element sizes:
77 #define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
78  HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
79  HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
80  HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
81  HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
82 
83 #define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
84  HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
85  HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
86  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
87  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
88 
89 #define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
90  HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
91  HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
92  HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
93 
94 // Commonly used type categories for a given element size:
95 #define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
96  HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
97  HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
98 
99 #define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
100  HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
101  HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
102 
103 #define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
104  HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
105  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
106 
107 #define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
108  HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
109  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
110 
111 #define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
112  HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
113  HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
114  HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
115  HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
116 
117 // Commonly used type categories:
118 #define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
119  HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
120  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
121 
122 #define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
123  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
124  HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
125 
126 #define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
127  HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
128  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
129  HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
130 
131 // Assemble types for use in x-macros
132 #define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
133 #define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
134 #define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
135 
136 } // namespace detail
137 
138 #define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
139  template <> \
140  struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
141  using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \
142  };
143 
145 #undef HWY_SPECIALIZE
146 
147 // Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
148 // instructions, and we anyway only use it when the predicate is ptrue.
149 
150 // vector = f(vector), e.g. Not
151 #define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \
152  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
153  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
154  }
155 #define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \
156  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
157  return sv##OP##_##CHAR##BITS(v); \
158  }
159 
160 // vector = f(vector, scalar), e.g. detail::AddN
161 #define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
162  HWY_API HWY_SVE_V(BASE, BITS) \
163  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
164  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
165  }
166 #define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \
167  HWY_API HWY_SVE_V(BASE, BITS) \
168  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
169  return sv##OP##_##CHAR##BITS(a, b); \
170  }
171 
172 // vector = f(vector, vector), e.g. Add
173 #define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
174  HWY_API HWY_SVE_V(BASE, BITS) \
175  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
176  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
177  }
178 #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
179  HWY_API HWY_SVE_V(BASE, BITS) \
180  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
181  return sv##OP##_##CHAR##BITS(a, b); \
182  }
183 
184 // ------------------------------ Lanes
185 
186 namespace detail {
187 
188 // Returns actual lanes of a hardware vector without rounding to a power of two.
190  return svcntb_pat(SV_ALL);
191 }
193  return svcnth_pat(SV_ALL);
194 }
196  return svcntw_pat(SV_ALL);
197 }
199  return svcntd_pat(SV_ALL);
200 }
201 
202 // All-true mask from a macro
203 #define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
204 
205 #if HWY_SVE_IS_POW2
206 #define HWY_SVE_PTRUE(BITS) HWY_SVE_ALL_PTRUE(BITS)
207 #else
208 #define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
209 
210 // Returns actual lanes of a hardware vector, rounded down to a power of two.
211 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
212 HWY_INLINE size_t HardwareLanes() {
213  return svcntb_pat(SV_POW2);
214 }
215 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
216 HWY_INLINE size_t HardwareLanes() {
217  return svcnth_pat(SV_POW2);
218 }
219 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
220 HWY_INLINE size_t HardwareLanes() {
221  return svcntw_pat(SV_POW2);
222 }
223 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
224 HWY_INLINE size_t HardwareLanes() {
225  return svcntd_pat(SV_POW2);
226 }
227 
228 #endif // HWY_SVE_IS_POW2
229 
230 } // namespace detail
231 
232 // Returns actual number of lanes after capping by N and shifting. May return 0
233 // (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8).
234 #if HWY_TARGET == HWY_SVE_256
235 template <typename T, size_t N, int kPow2>
236 HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
237  return HWY_MIN(detail::ScaleByPower(32 / sizeof(T), kPow2), N);
238 }
239 #elif HWY_TARGET == HWY_SVE2_128
240 template <typename T, size_t N, int kPow2>
241 HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
242  return HWY_MIN(detail::ScaleByPower(16 / sizeof(T), kPow2), N);
243 }
244 #else
245 template <typename T, size_t N, int kPow2>
246 HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
247  const size_t actual = detail::HardwareLanes<T>();
248  // Common case of full vectors: avoid any extra instructions.
249  if (detail::IsFull(d)) return actual;
250  return HWY_MIN(detail::ScaleByPower(actual, kPow2), N);
251 }
252 #endif // HWY_TARGET
253 
254 // ================================================== MASK INIT
255 
256 // One mask bit per byte; only the one belonging to the lowest byte is valid.
257 
258 // ------------------------------ FirstN
259 #define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \
260  template <size_t N, int kPow2> \
261  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \
262  const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
263  return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
264  }
266 #undef HWY_SVE_FIRSTN
267 
268 namespace detail {
269 
270 #define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
271  template <size_t N, int kPow2> \
272  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
273  return HWY_SVE_PTRUE(BITS); \
274  } \
275  template <size_t N, int kPow2> \
276  HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
277  return HWY_SVE_ALL_PTRUE(BITS); \
278  }
279 
280 HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
281 #undef HWY_SVE_WRAP_PTRUE
282 
283 HWY_API svbool_t PFalse() { return svpfalse_b(); }
284 
285 // Returns all-true if d is HWY_FULL or FirstN(N) after capping N.
286 //
287 // This is used in functions that load/store memory; other functions (e.g.
288 // arithmetic) can ignore d and use PTrue instead.
289 template <class D>
290 svbool_t MakeMask(D d) {
291  return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d));
292 }
293 
294 } // namespace detail
295 
296 // ================================================== INIT
297 
298 // ------------------------------ Set
299 // vector = f(d, scalar), e.g. Set
300 #define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
301  template <size_t N, int kPow2> \
302  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
303  HWY_SVE_T(BASE, BITS) arg) { \
304  return sv##OP##_##CHAR##BITS(arg); \
305  }
306 
308 #undef HWY_SVE_SET
309 
310 // Required for Zero and VFromD
311 template <size_t N, int kPow2>
313  return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
314 }
315 
316 template <class D>
317 using VFromD = decltype(Set(D(), TFromD<D>()));
318 
319 // ------------------------------ Zero
320 
321 template <class D>
323  return Set(d, 0);
324 }
325 
326 // ------------------------------ Undefined
327 
328 #define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
329  template <size_t N, int kPow2> \
330  HWY_API HWY_SVE_V(BASE, BITS) \
331  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
332  return sv##OP##_##CHAR##BITS(); \
333  }
334 
336 
337 // ------------------------------ BitCast
338 
339 namespace detail {
340 
341 // u8: no change
342 #define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \
343  HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
344  return v; \
345  } \
346  template <size_t N, int kPow2> \
347  HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
348  HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
349  return v; \
350  }
351 
352 // All other types
353 #define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \
354  HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
355  return sv##OP##_u8_##CHAR##BITS(v); \
356  } \
357  template <size_t N, int kPow2> \
358  HWY_INLINE HWY_SVE_V(BASE, BITS) \
359  BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svuint8_t v) { \
360  return sv##OP##_##CHAR##BITS##_u8(v); \
361  }
362 
364 HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
365 HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
366 HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
367 HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
368 HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
369 
370 #undef HWY_SVE_CAST_NOP
371 #undef HWY_SVE_CAST
372 
373 template <size_t N, int kPow2>
375  svuint8_t v) {
377 }
378 
379 } // namespace detail
380 
381 template <class D, class FromV>
384 }
385 
386 // ================================================== LOGICAL
387 
388 // detail::*N() functions accept a scalar argument to avoid extra Set().
389 
390 // ------------------------------ Not
392 
393 // ------------------------------ And
394 
395 namespace detail {
397 } // namespace detail
398 
400 
401 template <class V, HWY_IF_FLOAT_V(V)>
402 HWY_API V And(const V a, const V b) {
403  const DFromV<V> df;
404  const RebindToUnsigned<decltype(df)> du;
405  return BitCast(df, And(BitCast(du, a), BitCast(du, b)));
406 }
407 
408 // ------------------------------ Or
409 
411 
412 template <class V, HWY_IF_FLOAT_V(V)>
413 HWY_API V Or(const V a, const V b) {
414  const DFromV<V> df;
415  const RebindToUnsigned<decltype(df)> du;
416  return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
417 }
418 
419 // ------------------------------ Xor
420 
421 namespace detail {
423 } // namespace detail
424 
426 
427 template <class V, HWY_IF_FLOAT_V(V)>
428 HWY_API V Xor(const V a, const V b) {
429  const DFromV<V> df;
430  const RebindToUnsigned<decltype(df)> du;
431  return BitCast(df, Xor(BitCast(du, a), BitCast(du, b)));
432 }
433 
434 // ------------------------------ AndNot
435 
436 namespace detail {
437 #define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
438  HWY_API HWY_SVE_V(BASE, BITS) \
439  NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
440  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
441  }
442 
444 #undef HWY_SVE_RETV_ARGPVN_SWAP
445 } // namespace detail
446 
447 #define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
448  HWY_API HWY_SVE_V(BASE, BITS) \
449  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
450  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
451  }
453 #undef HWY_SVE_RETV_ARGPVV_SWAP
454 
455 template <class V, HWY_IF_FLOAT_V(V)>
456 HWY_API V AndNot(const V a, const V b) {
457  const DFromV<V> df;
458  const RebindToUnsigned<decltype(df)> du;
459  return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b)));
460 }
461 
462 // ------------------------------ Or3
463 template <class V>
464 HWY_API V Or3(V o1, V o2, V o3) {
465  return Or(o1, Or(o2, o3));
466 }
467 
468 // ------------------------------ OrAnd
469 template <class V>
470 HWY_API V OrAnd(const V o, const V a1, const V a2) {
471  return Or(o, And(a1, a2));
472 }
473 
474 // ------------------------------ PopulationCount
475 
476 #ifdef HWY_NATIVE_POPCNT
477 #undef HWY_NATIVE_POPCNT
478 #else
479 #define HWY_NATIVE_POPCNT
480 #endif
481 
482 // Need to return original type instead of unsigned.
483 #define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \
484  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
485  return BitCast(DFromV<decltype(v)>(), \
486  sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
487  }
489 #undef HWY_SVE_POPCNT
490 
491 // ================================================== SIGN
492 
493 // ------------------------------ Neg
495 
496 // ------------------------------ Abs
498 
499 // ------------------------------ CopySign[ToAbs]
500 
501 template <class V>
502 HWY_API V CopySign(const V magn, const V sign) {
503  const auto msb = SignBit(DFromV<V>());
504  return Or(AndNot(msb, magn), And(msb, sign));
505 }
506 
507 template <class V>
508 HWY_API V CopySignToAbs(const V abs, const V sign) {
509  const auto msb = SignBit(DFromV<V>());
510  return Or(abs, And(msb, sign));
511 }
512 
513 // ================================================== ARITHMETIC
514 
515 // ------------------------------ Add
516 
517 namespace detail {
519 } // namespace detail
520 
522 
523 // ------------------------------ Sub
524 
525 namespace detail {
526 // Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg.
527 #define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \
528  HWY_API HWY_SVE_V(BASE, BITS) \
529  NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
530  return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
531  }
532 
534 #undef HWY_SVE_RETV_ARGPVN_MASK
535 } // namespace detail
536 
538 
539 // ------------------------------ SumsOf8
540 HWY_API svuint64_t SumsOf8(const svuint8_t v) {
541  const ScalableTag<uint32_t> du32;
542  const ScalableTag<uint64_t> du64;
543  const svbool_t pg = detail::PTrue(du64);
544 
545  const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
546  // Compute pairwise sum of u32 and extend to u64.
547  // TODO(janwas): on SVE2, we can instead use svaddp.
548  const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
549  // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
550  const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
551  return Add(hi, lo);
552 }
553 
554 // ------------------------------ SaturatedAdd
555 
558 
559 // ------------------------------ SaturatedSub
560 
563 
564 // ------------------------------ AbsDiff
566 
567 // ------------------------------ ShiftLeft[Same]
568 
569 #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
570  template <int kBits> \
571  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
572  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
573  } \
574  HWY_API HWY_SVE_V(BASE, BITS) \
575  NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
576  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
577  }
578 
580 
581 // ------------------------------ ShiftRight[Same]
582 
585 
586 #undef HWY_SVE_SHIFT_N
587 
588 // ------------------------------ RotateRight
589 
590 // TODO(janwas): svxar on SVE2
591 template <int kBits, class V>
592 HWY_API V RotateRight(const V v) {
593  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
594  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
595  if (kBits == 0) return v;
596  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
597 }
598 
599 // ------------------------------ Shl/r
600 
601 #define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
602  HWY_API HWY_SVE_V(BASE, BITS) \
603  NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
604  const RebindToUnsigned<DFromV<decltype(v)>> du; \
605  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
606  BitCast(du, bits)); \
607  }
608 
610 
613 
614 #undef HWY_SVE_SHIFT
615 
616 // ------------------------------ Min/Max
617 
622 
623 namespace detail {
626 } // namespace detail
627 
628 // ------------------------------ Mul
631 
632 // ------------------------------ MulHigh
634 namespace detail {
637 } // namespace detail
638 
639 // ------------------------------ MulFixedPoint15
640 HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
641 #if HWY_TARGET == HWY_SVE2
642  return svqrdmulh_s16(a, b);
643 #else
644  const DFromV<decltype(a)> d;
645  const RebindToUnsigned<decltype(d)> du;
646 
647  const svuint16_t lo = BitCast(du, Mul(a, b));
648  const svint16_t hi = MulHigh(a, b);
649  // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
650  // carry that into the result. Instead isolate the top two bits because only
651  // they can influence the result.
652  const svuint16_t lo_top2 = ShiftRight<14>(lo);
653  // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
654  const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
655  return Add(Add(hi, hi), BitCast(d, rounding));
656 #endif
657 }
658 
659 // ------------------------------ Div
661 
662 // ------------------------------ ApproximateReciprocal
664 
665 // ------------------------------ Sqrt
667 
668 // ------------------------------ ApproximateReciprocalSqrt
670 
671 // ------------------------------ MulAdd
672 #define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
673  HWY_API HWY_SVE_V(BASE, BITS) \
674  NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
675  HWY_SVE_V(BASE, BITS) add) { \
676  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
677  }
678 
680 
681 // ------------------------------ NegMulAdd
683 
684 // ------------------------------ MulSub
686 
687 // ------------------------------ NegMulSub
689 
690 #undef HWY_SVE_FMA
691 
692 // ------------------------------ Round etc.
693 
698 
699 // ================================================== MASK
700 
701 // ------------------------------ RebindMask
702 template <class D, typename MFrom>
703 HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) {
704  return mask;
705 }
706 
707 // ------------------------------ Mask logical
708 
709 HWY_API svbool_t Not(svbool_t m) {
710  // We don't know the lane type, so assume 8-bit. For larger types, this will
711  // de-canonicalize the predicate, i.e. set bits to 1 even though they do not
712  // correspond to the lowest byte in the lane. Per ARM, such bits are ignored.
713  return svnot_b_z(HWY_SVE_PTRUE(8), m);
714 }
715 HWY_API svbool_t And(svbool_t a, svbool_t b) {
716  return svand_b_z(b, b, a); // same order as AndNot for consistency
717 }
718 HWY_API svbool_t AndNot(svbool_t a, svbool_t b) {
719  return svbic_b_z(b, b, a); // reversed order like NEON
720 }
721 HWY_API svbool_t Or(svbool_t a, svbool_t b) {
722  return svsel_b(a, a, b); // a ? true : b
723 }
724 HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
725  return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b.
726 }
727 
728 // ------------------------------ CountTrue
729 
730 #define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
731  template <size_t N, int kPow2> \
732  HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
733  return sv##OP##_b##BITS(detail::MakeMask(d), m); \
734  }
735 
737 #undef HWY_SVE_COUNT_TRUE
738 
739 // For 16-bit Compress: full vector, not limited to SV_POW2.
740 namespace detail {
741 
742 #define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \
743  template <size_t N, int kPow2> \
744  HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m) { \
745  return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
746  }
747 
748 HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp)
749 #undef HWY_SVE_COUNT_TRUE_FULL
750 
751 } // namespace detail
752 
753 // ------------------------------ AllFalse
754 template <class D>
755 HWY_API bool AllFalse(D d, svbool_t m) {
756  return !svptest_any(detail::MakeMask(d), m);
757 }
758 
759 // ------------------------------ AllTrue
760 template <class D>
761 HWY_API bool AllTrue(D d, svbool_t m) {
762  return CountTrue(d, m) == Lanes(d);
763 }
764 
765 // ------------------------------ FindFirstTrue
766 template <class D>
767 HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
768  return AllFalse(d, m) ? intptr_t{-1}
769  : static_cast<intptr_t>(
770  CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
771 }
772 
773 // ------------------------------ IfThenElse
774 #define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
775  HWY_API HWY_SVE_V(BASE, BITS) \
776  NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
777  return sv##OP##_##CHAR##BITS(m, yes, no); \
778  }
779 
781 #undef HWY_SVE_IF_THEN_ELSE
782 
783 // ------------------------------ IfThenElseZero
784 template <class V>
785 HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
786  return IfThenElse(mask, yes, Zero(DFromV<V>()));
787 }
788 
789 // ------------------------------ IfThenZeroElse
790 template <class V>
791 HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
792  return IfThenElse(mask, Zero(DFromV<V>()), no);
793 }
794 
795 // ================================================== COMPARE
796 
797 // mask = f(vector, vector)
798 #define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \
799  HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
800  return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
801  }
802 #define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \
803  HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
804  return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
805  }
806 
807 // ------------------------------ Eq
809 namespace detail {
810 HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, EqN, cmpeq_n)
811 } // namespace detail
812 
813 // ------------------------------ Ne
815 namespace detail {
816 HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, NeN, cmpne_n)
817 } // namespace detail
818 
819 // ------------------------------ Lt
821 namespace detail {
822 HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, LtN, cmplt_n)
823 } // namespace detail
824 
825 // ------------------------------ Le
827 
828 #undef HWY_SVE_COMPARE
829 #undef HWY_SVE_COMPARE_N
830 
831 // ------------------------------ Gt/Ge (swapped order)
832 template <class V>
833 HWY_API svbool_t Gt(const V a, const V b) {
834  return Lt(b, a);
835 }
836 template <class V>
837 HWY_API svbool_t Ge(const V a, const V b) {
838  return Le(b, a);
839 }
840 
841 // ------------------------------ TestBit
842 template <class V>
843 HWY_API svbool_t TestBit(const V a, const V bit) {
844  return detail::NeN(And(a, bit), 0);
845 }
846 
847 // ------------------------------ MaskFromVec (Ne)
848 template <class V>
849 HWY_API svbool_t MaskFromVec(const V v) {
850  return detail::NeN(v, static_cast<TFromV<V>>(0));
851 }
852 
853 // ------------------------------ VecFromMask
854 template <class D>
855 HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
856  const RebindToSigned<D> di;
857  // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which
858  // requires an extra instruction plus M0 pipeline.
859  return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
860 }
861 
862 // ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
863 
864 #if HWY_TARGET == HWY_SVE2
865 
866 #define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \
867  HWY_API HWY_SVE_V(BASE, BITS) \
868  NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
869  HWY_SVE_V(BASE, BITS) no) { \
870  return sv##OP##_##CHAR##BITS(yes, no, mask); \
871  }
872 
874 #undef HWY_SVE_IF_VEC
875 
876 template <class V, HWY_IF_FLOAT_V(V)>
877 HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
878  const DFromV<V> d;
879  const RebindToUnsigned<decltype(d)> du;
880  return BitCast(
881  d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no)));
882 }
883 
884 #else
885 
886 template <class V>
887 HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
888  return Or(And(mask, yes), AndNot(mask, no));
889 }
890 
891 #endif // HWY_TARGET == HWY_SVE2
892 
893 // ------------------------------ Floating-point classification (Ne)
894 
895 template <class V>
896 HWY_API svbool_t IsNaN(const V v) {
897  return Ne(v, v); // could also use cmpuo
898 }
899 
900 template <class V>
901 HWY_API svbool_t IsInf(const V v) {
902  using T = TFromV<V>;
903  const DFromV<decltype(v)> d;
904  const RebindToSigned<decltype(d)> di;
905  const VFromD<decltype(di)> vi = BitCast(di, v);
906  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
907  return RebindMask(d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
908 }
909 
910 // Returns whether normal/subnormal/zero.
911 template <class V>
912 HWY_API svbool_t IsFinite(const V v) {
913  using T = TFromV<V>;
914  const DFromV<decltype(v)> d;
915  const RebindToUnsigned<decltype(d)> du;
916  const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
917  const VFromD<decltype(du)> vu = BitCast(du, v);
918  // 'Shift left' to clear the sign bit, then right so we can compare with the
919  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
920  // negative and non-negative floats would be greater).
921  const VFromD<decltype(di)> exp =
922  BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
923  return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField<T>()));
924 }
925 
926 // ================================================== MEMORY
927 
928 // ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream
929 
930 #define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
931  template <size_t N, int kPow2> \
932  HWY_API HWY_SVE_V(BASE, BITS) \
933  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
934  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
935  return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \
936  }
937 
938 #define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
939  template <size_t N, int kPow2> \
940  HWY_API HWY_SVE_V(BASE, BITS) \
941  NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
942  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
943  return sv##OP##_##CHAR##BITS(m, p); \
944  }
945 
946 #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
947  template <size_t N, int kPow2> \
948  HWY_API HWY_SVE_V(BASE, BITS) \
949  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
950  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
951  /* All-true predicate to load all 128 bits. */ \
952  return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \
953  }
954 
955 #define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
956  template <size_t N, int kPow2> \
957  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
958  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
959  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
960  sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \
961  }
962 
963 #define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
964  template <size_t N, int kPow2> \
965  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
966  HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
967  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
968  sv##OP##_##CHAR##BITS(m, p, v); \
969  }
970 
977 
978 #undef HWY_SVE_LOAD
979 #undef HWY_SVE_MASKED_LOAD
980 #undef HWY_SVE_LOAD_DUP128
981 #undef HWY_SVE_STORE
982 #undef HWY_SVE_BLENDED_STORE
983 
984 // BF16 is the same as svuint16_t because BF16 is optional before v8.6.
985 template <size_t N, int kPow2>
987  const bfloat16_t* HWY_RESTRICT p) {
988  return Load(RebindToUnsigned<decltype(d)>(),
989  reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
990 }
991 
992 template <size_t N, int kPow2>
995  Store(v, RebindToUnsigned<decltype(d)>(),
996  reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
997 }
998 
999 // ------------------------------ Load/StoreU
1000 
1001 // SVE only requires lane alignment, not natural alignment of the entire
1002 // vector.
1003 template <class D>
1005  return Load(d, p);
1006 }
1007 
1008 template <class V, class D>
1009 HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1010  Store(v, d, p);
1011 }
1012 
1013 // ------------------------------ ScatterOffset/Index
1014 
1015 #define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1016  template <size_t N, int kPow2> \
1017  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1018  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1019  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1020  HWY_SVE_V(int, BITS) offset) { \
1021  sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
1022  v); \
1023  }
1024 
1025 #define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1026  template <size_t N, int kPow2> \
1027  HWY_API void NAME( \
1028  HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1029  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \
1030  sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \
1031  }
1032 
1035 #undef HWY_SVE_SCATTER_OFFSET
1036 #undef HWY_SVE_SCATTER_INDEX
1037 
1038 // ------------------------------ GatherOffset/Index
1039 
1040 #define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1041  template <size_t N, int kPow2> \
1042  HWY_API HWY_SVE_V(BASE, BITS) \
1043  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1044  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1045  HWY_SVE_V(int, BITS) offset) { \
1046  return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
1047  offset); \
1048  }
1049 #define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1050  template <size_t N, int kPow2> \
1051  HWY_API HWY_SVE_V(BASE, BITS) \
1052  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1053  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1054  HWY_SVE_V(int, BITS) index) { \
1055  return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \
1056  index); \
1057  }
1058 
1061 #undef HWY_SVE_GATHER_OFFSET
1062 #undef HWY_SVE_GATHER_INDEX
1063 
1064 // ------------------------------ LoadInterleaved2
1065 
1066 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1067 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1068 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1069 #else
1070 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1071 #endif
1072 
1073 #define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \
1074  template <size_t N, int kPow2> \
1075  HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1076  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1077  HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
1078  const sv##BASE##BITS##x2_t tuple = \
1079  sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1080  v0 = svget2(tuple, 0); \
1081  v1 = svget2(tuple, 1); \
1082  }
1084 
1085 #undef HWY_SVE_LOAD2
1086 
1087 // ------------------------------ LoadInterleaved3
1088 
1089 #define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \
1090  template <size_t N, int kPow2> \
1091  HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1092  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1093  HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1094  HWY_SVE_V(BASE, BITS) & v2) { \
1095  const sv##BASE##BITS##x3_t tuple = \
1096  sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1097  v0 = svget3(tuple, 0); \
1098  v1 = svget3(tuple, 1); \
1099  v2 = svget3(tuple, 2); \
1100  }
1102 
1103 #undef HWY_SVE_LOAD3
1104 
1105 // ------------------------------ LoadInterleaved4
1106 
1107 #define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \
1108  template <size_t N, int kPow2> \
1109  HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1110  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1111  HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1112  HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
1113  const sv##BASE##BITS##x4_t tuple = \
1114  sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1115  v0 = svget4(tuple, 0); \
1116  v1 = svget4(tuple, 1); \
1117  v2 = svget4(tuple, 2); \
1118  v3 = svget4(tuple, 3); \
1119  }
1121 
1122 #undef HWY_SVE_LOAD4
1123 
1124 // ------------------------------ StoreInterleaved2
1125 
1126 #define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
1127  template <size_t N, int kPow2> \
1128  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1129  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1130  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1131  const sv##BASE##BITS##x2_t tuple = svcreate2##_##CHAR##BITS(v0, v1); \
1132  sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, tuple); \
1133  }
1135 
1136 #undef HWY_SVE_STORE2
1137 
1138 // ------------------------------ StoreInterleaved3
1139 
1140 #define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \
1141  template <size_t N, int kPow2> \
1142  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1143  HWY_SVE_V(BASE, BITS) v2, \
1144  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1145  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1146  const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
1147  sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, triple); \
1148  }
1150 
1151 #undef HWY_SVE_STORE3
1152 
1153 // ------------------------------ StoreInterleaved4
1154 
1155 #define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \
1156  template <size_t N, int kPow2> \
1157  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1158  HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
1159  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1160  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1161  const sv##BASE##BITS##x4_t quad = \
1162  svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \
1163  sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, quad); \
1164  }
1166 
1167 #undef HWY_SVE_STORE4
1168 
1169 // ================================================== CONVERT
1170 
1171 // ------------------------------ PromoteTo
1172 
1173 // Same sign
1174 #define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \
1175  template <size_t N, int kPow2> \
1176  HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1177  HWY_SVE_D(BASE, BITS, N, kPow2) /* tag */, HWY_SVE_V(BASE, HALF) v) { \
1178  return sv##OP##_##CHAR##BITS(v); \
1179  }
1180 
1184 
1185 // 2x
1186 template <size_t N, int kPow2>
1187 HWY_API svuint32_t PromoteTo(Simd<uint32_t, N, kPow2> dto, svuint8_t vfrom) {
1188  const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
1189  return PromoteTo(dto, PromoteTo(d2, vfrom));
1190 }
1191 template <size_t N, int kPow2>
1192 HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svint8_t vfrom) {
1193  const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
1194  return PromoteTo(dto, PromoteTo(d2, vfrom));
1195 }
1196 
1197 // Sign change
1198 template <size_t N, int kPow2>
1199 HWY_API svint16_t PromoteTo(Simd<int16_t, N, kPow2> dto, svuint8_t vfrom) {
1200  const RebindToUnsigned<decltype(dto)> du;
1201  return BitCast(dto, PromoteTo(du, vfrom));
1202 }
1203 template <size_t N, int kPow2>
1204 HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint16_t vfrom) {
1205  const RebindToUnsigned<decltype(dto)> du;
1206  return BitCast(dto, PromoteTo(du, vfrom));
1207 }
1208 template <size_t N, int kPow2>
1209 HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint8_t vfrom) {
1210  const Repartition<uint16_t, DFromV<decltype(vfrom)>> du16;
1211  const Repartition<int16_t, decltype(du16)> di16;
1212  return PromoteTo(dto, BitCast(di16, PromoteTo(du16, vfrom)));
1213 }
1214 
1215 // ------------------------------ PromoteTo F
1216 
1217 namespace detail {
1219 } // namespace detail
1220 
1221 template <size_t N, int kPow2>
1223  const svfloat16_t v) {
1224  // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
1225  // first replicate each lane once.
1226  const svfloat16_t vv = detail::ZipLower(v, v);
1227  return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
1228 }
1229 
1230 template <size_t N, int kPow2>
1232  const svfloat32_t v) {
1233  const svfloat32_t vv = detail::ZipLower(v, v);
1234  return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
1235 }
1236 
1237 template <size_t N, int kPow2>
1239  const svint32_t v) {
1240  const svint32_t vv = detail::ZipLower(v, v);
1241  return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
1242 }
1243 
1244 // For 16-bit Compress
1245 namespace detail {
1247 #undef HWY_SVE_PROMOTE_TO
1248 
1249 template <size_t N, int kPow2>
1250 HWY_API svfloat32_t PromoteUpperTo(Simd<float, N, kPow2> df, svfloat16_t v) {
1251  const RebindToUnsigned<decltype(df)> du;
1252  const RepartitionToNarrow<decltype(du)> dn;
1253  return BitCast(df, PromoteUpperTo(du, BitCast(dn, v)));
1254 }
1255 
1256 } // namespace detail
1257 
1258 // ------------------------------ DemoteTo U
1259 
1260 namespace detail {
1261 
1262 // Saturates unsigned vectors to half/quarter-width TN.
1263 template <typename TN, class VU>
1264 VU SaturateU(VU v) {
1265  return detail::MinN(v, static_cast<TFromV<VU>>(LimitsMax<TN>()));
1266 }
1267 
1268 // Saturates unsigned vectors to half/quarter-width TN.
1269 template <typename TN, class VI>
1270 VI SaturateI(VI v) {
1271  return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>());
1272 }
1273 
1274 } // namespace detail
1275 
1276 template <size_t N, int kPow2>
1277 HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint16_t v) {
1278  const DFromV<decltype(v)> di;
1279  const RebindToUnsigned<decltype(di)> du;
1280  using TN = TFromD<decltype(dn)>;
1281  // First clamp negative numbers to zero and cast to unsigned.
1282  const svuint16_t clamped = BitCast(du, detail::MaxN(v, 0));
1283  // Saturate to unsigned-max and halve the width.
1284  const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1285  return svuzp1_u8(vn, vn);
1286 }
1287 
1288 template <size_t N, int kPow2>
1289 HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svint32_t v) {
1290  const DFromV<decltype(v)> di;
1291  const RebindToUnsigned<decltype(di)> du;
1292  using TN = TFromD<decltype(dn)>;
1293  // First clamp negative numbers to zero and cast to unsigned.
1294  const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
1295  // Saturate to unsigned-max and halve the width.
1296  const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1297  return svuzp1_u16(vn, vn);
1298 }
1299 
1300 template <size_t N, int kPow2>
1301 HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint32_t v) {
1302  const DFromV<decltype(v)> di;
1303  const RebindToUnsigned<decltype(di)> du;
1304  const RepartitionToNarrow<decltype(du)> d2;
1305  using TN = TFromD<decltype(dn)>;
1306  // First clamp negative numbers to zero and cast to unsigned.
1307  const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
1308  // Saturate to unsigned-max and quarter the width.
1309  const svuint16_t cast16 = BitCast(d2, detail::SaturateU<TN>(clamped));
1310  const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16));
1311  return svuzp1_u8(x2, x2);
1312 }
1313 
1314 HWY_API svuint8_t U8FromU32(const svuint32_t v) {
1315  const DFromV<svuint32_t> du32;
1316  const RepartitionToNarrow<decltype(du32)> du16;
1317  const RepartitionToNarrow<decltype(du16)> du8;
1318 
1319  const svuint16_t cast16 = BitCast(du16, v);
1320  const svuint16_t x2 = svuzp1_u16(cast16, cast16);
1321  const svuint8_t cast8 = BitCast(du8, x2);
1322  return svuzp1_u8(cast8, cast8);
1323 }
1324 
1325 // ------------------------------ DemoteTo I
1326 
1327 template <size_t N, int kPow2>
1328 HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint16_t v) {
1329 #if HWY_TARGET == HWY_SVE2
1330  const svint8_t vn = BitCast(dn, svqxtnb_s16(v));
1331 #else
1332  using TN = TFromD<decltype(dn)>;
1333  const svint8_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1334 #endif
1335  return svuzp1_s8(vn, vn);
1336 }
1337 
1338 template <size_t N, int kPow2>
1339 HWY_API svint16_t DemoteTo(Simd<int16_t, N, kPow2> dn, const svint32_t v) {
1340 #if HWY_TARGET == HWY_SVE2
1341  const svint16_t vn = BitCast(dn, svqxtnb_s32(v));
1342 #else
1343  using TN = TFromD<decltype(dn)>;
1344  const svint16_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1345 #endif
1346  return svuzp1_s16(vn, vn);
1347 }
1348 
1349 template <size_t N, int kPow2>
1350 HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint32_t v) {
1351  const RepartitionToWide<decltype(dn)> d2;
1352 #if HWY_TARGET == HWY_SVE2
1353  const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v)));
1354 #else
1355  using TN = TFromD<decltype(dn)>;
1356  const svint16_t cast16 = BitCast(d2, detail::SaturateI<TN>(v));
1357 #endif
1358  const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16));
1359  return BitCast(dn, svuzp1_s8(v2, v2));
1360 }
1361 
1362 // ------------------------------ ConcatEven/ConcatOdd
1363 
1364 // WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
1365 // full vector length, not rounded down to a power of two as we require).
1366 namespace detail {
1367 
1368 #define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
1369  HWY_INLINE HWY_SVE_V(BASE, BITS) \
1370  NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1371  return sv##OP##_##CHAR##BITS(lo, hi); \
1372  }
1375 #if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
1376 HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
1377 HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
1378 #endif
1379 #undef HWY_SVE_CONCAT_EVERY_SECOND
1380 
1381 // Used to slide up / shift whole register left; mask indicates which range
1382 // to take from lo, and the rest is filled from hi starting at its lowest.
1383 #define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
1384  HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1385  HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
1386  return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1387  }
1388 HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
1389 #undef HWY_SVE_SPLICE
1390 
1391 } // namespace detail
1392 
1393 template <class D>
1395 #if HWY_SVE_IS_POW2
1396  (void)d;
1397  return detail::ConcatOdd(hi, lo);
1398 #else
1399  const VFromD<D> hi_odd = detail::ConcatOdd(hi, hi);
1400  const VFromD<D> lo_odd = detail::ConcatOdd(lo, lo);
1401  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1402 #endif
1403 }
1404 
1405 template <class D>
1407 #if HWY_SVE_IS_POW2
1408  (void)d;
1409  return detail::ConcatEven(hi, lo);
1410 #else
1411  const VFromD<D> hi_odd = detail::ConcatEven(hi, hi);
1412  const VFromD<D> lo_odd = detail::ConcatEven(lo, lo);
1413  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1414 #endif
1415 }
1416 
1417 // ------------------------------ DemoteTo F
1418 
1419 template <size_t N, int kPow2>
1420 HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
1421  const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
1422  return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
1423 }
1424 
1425 template <size_t N, int kPow2>
1426 HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) {
1427  const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
1428  return detail::ConcatOdd(in_even, in_even); // can ignore upper half of vec
1429 }
1430 
1431 template <size_t N, int kPow2>
1432 HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
1433  const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
1434  return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
1435 }
1436 
1437 template <size_t N, int kPow2>
1438 HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
1439  const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
1440  return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
1441 }
1442 
1443 // ------------------------------ ConvertTo F
1444 
1445 #define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
1446  template <size_t N, int kPow2> \
1447  HWY_API HWY_SVE_V(BASE, BITS) \
1448  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
1449  return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1450  } \
1451  /* Truncates (rounds toward zero). */ \
1452  template <size_t N, int kPow2> \
1453  HWY_API HWY_SVE_V(int, BITS) \
1454  NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
1455  return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1456  }
1457 
1458 // API only requires f32 but we provide f64 for use by Iota.
1460 #undef HWY_SVE_CONVERT
1461 
1462 // ------------------------------ NearestInt (Round, ConvertTo)
1463 template <class VF, class DI = RebindToSigned<DFromV<VF>>>
1465  // No single instruction, round then truncate.
1466  return ConvertTo(DI(), Round(v));
1467 }
1468 
1469 // ------------------------------ Iota (Add, ConvertTo)
1470 
1471 #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
1472  template <size_t N, int kPow2> \
1473  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1474  HWY_SVE_T(BASE, BITS) first) { \
1475  return sv##OP##_##CHAR##BITS(first, 1); \
1476  }
1477 
1479 #undef HWY_SVE_IOTA
1480 
1481 template <class D, HWY_IF_FLOAT_D(D)>
1482 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
1483  const RebindToSigned<D> di;
1484  return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
1485 }
1486 
1487 // ------------------------------ InterleaveLower
1488 
1489 template <class D, class V>
1490 HWY_API V InterleaveLower(D d, const V a, const V b) {
1491  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
1492 #if HWY_TARGET == HWY_SVE2_128
1493  (void)d;
1494  return detail::ZipLower(a, b);
1495 #else
1496  // Move lower halves of blocks to lower half of vector.
1497  const Repartition<uint64_t, decltype(d)> d64;
1498  const auto a64 = BitCast(d64, a);
1499  const auto b64 = BitCast(d64, b);
1500  const auto a_blocks = detail::ConcatEven(a64, a64); // only lower half needed
1501  const auto b_blocks = detail::ConcatEven(b64, b64);
1502  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
1503 #endif
1504 }
1505 
1506 template <class V>
1507 HWY_API V InterleaveLower(const V a, const V b) {
1508  return InterleaveLower(DFromV<V>(), a, b);
1509 }
1510 
1511 // ------------------------------ InterleaveUpper
1512 
1513 // Only use zip2 if vector are a powers of two, otherwise getting the actual
1514 // "upper half" requires MaskUpperHalf.
1515 #if HWY_TARGET == HWY_SVE2_128
1516 namespace detail {
1518 } // namespace detail
1519 #endif
1520 
1521 // Full vector: guaranteed to have at least one block
1522 template <class D, class V = VFromD<D>,
1523  hwy::EnableIf<detail::IsFull(D())>* = nullptr>
1524 HWY_API V InterleaveUpper(D d, const V a, const V b) {
1525 #if HWY_TARGET == HWY_SVE2_128
1526  (void)d;
1527  return detail::ZipUpper(a, b);
1528 #else
1529  // Move upper halves of blocks to lower half of vector.
1530  const Repartition<uint64_t, decltype(d)> d64;
1531  const auto a64 = BitCast(d64, a);
1532  const auto b64 = BitCast(d64, b);
1533  const auto a_blocks = detail::ConcatOdd(a64, a64); // only lower half needed
1534  const auto b_blocks = detail::ConcatOdd(b64, b64);
1535  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
1536 #endif
1537 }
1538 
1539 // Capped/fraction: need runtime check
1540 template <class D, class V = VFromD<D>,
1541  hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
1542 HWY_API V InterleaveUpper(D d, const V a, const V b) {
1543  // Less than one block: treat as capped
1544  if (Lanes(d) * sizeof(TFromD<D>) < 16) {
1545  const Half<decltype(d)> d2;
1546  return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
1547  }
1548  return InterleaveUpper(DFromV<V>(), a, b);
1549 }
1550 
1551 // ================================================== COMBINE
1552 
1553 namespace detail {
1554 
1555 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
1556 template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
1557 svbool_t MaskLowerHalf(D d) {
1558  switch (Lanes(d)) {
1559  case 32:
1560  return svptrue_pat_b8(SV_VL16);
1561  case 16:
1562  return svptrue_pat_b8(SV_VL8);
1563  case 8:
1564  return svptrue_pat_b8(SV_VL4);
1565  case 4:
1566  return svptrue_pat_b8(SV_VL2);
1567  default:
1568  return svptrue_pat_b8(SV_VL1);
1569  }
1570 }
1571 template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1572 svbool_t MaskLowerHalf(D d) {
1573  switch (Lanes(d)) {
1574  case 16:
1575  return svptrue_pat_b16(SV_VL8);
1576  case 8:
1577  return svptrue_pat_b16(SV_VL4);
1578  case 4:
1579  return svptrue_pat_b16(SV_VL2);
1580  default:
1581  return svptrue_pat_b16(SV_VL1);
1582  }
1583 }
1584 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1585 svbool_t MaskLowerHalf(D d) {
1586  switch (Lanes(d)) {
1587  case 8:
1588  return svptrue_pat_b32(SV_VL4);
1589  case 4:
1590  return svptrue_pat_b32(SV_VL2);
1591  default:
1592  return svptrue_pat_b32(SV_VL1);
1593  }
1594 }
1595 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1596 svbool_t MaskLowerHalf(D d) {
1597  switch (Lanes(d)) {
1598  case 4:
1599  return svptrue_pat_b64(SV_VL2);
1600  default:
1601  return svptrue_pat_b64(SV_VL1);
1602  }
1603 }
1604 #endif
1605 #if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
1606 template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
1607 svbool_t MaskLowerHalf(D d) {
1608  switch (Lanes(d)) {
1609  case 16:
1610  return svptrue_pat_b8(SV_VL8);
1611  case 8:
1612  return svptrue_pat_b8(SV_VL4);
1613  case 4:
1614  return svptrue_pat_b8(SV_VL2);
1615  case 2:
1616  case 1:
1617  default:
1618  return svptrue_pat_b8(SV_VL1);
1619  }
1620 }
1621 template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1622 svbool_t MaskLowerHalf(D d) {
1623  switch (Lanes(d)) {
1624  case 8:
1625  return svptrue_pat_b16(SV_VL4);
1626  case 4:
1627  return svptrue_pat_b16(SV_VL2);
1628  case 2:
1629  case 1:
1630  default:
1631  return svptrue_pat_b16(SV_VL1);
1632  }
1633 }
1634 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1635 svbool_t MaskLowerHalf(D d) {
1636  return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1);
1637 }
1638 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1639 svbool_t MaskLowerHalf(D /*d*/) {
1640  return svptrue_pat_b64(SV_VL1);
1641 }
1642 #endif // HWY_TARGET == HWY_SVE2_128
1643 #if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
1644 template <class D>
1645 svbool_t MaskLowerHalf(D d) {
1646  return FirstN(d, Lanes(d) / 2);
1647 }
1648 #endif
1649 
1650 template <class D>
1651 svbool_t MaskUpperHalf(D d) {
1652  // TODO(janwas): WHILEGE on pow2 SVE2
1653  if (HWY_SVE_IS_POW2 && IsFull(d)) {
1654  return Not(MaskLowerHalf(d));
1655  }
1656 
1657  // For Splice to work as intended, make sure bits above Lanes(d) are zero.
1659 }
1660 
1661 // Right-shift vector pair by constexpr; can be used to slide down (=N) or up
1662 // (=Lanes()-N).
1663 #define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \
1664  template <size_t kIndex> \
1665  HWY_API HWY_SVE_V(BASE, BITS) \
1666  NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1667  return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
1668  }
1669 HWY_SVE_FOREACH(HWY_SVE_EXT, Ext, ext)
1670 #undef HWY_SVE_EXT
1671 
1672 } // namespace detail
1673 
1674 // ------------------------------ ConcatUpperLower
1675 template <class D, class V>
1676 HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) {
1677  return IfThenElse(detail::MaskLowerHalf(d), lo, hi);
1678 }
1679 
1680 // ------------------------------ ConcatLowerLower
1681 template <class D, class V>
1682 HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
1683  if (detail::IsFull(d)) {
1684 #if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1685  return detail::ConcatEvenBlocks(hi, lo);
1686 #endif
1687 #if HWY_TARGET == HWY_SVE2_128
1688  const Repartition<uint64_t, D> du64;
1689  const auto lo64 = BitCast(du64, lo);
1690  return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi)));
1691 #endif
1692  }
1693  return detail::Splice(hi, lo, detail::MaskLowerHalf(d));
1694 }
1695 
1696 // ------------------------------ ConcatLowerUpper
1697 template <class D, class V>
1698 HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
1699 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
1700  if (detail::IsFull(d)) {
1701  return detail::Ext<Lanes(d) / 2>(hi, lo);
1702  }
1703 #endif
1704  return detail::Splice(hi, lo, detail::MaskUpperHalf(d));
1705 }
1706 
1707 // ------------------------------ ConcatUpperUpper
1708 template <class D, class V>
1709 HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) {
1710  if (detail::IsFull(d)) {
1711 #if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1712  return detail::ConcatOddBlocks(hi, lo);
1713 #endif
1714 #if HWY_TARGET == HWY_SVE2_128
1715  const Repartition<uint64_t, D> du64;
1716  const auto lo64 = BitCast(du64, lo);
1717  return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi)));
1718 #endif
1719  }
1720  const svbool_t mask_upper = detail::MaskUpperHalf(d);
1721  const V lo_upper = detail::Splice(lo, lo, mask_upper);
1722  return IfThenElse(mask_upper, hi, lo_upper);
1723 }
1724 
1725 // ------------------------------ Combine
1726 template <class D, class V2>
1727 HWY_API VFromD<D> Combine(const D d, const V2 hi, const V2 lo) {
1728  return ConcatLowerLower(d, hi, lo);
1729 }
1730 
1731 // ------------------------------ ZeroExtendVector
1732 template <class D, class V>
1733 HWY_API V ZeroExtendVector(const D d, const V lo) {
1734  return Combine(d, Zero(Half<D>()), lo);
1735 }
1736 
1737 // ------------------------------ Lower/UpperHalf
1738 
1739 template <class D2, class V>
1740 HWY_API V LowerHalf(D2 /* tag */, const V v) {
1741  return v;
1742 }
1743 
1744 template <class V>
1745 HWY_API V LowerHalf(const V v) {
1746  return v;
1747 }
1748 
1749 template <class D2, class V>
1750 HWY_API V UpperHalf(const D2 d2, const V v) {
1751 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
1752  return detail::Ext<Lanes(d2)>(v, v);
1753 #else
1754  return detail::Splice(v, v, detail::MaskUpperHalf(Twice<decltype(d2)>()));
1755 #endif
1756 }
1757 
1758 // ================================================== REDUCE
1759 
1760 // These return T, whereas the Highway op returns a broadcasted vector.
1761 namespace detail {
1762 #define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
1763  HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
1764  return sv##OP##_##CHAR##BITS(pg, v); \
1765  }
1766 
1770 // NaN if all are
1773 
1774 #undef HWY_SVE_REDUCE
1775 } // namespace detail
1776 
1777 template <class D, class V>
1778 V SumOfLanes(D d, V v) {
1780 }
1781 
1782 template <class D, class V>
1783 V MinOfLanes(D d, V v) {
1785 }
1786 
1787 template <class D, class V>
1788 V MaxOfLanes(D d, V v) {
1790 }
1791 
1792 
1793 // ================================================== SWIZZLE
1794 
1795 // ------------------------------ GetLane
1796 
1797 namespace detail {
1798 #define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
1799  HWY_INLINE HWY_SVE_T(BASE, BITS) \
1800  NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
1801  return sv##OP##_##CHAR##BITS(mask, v); \
1802  }
1803 
1805 #undef HWY_SVE_GET_LANE
1806 } // namespace detail
1807 
1808 template <class V>
1809 HWY_API TFromV<V> GetLane(V v) {
1810  return detail::GetLane(v, detail::PFalse());
1811 }
1812 
1813 // ------------------------------ ExtractLane
1814 template <class V>
1816  return detail::GetLane(v, FirstN(DFromV<V>(), i));
1817 }
1818 
1819 // ------------------------------ InsertLane (IfThenElse)
1820 template <class V>
1821 HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
1822  const DFromV<V> d;
1823  const auto is_i = detail::EqN(Iota(d, 0), static_cast<TFromV<V>>(i));
1824  return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
1825 }
1826 
1827 // ------------------------------ DupEven
1828 
1829 namespace detail {
1830 HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveEven, trn1)
1831 } // namespace detail
1832 
1833 template <class V>
1834 HWY_API V DupEven(const V v) {
1835  return detail::InterleaveEven(v, v);
1836 }
1837 
1838 // ------------------------------ DupOdd
1839 
1840 namespace detail {
1841 HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveOdd, trn2)
1842 } // namespace detail
1843 
1844 template <class V>
1845 HWY_API V DupOdd(const V v) {
1846  return detail::InterleaveOdd(v, v);
1847 }
1848 
1849 // ------------------------------ OddEven
1850 
1851 #if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE2
1852 
1853 #define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
1854  HWY_API HWY_SVE_V(BASE, BITS) \
1855  NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
1856  return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0); \
1857  }
1858 
1860 #undef HWY_SVE_ODD_EVEN
1861 
1862 template <class V, HWY_IF_FLOAT_V(V)>
1863 HWY_API V OddEven(const V odd, const V even) {
1864  const DFromV<V> d;
1865  const RebindToUnsigned<decltype(d)> du;
1866  return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even)));
1867 }
1868 
1869 #else
1870 
1871 template <class V>
1872 HWY_API V OddEven(const V odd, const V even) {
1873  const auto odd_in_even = detail::Ext<1>(odd, odd);
1874  return detail::InterleaveEven(even, odd_in_even);
1875 }
1876 
1877 #endif // HWY_TARGET
1878 
1879 // ------------------------------ OddEvenBlocks
1880 template <class V>
1881 HWY_API V OddEvenBlocks(const V odd, const V even) {
1882  const DFromV<V> d;
1883 #if HWY_TARGET == HWY_SVE_256
1884  return ConcatUpperLower(d, odd, even);
1885 #elif HWY_TARGET == HWY_SVE2_128
1886  (void)odd;
1887  (void)d;
1888  return even;
1889 #else
1890  const RebindToUnsigned<decltype(d)> du;
1891  using TU = TFromD<decltype(du)>;
1892  constexpr size_t kShift = CeilLog2(16 / sizeof(TU));
1893  const auto idx_block = ShiftRight<kShift>(Iota(du, 0));
1894  const auto lsb = detail::AndN(idx_block, static_cast<TU>(1));
1895  const svbool_t is_even = detail::EqN(lsb, static_cast<TU>(0));
1896  return IfThenElse(is_even, even, odd);
1897 #endif
1898 }
1899 
1900 // ------------------------------ TableLookupLanes
1901 
1902 template <class D, class VI>
1904  using TI = TFromV<VI>;
1905  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size mismatch");
1906  const RebindToUnsigned<D> du;
1907  const auto indices = BitCast(du, vec);
1908 #if HWY_IS_DEBUG_BUILD
1909  HWY_DASSERT(AllTrue(du, detail::LtN(indices, static_cast<TI>(Lanes(d)))));
1910 #else
1911  (void)d;
1912 #endif
1913  return indices;
1914 }
1915 
1916 template <class D, typename TI>
1918  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
1919  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
1920 }
1921 
1922 // <32bit are not part of Highway API, but used in Broadcast.
1923 #define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \
1924  HWY_API HWY_SVE_V(BASE, BITS) \
1925  NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
1926  return sv##OP##_##CHAR##BITS(v, idx); \
1927  }
1928 
1930 #undef HWY_SVE_TABLE
1931 
1932 // ------------------------------ SwapAdjacentBlocks (TableLookupLanes)
1933 
1934 namespace detail {
1935 
1936 template <typename T, size_t N, int kPow2>
1937 constexpr size_t LanesPerBlock(Simd<T, N, kPow2> /* tag */) {
1938  // We might have a capped vector smaller than a block, so honor that.
1939  return HWY_MIN(16 / sizeof(T), detail::ScaleByPower(N, kPow2));
1940 }
1941 
1942 } // namespace detail
1943 
1944 template <class V>
1946  const DFromV<V> d;
1947 #if HWY_TARGET == HWY_SVE_256
1948  return ConcatLowerUpper(d, v, v);
1949 #elif HWY_TARGET == HWY_SVE2_128
1950  (void)d;
1951  return v;
1952 #else
1953  const RebindToUnsigned<decltype(d)> du;
1954  constexpr auto kLanesPerBlock =
1955  static_cast<TFromV<V>>(detail::LanesPerBlock(d));
1956  const VFromD<decltype(du)> idx = detail::XorN(Iota(du, 0), kLanesPerBlock);
1957  return TableLookupLanes(v, idx);
1958 #endif
1959 }
1960 
1961 // ------------------------------ Reverse
1962 
1963 namespace detail {
1964 
1965 #define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \
1966  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1967  return sv##OP##_##CHAR##BITS(v); \
1968  }
1969 
1970 HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
1971 #undef HWY_SVE_REVERSE
1972 
1973 } // namespace detail
1974 
1975 template <class D, class V>
1976 HWY_API V Reverse(D d, V v) {
1977  using T = TFromD<D>;
1978  const auto reversed = detail::ReverseFull(v);
1979  if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed;
1980  // Shift right to remove extra (non-pow2 and remainder) lanes.
1981  // TODO(janwas): on SVE2, use WHILEGE.
1982  // Avoids FirstN truncating to the return vector size. Must also avoid Not
1983  // because that is limited to SV_POW2.
1984  const ScalableTag<T> dfull;
1985  const svbool_t all_true = detail::AllPTrue(dfull);
1986  const size_t all_lanes = detail::AllHardwareLanes(hwy::SizeTag<sizeof(T)>());
1987  const svbool_t mask =
1988  svnot_b_z(all_true, FirstN(dfull, all_lanes - Lanes(d)));
1989  return detail::Splice(reversed, reversed, mask);
1990 }
1991 
1992 // ------------------------------ Reverse2
1993 
1994 template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1996  const RebindToUnsigned<decltype(d)> du;
1997  const RepartitionToWide<decltype(du)> dw;
1998  return BitCast(d, svrevh_u32_x(detail::PTrue(d), BitCast(dw, v)));
1999 }
2000 
2001 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
2002 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2003  const RebindToUnsigned<decltype(d)> du;
2004  const RepartitionToWide<decltype(du)> dw;
2005  return BitCast(d, svrevw_u64_x(detail::PTrue(d), BitCast(dw, v)));
2006 }
2007 
2008 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
2009 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { // 3210
2010 #if HWY_TARGET == HWY_SVE2_128
2011  if (detail::IsFull(d)) {
2012  return detail::Ext<1>(v, v);
2013  }
2014 #endif
2015  (void)d;
2016  const auto odd_in_even = detail::Ext<1>(v, v); // x321
2017  return detail::InterleaveEven(odd_in_even, v); // 2301
2018 }
2019 // ------------------------------ Reverse4 (TableLookupLanes)
2020 template <class D>
2022  if (HWY_TARGET == HWY_SVE_256 && sizeof(TFromD<D>) == 8 &&
2023  detail::IsFull(d)) {
2024  return detail::ReverseFull(v);
2025  }
2026  // TODO(janwas): is this approach faster than Shuffle0123?
2027  const RebindToUnsigned<decltype(d)> du;
2028  const auto idx = detail::XorN(Iota(du, 0), 3);
2029  return TableLookupLanes(v, idx);
2030 }
2031 
2032 // ------------------------------ Reverse8 (TableLookupLanes)
2033 template <class D>
2035  const RebindToUnsigned<decltype(d)> du;
2036  const auto idx = detail::XorN(Iota(du, 0), 7);
2037  return TableLookupLanes(v, idx);
2038 }
2039 
2040 // ------------------------------ Compress (PromoteTo)
2041 
2042 template <typename T>
2043 struct CompressIsPartition {
2044 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2045  // Optimization for 64-bit lanes (could also be applied to 32-bit, but that
2046  // requires a larger table).
2047  enum { value = (sizeof(T) == 8) };
2048 #else
2049  enum { value = 0 };
2050 #endif // HWY_TARGET == HWY_SVE_256
2051 };
2052 
2053 #define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \
2054  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
2055  return sv##OP##_##CHAR##BITS(mask, v); \
2056  }
2057 
2058 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2061 #else
2063 #endif
2064 #undef HWY_SVE_COMPRESS
2065 
2066 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2067 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2068 HWY_API V Compress(V v, svbool_t mask) {
2069  const DFromV<V> d;
2070  const RebindToUnsigned<decltype(d)> du64;
2071 
2072  // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
2073  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
2074  // SetTableIndices.
2075  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
2076  const size_t offset = detail::SumOfLanes(mask, bits);
2077 
2078  // See CompressIsPartition.
2079  alignas(16) static constexpr uint64_t table[4 * 16] = {
2080  // PrintCompress64x4Tables
2081  0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
2082  1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
2083  0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
2084  return TableLookupLanes(v, SetTableIndices(d, table + offset));
2085 }
2086 #endif // HWY_TARGET == HWY_SVE_256
2087 #if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2088 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2089 HWY_API V Compress(V v, svbool_t mask) {
2090  // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
2091  // swaps upper/lower (the lower half is set to the upper half, and the
2092  // remaining upper half is filled from the lower half of the second v), and
2093  // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10
2094  // unchanged and map everything else to 00.
2095  const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane
2096  return detail::Splice(v, v, AndNot(maskLL, mask));
2097 }
2098 #endif // HWY_TARGET == HWY_SVE_256
2099 
2100 template <class V, HWY_IF_LANE_SIZE_V(V, 2)>
2101 HWY_API V Compress(V v, svbool_t mask16) {
2102  static_assert(!IsSame<V, svfloat16_t>(), "Must use overload");
2103  const DFromV<V> d16;
2104 
2105  // Promote vector and mask to 32-bit
2106  const RepartitionToWide<decltype(d16)> dw;
2107  const auto v32L = PromoteTo(dw, v);
2108  const auto v32H = detail::PromoteUpperTo(dw, v);
2109  const svbool_t mask32L = svunpklo_b(mask16);
2110  const svbool_t mask32H = svunpkhi_b(mask16);
2111 
2112  const auto compressedL = Compress(v32L, mask32L);
2113  const auto compressedH = Compress(v32H, mask32H);
2114 
2115  // Demote to 16-bit (already in range) - separately so we can splice
2116  const V evenL = BitCast(d16, compressedL);
2117  const V evenH = BitCast(d16, compressedH);
2118  const V v16L = detail::ConcatEven(evenL, evenL); // only lower half needed
2119  const V v16H = detail::ConcatEven(evenH, evenH);
2120 
2121  // We need to combine two vectors of non-constexpr length, so the only option
2122  // is Splice, which requires us to synthesize a mask. NOTE: this function uses
2123  // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt.
2124  const size_t countL = detail::CountTrueFull(dw, mask32L);
2125  const auto compressed_maskL = FirstN(d16, countL);
2126  return detail::Splice(v16H, v16L, compressed_maskL);
2127 }
2128 
2129 // Must treat float16_t as integers so we can ConcatEven.
2130 HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) {
2131  const DFromV<decltype(v)> df;
2132  const RebindToSigned<decltype(df)> di;
2133  return BitCast(df, Compress(BitCast(di, v), mask16));
2134 }
2135 
2136 // ------------------------------ CompressNot
2137 
2138 template <class V, HWY_IF_NOT_LANE_SIZE_V(V, 8)>
2139 HWY_API V CompressNot(V v, const svbool_t mask) {
2140  return Compress(v, Not(mask));
2141 }
2142 
2143 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2144 HWY_API V CompressNot(V v, svbool_t mask) {
2145 #if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2146  // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
2147  // swaps upper/lower (the lower half is set to the upper half, and the
2148  // remaining upper half is filled from the lower half of the second v), and
2149  // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map
2150  // 01 to 10, and everything else to 00.
2151  const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane
2152  return detail::Splice(v, v, AndNot(mask, maskLL));
2153 #endif
2154 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2155  const DFromV<V> d;
2156  const RebindToUnsigned<decltype(d)> du64;
2157 
2158  // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
2159  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
2160  // SetTableIndices.
2161  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
2162  const size_t offset = detail::SumOfLanes(mask, bits);
2163 
2164  // See CompressIsPartition.
2165  alignas(16) static constexpr uint64_t table[4 * 16] = {
2166  // PrintCompressNot64x4Tables
2167  0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
2168  0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
2169  2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
2170  return TableLookupLanes(v, SetTableIndices(d, table + offset));
2171 #endif // HWY_TARGET == HWY_SVE_256
2172 
2173  return Compress(v, Not(mask));
2174 }
2175 
2176 // ------------------------------ CompressBlocksNot
2177 HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) {
2178 #if HWY_TARGET == HWY_SVE2_128
2179  (void)mask;
2180  return v;
2181 #endif
2182  return CompressNot(v, mask);
2183 }
2184 
2185 // ------------------------------ CompressStore
2186 template <class V, class D>
2187 HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d,
2188  TFromD<D>* HWY_RESTRICT unaligned) {
2189  StoreU(Compress(v, mask), d, unaligned);
2190  return CountTrue(d, mask);
2191 }
2192 
2193 // ------------------------------ CompressBlendedStore
2194 template <class V, class D>
2195 HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d,
2196  TFromD<D>* HWY_RESTRICT unaligned) {
2197  const size_t count = CountTrue(d, mask);
2198  const svbool_t store_mask = FirstN(d, count);
2199  BlendedStore(Compress(v, mask), store_mask, d, unaligned);
2200  return count;
2201 }
2202 
2203 // ================================================== BLOCKWISE
2204 
2205 // ------------------------------ CombineShiftRightBytes
2206 
2207 // Prevent accidentally using these for 128-bit vectors - should not be
2208 // necessary.
2209 #if HWY_TARGET != HWY_SVE2_128
2210 namespace detail {
2211 
2212 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes
2213 // offsets are implicitly relative to the start of their 128-bit block.
2214 template <class D, class V>
2215 HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
2216  using T = MakeUnsigned<TFromD<D>>;
2217  return detail::AndNotN(static_cast<T>(LanesPerBlock(d) - 1), iota0);
2218 }
2219 
2220 template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 1)>
2221 svbool_t FirstNPerBlock(D d) {
2222  const RebindToUnsigned<decltype(d)> du;
2223  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2224  const svuint8_t idx_mod =
2225  svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2226  3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2227  6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock,
2228  9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock,
2229  12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock,
2230  15 % kLanesPerBlock);
2231  return detail::LtN(BitCast(du, idx_mod), kLanes);
2232 }
2233 template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 2)>
2234 svbool_t FirstNPerBlock(D d) {
2235  const RebindToUnsigned<decltype(d)> du;
2236  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2237  const svuint16_t idx_mod =
2238  svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2239  3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2240  6 % kLanesPerBlock, 7 % kLanesPerBlock);
2241  return detail::LtN(BitCast(du, idx_mod), kLanes);
2242 }
2243 template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 4)>
2244 svbool_t FirstNPerBlock(D d) {
2245  const RebindToUnsigned<decltype(d)> du;
2246  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2247  const svuint32_t idx_mod =
2248  svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2249  3 % kLanesPerBlock);
2250  return detail::LtN(BitCast(du, idx_mod), kLanes);
2251 }
2252 template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 8)>
2253 svbool_t FirstNPerBlock(D d) {
2254  const RebindToUnsigned<decltype(d)> du;
2255  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2256  const svuint64_t idx_mod =
2257  svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock);
2258  return detail::LtN(BitCast(du, idx_mod), kLanes);
2259 }
2260 
2261 } // namespace detail
2262 #endif // HWY_TARGET != HWY_SVE2_128
2263 
2264 template <size_t kBytes, class D, class V = VFromD<D>>
2265 HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) {
2266  const Repartition<uint8_t, decltype(d)> d8;
2267  const auto hi8 = BitCast(d8, hi);
2268  const auto lo8 = BitCast(d8, lo);
2269 #if HWY_TARGET == HWY_SVE2_128
2270  return BitCast(d, detail::Ext<kBytes>(hi8, lo8));
2271 #else
2272  const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes));
2273  const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
2274  const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
2275  return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
2276 #endif
2277 }
2278 
2279 // ------------------------------ Shuffle2301
2280 template <class V>
2281 HWY_API V Shuffle2301(const V v) {
2282  const DFromV<V> d;
2283  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2284  return Reverse2(d, v);
2285 }
2286 
2287 // ------------------------------ Shuffle2103
2288 template <class V>
2289 HWY_API V Shuffle2103(const V v) {
2290  const DFromV<V> d;
2291  const Repartition<uint8_t, decltype(d)> d8;
2292  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2293  const svuint8_t v8 = BitCast(d8, v);
2294  return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8));
2295 }
2296 
2297 // ------------------------------ Shuffle0321
2298 template <class V>
2299 HWY_API V Shuffle0321(const V v) {
2300  const DFromV<V> d;
2301  const Repartition<uint8_t, decltype(d)> d8;
2302  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2303  const svuint8_t v8 = BitCast(d8, v);
2304  return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8));
2305 }
2306 
2307 // ------------------------------ Shuffle1032
2308 template <class V>
2309 HWY_API V Shuffle1032(const V v) {
2310  const DFromV<V> d;
2311  const Repartition<uint8_t, decltype(d)> d8;
2312  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2313  const svuint8_t v8 = BitCast(d8, v);
2314  return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
2315 }
2316 
2317 // ------------------------------ Shuffle01
2318 template <class V>
2319 HWY_API V Shuffle01(const V v) {
2320  const DFromV<V> d;
2321  const Repartition<uint8_t, decltype(d)> d8;
2322  static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
2323  const svuint8_t v8 = BitCast(d8, v);
2324  return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
2325 }
2326 
2327 // ------------------------------ Shuffle0123
2328 template <class V>
2329 HWY_API V Shuffle0123(const V v) {
2330  return Shuffle2301(Shuffle1032(v));
2331 }
2332 
2333 // ------------------------------ ReverseBlocks (Reverse, Shuffle01)
2334 template <class D, class V = VFromD<D>>
2336 #if HWY_TARGET == HWY_SVE_256
2337  if (detail::IsFull(d)) {
2338  return SwapAdjacentBlocks(v);
2339  } else if (detail::IsFull(Twice<D>())) {
2340  return v;
2341  }
2342 #elif HWY_TARGET == HWY_SVE2_128
2343  (void)d;
2344  return v;
2345 #endif
2346  const Repartition<uint64_t, D> du64;
2347  return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
2348 }
2349 
2350 // ------------------------------ TableLookupBytes
2351 
2352 template <class V, class VI>
2353 HWY_API VI TableLookupBytes(const V v, const VI idx) {
2354  const DFromV<VI> d;
2355  const Repartition<uint8_t, decltype(d)> du8;
2356 #if HWY_TARGET == HWY_SVE2_128
2357  return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx)));
2358 #else
2359  const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0));
2360  const auto idx8 = Add(BitCast(du8, idx), offsets128);
2361  return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8));
2362 #endif
2363 }
2364 
2365 template <class V, class VI>
2366 HWY_API VI TableLookupBytesOr0(const V v, const VI idx) {
2367  const DFromV<VI> d;
2368  // Mask size must match vector type, so cast everything to this type.
2369  const Repartition<int8_t, decltype(d)> di8;
2370 
2371  auto idx8 = BitCast(di8, idx);
2372  const auto msb = detail::LtN(idx8, 0);
2373 
2374  const auto lookup = TableLookupBytes(BitCast(di8, v), idx8);
2375  return BitCast(d, IfThenZeroElse(msb, lookup));
2376 }
2377 
2378 // ------------------------------ Broadcast
2379 
2380 #if HWY_TARGET == HWY_SVE2_128
2381 namespace detail {
2382 #define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \
2383  template <int kLane> \
2384  HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
2385  return sv##OP##_##CHAR##BITS(v, kLane); \
2386  }
2387 
2389 #undef HWY_SVE_BROADCAST
2390 } // namespace detail
2391 #endif
2392 
2393 template <int kLane, class V>
2394 HWY_API V Broadcast(const V v) {
2395  const DFromV<V> d;
2396  const RebindToUnsigned<decltype(d)> du;
2397  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2398  static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
2399 #if HWY_TARGET == HWY_SVE2_128
2400  return detail::Broadcast<kLane>(v);
2401 #else
2402  auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
2403  if (kLane != 0) {
2404  idx = detail::AddN(idx, kLane);
2405  }
2406  return TableLookupLanes(v, idx);
2407 #endif
2408 }
2409 
2410 // ------------------------------ ShiftLeftLanes
2411 
2412 template <size_t kLanes, class D, class V = VFromD<D>>
2413 HWY_API V ShiftLeftLanes(D d, const V v) {
2414  const auto zero = Zero(d);
2415  const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes));
2416 #if HWY_TARGET == HWY_SVE2_128
2417  return shifted;
2418 #else
2419  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
2420  return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
2421 #endif
2422 }
2423 
2424 template <size_t kLanes, class V>
2426  return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
2427 }
2428 
2429 // ------------------------------ ShiftRightLanes
2430 template <size_t kLanes, class D, class V = VFromD<D>>
2432  // For capped/fractional vectors, clear upper lanes so we shift in zeros.
2433  if (!detail::IsFull(d)) {
2435  }
2436 
2437 #if HWY_TARGET == HWY_SVE2_128
2438  return detail::Ext<kLanes>(Zero(d), v);
2439 #else
2440  const auto shifted = detail::Ext<kLanes>(v, v);
2441  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
2442  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
2443  const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d);
2444  return IfThenElseZero(mask, shifted);
2445 #endif
2446 }
2447 
2448 // ------------------------------ ShiftLeftBytes
2449 
2450 template <int kBytes, class D, class V = VFromD<D>>
2451 HWY_API V ShiftLeftBytes(const D d, const V v) {
2452  const Repartition<uint8_t, decltype(d)> d8;
2453  return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
2454 }
2455 
2456 template <int kBytes, class V>
2458  return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
2459 }
2460 
2461 // ------------------------------ ShiftRightBytes
2462 template <int kBytes, class D, class V = VFromD<D>>
2463 HWY_API V ShiftRightBytes(const D d, const V v) {
2464  const Repartition<uint8_t, decltype(d)> d8;
2465  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
2466 }
2467 
2468 // ------------------------------ ZipLower
2469 
2470 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2471 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2472  const RepartitionToNarrow<DW> dn;
2473  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2474  return BitCast(dw, InterleaveLower(dn, a, b));
2475 }
2476 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2477 HWY_API VFromD<DW> ZipLower(const V a, const V b) {
2478  return BitCast(DW(), InterleaveLower(D(), a, b));
2479 }
2480 
2481 // ------------------------------ ZipUpper
2482 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2483 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2484  const RepartitionToNarrow<DW> dn;
2485  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2486  return BitCast(dw, InterleaveUpper(dn, a, b));
2487 }
2488 
2489 // ================================================== Ops with dependencies
2490 
2491 // ------------------------------ PromoteTo bfloat16 (ZipLower)
2492 template <size_t N, int kPow2>
2494  const svuint16_t v) {
2495  return BitCast(df32, detail::ZipLower(svdup_n_u16(0), v));
2496 }
2497 
2498 // ------------------------------ ReorderDemote2To (OddEven)
2499 template <size_t N, int kPow2>
2501  svfloat32_t a, svfloat32_t b) {
2502  const RebindToUnsigned<decltype(dbf16)> du16;
2503  const Repartition<uint32_t, decltype(dbf16)> du32;
2504  const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b));
2505  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2506 }
2507 
2508 // ------------------------------ ZeroIfNegative (Lt, IfThenElse)
2509 template <class V>
2511  return IfThenZeroElse(detail::LtN(v, 0), v);
2512 }
2513 
2514 // ------------------------------ BroadcastSignBit (ShiftRight)
2515 template <class V>
2517  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
2518 }
2519 
2520 // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
2521 template <class V>
2522 HWY_API V IfNegativeThenElse(V v, V yes, V no) {
2523  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
2524  const DFromV<V> d;
2525  const RebindToSigned<decltype(d)> di;
2526 
2527  const svbool_t m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2528  return IfThenElse(m, yes, no);
2529 }
2530 
2531 // ------------------------------ AverageRound (ShiftRight)
2532 
2533 #if HWY_TARGET == HWY_SVE2
2536 #else
2537 template <class V>
2538 V AverageRound(const V a, const V b) {
2539  return ShiftRight<1>(detail::AddN(Add(a, b), 1));
2540 }
2541 #endif // HWY_TARGET == HWY_SVE2
2542 
2543 // ------------------------------ LoadMaskBits (TestBit)
2544 
2545 // `p` points to at least 8 readable bytes, not all of which need be valid.
2546 template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
2547 HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
2548  const RebindToUnsigned<D> du;
2549  const svuint8_t iota = Iota(du, 0);
2550 
2551  // Load correct number of bytes (bits/8) with 7 zeros after each.
2552  const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits));
2553  // Replicate bytes 8x such that each byte contains the bit that governs it.
2554  const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
2555 
2556  const svuint8_t bit =
2557  svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
2558  return TestBit(rep8, bit);
2559 }
2560 
2561 template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
2562 HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2563  const uint8_t* HWY_RESTRICT bits) {
2564  const RebindToUnsigned<D> du;
2565  const Repartition<uint8_t, D> du8;
2566 
2567  // There may be up to 128 bits; avoid reading past the end.
2568  const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits);
2569 
2570  // Replicate bytes 16x such that each lane contains the bit that governs it.
2571  const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0)));
2572 
2573  const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
2574  return TestBit(BitCast(du, rep16), bit);
2575 }
2576 
2577 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
2578 HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2579  const uint8_t* HWY_RESTRICT bits) {
2580  const RebindToUnsigned<D> du;
2581  const Repartition<uint8_t, D> du8;
2582 
2583  // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable,
2584  // so we can skip computing the actual length (Lanes(du)+7)/8.
2585  const svuint8_t bytes = svld1(FirstN(du8, 8), bits);
2586 
2587  // Replicate bytes 32x such that each lane contains the bit that governs it.
2588  const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0)));
2589 
2590  // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
2591  const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7));
2592 
2593  return TestBit(BitCast(du, rep32), bit);
2594 }
2595 
2596 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
2597 HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2598  const uint8_t* HWY_RESTRICT bits) {
2599  const RebindToUnsigned<D> du;
2600 
2601  // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
2602  // The "at least 8 byte" guarantee in quick_reference ensures this is safe.
2603  uint32_t mask_bits;
2604  CopyBytes<4>(bits, &mask_bits);
2605  const auto vbits = Set(du, mask_bits);
2606 
2607  // 2 ^ {0,1, .., 31}, will not have more lanes than that.
2608  const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0));
2609 
2610  return TestBit(vbits, bit);
2611 }
2612 
2613 // ------------------------------ StoreMaskBits
2614 
2615 namespace detail {
2616 
2617 // For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
2618 template <class T, HWY_IF_LANE_SIZE(T, 1)>
2619 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2620  return svdup_n_u8_z(m, 1);
2621 }
2622 template <class T, HWY_IF_LANE_SIZE(T, 2)>
2623 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2624  const ScalableTag<uint8_t> d8;
2625  const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
2626  return detail::ConcatEven(b16, b16); // only lower half needed
2627 }
2628 template <class T, HWY_IF_LANE_SIZE(T, 4)>
2629 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2630  return U8FromU32(svdup_n_u32_z(m, 1));
2631 }
2632 template <class T, HWY_IF_LANE_SIZE(T, 8)>
2633 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2634  const ScalableTag<uint32_t> d32;
2635  const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
2636  return U8FromU32(detail::ConcatEven(b64, b64)); // only lower half needed
2637 }
2638 
2639 // Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
2640 HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
2641  const ScalableTag<uint8_t> d8;
2642  const ScalableTag<uint16_t> d16;
2643  const ScalableTag<uint32_t> d32;
2644  const ScalableTag<uint64_t> d64;
2645  // TODO(janwas): could use SVE2 BDEP, but it's optional.
2646  x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
2647  x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
2648  x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
2649  return BitCast(d64, x);
2650 }
2651 
2652 } // namespace detail
2653 
2654 // `p` points to at least 8 writable bytes.
2655 // TODO(janwas): specialize for HWY_SVE_256
2656 template <class D>
2657 HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
2658  svuint64_t bits_in_u64 =
2660 
2661  const size_t num_bits = Lanes(d);
2662  const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below
2663 
2664  // Truncate each u64 to 8 bits and store to u8.
2665  svst1b_u64(FirstN(ScalableTag<uint64_t>(), num_bytes), bits, bits_in_u64);
2666 
2667  // Non-full byte, need to clear the undefined upper bits. Can happen for
2668  // capped/fractional vectors or large T and small hardware vectors.
2669  if (num_bits < 8) {
2670  const int mask = (1ull << num_bits) - 1;
2671  bits[0] = static_cast<uint8_t>(bits[0] & mask);
2672  }
2673  // Else: we wrote full bytes because num_bits is a power of two >= 8.
2674 
2675  return num_bytes;
2676 }
2677 
2678 // ------------------------------ CompressBits (LoadMaskBits)
2679 template <class V>
2680 HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
2681  return Compress(v, LoadMaskBits(DFromV<V>(), bits));
2682 }
2683 
2684 // ------------------------------ CompressBitsStore (LoadMaskBits)
2685 template <class D>
2686 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
2687  D d, TFromD<D>* HWY_RESTRICT unaligned) {
2688  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
2689 }
2690 
2691 // ------------------------------ MulEven (InterleaveEven)
2692 
2693 #if HWY_TARGET == HWY_SVE2
2694 namespace detail {
2695 #define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
2696  HWY_API HWY_SVE_V(BASE, BITS) \
2697  NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
2698  return sv##OP##_##CHAR##BITS(a, b); \
2699  }
2700 
2702 #undef HWY_SVE_MUL_EVEN
2703 } // namespace detail
2704 #endif
2705 
2706 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2707 HWY_API VFromD<DW> MulEven(const V a, const V b) {
2708 #if HWY_TARGET == HWY_SVE2
2709  return BitCast(DW(), detail::MulEven(a, b));
2710 #else
2711  const auto lo = Mul(a, b);
2712  const auto hi = detail::MulHigh(a, b);
2713  return BitCast(DW(), detail::InterleaveEven(lo, hi));
2714 #endif
2715 }
2716 
2717 HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
2718  const auto lo = Mul(a, b);
2719  const auto hi = detail::MulHigh(a, b);
2720  return detail::InterleaveEven(lo, hi);
2721 }
2722 
2723 HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
2724  const auto lo = Mul(a, b);
2725  const auto hi = detail::MulHigh(a, b);
2726  return detail::InterleaveOdd(lo, hi);
2727 }
2728 
2729 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2730 template <size_t N, int kPow2>
2732  svuint16_t a, svuint16_t b,
2733  const svfloat32_t sum0,
2734  svfloat32_t& sum1) {
2735  // TODO(janwas): svbfmlalb_f32 if __ARM_FEATURE_SVE_BF16.
2736  const Repartition<uint16_t, decltype(df32)> du16;
2737  const RebindToUnsigned<decltype(df32)> du32;
2738  const svuint16_t zero = Zero(du16);
2739  const svuint32_t a0 = ZipLower(du32, zero, BitCast(du16, a));
2740  const svuint32_t a1 = ZipUpper(du32, zero, BitCast(du16, a));
2741  const svuint32_t b0 = ZipLower(du32, zero, BitCast(du16, b));
2742  const svuint32_t b1 = ZipUpper(du32, zero, BitCast(du16, b));
2743  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2744  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2745 }
2746 
2747 // ------------------------------ AESRound / CLMul
2748 
2749 #if defined(__ARM_FEATURE_SVE2_AES)
2750 
2751 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
2752 #ifdef HWY_NATIVE_AES
2753 #undef HWY_NATIVE_AES
2754 #else
2755 #define HWY_NATIVE_AES
2756 #endif
2757 
2758 HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) {
2759  // It is not clear whether E and MC fuse like they did on NEON.
2760  const svuint8_t zero = svdup_n_u8(0);
2761  return Xor(svaesmc_u8(svaese_u8(state, zero)), round_key);
2762 }
2763 
2764 HWY_API svuint8_t AESLastRound(svuint8_t state, svuint8_t round_key) {
2765  return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
2766 }
2767 
2768 HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) {
2769  return svpmullb_pair(a, b);
2770 }
2771 
2772 HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) {
2773  return svpmullt_pair(a, b);
2774 }
2775 
2776 #endif // __ARM_FEATURE_SVE2_AES
2777 
2778 // ------------------------------ Lt128
2779 
2780 namespace detail {
2781 #define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \
2782  template <size_t N, int kPow2> \
2783  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \
2784  return sv##OP##_b##BITS(m, m); \
2785  }
2786 
2787 HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) // actually for bool
2788 HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2) // actually for bool
2789 #undef HWY_SVE_DUP
2790 
2791 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2792 template <class D>
2793 HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
2794  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2795  const svbool_t eqHx = Eq(a, b); // only odd lanes used
2796  // Convert to vector: more pipelines can TRN* for vectors than predicates.
2797  const svuint64_t ltHL = VecFromMask(d, Lt(a, b));
2798  // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
2799  // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated.
2800  const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL);
2801  // Duplicate upper lane into lower.
2802  return DupOdd(ltHx);
2803 }
2804 #endif
2805 } // namespace detail
2806 
2807 template <class D>
2808 HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
2809 #if HWY_TARGET == HWY_SVE_256
2810  return MaskFromVec(detail::Lt128Vec(d, a, b));
2811 #else
2812  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2813  const svbool_t eqHx = Eq(a, b); // only odd lanes used
2814  const svbool_t ltHL = Lt(a, b);
2815  // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
2816  const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL);
2817  // Duplicate upper lane into lower.
2818  return detail::DupOddB(d, ltHx);
2819 #endif // HWY_TARGET != HWY_SVE_256
2820 }
2821 
2822 // ------------------------------ Lt128Upper
2823 
2824 template <class D>
2825 HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
2826  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2827  const svbool_t ltHL = Lt(a, b);
2828  return detail::DupOddB(d, ltHL);
2829 }
2830 
2831 // ------------------------------ Min128, Max128 (Lt128)
2832 
2833 template <class D>
2834 HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) {
2835 #if HWY_TARGET == HWY_SVE_256
2836  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
2837 #else
2838  return IfThenElse(Lt128(d, a, b), a, b);
2839 #endif
2840 }
2841 
2842 template <class D>
2843 HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) {
2844 #if HWY_TARGET == HWY_SVE_256
2845  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
2846 #else
2847  return IfThenElse(Lt128(d, b, a), a, b);
2848 #endif
2849 }
2850 
2851 template <class D>
2852 HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) {
2853  return IfThenElse(Lt128Upper(d, a, b), a, b);
2854 }
2855 
2856 template <class D>
2857 HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) {
2858  return IfThenElse(Lt128Upper(d, b, a), a, b);
2859 }
2860 
2861 // ================================================== END MACROS
2862 namespace detail { // for code folding
2863 #undef HWY_IF_FLOAT_V
2864 #undef HWY_IF_LANE_SIZE_V
2865 #undef HWY_IF_SIGNED_V
2866 #undef HWY_IF_UNSIGNED_V
2867 #undef HWY_SVE_ALL_PTRUE
2868 #undef HWY_SVE_D
2869 #undef HWY_SVE_FOREACH
2870 #undef HWY_SVE_FOREACH_F
2871 #undef HWY_SVE_FOREACH_F16
2872 #undef HWY_SVE_FOREACH_F32
2873 #undef HWY_SVE_FOREACH_F64
2874 #undef HWY_SVE_FOREACH_I
2875 #undef HWY_SVE_FOREACH_I08
2876 #undef HWY_SVE_FOREACH_I16
2877 #undef HWY_SVE_FOREACH_I32
2878 #undef HWY_SVE_FOREACH_I64
2879 #undef HWY_SVE_FOREACH_IF
2880 #undef HWY_SVE_FOREACH_U
2881 #undef HWY_SVE_FOREACH_U08
2882 #undef HWY_SVE_FOREACH_U16
2883 #undef HWY_SVE_FOREACH_U32
2884 #undef HWY_SVE_FOREACH_U64
2885 #undef HWY_SVE_FOREACH_UI
2886 #undef HWY_SVE_FOREACH_UI08
2887 #undef HWY_SVE_FOREACH_UI16
2888 #undef HWY_SVE_FOREACH_UI32
2889 #undef HWY_SVE_FOREACH_UI64
2890 #undef HWY_SVE_FOREACH_UIF3264
2891 #undef HWY_SVE_PTRUE
2892 #undef HWY_SVE_RETV_ARGPV
2893 #undef HWY_SVE_RETV_ARGPVN
2894 #undef HWY_SVE_RETV_ARGPVV
2895 #undef HWY_SVE_RETV_ARGV
2896 #undef HWY_SVE_RETV_ARGVN
2897 #undef HWY_SVE_RETV_ARGVV
2898 #undef HWY_SVE_T
2899 #undef HWY_SVE_UNDEFINED
2900 #undef HWY_SVE_V
2901 
2902 } // namespace detail
2903 // NOLINTNEXTLINE(google-readability-namespace-comments)
2904 } // namespace HWY_NAMESPACE
2905 } // namespace hwy
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:103
#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:59
HWY_AFTER_NAMESPACE()
#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1073
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:71
#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2781
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:730
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1383
#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1025
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1368
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:55
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1471
#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1107
#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:353
#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:672
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:342
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:126
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:259
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1965
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:946
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:527
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1040
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:300
#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:938
#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1853
#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2382
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:111
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:155
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:63
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:328
#define HWY_SVE_PTRUE(BITS)
Definition: arm_sve-inl.h:206
#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1089
#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1663
#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1155
#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1140
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:742
#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1049
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:89
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:161
#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:955
#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1923
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:178
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:601
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:95
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1174
#define HWY_SVE_IS_POW2
Definition: arm_sve-inl.h:30
#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:963
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2053
#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:930
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:802
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:483
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:83
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:138
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1445
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:774
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:437
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:122
HWY_BEFORE_NAMESPACE()
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:56
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:77
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:798
#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:866
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:270
#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2695
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1126
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:107
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1015
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:569
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1762
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:99
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:173
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:151
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:447
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1798
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_TARGET
Definition: detect_targets.h:341
#define HWY_SVE_256
Definition: detect_targets.h:78
HWY_INLINE svuint8_t BoolFromMask(svbool_t m)
Definition: arm_sve-inl.h:2619
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:189
DupOddB
Definition: arm_sve-inl.h:2788
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition: rvv-inl.h:1823
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x)
Definition: arm_sve-inl.h:2640
svbool_t MaskLowerHalf(D d)
Definition: arm_sve-inl.h:1557
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: rvv-inl.h:1817
svbool_t MakeMask(D d)
Definition: arm_sve-inl.h:290
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:1937
VI SaturateI(VI v)
Definition: arm_sve-inl.h:1270
svbool_t MaskUpperHalf(D d)
Definition: arm_sve-inl.h:1651
VU SaturateU(VU v)
Definition: arm_sve-inl.h:1264
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) HWY_API svbool_t PFalse()
Definition: arm_sve-inl.h:280
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition: arm_sve-inl.h:1250
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) HWY_SVE_FOREACH_U(HWY_SVE_DUP
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:111
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition: ops/shared-inl.h:103
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
trn2 HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2793
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPV, Not, not) namespace detail
Definition: arm_sve-inl.h:391
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2717
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API VFromD< D > ConcatEven(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1406
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:161
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_sve-inl.h:2483
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API VFromD< DW > ZipLower(const V a, const V b)
Definition: arm_sve-inl.h:2477
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API VFromD< D > ConcatOdd(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1394
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
constexpr HWY_API bool IsSame()
Definition: base.h:322
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
constexpr HWY_API bool IsSigned()
Definition: base.h:534
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_sve-inl.h:40
Definition: ops/shared-inl.h:40
Definition: base.h:358
Definition: base.h:251
uint16_t bits
Definition: base.h:252