28 #undef HWY_SVE_IS_POW2
29 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
30 #define HWY_SVE_IS_POW2 1
32 #define HWY_SVE_IS_POW2 0
55 #define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
56 #define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
57 #define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
58 X_MACRO(uint, u, 32, 16, NAME, OP)
59 #define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
60 X_MACRO(uint, u, 64, 32, NAME, OP)
63 #define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
64 #define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
65 #define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
66 #define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
69 #define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
70 X_MACRO(float, f, 16, 16, NAME, OP)
71 #define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
72 X_MACRO(float, f, 32, 16, NAME, OP)
73 #define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
74 X_MACRO(float, f, 64, 32, NAME, OP)
77 #define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
78 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
79 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
80 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
81 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
83 #define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
84 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
85 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
86 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
87 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
89 #define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
90 HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
91 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
92 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
95 #define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
96 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
97 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
99 #define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
100 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
101 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
103 #define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
104 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
105 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
107 #define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
108 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
109 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
111 #define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
112 HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
113 HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
114 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
115 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
118 #define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
119 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
120 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
122 #define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
123 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
124 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
126 #define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
127 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
128 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
129 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
132 #define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
133 #define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
134 #define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
138 #define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
140 struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
141 using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \
145 #undef HWY_SPECIALIZE
151 #define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \
152 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
153 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
155 #define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \
156 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
157 return sv##OP##_##CHAR##BITS(v); \
161 #define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
162 HWY_API HWY_SVE_V(BASE, BITS) \
163 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
164 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
166 #define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \
167 HWY_API HWY_SVE_V(BASE, BITS) \
168 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
169 return sv##OP##_##CHAR##BITS(a, b); \
173 #define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
174 HWY_API HWY_SVE_V(BASE, BITS) \
175 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
176 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
178 #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
179 HWY_API HWY_SVE_V(BASE, BITS) \
180 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
181 return sv##OP##_##CHAR##BITS(a, b); \
190 return svcntb_pat(SV_ALL);
193 return svcnth_pat(SV_ALL);
196 return svcntw_pat(SV_ALL);
199 return svcntd_pat(SV_ALL);
203 #define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
206 #define HWY_SVE_PTRUE(BITS) HWY_SVE_ALL_PTRUE(BITS)
208 #define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
211 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
213 return svcntb_pat(SV_POW2);
215 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
217 return svcnth_pat(SV_POW2);
219 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
221 return svcntw_pat(SV_POW2);
223 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
225 return svcntd_pat(SV_POW2);
234 #if HWY_TARGET == HWY_SVE_256
235 template <
typename T,
size_t N,
int kPow2>
239 #elif HWY_TARGET == HWY_SVE2_128
240 template <
typename T,
size_t N,
int kPow2>
245 template <
typename T,
size_t N,
int kPow2>
247 const size_t actual = detail::HardwareLanes<T>();
259 #define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \
260 template <size_t N, int kPow2> \
261 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \
262 const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
263 return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
266 #undef HWY_SVE_FIRSTN
270 #define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
271 template <size_t N, int kPow2> \
272 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
273 return HWY_SVE_PTRUE(BITS); \
275 template <size_t N, int kPow2> \
276 HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
277 return HWY_SVE_ALL_PTRUE(BITS); \
281 #undef HWY_SVE_WRAP_PTRUE
283 HWY_API svbool_t PFalse() {
return svpfalse_b(); }
300 #define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
301 template <size_t N, int kPow2> \
302 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
303 HWY_SVE_T(BASE, BITS) arg) { \
304 return sv##OP##_##CHAR##BITS(arg); \
311 template <
size_t N,
int kPow2>
317 using VFromD = decltype(
Set(D(), TFromD<D>()));
328 #define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
329 template <size_t N, int kPow2> \
330 HWY_API HWY_SVE_V(BASE, BITS) \
331 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
332 return sv##OP##_##CHAR##BITS(); \
342 #define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \
343 HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
346 template <size_t N, int kPow2> \
347 HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
348 HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v) { \
353 #define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \
354 HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
355 return sv##OP##_u8_##CHAR##BITS(v); \
357 template <size_t N, int kPow2> \
358 HWY_INLINE HWY_SVE_V(BASE, BITS) \
359 BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) , svuint8_t v) { \
360 return sv##OP##_##CHAR##BITS##_u8(v); \
370 #undef HWY_SVE_CAST_NOP
373 template <
size_t N,
int kPow2>
381 template <
class D,
class FromV>
401 template <
class V, HWY_IF_FLOAT_V(V)>
412 template <
class V, HWY_IF_FLOAT_V(V)>
427 template <
class V, HWY_IF_FLOAT_V(V)>
437 #define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
438 HWY_API HWY_SVE_V(BASE, BITS) \
439 NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
440 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
444 #undef HWY_SVE_RETV_ARGPVN_SWAP
447 #define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
448 HWY_API HWY_SVE_V(BASE, BITS) \
449 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
450 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
453 #undef HWY_SVE_RETV_ARGPVV_SWAP
455 template <
class V, HWY_IF_FLOAT_V(V)>
465 return Or(o1,
Or(o2, o3));
471 return Or(o,
And(a1, a2));
476 #ifdef HWY_NATIVE_POPCNT
477 #undef HWY_NATIVE_POPCNT
479 #define HWY_NATIVE_POPCNT
483 #define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \
484 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
485 return BitCast(DFromV<decltype(v)>(), \
486 sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
489 #undef HWY_SVE_POPCNT
510 return Or(abs,
And(msb, sign));
527 #define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \
528 HWY_API HWY_SVE_V(BASE, BITS) \
529 NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
530 return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
534 #undef HWY_SVE_RETV_ARGPVN_MASK
543 const svbool_t pg = detail::PTrue(du64);
545 const svuint32_t sums_of_4 = svdot_n_u32(
Zero(du32),
v, 1);
548 const svuint64_t hi = svlsr_n_u64_x(pg,
BitCast(du64, sums_of_4), 32);
550 const svuint64_t lo = svextw_u64_x(pg,
BitCast(du64, sums_of_4));
569 #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
570 template <int kBits> \
571 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
572 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
574 HWY_API HWY_SVE_V(BASE, BITS) \
575 NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
576 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
586 #undef HWY_SVE_SHIFT_N
591 template <
int kBits,
class V>
593 constexpr
size_t kSizeInBits =
sizeof(
TFromV<V>) * 8;
594 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
595 if (kBits == 0)
return v;
596 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
601 #define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
602 HWY_API HWY_SVE_V(BASE, BITS) \
603 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
604 const RebindToUnsigned<DFromV<decltype(v)>> du; \
605 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
606 BitCast(du, bits)); \
641 #if HWY_TARGET == HWY_SVE2
642 return svqrdmulh_s16(a, b);
648 const svint16_t hi =
MulHigh(a, b);
652 const svuint16_t lo_top2 = ShiftRight<14>(lo);
654 const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
672 #define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
673 HWY_API HWY_SVE_V(BASE, BITS) \
674 NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
675 HWY_SVE_V(BASE, BITS) add) { \
676 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
702 template <
class D,
typename MFrom>
716 return svand_b_z(b, b, a);
719 return svbic_b_z(b, b, a);
722 return svsel_b(a, a, b);
725 return svsel_b(a, svnand_b_z(a, a, b), b);
730 #define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
731 template <size_t N, int kPow2> \
732 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
733 return sv##OP##_b##BITS(detail::MakeMask(d), m); \
737 #undef HWY_SVE_COUNT_TRUE
742 #define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \
743 template <size_t N, int kPow2> \
744 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , svbool_t m) { \
745 return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
749 #undef HWY_SVE_COUNT_TRUE_FULL
769 :
static_cast<intptr_t
>(
774 #define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
775 HWY_API HWY_SVE_V(BASE, BITS) \
776 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
777 return sv##OP##_##CHAR##BITS(m, yes, no); \
781 #undef HWY_SVE_IF_THEN_ELSE
798 #define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \
799 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
800 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
802 #define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \
803 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
804 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
828 #undef HWY_SVE_COMPARE
829 #undef HWY_SVE_COMPARE_N
844 return detail::NeN(
And(a, bit), 0);
850 return detail::NeN(
v,
static_cast<TFromV<V>>(0));
864 #if HWY_TARGET == HWY_SVE2
866 #define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \
867 HWY_API HWY_SVE_V(BASE, BITS) \
868 NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
869 HWY_SVE_V(BASE, BITS) no) { \
870 return sv##OP##_##CHAR##BITS(yes, no, mask); \
874 #undef HWY_SVE_IF_VEC
876 template <
class V, HWY_IF_FLOAT_V(V)>
907 return RebindMask(
d, detail::EqN(
Add(vi, vi), hwy::MaxExponentTimes2<T>()));
921 const VFromD<decltype(di)> exp =
923 return RebindMask(
d, detail::LtN(exp, hwy::MaxExponentField<T>()));
930 #define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
931 template <size_t N, int kPow2> \
932 HWY_API HWY_SVE_V(BASE, BITS) \
933 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
934 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
935 return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \
938 #define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
939 template <size_t N, int kPow2> \
940 HWY_API HWY_SVE_V(BASE, BITS) \
941 NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) , \
942 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
943 return sv##OP##_##CHAR##BITS(m, p); \
946 #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
947 template <size_t N, int kPow2> \
948 HWY_API HWY_SVE_V(BASE, BITS) \
949 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
950 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
952 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \
955 #define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
956 template <size_t N, int kPow2> \
957 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
958 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
959 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
960 sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \
963 #define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
964 template <size_t N, int kPow2> \
965 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
966 HWY_SVE_D(BASE, BITS, N, kPow2) , \
967 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
968 sv##OP##_##CHAR##BITS(m, p, v); \
979 #undef HWY_SVE_MASKED_LOAD
980 #undef HWY_SVE_LOAD_DUP128
982 #undef HWY_SVE_BLENDED_STORE
985 template <
size_t N,
int kPow2>
992 template <
size_t N,
int kPow2>
1008 template <
class V,
class D>
1015 #define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1016 template <size_t N, int kPow2> \
1017 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1018 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1019 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1020 HWY_SVE_V(int, BITS) offset) { \
1021 sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
1025 #define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1026 template <size_t N, int kPow2> \
1027 HWY_API void NAME( \
1028 HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1029 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \
1030 sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \
1035 #undef HWY_SVE_SCATTER_OFFSET
1036 #undef HWY_SVE_SCATTER_INDEX
1040 #define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1041 template <size_t N, int kPow2> \
1042 HWY_API HWY_SVE_V(BASE, BITS) \
1043 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1044 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1045 HWY_SVE_V(int, BITS) offset) { \
1046 return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
1049 #define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1050 template <size_t N, int kPow2> \
1051 HWY_API HWY_SVE_V(BASE, BITS) \
1052 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1053 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1054 HWY_SVE_V(int, BITS) index) { \
1055 return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \
1061 #undef HWY_SVE_GATHER_OFFSET
1062 #undef HWY_SVE_GATHER_INDEX
1067 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1068 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1070 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1073 #define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \
1074 template <size_t N, int kPow2> \
1075 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1076 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1077 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
1078 const sv##BASE##BITS##x2_t tuple = \
1079 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1080 v0 = svget2(tuple, 0); \
1081 v1 = svget2(tuple, 1); \
1085 #undef HWY_SVE_LOAD2
1089 #define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \
1090 template <size_t N, int kPow2> \
1091 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1092 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1093 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1094 HWY_SVE_V(BASE, BITS) & v2) { \
1095 const sv##BASE##BITS##x3_t tuple = \
1096 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1097 v0 = svget3(tuple, 0); \
1098 v1 = svget3(tuple, 1); \
1099 v2 = svget3(tuple, 2); \
1103 #undef HWY_SVE_LOAD3
1107 #define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \
1108 template <size_t N, int kPow2> \
1109 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1110 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1111 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1112 HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
1113 const sv##BASE##BITS##x4_t tuple = \
1114 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1115 v0 = svget4(tuple, 0); \
1116 v1 = svget4(tuple, 1); \
1117 v2 = svget4(tuple, 2); \
1118 v3 = svget4(tuple, 3); \
1122 #undef HWY_SVE_LOAD4
1126 #define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
1127 template <size_t N, int kPow2> \
1128 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1129 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1130 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1131 const sv##BASE##BITS##x2_t tuple = svcreate2##_##CHAR##BITS(v0, v1); \
1132 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, tuple); \
1136 #undef HWY_SVE_STORE2
1140 #define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \
1141 template <size_t N, int kPow2> \
1142 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1143 HWY_SVE_V(BASE, BITS) v2, \
1144 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1145 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1146 const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
1147 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, triple); \
1151 #undef HWY_SVE_STORE3
1155 #define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \
1156 template <size_t N, int kPow2> \
1157 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1158 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
1159 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1160 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1161 const sv##BASE##BITS##x4_t quad = \
1162 svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \
1163 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, quad); \
1167 #undef HWY_SVE_STORE4
1174 #define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \
1175 template <size_t N, int kPow2> \
1176 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1177 HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(BASE, HALF) v) { \
1178 return sv##OP##_##CHAR##BITS(v); \
1186 template <
size_t N,
int kPow2>
1191 template <
size_t N,
int kPow2>
1198 template <
size_t N,
int kPow2>
1203 template <
size_t N,
int kPow2>
1208 template <
size_t N,
int kPow2>
1221 template <
size_t N,
int kPow2>
1223 const svfloat16_t
v) {
1230 template <
size_t N,
int kPow2>
1232 const svfloat32_t
v) {
1237 template <
size_t N,
int kPow2>
1239 const svint32_t
v) {
1247 #undef HWY_SVE_PROMOTE_TO
1249 template <
size_t N,
int kPow2>
1263 template <
typename TN,
class VU>
1265 return detail::MinN(
v,
static_cast<TFromV<VU>>(LimitsMax<TN>()));
1269 template <
typename TN,
class VI>
1271 return detail::MinN(detail::MaxN(
v, LimitsMin<TN>()), LimitsMax<TN>());
1276 template <
size_t N,
int kPow2>
1280 using TN =
TFromD<decltype(dn)>;
1282 const svuint16_t clamped =
BitCast(du, detail::MaxN(
v, 0));
1284 const svuint8_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
1285 return svuzp1_u8(vn, vn);
1288 template <
size_t N,
int kPow2>
1292 using TN =
TFromD<decltype(dn)>;
1294 const svuint32_t clamped =
BitCast(du, detail::MaxN(
v, 0));
1296 const svuint16_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
1297 return svuzp1_u16(vn, vn);
1300 template <
size_t N,
int kPow2>
1305 using TN =
TFromD<decltype(dn)>;
1307 const svuint32_t clamped =
BitCast(du, detail::MaxN(
v, 0));
1309 const svuint16_t cast16 =
BitCast(d2, detail::SaturateU<TN>(clamped));
1310 const svuint8_t x2 =
BitCast(dn, svuzp1_u16(cast16, cast16));
1311 return svuzp1_u8(x2, x2);
1319 const svuint16_t cast16 =
BitCast(du16,
v);
1320 const svuint16_t x2 = svuzp1_u16(cast16, cast16);
1321 const svuint8_t cast8 =
BitCast(du8, x2);
1322 return svuzp1_u8(cast8, cast8);
1327 template <
size_t N,
int kPow2>
1329 #if HWY_TARGET == HWY_SVE2
1330 const svint8_t vn =
BitCast(dn, svqxtnb_s16(
v));
1332 using TN =
TFromD<decltype(dn)>;
1333 const svint8_t vn =
BitCast(dn, detail::SaturateI<TN>(
v));
1335 return svuzp1_s8(vn, vn);
1338 template <
size_t N,
int kPow2>
1340 #if HWY_TARGET == HWY_SVE2
1341 const svint16_t vn =
BitCast(dn, svqxtnb_s32(
v));
1343 using TN =
TFromD<decltype(dn)>;
1344 const svint16_t vn =
BitCast(dn, detail::SaturateI<TN>(
v));
1346 return svuzp1_s16(vn, vn);
1349 template <
size_t N,
int kPow2>
1352 #if HWY_TARGET == HWY_SVE2
1353 const svint16_t cast16 =
BitCast(d2, svqxtnb_s16(svqxtnb_s32(
v)));
1355 using TN =
TFromD<decltype(dn)>;
1356 const svint16_t cast16 =
BitCast(d2, detail::SaturateI<TN>(
v));
1358 const svint8_t v2 =
BitCast(dn, svuzp1_s16(cast16, cast16));
1359 return BitCast(dn, svuzp1_s8(v2, v2));
1368 #define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
1369 HWY_INLINE HWY_SVE_V(BASE, BITS) \
1370 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1371 return sv##OP##_##CHAR##BITS(lo, hi); \
1375 #if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
1379 #undef HWY_SVE_CONCAT_EVERY_SECOND
1383 #define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
1384 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1385 HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
1386 return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1389 #undef HWY_SVE_SPLICE
1401 return detail::Splice(hi_odd, lo_odd,
FirstN(
d,
Lanes(
d) / 2));
1413 return detail::Splice(hi_odd, lo_odd,
FirstN(
d,
Lanes(
d) / 2));
1419 template <
size_t N,
int kPow2>
1421 const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(
d),
v);
1425 template <
size_t N,
int kPow2>
1431 template <
size_t N,
int kPow2>
1433 const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(
d),
v);
1437 template <
size_t N,
int kPow2>
1439 const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(
d),
v);
1445 #define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
1446 template <size_t N, int kPow2> \
1447 HWY_API HWY_SVE_V(BASE, BITS) \
1448 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(int, BITS) v) { \
1449 return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1452 template <size_t N, int kPow2> \
1453 HWY_API HWY_SVE_V(int, BITS) \
1454 NAME(HWY_SVE_D(int, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v) { \
1455 return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1460 #undef HWY_SVE_CONVERT
1463 template <
class VF,
class DI = RebindToSigned<DFromV<VF>>>
1471 #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
1472 template <size_t N, int kPow2> \
1473 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
1474 HWY_SVE_T(BASE, BITS) first) { \
1475 return sv##OP##_##CHAR##BITS(first, 1); \
1481 template <
class D, HWY_IF_FLOAT_D(D)>
1489 template <
class D,
class V>
1492 #if HWY_TARGET == HWY_SVE2_128
1498 const auto a64 =
BitCast(d64, a);
1499 const auto b64 =
BitCast(d64, b);
1515 #if HWY_TARGET == HWY_SVE2_128
1522 template <
class D,
class V = VFromD<D>,
1523 hwy::EnableIf<detail::IsFull(D())>* =
nullptr>
1525 #if HWY_TARGET == HWY_SVE2_128
1531 const auto a64 =
BitCast(d64, a);
1532 const auto b64 =
BitCast(d64, b);
1540 template <
class D,
class V = VFromD<D>,
1541 hwy::EnableIf<!detail::IsFull(D())>* =
nullptr>
1544 if (
Lanes(
d) *
sizeof(TFromD<D>) < 16) {
1545 const Half<decltype(
d)> d2;
1555 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
1556 template <
class D, HWY_IF_LANE_SIZE_D(D, 1)>
1560 return svptrue_pat_b8(SV_VL16);
1562 return svptrue_pat_b8(SV_VL8);
1564 return svptrue_pat_b8(SV_VL4);
1566 return svptrue_pat_b8(SV_VL2);
1568 return svptrue_pat_b8(SV_VL1);
1571 template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
1575 return svptrue_pat_b16(SV_VL8);
1577 return svptrue_pat_b16(SV_VL4);
1579 return svptrue_pat_b16(SV_VL2);
1581 return svptrue_pat_b16(SV_VL1);
1584 template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
1588 return svptrue_pat_b32(SV_VL4);
1590 return svptrue_pat_b32(SV_VL2);
1592 return svptrue_pat_b32(SV_VL1);
1595 template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
1599 return svptrue_pat_b64(SV_VL2);
1601 return svptrue_pat_b64(SV_VL1);
1605 #if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
1606 template <
class D, HWY_IF_LANE_SIZE_D(D, 1)>
1610 return svptrue_pat_b8(SV_VL8);
1612 return svptrue_pat_b8(SV_VL4);
1614 return svptrue_pat_b8(SV_VL2);
1618 return svptrue_pat_b8(SV_VL1);
1621 template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
1625 return svptrue_pat_b16(SV_VL4);
1627 return svptrue_pat_b16(SV_VL2);
1631 return svptrue_pat_b16(SV_VL1);
1634 template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
1636 return svptrue_pat_b32(
Lanes(
d) == 4 ? SV_VL2 : SV_VL1);
1638 template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
1640 return svptrue_pat_b64(SV_VL1);
1643 #if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
1663 #define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \
1664 template <size_t kIndex> \
1665 HWY_API HWY_SVE_V(BASE, BITS) \
1666 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1667 return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
1675 template <
class D,
class V>
1681 template <
class D,
class V>
1684 #if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1685 return detail::ConcatEvenBlocks(hi, lo);
1687 #if HWY_TARGET == HWY_SVE2_128
1689 const auto lo64 =
BitCast(du64, lo);
1697 template <
class D,
class V>
1699 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
1701 return detail::Ext<
Lanes(
d) / 2>(hi, lo);
1708 template <
class D,
class V>
1711 #if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1712 return detail::ConcatOddBlocks(hi, lo);
1714 #if HWY_TARGET == HWY_SVE2_128
1716 const auto lo64 =
BitCast(du64, lo);
1721 const V lo_upper = detail::Splice(lo, lo, mask_upper);
1726 template <
class D,
class V2>
1732 template <
class D,
class V>
1739 template <
class D2,
class V>
1749 template <
class D2,
class V>
1751 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
1752 return detail::Ext<Lanes(d2)>(
v,
v);
1762 #define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
1763 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
1764 return sv##OP##_##CHAR##BITS(pg, v); \
1774 #undef HWY_SVE_REDUCE
1777 template <
class D,
class V>
1782 template <
class D,
class V>
1787 template <
class D,
class V>
1798 #define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
1799 HWY_INLINE HWY_SVE_T(BASE, BITS) \
1800 NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
1801 return sv##OP##_##CHAR##BITS(mask, v); \
1805 #undef HWY_SVE_GET_LANE
1823 const auto is_i = detail::EqN(
Iota(
d, 0),
static_cast<TFromV<V>>(i));
1835 return detail::InterleaveEven(
v,
v);
1846 return detail::InterleaveOdd(
v,
v);
1851 #if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE2
1853 #define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
1854 HWY_API HWY_SVE_V(BASE, BITS) \
1855 NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
1856 return sv##OP##_##CHAR##BITS(even, odd, 0); \
1860 #undef HWY_SVE_ODD_EVEN
1862 template <
class V, HWY_IF_FLOAT_V(V)>
1873 const auto odd_in_even = detail::Ext<1>(odd, odd);
1874 return detail::InterleaveEven(even, odd_in_even);
1883 #if HWY_TARGET == HWY_SVE_256
1885 #elif HWY_TARGET == HWY_SVE2_128
1891 using TU =
TFromD<decltype(du)>;
1892 constexpr
size_t kShift =
CeilLog2(16 /
sizeof(TU));
1893 const auto idx_block = ShiftRight<kShift>(
Iota(du, 0));
1894 const auto lsb = detail::AndN(idx_block,
static_cast<TU
>(1));
1895 const svbool_t is_even = detail::EqN(lsb,
static_cast<TU
>(0));
1902 template <
class D,
class VI>
1905 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index/lane size mismatch");
1907 const auto indices =
BitCast(du, vec);
1908 #if HWY_IS_DEBUG_BUILD
1916 template <
class D,
typename TI>
1918 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
1923 #define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \
1924 HWY_API HWY_SVE_V(BASE, BITS) \
1925 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
1926 return sv##OP##_##CHAR##BITS(v, idx); \
1930 #undef HWY_SVE_TABLE
1936 template <
typename T,
size_t N,
int kPow2>
1947 #if HWY_TARGET == HWY_SVE_256
1949 #elif HWY_TARGET == HWY_SVE2_128
1954 constexpr
auto kLanesPerBlock =
1956 const VFromD<decltype(du)> idx = detail::XorN(
Iota(du, 0), kLanesPerBlock);
1965 #define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \
1966 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1967 return sv##OP##_##CHAR##BITS(v); \
1971 #undef HWY_SVE_REVERSE
1975 template <
class D,
class V>
1978 const auto reversed = detail::ReverseFull(
v);
1985 const svbool_t all_true = detail::AllPTrue(dfull);
1987 const svbool_t mask =
1988 svnot_b_z(all_true,
FirstN(dfull, all_lanes -
Lanes(
d)));
1989 return detail::Splice(reversed, reversed, mask);
1994 template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
2001 template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
2008 template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
2010 #if HWY_TARGET == HWY_SVE2_128
2012 return detail::Ext<1>(
v,
v);
2016 const auto odd_in_even = detail::Ext<1>(
v,
v);
2017 return detail::InterleaveEven(odd_in_even,
v);
2024 return detail::ReverseFull(
v);
2028 const auto idx = detail::XorN(
Iota(du, 0), 3);
2036 const auto idx = detail::XorN(
Iota(du, 0), 7);
2042 template <
typename T>
2043 struct CompressIsPartition {
2044 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2047 enum {
value = (
sizeof(T) == 8) };
2053 #define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \
2054 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
2055 return sv##OP##_##CHAR##BITS(mask, v); \
2058 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2064 #undef HWY_SVE_COMPRESS
2066 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2067 template <
class V, HWY_IF_LANE_SIZE_V(V, 8)>
2075 const svuint64_t bits =
Shl(
Set(du64, 1),
Iota(du64, 2));
2079 alignas(16)
static constexpr uint64_t table[4 * 16] = {
2081 0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
2082 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
2083 0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
2087 #if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2088 template <
class V, HWY_IF_LANE_SIZE_V(V, 8)>
2095 const svbool_t maskLL = svzip1_b64(mask, mask);
2096 return detail::Splice(
v,
v,
AndNot(maskLL, mask));
2100 template <
class V, HWY_IF_LANE_SIZE_V(V, 2)>
2102 static_assert(!IsSame<V, svfloat16_t>(),
"Must use overload");
2103 const DFromV<V> d16;
2109 const svbool_t mask32L = svunpklo_b(mask16);
2110 const svbool_t mask32H = svunpkhi_b(mask16);
2112 const auto compressedL =
Compress(v32L, mask32L);
2113 const auto compressedH =
Compress(v32H, mask32H);
2116 const V evenL =
BitCast(d16, compressedL);
2117 const V evenH =
BitCast(d16, compressedH);
2124 const size_t countL = detail::CountTrueFull(dw, mask32L);
2125 const auto compressed_maskL =
FirstN(d16, countL);
2126 return detail::Splice(v16H, v16L, compressed_maskL);
2138 template <
class V, HWY_IF_NOT_LANE_SIZE_V(V, 8)>
2143 template <
class V, HWY_IF_LANE_SIZE_V(V, 8)>
2145 #if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2151 const svbool_t maskLL = svzip1_b64(mask, mask);
2152 return detail::Splice(
v,
v,
AndNot(mask, maskLL));
2154 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2161 const svuint64_t bits =
Shl(
Set(du64, 1),
Iota(du64, 2));
2165 alignas(16)
static constexpr uint64_t table[4 * 16] = {
2167 0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
2168 0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
2169 2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
2178 #if HWY_TARGET == HWY_SVE2_128
2186 template <
class V,
class D>
2194 template <
class V,
class D>
2198 const svbool_t store_mask =
FirstN(
d, count);
2209 #if HWY_TARGET != HWY_SVE2_128
2214 template <
class D,
class V>
2217 return detail::AndNotN(
static_cast<T
>(
LanesPerBlock(
d) - 1), iota0);
2220 template <
size_t kLanes,
class D, HWY_IF_LANE_SIZE_D(D, 1)>
2224 const svuint8_t idx_mod =
2225 svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2226 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2227 6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock,
2228 9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock,
2229 12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock,
2230 15 % kLanesPerBlock);
2231 return detail::LtN(
BitCast(du, idx_mod), kLanes);
2233 template <
size_t kLanes,
class D, HWY_IF_LANE_SIZE_D(D, 2)>
2237 const svuint16_t idx_mod =
2238 svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2239 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2240 6 % kLanesPerBlock, 7 % kLanesPerBlock);
2241 return detail::LtN(
BitCast(du, idx_mod), kLanes);
2243 template <
size_t kLanes,
class D, HWY_IF_LANE_SIZE_D(D, 4)>
2247 const svuint32_t idx_mod =
2248 svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2249 3 % kLanesPerBlock);
2250 return detail::LtN(
BitCast(du, idx_mod), kLanes);
2252 template <
size_t kLanes,
class D, HWY_IF_LANE_SIZE_D(D, 8)>
2256 const svuint64_t idx_mod =
2257 svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock);
2258 return detail::LtN(
BitCast(du, idx_mod), kLanes);
2264 template <
size_t kBytes,
class D,
class V = VFromD<D>>
2267 const auto hi8 =
BitCast(d8, hi);
2268 const auto lo8 =
BitCast(d8, lo);
2269 #if HWY_TARGET == HWY_SVE2_128
2270 return BitCast(
d, detail::Ext<kBytes>(hi8, lo8));
2272 const auto hi_up = detail::Splice(hi8, hi8,
FirstN(d8, 16 - kBytes));
2273 const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
2283 static_assert(
sizeof(
TFromD<decltype(
d)>) == 4,
"Defined for 32-bit types");
2292 static_assert(
sizeof(
TFromD<decltype(
d)>) == 4,
"Defined for 32-bit types");
2293 const svuint8_t v8 =
BitCast(d8,
v);
2294 return BitCast(
d, CombineShiftRightBytes<12>(d8, v8, v8));
2302 static_assert(
sizeof(
TFromD<decltype(
d)>) == 4,
"Defined for 32-bit types");
2303 const svuint8_t v8 =
BitCast(d8,
v);
2304 return BitCast(
d, CombineShiftRightBytes<4>(d8, v8, v8));
2312 static_assert(
sizeof(
TFromD<decltype(
d)>) == 4,
"Defined for 32-bit types");
2313 const svuint8_t v8 =
BitCast(d8,
v);
2314 return BitCast(
d, CombineShiftRightBytes<8>(d8, v8, v8));
2322 static_assert(
sizeof(
TFromD<decltype(
d)>) == 8,
"Defined for 64-bit types");
2323 const svuint8_t v8 =
BitCast(d8,
v);
2324 return BitCast(
d, CombineShiftRightBytes<8>(d8, v8, v8));
2334 template <
class D,
class V = VFromD<D>>
2336 #if HWY_TARGET == HWY_SVE_256
2342 #elif HWY_TARGET == HWY_SVE2_128
2352 template <
class V,
class VI>
2356 #if HWY_TARGET == HWY_SVE2_128
2360 const auto idx8 =
Add(
BitCast(du8, idx), offsets128);
2365 template <
class V,
class VI>
2371 auto idx8 =
BitCast(di8, idx);
2372 const auto msb = detail::LtN(idx8, 0);
2380 #if HWY_TARGET == HWY_SVE2_128
2382 #define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \
2383 template <int kLane> \
2384 HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
2385 return sv##OP##_##CHAR##BITS(v, kLane); \
2389 #undef HWY_SVE_BROADCAST
2393 template <
int kLane,
class V>
2398 static_assert(0 <= kLane && kLane < kLanesPerBlock,
"Invalid lane");
2399 #if HWY_TARGET == HWY_SVE2_128
2400 return detail::Broadcast<kLane>(
v);
2404 idx = detail::AddN(idx, kLane);
2412 template <
size_t kLanes,
class D,
class V = VFromD<D>>
2414 const auto zero =
Zero(
d);
2415 const auto shifted = detail::Splice(
v, zero,
FirstN(
d, kLanes));
2416 #if HWY_TARGET == HWY_SVE2_128
2420 return IfThenElse(detail::FirstNPerBlock<kLanes>(
d), zero, shifted);
2424 template <
size_t kLanes,
class V>
2426 return ShiftLeftLanes<kLanes>(
DFromV<V>(),
v);
2430 template <
size_t kLanes,
class D,
class V = VFromD<D>>
2437 #if HWY_TARGET == HWY_SVE2_128
2438 return detail::Ext<kLanes>(
Zero(
d),
v);
2440 const auto shifted = detail::Ext<kLanes>(
v,
v);
2450 template <
int kBytes,
class D,
class V = VFromD<D>>
2456 template <
int kBytes,
class V>
2458 return ShiftLeftBytes<kBytes>(
DFromV<V>(),
v);
2462 template <
int kBytes,
class D,
class V = VFromD<D>>
2470 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2476 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2482 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2492 template <
size_t N,
int kPow2>
2494 const svuint16_t
v) {
2499 template <
size_t N,
int kPow2>
2501 svfloat32_t a, svfloat32_t b) {
2503 const Repartition<uint32_t, decltype(dbf16)> du32;
2504 const svuint32_t b_in_even = ShiftRight<16>(
BitCast(du32, b));
2533 #if HWY_TARGET == HWY_SVE2
2539 return ShiftRight<1>(detail::AddN(
Add(a, b), 1));
2546 template <
class D, HWY_IF_LANE_SIZE_D(D, 1)>
2549 const svuint8_t iota =
Iota(du, 0);
2552 const svuint8_t bytes =
BitCast(du, svld1ub_u64(detail::PTrue(
d), bits));
2554 const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
2556 const svuint8_t bit =
2557 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
2561 template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
2564 const RebindToUnsigned<D> du;
2565 const Repartition<uint8_t, D> du8;
2568 const svuint8_t bytes = svld1(
FirstN(du8, (
Lanes(du) + 7) / 8), bits);
2571 const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(
Iota(du8, 0)));
2573 const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
2577 template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
2580 const RebindToUnsigned<D> du;
2581 const Repartition<uint8_t, D> du8;
2585 const svuint8_t bytes = svld1(
FirstN(du8, 8), bits);
2588 const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(
Iota(du8, 0)));
2591 const svuint32_t bit =
Shl(
Set(du, 1), detail::AndN(
Iota(du, 0), 7));
2596 template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
2599 const RebindToUnsigned<D> du;
2604 CopyBytes<4>(bits, &mask_bits);
2605 const auto vbits =
Set(du, mask_bits);
2608 const svuint64_t bit =
Shl(
Set(du, 1),
Iota(du, 0));
2618 template <
class T, HWY_IF_LANE_SIZE(T, 1)>
2620 return svdup_n_u8_z(m, 1);
2622 template <
class T, HWY_IF_LANE_SIZE(T, 2)>
2625 const svuint8_t b16 =
BitCast(d8, svdup_n_u16_z(m, 1));
2628 template <
class T, HWY_IF_LANE_SIZE(T, 4)>
2632 template <
class T, HWY_IF_LANE_SIZE(T, 8)>
2634 const ScalableTag<uint32_t> d32;
2635 const svuint32_t b64 =
BitCast(d32, svdup_n_u64_z(m, 1));
2658 svuint64_t bits_in_u64 =
2661 const size_t num_bits =
Lanes(
d);
2662 const size_t num_bytes = (num_bits + 8 - 1) / 8;
2670 const int mask = (1ull << num_bits) - 1;
2671 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
2693 #if HWY_TARGET == HWY_SVE2
2695 #define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
2696 HWY_API HWY_SVE_V(BASE, BITS) \
2697 NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
2698 return sv##OP##_##CHAR##BITS(a, b); \
2702 #undef HWY_SVE_MUL_EVEN
2706 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2708 #if HWY_TARGET == HWY_SVE2
2711 const auto lo =
Mul(a, b);
2713 return BitCast(DW(), detail::InterleaveEven(lo, hi));
2718 const auto lo =
Mul(a, b);
2720 return detail::InterleaveEven(lo, hi);
2724 const auto lo =
Mul(a, b);
2726 return detail::InterleaveOdd(lo, hi);
2730 template <
size_t N,
int kPow2>
2732 svuint16_t a, svuint16_t b,
2733 const svfloat32_t sum0,
2734 svfloat32_t& sum1) {
2738 const svuint16_t zero =
Zero(du16);
2749 #if defined(__ARM_FEATURE_SVE2_AES)
2752 #ifdef HWY_NATIVE_AES
2753 #undef HWY_NATIVE_AES
2755 #define HWY_NATIVE_AES
2760 const svuint8_t zero = svdup_n_u8(0);
2761 return Xor(svaesmc_u8(svaese_u8(state, zero)), round_key);
2765 return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
2769 return svpmullb_pair(a, b);
2773 return svpmullt_pair(a, b);
2781 #define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \
2782 template <size_t N, int kPow2> \
2783 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , svbool_t m) { \
2784 return sv##OP##_b##BITS(m, m); \
2791 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2795 const svbool_t eqHx =
Eq(a, b);
2809 #if HWY_TARGET == HWY_SVE_256
2813 const svbool_t eqHx =
Eq(a, b);
2814 const svbool_t ltHL =
Lt(a, b);
2816 const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(
d, ltHL), ltHL);
2827 const svbool_t ltHL =
Lt(a, b);
2835 #if HWY_TARGET == HWY_SVE_256
2844 #if HWY_TARGET == HWY_SVE_256
2863 #undef HWY_IF_FLOAT_V
2864 #undef HWY_IF_LANE_SIZE_V
2865 #undef HWY_IF_SIGNED_V
2866 #undef HWY_IF_UNSIGNED_V
2867 #undef HWY_SVE_ALL_PTRUE
2869 #undef HWY_SVE_FOREACH
2870 #undef HWY_SVE_FOREACH_F
2871 #undef HWY_SVE_FOREACH_F16
2872 #undef HWY_SVE_FOREACH_F32
2873 #undef HWY_SVE_FOREACH_F64
2874 #undef HWY_SVE_FOREACH_I
2875 #undef HWY_SVE_FOREACH_I08
2876 #undef HWY_SVE_FOREACH_I16
2877 #undef HWY_SVE_FOREACH_I32
2878 #undef HWY_SVE_FOREACH_I64
2879 #undef HWY_SVE_FOREACH_IF
2880 #undef HWY_SVE_FOREACH_U
2881 #undef HWY_SVE_FOREACH_U08
2882 #undef HWY_SVE_FOREACH_U16
2883 #undef HWY_SVE_FOREACH_U32
2884 #undef HWY_SVE_FOREACH_U64
2885 #undef HWY_SVE_FOREACH_UI
2886 #undef HWY_SVE_FOREACH_UI08
2887 #undef HWY_SVE_FOREACH_UI16
2888 #undef HWY_SVE_FOREACH_UI32
2889 #undef HWY_SVE_FOREACH_UI64
2890 #undef HWY_SVE_FOREACH_UIF3264
2891 #undef HWY_SVE_PTRUE
2892 #undef HWY_SVE_RETV_ARGPV
2893 #undef HWY_SVE_RETV_ARGPVN
2894 #undef HWY_SVE_RETV_ARGPVV
2895 #undef HWY_SVE_RETV_ARGV
2896 #undef HWY_SVE_RETV_ARGVN
2897 #undef HWY_SVE_RETV_ARGVV
2899 #undef HWY_SVE_UNDEFINED
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:103
#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:59
#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1073
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:71
#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2781
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:730
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1383
#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1025
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1368
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:55
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1471
#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1107
#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:353
#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:672
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:342
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:126
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:259
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1965
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:946
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:527
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1040
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:300
#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:938
#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1853
#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2382
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:111
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:155
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:63
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:328
#define HWY_SVE_PTRUE(BITS)
Definition: arm_sve-inl.h:206
#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1089
#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1663
#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1155
#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1140
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:742
#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1049
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:89
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:161
#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:955
#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1923
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:178
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:601
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:95
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1174
#define HWY_SVE_IS_POW2
Definition: arm_sve-inl.h:30
#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:963
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2053
#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:930
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:802
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:483
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:83
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:138
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1445
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:774
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:437
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:122
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:56
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:77
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:798
#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:866
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:270
#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2695
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1126
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:107
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1015
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:569
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1762
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:99
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:173
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:151
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:447
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1798
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_TARGET
Definition: detect_targets.h:341
#define HWY_SVE_256
Definition: detect_targets.h:78
HWY_INLINE svuint8_t BoolFromMask(svbool_t m)
Definition: arm_sve-inl.h:2619
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:189
DupOddB
Definition: arm_sve-inl.h:2788
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition: rvv-inl.h:1823
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x)
Definition: arm_sve-inl.h:2640
svbool_t MaskLowerHalf(D d)
Definition: arm_sve-inl.h:1557
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: rvv-inl.h:1817
svbool_t MakeMask(D d)
Definition: arm_sve-inl.h:290
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:1937
VI SaturateI(VI v)
Definition: arm_sve-inl.h:1270
svbool_t MaskUpperHalf(D d)
Definition: arm_sve-inl.h:1651
VU SaturateU(VU v)
Definition: arm_sve-inl.h:1264
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) HWY_API svbool_t PFalse()
Definition: arm_sve-inl.h:280
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition: arm_sve-inl.h:1250
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) HWY_SVE_FOREACH_U(HWY_SVE_DUP
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:111
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition: ops/shared-inl.h:103
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
trn2 HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2793
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPV, Not, not) namespace detail
Definition: arm_sve-inl.h:391
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2717
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API VFromD< D > ConcatEven(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1406
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:161
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_sve-inl.h:2483
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API VFromD< DW > ZipLower(const V a, const V b)
Definition: arm_sve-inl.h:2477
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API VFromD< D > ConcatOdd(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1394
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
constexpr HWY_API bool IsSame()
Definition: base.h:322
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
constexpr HWY_API bool IsSigned()
Definition: base.h:534
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_sve-inl.h:40
Definition: ops/shared-inl.h:40
uint16_t bits
Definition: base.h:252