42 return *
this = (*
this * other);
45 return *
this = (*
this / other);
48 return *
this = (*
this + other);
51 return *
this = (*
this - other);
54 return *
this = (*
this & other);
57 return *
this = (*
this | other);
60 return *
this = (*
this ^ other);
94 using DFromV = decltype(detail::Deduce1()(V()));
97 using TFromV = TFromD<DFromV<V>>;
101 template <
typename T,
typename FromT>
103 static_assert(
sizeof(T) <=
sizeof(FromT),
"Promoting is undefined");
105 CopyBytes<sizeof(FromT)>(&
v.raw, &to);
111 template <
typename T>
116 template <
typename T,
typename T2>
118 return Vec1<T>(
static_cast<T
>(t));
121 template <
typename T>
126 template <
typename T,
typename T2>
128 return Vec1<T>(
static_cast<T
>(first));
135 template <
typename T>
144 template <
typename T>
150 template <
typename T>
157 template <
typename T>
167 template <
typename T>
173 template <
typename T>
180 template <
typename T>
186 template <
typename T>
193 template <
typename T>
195 return Or(o1,
Or(o2, o3));
200 template <
typename T>
202 return Or(o,
And(a1, a2));
207 template <
typename T>
214 template <
typename T>
216 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
221 template <
typename T>
223 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
229 template <
typename T>
237 #ifdef HWY_NATIVE_POPCNT
238 #undef HWY_NATIVE_POPCNT
240 #define HWY_NATIVE_POPCNT
243 template <
typename T>
250 template <
typename TFrom,
typename TTo>
252 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
257 template <
typename T>
264 template <
typename T>
271 template <
typename T>
278 template <
typename T>
284 template <
typename T>
287 return mask.
bits ? yes : no;
290 template <
typename T>
295 template <
typename T>
300 template <
typename T>
302 return v.raw < 0 ? yes : no;
305 template <
typename T>
312 template <
typename T>
317 template <
typename T>
323 template <
typename T>
329 template <
typename T>
335 template <
typename T>
345 template <
int kBits,
typename T>
347 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
352 template <
int kBits,
typename T>
354 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
355 #if __cplusplus >= 202002L
358 return Vec1<T>(
static_cast<T
>(
v.raw >> kBits));
365 const TU shifted =
BitCast(du,
v).raw >> kBits;
367 const size_t sign_shift =
368 static_cast<size_t>(
static_cast<int>(
sizeof(TU)) * 8 - 1 - kBits);
369 const TU upper =
static_cast<TU
>(sign << sign_shift);
372 return Vec1<T>(
static_cast<T
>(
v.raw >> kBits));
384 template <
typename T>
386 return Or(ShiftRight<kBits>(
v),
ShiftLeft<
sizeof(T) * 8 - kBits>(
v));
392 template <
typename T>
400 template <
int kBits,
typename T>
402 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
408 template <
typename T>
414 template <
typename T>
416 #if __cplusplus >= 202002L
419 return Vec1<T>(
static_cast<T
>(
v.raw >> bits));
426 const TU shifted =
BitCast(du,
v).raw >> bits;
428 const size_t sign_shift =
429 static_cast<size_t>(
static_cast<int>(
sizeof(TU)) * 8 - 1 - bits);
430 const TU upper =
static_cast<TU
>(sign << sign_shift);
433 return Vec1<T>(
static_cast<T
>(
v.raw >> bits));
441 template <
typename T>
446 template <
typename T>
453 template <
typename T>
455 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
456 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
457 return Vec1<T>(
static_cast<T
>((a64 + b64) &
static_cast<uint64_t
>(~T(0))));
466 template <
typename T>
468 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
469 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
470 return Vec1<T>(
static_cast<T
>((a64 - b64) &
static_cast<uint64_t
>(~T(0))));
554 template <
typename T>
557 return (i >= 0 || i == hwy::LimitsMin<T>()) ? a :
Vec1<T>(-i);
568 template <
typename T, HWY_IF_NOT_FLOAT(T)>
573 template <
typename T, HWY_IF_FLOAT(T)>
574 HWY_API Vec1<T>
Min(
const Vec1<T> a,
const Vec1<T> b) {
575 if (std::isnan(a.raw))
return b;
576 if (std::isnan(b.raw))
return a;
577 return Vec1<T>(
HWY_MIN(a.raw, b.raw));
580 template <
typename T, HWY_IF_NOT_FLOAT(T)>
585 template <
typename T, HWY_IF_FLOAT(T)>
586 HWY_API Vec1<T>
Max(
const Vec1<T> a,
const Vec1<T> b) {
587 if (std::isnan(a.raw))
return b;
588 if (std::isnan(b.raw))
return a;
589 return Vec1<T>(
HWY_MAX(a.raw, b.raw));
594 template <
typename T, HWY_IF_FLOAT(T)>
599 template <
typename T, HWY_IF_NOT_FLOAT(T)>
601 return Zero(Sisd<T>()) -
v;
606 template <
typename T, HWY_IF_FLOAT(T)>
611 template <
typename T, HWY_IF_SIGNED(T)>
613 return Vec1<T>(
static_cast<T
>(int64_t(a.raw) * b.raw));
616 template <
typename T, HWY_IF_UNSIGNED(T)>
618 return Vec1<T>(
static_cast<T
>(uint64_t(a.raw) * b.raw));
621 template <
typename T>
635 (
static_cast<uint32_t
>(a.
raw) *
static_cast<uint32_t
>(b.
raw)) >> 16));
644 const int64_t a64 = a.
raw;
648 const uint64_t a64 = a.
raw;
668 template <
typename T>
670 return mul * x + add;
673 template <
typename T>
676 return add - mul * x;
679 template <
typename T>
681 return mul * x - sub;
684 template <
typename T>
687 return Neg(mul) * x - sub;
695 const float half = f * 0.5f;
697 CopyBytes<4>(&f, &bits);
699 bits = 0x5F3759DF - (bits >> 1);
700 CopyBytes<4>(&bits, &f);
715 template <
typename T>
718 if (!(
Abs(
v).raw < MantissaEnd<T>())) {
721 const T bias =
v.raw < T(0.0) ? T(-0.5) : T(0.5);
722 const TI rounded =
static_cast<TI
>(
v.raw + bias);
725 if ((rounded & 1) && std::abs(
static_cast<T
>(rounded) -
v.raw) == T(0.5)) {
726 return Vec1<T>(
static_cast<T
>(rounded - (
v.raw < T(0) ? -1 : 1)));
728 return Vec1<T>(
static_cast<T
>(rounded));
736 const T abs =
Abs(
v).raw;
737 const bool signbit = std::signbit(
v.raw);
739 if (!(abs < MantissaEnd<T>())) {
741 if (!(abs <=
static_cast<T
>(LimitsMax<TI>()))) {
742 return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
746 const T bias =
v.raw < T(0.0) ? T(-0.5) : T(0.5);
747 const TI rounded =
static_cast<TI
>(
v.raw + bias);
750 if ((rounded & 1) && std::abs(
static_cast<T
>(rounded) -
v.raw) == T(0.5)) {
751 return Vec1<TI>(rounded - (signbit ? -1 : 1));
756 template <
typename T>
759 if (!(
Abs(
v).raw <= MantissaEnd<T>())) {
762 const TI truncated =
static_cast<TI
>(
v.raw);
764 return Vec1<T>(
static_cast<T
>(truncated));
767 template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
770 const Bits kExponentMask = (1ull << kExponentBits) - 1;
771 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
772 const Bits kBias = kExponentMask / 2;
775 const bool positive = f > Float(0.0);
778 CopyBytes<sizeof(Bits)>(&
v, &bits);
781 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
783 if (exponent >= kMantissaBits)
return v;
785 if (exponent < 0)
return positive ? V(1) : V(-0.0);
787 const Bits mantissa_mask = kMantissaMask >> exponent;
789 if ((bits & mantissa_mask) == 0)
return v;
792 if (positive) bits += (kMantissaMask + 1) >> exponent;
793 bits &= ~mantissa_mask;
795 CopyBytes<sizeof(Bits)>(&bits, &f);
799 template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
802 const Bits kExponentMask = (1ull << kExponentBits) - 1;
803 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
804 const Bits kBias = kExponentMask / 2;
807 const bool negative = f < Float(0.0);
810 CopyBytes<sizeof(Bits)>(&
v, &bits);
813 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
815 if (exponent >= kMantissaBits)
return v;
817 if (exponent < 0)
return V(negative ? Float(-1.0) : Float(0.0));
819 const Bits mantissa_mask = kMantissaMask >> exponent;
821 if ((bits & mantissa_mask) == 0)
return v;
824 if (negative) bits += (kMantissaMask + 1) >> exponent;
825 bits &= ~mantissa_mask;
827 CopyBytes<sizeof(Bits)>(&bits, &f);
833 return Ceiling<float, uint32_t, 23, 8>(
v);
836 return Ceiling<double, uint64_t, 52, 11>(
v);
841 return Floor<float, uint32_t, 23, 8>(
v);
844 return Floor<double, uint64_t, 52, 11>(
v);
849 template <
typename T>
854 template <
typename T>
859 template <
typename T>
861 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
862 return (
v & bit) == bit;
865 template <
typename T>
869 template <
typename T>
874 template <
typename T>
878 template <
typename T>
885 template <
typename T>
889 memcpy(&bits, &
v,
sizeof(
v));
908 return RebindMask(
d, (vu + vu) ==
Set(du, 0xFFE0000000000000ull));
926 template <
typename T>
929 CopyBytes<sizeof(T)>(aligned, &t);
933 template <
typename T>
939 template <
typename T>
945 template <
typename T>
947 return Load(
d, aligned);
952 template <
typename T>
955 CopyBytes<sizeof(T)>(&
v.raw, aligned);
958 template <
typename T>
963 template <
typename T>
973 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
974 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
976 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
979 template <
typename T>
982 v0 =
LoadU(
d, unaligned + 0);
983 v1 =
LoadU(
d, unaligned + 1);
986 template <
typename T>
989 v0 =
LoadU(
d, unaligned + 0);
990 v1 =
LoadU(
d, unaligned + 1);
991 v2 =
LoadU(
d, unaligned + 2);
994 template <
typename T>
998 v0 =
LoadU(
d, unaligned + 0);
999 v1 =
LoadU(
d, unaligned + 1);
1000 v2 =
LoadU(
d, unaligned + 2);
1001 v3 =
LoadU(
d, unaligned + 3);
1006 template <
typename T>
1013 template <
typename T>
1022 template <
typename T>
1034 template <
typename T>
1041 template <
typename T,
typename Offset>
1044 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1045 uint8_t*
const base8 =
reinterpret_cast<uint8_t*
>(base) + offset.
raw;
1046 return Store(
v,
d,
reinterpret_cast<T*
>(base8));
1049 template <
typename T,
typename Index>
1052 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1058 template <
typename T,
typename Offset>
1061 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1062 const intptr_t addr =
1063 reinterpret_cast<intptr_t
>(base) +
static_cast<intptr_t
>(offset.
raw);
1064 return Load(
d,
reinterpret_cast<const T*
>(addr));
1067 template <
typename T,
typename Index>
1070 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1079 template <
typename FromT,
typename ToT>
1081 static_assert(
sizeof(ToT) >
sizeof(FromT),
"Not promoting");
1090 if (std::isinf(from.
raw) ||
1099 if (std::isinf(from.
raw) ||
1100 std::fabs(from.
raw) >
static_cast<double>(HighestValue<int32_t>())) {
1102 : HighestValue<int32_t>());
1107 template <
typename FromT,
typename ToT>
1109 static_assert(!IsFloat<FromT>(),
"FromT=double are handled above");
1110 static_assert(
sizeof(ToT) <
sizeof(FromT),
"Not demoting");
1118 #if HWY_NATIVE_FLOAT16
1120 CopyBytes<2>(&
v.raw, &bits16);
1122 const uint16_t bits16 =
v.raw.bits;
1124 const uint32_t sign =
static_cast<uint32_t
>(bits16 >> 15);
1125 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1126 const uint32_t mantissa = bits16 & 0x3FF;
1129 if (biased_exp == 0) {
1130 const float subnormal =
1131 (1.0f / 16384) * (
static_cast<float>(mantissa) * (1.0f / 1024));
1132 return Vec1<float>(sign ? -subnormal : subnormal);
1136 const uint32_t biased_exp32 = biased_exp + (127 - 15);
1137 const uint32_t mantissa32 = mantissa << (23 - 10);
1138 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1140 CopyBytes<4>(&bits32, &out);
1151 CopyBytes<4>(&
v.raw, &bits32);
1152 const uint32_t sign = bits32 >> 31;
1153 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1154 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1156 const int32_t exp =
HWY_MIN(
static_cast<int32_t
>(biased_exp32) - 127, 15);
1161 #if HWY_NATIVE_FLOAT16
1162 const uint16_t zero = 0;
1163 CopyBytes<2>(&zero, &out.
raw);
1170 uint32_t biased_exp16, mantissa16;
1175 const uint32_t sub_exp =
static_cast<uint32_t
>(-14 - exp);
1177 mantissa16 =
static_cast<uint32_t
>((1u << (10 - sub_exp)) +
1178 (mantissa32 >> (13 + sub_exp)));
1181 biased_exp16 =
static_cast<uint32_t
>(exp + 15);
1182 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1183 mantissa16 = mantissa32 >> 13;
1187 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1189 #if HWY_NATIVE_FLOAT16
1190 const uint16_t narrowed =
static_cast<uint16_t
>(bits16);
1191 CopyBytes<2>(&narrowed, &out.
raw);
1193 out.
raw.bits =
static_cast<uint16_t
>(bits16);
1202 template <
typename FromT,
typename ToT, HWY_IF_FLOAT(FromT)>
1204 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1207 const double f =
static_cast<double>(from.
raw);
1208 if (std::isinf(from.
raw) ||
1209 std::fabs(f) >
static_cast<double>(LimitsMax<ToT>())) {
1210 return Vec1<ToT>(std::signbit(from.
raw) ? LimitsMin<ToT>()
1211 : LimitsMax<ToT>());
1216 template <
typename FromT,
typename ToT, HWY_IF_NOT_FLOAT(FromT)>
1218 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1220 return Vec1<ToT>(
static_cast<ToT
>(from.raw));
1230 template <
typename T>
1235 template <
typename T>
1242 template <
typename T>
1247 template <
typename T>
1254 template <
typename T>
1262 template <
typename T>
1268 template <
typename T>
1273 template <
typename T>
1280 template <
typename T>
1288 template <
typename T>
1293 template <
typename T,
typename TI>
1295 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane size");
1300 template <
typename T,
typename TI>
1305 template <
typename T>
1313 template <
typename T>
1320 template <
typename T>
1326 template <
typename T>
1331 template <
typename T>
1336 template <
typename T>
1346 template <
int kLane,
typename T>
1348 static_assert(kLane == 0,
"Scalar only has one lane");
1354 template <
typename T,
typename TI>
1356 uint8_t in_bytes[
sizeof(T)];
1357 uint8_t idx_bytes[
sizeof(T)];
1358 uint8_t out_bytes[
sizeof(T)];
1359 CopyBytes<sizeof(T)>(&in, &in_bytes);
1360 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1361 for (
size_t i = 0; i <
sizeof(T); ++i) {
1362 out_bytes[i] = in_bytes[idx_bytes[i]];
1365 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1369 template <
typename T,
typename TI>
1371 uint8_t in_bytes[
sizeof(T)];
1372 uint8_t idx_bytes[
sizeof(T)];
1373 uint8_t out_bytes[
sizeof(T)];
1374 CopyBytes<sizeof(T)>(&in, &in_bytes);
1375 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1376 for (
size_t i = 0; i <
sizeof(T); ++i) {
1377 out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1380 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1407 template <
typename T,
typename TW = MakeW
ide<T>,
class VW = Vec1<TW>>
1409 return VW(
static_cast<TW
>((TW{b.
raw} << (
sizeof(T) * 8)) + a.
raw));
1414 template <
typename T>
1416 return mask.
bits == 0;
1419 template <
typename T>
1421 return mask.
bits != 0;
1425 template <
typename T>
1432 template <
typename T>
1438 template <
typename T>
1440 return mask.
bits == 0 ? 0 : 1;
1443 template <
typename T>
1445 return mask.
bits == 0 ? -1 : 0;
1450 template <
typename T>
1451 struct CompressIsPartition {
1455 template <
typename T>
1461 template <
typename T>
1468 template <
typename T>
1476 template <
typename T>
1479 if (!mask.
bits)
return 0;
1485 template <
typename T>
1491 template <
typename T>
1513 template <
typename T>
1517 template <
typename T>
1521 template <
typename T>
1556 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1560 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
1564 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1569 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1573 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1578 HWY_API auto Le(V a, V b) -> decltype(a == b) {
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
Definition: scalar-inl.h:68
Raw bits
Definition: scalar-inl.h:78
hwy::MakeUnsigned< T > Raw
Definition: scalar-inl.h:69
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition: scalar-inl.h:72
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
V Ceiling(const V v)
Definition: scalar-inl.h:769
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:831
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:839
constexpr float HighestValue< float >()
Definition: base.h:580
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
constexpr float LowestValue< float >()
Definition: base.h:567
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: scalar-inl.h:1289
MakeSigned< T > raw
Definition: scalar-inl.h:1290
Definition: ops/shared-inl.h:40
Definition: scalar-inl.h:35
T raw
Definition: scalar-inl.h:63
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition: scalar-inl.h:41
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition: scalar-inl.h:50
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition: scalar-inl.h:47
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition: scalar-inl.h:44
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition: scalar-inl.h:53
HWY_INLINE Vec1(const T t)
Definition: scalar-inl.h:39
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition: scalar-inl.h:59
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition: scalar-inl.h:56
Definition: scalar-inl.h:84
Sisd< T > operator()(Vec1< T >) const
Definition: scalar-inl.h:86
HWY_INLINE Vec1< T > operator()(const Vec1< T > v) const
Definition: scalar-inl.h:393
Definition: emu128-inl.h:438
HWY_INLINE Vec1< T > operator()(const Vec1< T > v) const
Definition: scalar-inl.h:385