21 #include <wasm_simd128.h>
26 #ifdef HWY_WASM_OLD_NAMES
27 #define wasm_i8x16_shuffle wasm_v8x16_shuffle
28 #define wasm_i16x8_shuffle wasm_v16x8_shuffle
29 #define wasm_i32x4_shuffle wasm_v32x4_shuffle
30 #define wasm_i64x2_shuffle wasm_v64x2_shuffle
31 #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
32 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
33 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
34 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
35 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
36 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
37 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
38 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate
39 #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
40 #define wasm_u16x8_add_sat wasm_u16x8_add_saturate
41 #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
42 #define wasm_i8x16_add_sat wasm_i8x16_add_saturate
43 #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
44 #define wasm_i16x8_add_sat wasm_i16x8_add_saturate
45 #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
65 template <
typename T,
size_t N = 16 /
sizeof(T)>
73 return *
this = (*
this * other);
76 return *
this = (*
this / other);
79 return *
this = (*
this + other);
82 return *
this = (*
this - other);
85 return *
this = (*
this & other);
88 return *
this = (*
this | other);
91 return *
this = (*
this ^ other);
98 using Vec64 = Vec128<T, 8 /
sizeof(T)>;
100 template <
typename T>
101 using Vec32 = Vec128<T, 4 /
sizeof(T)>;
104 template <
typename T,
size_t N = 16 /
sizeof(T)>
113 template <
typename T,
size_t N>
122 using DFromV = decltype(detail::DeduceD()(V()));
125 using TFromV = TFromD<DFromV<V>>;
133 return static_cast<__v128_u
>(
v);
136 return static_cast<__v128_u
>(
v);
139 template <
typename T,
size_t N>
145 template <
typename T>
154 template <
typename T,
size_t N>
162 template <
typename T,
size_t N,
typename FromT>
164 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
171 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
175 template <
size_t N, HWY_IF_LE128(
float, N)>
186 template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
190 template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
195 template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
200 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
206 template <
size_t N, HWY_IF_LE128(
int8_t, N)>
210 template <
size_t N, HWY_IF_LE128(
int16_t, N)>
214 template <
size_t N, HWY_IF_LE128(
int32_t, N)>
218 template <
size_t N, HWY_IF_LE128(
int64_t, N)>
223 template <
size_t N, HWY_IF_LE128(
float, N)>
232 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
240 template <
typename T,
size_t N,
typename T2>
241 Vec128<T, N>
Iota(
const Simd<T, N, 0>
d,
const T2 first) {
243 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
244 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
246 return Load(
d, lanes);
457 template <
int kBits,
size_t N>
461 template <
int kBits,
size_t N>
465 template <
int kBits,
size_t N>
469 template <
int kBits,
size_t N>
473 template <
int kBits,
size_t N>
477 template <
int kBits,
size_t N>
483 template <
int kBits,
size_t N>
487 template <
int kBits,
size_t N>
491 template <
int kBits,
size_t N>
495 template <
int kBits,
size_t N>
499 template <
int kBits,
size_t N>
503 template <
int kBits,
size_t N>
509 template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
516 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
519 template <
int kBits,
size_t N>
525 return shifted &
Set(d8, 0xFF >> kBits);
528 template <
int kBits,
size_t N>
533 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
534 return (shifted ^ shifted_sign) - shifted_sign;
538 template <
int kBits,
typename T,
size_t N>
540 constexpr
size_t kSizeInBits =
sizeof(T) * 8;
541 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
542 if (kBits == 0)
return v;
543 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
617 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
623 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
633 return shifted &
Set(d8, 0xFF >> bits);
641 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> bits));
642 return (shifted ^ shifted_sign) - shifted_sign;
664 HWY_API Vec128<uint64_t, N>
Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
666 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0));
667 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
668 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1));
669 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
671 return Vec128<uint64_t, N>{wasm_v128_load(min)};
688 HWY_API Vec128<int64_t, N>
Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
689 alignas(16) int64_t min[4];
690 min[0] =
HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
691 wasm_i64x2_extract_lane(b.raw, 0));
692 min[1] =
HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
693 wasm_i64x2_extract_lane(b.raw, 1));
694 return Vec128<int64_t, N>{wasm_v128_load(min)};
719 HWY_API Vec128<uint64_t, N>
Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
721 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0));
722 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
723 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1));
724 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
726 return Vec128<uint64_t, N>{wasm_v128_load(max)};
743 HWY_API Vec128<int64_t, N>
Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
744 alignas(16) int64_t max[2];
745 max[0] =
HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
746 wasm_i64x2_extract_lane(b.raw, 0));
747 max[1] =
HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
748 wasm_i64x2_extract_lane(b.raw, 1));
749 return Vec128<int64_t, N>{wasm_v128_load(max)};
787 const Vec128<uint16_t, N> b) {
789 const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
790 const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
791 const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
792 const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
793 const auto l = wasm_i32x4_mul(al, bl);
794 const auto h = wasm_i32x4_mul(ah, bh);
796 return Vec128<uint16_t, N>{
797 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
801 const Vec128<int16_t, N> b) {
803 const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
804 const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
805 const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
806 const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
807 const auto l = wasm_i32x4_mul(al, bl);
808 const auto h = wasm_i32x4_mul(ah, bh);
810 return Vec128<int16_t, N>{
811 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
816 Vec128<int16_t, N> b) {
820 const Vec128<uint16_t, N> lo =
BitCast(du,
Mul(a, b));
821 const Vec128<int16_t, N> hi =
MulHigh(a, b);
825 const Vec128<uint16_t, N> lo_top2 = ShiftRight<14>(lo);
827 const Vec128<uint16_t, N> rounding = ShiftRight<1>(
Add(lo_top2,
Set(du, 1)));
833 HWY_API Vec128<int64_t, (
N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
834 const Vec128<int32_t, N> b) {
836 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
837 const auto ae = wasm_v128_and(a.raw, kEvenMask);
838 const auto be = wasm_v128_and(b.raw, kEvenMask);
839 return Vec128<int64_t, (
N + 1) / 2>{wasm_i64x2_mul(ae, be)};
842 HWY_API Vec128<uint64_t, (
N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
843 const Vec128<uint32_t, N> b) {
845 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
846 const auto ae = wasm_v128_and(a.raw, kEvenMask);
847 const auto be = wasm_v128_and(b.raw, kEvenMask);
848 return Vec128<uint64_t, (
N + 1) / 2>{wasm_i64x2_mul(ae, be)};
853 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
884 const Vec128<float, N> b) {
885 return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
891 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
898 const Vec128<float, N> b) {
907 const Vec128<float, N> x,
908 const Vec128<float, N> add) {
911 return mul * x + add;
917 const Vec128<float, N> x,
918 const Vec128<float, N> add) {
920 return add - mul * x;
926 const Vec128<float, N> x,
927 const Vec128<float, N> sub) {
930 return mul * x - sub;
936 const Vec128<float, N> x,
937 const Vec128<float, N> sub) {
939 return Neg(mul) * x - sub;
946 HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N>
v) {
947 return Vec128<float, N>{wasm_f32x4_sqrt(
v.raw)};
954 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
955 return one /
Sqrt(
v);
963 return Vec128<float, N>{wasm_f32x4_nearest(
v.raw)};
969 return Vec128<float, N>{wasm_f32x4_trunc(
v.raw)};
974 HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N>
v) {
975 return Vec128<float, N>{wasm_f32x4_ceil(
v.raw)};
981 return Vec128<float, N>{wasm_f32x4_floor(
v.raw)};
985 template <
typename T,
size_t N>
990 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
992 const Simd<T, N, 0>
d;
1000 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1002 const Simd<T, N, 0>
d;
1009 const VFromD<decltype(di)> exp =
1018 template <
typename TFrom,
typename TTo,
size_t N>
1020 Mask128<TFrom, N> m) {
1021 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1022 return Mask128<TTo, N>{m.raw};
1025 template <
typename T,
size_t N>
1027 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1028 return (
v & bit) == bit;
1180 const auto a32 =
BitCast(d32, a);
1181 const auto b32 =
BitCast(d32, b);
1183 const auto m_gt = a32 > b32;
1186 const auto m_eq = a32 == b32;
1187 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1190 const auto gt =
Or(lo_gt, m_gt);
1201 template <
typename T,
size_t N>
1202 HWY_API Mask128<T, N>
operator<(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1222 template <
typename T,
size_t N>
1223 HWY_API Mask128<T, N>
FirstN(
const Simd<T, N, 0>
d,
size_t num) {
1232 template <
typename T,
size_t N>
1234 return Vec128<T, N>{wasm_v128_not(
v.raw)};
1239 template <
typename T,
size_t N>
1240 HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
1241 return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
1247 template <
typename T,
size_t N>
1248 HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
1249 return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1254 template <
typename T,
size_t N>
1255 HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
1256 return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
1261 template <
typename T,
size_t N>
1262 HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
1263 return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
1268 template <
typename T,
size_t N>
1269 HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
1270 return Or(o1,
Or(o2, o3));
1275 template <
typename T,
size_t N>
1276 HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1277 return Or(o,
And(a1, a2));
1282 template <
typename T,
size_t N>
1290 template <
typename T,
size_t N>
1291 HWY_API Vec128<T, N>
operator&(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1295 template <
typename T,
size_t N>
1296 HWY_API Vec128<T, N>
operator|(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1300 template <
typename T,
size_t N>
1301 HWY_API Vec128<T, N>
operator^(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1307 template <
typename T,
size_t N>
1309 const Vec128<T, N> sign) {
1310 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1315 template <
typename T,
size_t N>
1317 const Vec128<T, N> sign) {
1318 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1324 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1337 template <
typename T,
size_t N>
1339 return Mask128<T, N>{
v.raw};
1342 template <
typename T,
size_t N>
1344 return Vec128<T, N>{
v.raw};
1348 template <
typename T,
size_t N>
1351 return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1355 template <
typename T,
size_t N>
1361 template <
typename T,
size_t N>
1366 template <
typename T,
size_t N>
1369 static_assert(IsSigned<T>(),
"Only works for signed/float");
1377 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1380 const auto zero =
Zero(
d);
1386 template <
typename T,
size_t N>
1387 HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1391 template <
typename T,
size_t N>
1392 HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1393 const Simd<T, N, 0>
d;
1397 template <
typename T,
size_t N>
1398 HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1399 const Simd<T, N, 0>
d;
1403 template <
typename T,
size_t N>
1404 HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1405 const Simd<T, N, 0>
d;
1409 template <
typename T,
size_t N>
1410 HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1411 const Simd<T, N, 0>
d;
1425 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1432 test = ShiftLeft<12>(test);
1435 test = ShiftLeft<1>(test);
1439 test = ShiftLeft<1>(test);
1443 test = ShiftLeft<1>(test);
1450 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1457 test = ShiftLeft<27>(test);
1460 test = ShiftLeft<1>(test);
1464 test = ShiftLeft<1>(test);
1468 test = ShiftLeft<1>(test);
1472 test = ShiftLeft<1>(test);
1479 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1482 alignas(16) T lanes[2];
1483 alignas(16) T bits_lanes[2];
1485 Store(bits,
d, bits_lanes);
1486 lanes[0] <<= bits_lanes[0];
1487 lanes[1] <<= bits_lanes[1];
1488 return Load(
d, lanes);
1493 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1500 test = ShiftLeft<12>(test);
1503 test = ShiftLeft<1>(test);
1507 test = ShiftLeft<1>(test);
1511 test = ShiftLeft<1>(test);
1518 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1525 test = ShiftLeft<27>(test);
1528 test = ShiftLeft<1>(test);
1532 test = ShiftLeft<1>(test);
1536 test = ShiftLeft<1>(test);
1540 test = ShiftLeft<1>(test);
1551 template <
typename T>
1553 return Vec128<T>{wasm_v128_load(aligned)};
1556 template <
typename T,
size_t N>
1563 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1566 CopyBytes<sizeof(T) * N>(p, &
v);
1571 template <
typename T,
size_t N>
1577 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1584 template <
typename T>
1586 wasm_v128_store(aligned,
v.raw);
1590 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1592 CopyBytes<sizeof(T) * N>(&
v, p);
1597 *p = wasm_f32x4_extract_lane(
v.raw, 0);
1601 template <
typename T,
size_t N>
1606 template <
typename T,
size_t N>
1616 template <
typename T,
size_t N>
1619 wasm_v128_store(aligned,
v.raw);
1624 template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
1627 const Vec128<Offset, N> offset) {
1628 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1630 alignas(16) T lanes[
N];
1633 alignas(16) Offset offset_lanes[
N];
1634 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
1636 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
1637 for (
size_t i = 0; i <
N; ++i) {
1638 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1642 template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
1644 const Vec128<Index, N> index) {
1645 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1647 alignas(16) T lanes[
N];
1650 alignas(16) Index index_lanes[
N];
1651 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
1653 for (
size_t i = 0; i <
N; ++i) {
1654 base[index_lanes[i]] = lanes[i];
1660 template <
typename T,
size_t N,
typename Offset>
1663 const Vec128<Offset, N> offset) {
1664 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1666 alignas(16) Offset offset_lanes[
N];
1667 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
1669 alignas(16) T lanes[
N];
1670 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
1671 for (
size_t i = 0; i <
N; ++i) {
1672 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1674 return Load(
d, lanes);
1677 template <
typename T,
size_t N,
typename Index>
1680 const Vec128<Index, N> index) {
1681 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1683 alignas(16) Index index_lanes[
N];
1684 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
1686 alignas(16) T lanes[
N];
1687 for (
size_t i = 0; i <
N; ++i) {
1688 lanes[i] = base[index_lanes[i]];
1690 return Load(
d, lanes);
1699 template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1701 return static_cast<T
>(wasm_i8x16_extract_lane(
v.raw, kLane));
1703 template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1705 return static_cast<T
>(wasm_i16x8_extract_lane(
v.raw, kLane));
1707 template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1709 return static_cast<T
>(wasm_i32x4_extract_lane(
v.raw, kLane));
1711 template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1713 return static_cast<T
>(wasm_i64x2_extract_lane(
v.raw, kLane));
1716 template <
size_t kLane,
size_t N>
1718 return wasm_f32x4_extract_lane(
v.raw, kLane);
1726 template <
typename T>
1733 template <
typename T>
1735 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1736 if (__builtin_constant_p(i)) {
1739 return detail::ExtractLane<0>(
v);
1741 return detail::ExtractLane<1>(
v);
1745 alignas(16) T lanes[2];
1750 template <
typename T>
1752 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1753 if (__builtin_constant_p(i)) {
1756 return detail::ExtractLane<0>(
v);
1758 return detail::ExtractLane<1>(
v);
1760 return detail::ExtractLane<2>(
v);
1762 return detail::ExtractLane<3>(
v);
1766 alignas(16) T lanes[4];
1771 template <
typename T>
1773 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1774 if (__builtin_constant_p(i)) {
1777 return detail::ExtractLane<0>(
v);
1779 return detail::ExtractLane<1>(
v);
1781 return detail::ExtractLane<2>(
v);
1783 return detail::ExtractLane<3>(
v);
1785 return detail::ExtractLane<4>(
v);
1787 return detail::ExtractLane<5>(
v);
1789 return detail::ExtractLane<6>(
v);
1791 return detail::ExtractLane<7>(
v);
1795 alignas(16) T lanes[8];
1800 template <
typename T>
1802 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1803 if (__builtin_constant_p(i)) {
1806 return detail::ExtractLane<0>(
v);
1808 return detail::ExtractLane<1>(
v);
1810 return detail::ExtractLane<2>(
v);
1812 return detail::ExtractLane<3>(
v);
1814 return detail::ExtractLane<4>(
v);
1816 return detail::ExtractLane<5>(
v);
1818 return detail::ExtractLane<6>(
v);
1820 return detail::ExtractLane<7>(
v);
1822 return detail::ExtractLane<8>(
v);
1824 return detail::ExtractLane<9>(
v);
1826 return detail::ExtractLane<10>(
v);
1828 return detail::ExtractLane<11>(
v);
1830 return detail::ExtractLane<12>(
v);
1832 return detail::ExtractLane<13>(
v);
1834 return detail::ExtractLane<14>(
v);
1836 return detail::ExtractLane<15>(
v);
1840 alignas(16) T lanes[16];
1846 template <
typename T,
size_t N>
1848 return detail::ExtractLane<0>(
v);
1855 template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1857 static_assert(kLane <
N,
"Lane index out of bounds");
1859 wasm_i8x16_replace_lane(
v.raw, kLane,
static_cast<int8_t
>(t))};
1862 template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1864 static_assert(kLane <
N,
"Lane index out of bounds");
1866 wasm_i16x8_replace_lane(
v.raw, kLane,
static_cast<int16_t
>(t))};
1869 template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1871 static_assert(kLane <
N,
"Lane index out of bounds");
1872 return Vec128<T, N>{
1873 wasm_i32x4_replace_lane(
v.raw, kLane,
static_cast<int32_t
>(t))};
1876 template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1878 static_assert(kLane <
N,
"Lane index out of bounds");
1879 return Vec128<T, N>{
1880 wasm_i64x2_replace_lane(
v.raw, kLane,
static_cast<int64_t
>(t))};
1883 template <
size_t kLane,
size_t N>
1885 static_assert(kLane <
N,
"Lane index out of bounds");
1889 template <
size_t kLane,
size_t N>
1891 static_assert(kLane < 2,
"Lane index out of bounds");
1900 template <
typename T>
1907 template <
typename T>
1909 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1910 if (__builtin_constant_p(i)) {
1913 return detail::InsertLane<0>(
v, t);
1915 return detail::InsertLane<1>(
v, t);
1920 alignas(16) T lanes[2];
1923 return Load(
d, lanes);
1926 template <
typename T>
1928 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1929 if (__builtin_constant_p(i)) {
1932 return detail::InsertLane<0>(
v, t);
1934 return detail::InsertLane<1>(
v, t);
1936 return detail::InsertLane<2>(
v, t);
1938 return detail::InsertLane<3>(
v, t);
1943 alignas(16) T lanes[4];
1946 return Load(
d, lanes);
1949 template <
typename T>
1951 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1952 if (__builtin_constant_p(i)) {
1955 return detail::InsertLane<0>(
v, t);
1957 return detail::InsertLane<1>(
v, t);
1959 return detail::InsertLane<2>(
v, t);
1961 return detail::InsertLane<3>(
v, t);
1963 return detail::InsertLane<4>(
v, t);
1965 return detail::InsertLane<5>(
v, t);
1967 return detail::InsertLane<6>(
v, t);
1969 return detail::InsertLane<7>(
v, t);
1974 alignas(16) T lanes[8];
1977 return Load(
d, lanes);
1980 template <
typename T>
1982 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1983 if (__builtin_constant_p(i)) {
1986 return detail::InsertLane<0>(
v, t);
1988 return detail::InsertLane<1>(
v, t);
1990 return detail::InsertLane<2>(
v, t);
1992 return detail::InsertLane<3>(
v, t);
1994 return detail::InsertLane<4>(
v, t);
1996 return detail::InsertLane<5>(
v, t);
1998 return detail::InsertLane<6>(
v, t);
2000 return detail::InsertLane<7>(
v, t);
2002 return detail::InsertLane<8>(
v, t);
2004 return detail::InsertLane<9>(
v, t);
2006 return detail::InsertLane<10>(
v, t);
2008 return detail::InsertLane<11>(
v, t);
2010 return detail::InsertLane<12>(
v, t);
2012 return detail::InsertLane<13>(
v, t);
2014 return detail::InsertLane<14>(
v, t);
2016 return detail::InsertLane<15>(
v, t);
2021 alignas(16) T lanes[16];
2024 return Load(
d, lanes);
2029 template <
typename T,
size_t N>
2032 return Vec128<T,
N / 2>{
v.raw};
2035 template <
typename T,
size_t N>
2043 template <
int kBytes,
typename T,
size_t N>
2045 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2046 const __i8x16 zero = wasm_i8x16_splat(0);
2052 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
2053 6, 7, 8, 9, 10, 11, 12, 13, 14)};
2056 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
2057 5, 6, 7, 8, 9, 10, 11, 12, 13)};
2060 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 0, 1, 2,
2061 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
2064 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 0, 1,
2065 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
2068 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 0,
2069 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
2072 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2073 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
2076 return Vec128<T, N>{wasm_i8x16_shuffle(
2077 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
2080 return Vec128<T, N>{wasm_i8x16_shuffle(
2081 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
2084 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2085 16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
2089 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2090 16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
2094 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2095 16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
2099 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2100 16, 16, 16, 16, 16, 16, 16, 0, 1,
2104 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2105 16, 16, 16, 16, 16, 16, 16, 16, 0,
2109 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2110 16, 16, 16, 16, 16, 16, 16, 16, 16,
2114 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2115 16, 16, 16, 16, 16, 16, 16, 16, 16,
2118 return Vec128<T, N>{zero};
2121 template <
int kBytes,
typename T,
size_t N>
2123 return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(),
v);
2128 template <
int kLanes,
typename T,
size_t N>
2134 template <
int kLanes,
typename T,
size_t N>
2136 return ShiftLeftLanes<kLanes>(
DFromV<decltype(
v)>(),
v);
2143 template <
int kBytes,
typename T,
size_t N>
2145 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2146 const __i8x16 zero = wasm_i8x16_splat(0);
2153 return wasm_i8x16_shuffle(
v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2154 12, 13, 14, 15, 16);
2157 return wasm_i8x16_shuffle(
v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2158 13, 14, 15, 16, 16);
2161 return wasm_i8x16_shuffle(
v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2162 13, 14, 15, 16, 16, 16);
2165 return wasm_i8x16_shuffle(
v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2166 14, 15, 16, 16, 16, 16);
2169 return wasm_i8x16_shuffle(
v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
2170 15, 16, 16, 16, 16, 16);
2173 return wasm_i8x16_shuffle(
v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2174 16, 16, 16, 16, 16, 16);
2177 return wasm_i8x16_shuffle(
v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2178 16, 16, 16, 16, 16, 16, 16);
2181 return wasm_i8x16_shuffle(
v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
2182 16, 16, 16, 16, 16, 16, 16);
2185 return wasm_i8x16_shuffle(
v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
2186 16, 16, 16, 16, 16, 16, 16);
2189 return wasm_i8x16_shuffle(
v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
2190 16, 16, 16, 16, 16, 16, 16);
2193 return wasm_i8x16_shuffle(
v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
2194 16, 16, 16, 16, 16, 16, 16);
2197 return wasm_i8x16_shuffle(
v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
2198 16, 16, 16, 16, 16, 16, 16);
2201 return wasm_i8x16_shuffle(
v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
2202 16, 16, 16, 16, 16, 16, 16);
2205 return wasm_i8x16_shuffle(
v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
2206 16, 16, 16, 16, 16, 16, 16);
2209 return wasm_i8x16_shuffle(
v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
2210 16, 16, 16, 16, 16, 16, 16);
2219 template <
int kBytes,
typename T,
size_t N>
2222 if (
N != 16 /
sizeof(T)) {
2223 const Vec128<T> vfull{
v.raw};
2226 return Vec128<T, N>{detail::ShrBytes<kBytes>(
v)};
2230 template <
int kLanes,
typename T,
size_t N>
2239 template <
typename T>
2241 return Vec64<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
2244 return Vec64<float>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
2248 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2255 return Vec128<T, (
N + 1) / 2>{upper.raw};
2260 template <
int kBytes,
typename T,
class V = Vec128<T>>
2262 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2268 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2269 11, 12, 13, 14, 15, 16)};
2272 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2273 11, 12, 13, 14, 15, 16, 17)};
2276 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2277 12, 13, 14, 15, 16, 17, 18)};
2280 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2281 13, 14, 15, 16, 17, 18, 19)};
2284 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2285 14, 15, 16, 17, 18, 19, 20)};
2288 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
2289 14, 15, 16, 17, 18, 19, 20, 21)};
2292 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
2293 15, 16, 17, 18, 19, 20, 21, 22)};
2296 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
2297 16, 17, 18, 19, 20, 21, 22, 23)};
2300 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
2301 17, 18, 19, 20, 21, 22, 23, 24)};
2304 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
2305 17, 18, 19, 20, 21, 22, 23, 24, 25)};
2308 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
2309 18, 19, 20, 21, 22, 23, 24, 25, 26)};
2312 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
2313 19, 20, 21, 22, 23, 24, 25, 26, 27)};
2316 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
2317 20, 21, 22, 23, 24, 25, 26, 27, 28)};
2320 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
2321 21, 22, 23, 24, 25, 26, 27, 28, 29)};
2324 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
2325 22, 23, 24, 25, 26, 27, 28, 29, 30)};
2330 template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T,
N),
2331 class V = Vec128<T, N>>
2333 constexpr
size_t kSize =
N *
sizeof(T);
2334 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
2337 using V8 =
VFromD<decltype(d_full8)>;
2338 const V8 hi8{
BitCast(d8, hi).raw};
2347 template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2349 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2350 return Vec128<T, N>{wasm_i16x8_shuffle(
v.raw,
v.raw, kLane, kLane, kLane,
2351 kLane, kLane, kLane, kLane, kLane)};
2354 template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2356 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2357 return Vec128<T, N>{
2358 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
2361 template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2363 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2364 return Vec128<T, N>{wasm_i64x2_shuffle(
v.raw,
v.raw, kLane, kLane)};
2371 template <
typename T,
size_t N,
typename TI,
size_t NI>
2373 const Vec128<TI, NI> from) {
2379 return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
2381 alignas(16) uint8_t control[16];
2382 alignas(16) uint8_t input[16];
2383 alignas(16) uint8_t output[16];
2384 wasm_v128_store(control, from.raw);
2385 wasm_v128_store(input, bytes.raw);
2386 for (
size_t i = 0; i < 16; ++i) {
2387 output[i] = control[i] < 16 ? input[control[i]] : 0;
2389 return Vec128<TI, NI>{wasm_v128_load(output)};
2393 template <
typename T,
size_t N,
typename TI,
size_t NI>
2395 const Vec128<TI, NI> from) {
2396 const Simd<TI, NI, 0>
d;
2399 Repartition<int8_t, Simd<T, N, 0>> d_bytes8;
2414 template <
typename T,
size_t N>
2416 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2417 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2418 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
2424 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2426 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2428 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2429 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2431 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2433 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2435 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2437 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2439 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2440 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
2443 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2445 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2447 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2448 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2450 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2452 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2454 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2456 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2458 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2459 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
2462 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2464 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2466 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2467 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2469 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2471 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2473 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2475 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2477 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2478 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
2484 template <
typename T>
2486 static_assert(
sizeof(T) == 8,
"Only for 64-bit lanes");
2487 return Vec128<T>{wasm_i64x2_shuffle(
v.raw,
v.raw, 1, 0)};
2489 template <
typename T>
2491 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2492 return Vec128<T>{wasm_i64x2_shuffle(
v.raw,
v.raw, 1, 0)};
2496 template <
typename T>
2498 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2499 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 2, 3, 0)};
2503 template <
typename T>
2505 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2506 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 3, 0, 1, 2)};
2510 template <
typename T>
2512 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2513 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 3, 2, 1, 0)};
2519 template <
typename T,
size_t N>
2524 template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
2526 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2527 #if HWY_IS_DEBUG_BUILD
2528 const Rebind<TI, decltype(
d)> di;
2534 using V8 =
VFromD<decltype(d8)>;
2538 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"");
2539 if (
sizeof(T) == 4) {
2540 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2541 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2542 const V8 lane_indices =
2544 const V8 byte_indices =
2546 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
2547 0, 1, 2, 3, 0, 1, 2, 3};
2550 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2551 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2552 const V8 lane_indices =
2554 const V8 byte_indices =
2556 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
2557 0, 1, 2, 3, 4, 5, 6, 7};
2558 return Indices128<T, N>{
Add(byte_indices,
Load(d8, kByteOffsets)).raw};
2562 template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
2564 const Rebind<TI, decltype(
d)> di;
2568 template <
typename T,
size_t N>
2570 using TI = MakeSigned<T>;
2572 const Rebind<TI, decltype(
d)> di;
2579 template <
typename T>
2585 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2587 return Vec128<T, 2>{
Shuffle2301(Vec128<T>{
v.raw}).raw};
2590 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2596 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2602 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2610 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2616 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2621 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2628 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2630 return BitCast(
d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(
v.raw,
v.raw, 3, 2,
2631 1, 0, 7, 6, 5, 4)});
2634 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2639 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2646 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2651 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2662 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2668 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2685 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2691 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2731 26, 11, 27, 12, 28, 13, 29, 14,
2738 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2755 26, 11, 27, 12, 28, 13, 29, 14,
2762 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2790 template <
typename T,
class V = Vec128<T>>
2796 template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
2798 const Half<decltype(
d)> d2;
2806 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2810 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2815 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2825 template <
typename T,
size_t N>
2826 HWY_API Vec128<T, N>
Combine(Simd<T, N, 0>
d, Vec128<T, N / 2> hi_half,
2827 Vec128<T, N / 2> lo_half) {
2828 const Half<decltype(
d)> d2;
2832 const VU lo{
BitCast(du2, lo_half).raw};
2833 const VU hi{
BitCast(du2, hi_half).raw};
2839 template <
typename T,
size_t N>
2847 template <
typename T>
2852 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2854 const Vec128<T, N> lo) {
2855 const Half<decltype(
d)> d2;
2861 template <
typename T>
2866 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2868 const Vec128<T, N> lo) {
2869 const Half<decltype(
d)> d2;
2875 template <
typename T>
2878 return CombineShiftRightBytes<8>(
d, hi, lo);
2880 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2882 const Vec128<T, N> lo) {
2883 const Half<decltype(
d)> d2;
2888 template <
typename T,
size_t N>
2890 const Vec128<T, N> lo) {
2897 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2899 return Vec128<T>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 1, 3, 5, 7, 9, 11, 13, 15,
2900 17, 19, 21, 23, 25, 27, 29, 31)};
2904 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2908 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 1, 3, 5, 7, 17, 19, 21,
2909 23, 1, 3, 5, 7, 17, 19, 21, 23)};
2913 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2917 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
2918 19, 1, 3, 17, 19, 1, 3, 17, 19)};
2922 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2925 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
2929 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2933 return Vec128<T, 4>{
2934 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
2938 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2940 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2944 template <
typename T>
2953 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2955 return Vec128<T>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 0, 2, 4, 6, 8, 10, 12, 14,
2956 16, 18, 20, 22, 24, 26, 28, 30)};
2960 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2964 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 0, 2, 4, 6, 16, 18, 20,
2965 22, 0, 2, 4, 6, 16, 18, 20, 22)};
2969 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2973 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16,
2974 18, 0, 2, 16, 18, 0, 2, 16, 18)};
2978 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2981 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
2985 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2989 return Vec128<T, 4>{
2990 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
2994 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2996 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
3000 template <
typename T>
3008 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3010 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 0, 0, 2, 2)};
3013 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3020 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3022 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 1, 3, 3)};
3025 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3034 template <
typename T,
size_t N>
3039 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3040 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3043 template <
typename T,
size_t N>
3047 wasm_i16x8_shuffle(a.
raw, b.
raw, 8, 1, 10, 3, 12, 5, 14, 7)};
3049 template <
typename T,
size_t N>
3054 template <
typename T,
size_t N>
3062 template <
typename T,
size_t N>
3063 HWY_API Vec128<T, N>
OddEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
3073 template <
typename T,
size_t N>
3080 template <
typename T,
size_t N>
3088 template <
typename T>
3107 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
3118 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
3122 const Vec128<uint16_t, N>
v) {
3123 return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(
v.raw)};
3140 const Vec128<int8_t, N>
v) {
3141 return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(
v.raw)};
3145 const Vec128<int8_t, N>
v) {
3146 return Vec128<int32_t, N>{
3147 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(
v.raw))};
3151 const Vec128<int16_t, N>
v) {
3152 return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(
v.raw)};
3156 const Vec128<int32_t, N>
v) {
3157 return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(
v.raw)};
3168 const Vec128<float16_t, N>
v) {
3172 const auto bits16 =
PromoteTo(du32, Vec128<uint16_t, N>{
v.raw});
3173 const auto sign = ShiftRight<15>(bits16);
3174 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
3175 const auto mantissa = bits16 &
Set(du32, 0x3FF);
3176 const auto subnormal =
3178 Set(df32, 1.0f / 16384 / 1024));
3180 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
3181 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3182 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3183 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
3184 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3189 const Vec128<bfloat16_t, N>
v) {
3190 const Rebind<uint16_t, decltype(df32)> du16;
3212 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
3214 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3226 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
3238 const Vec128<double, N>
v) {
3239 return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(
v.raw)};
3244 const Vec128<float, N>
v) {
3246 const Rebind<uint32_t, decltype(du16)> du;
3248 const auto bits32 =
BitCast(du,
v);
3249 const auto sign = ShiftRight<31>(bits32);
3250 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
3251 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3253 const auto k15 =
Set(di, 15);
3254 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3255 const auto is_tiny = exp <
Set(di, -24);
3257 const auto is_subnormal = exp <
Set(di, -14);
3258 const auto biased_exp16 =
3260 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3261 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3262 (mantissa32 >> (
Set(du, 13) + sub_exp));
3264 ShiftRight<13>(mantissa32));
3266 const auto sign16 = ShiftLeft<15>(sign);
3267 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3269 return Vec128<float16_t, N>{
DemoteTo(du16, bits16).raw};
3274 const Vec128<float, N>
v) {
3275 const Rebind<int32_t, decltype(dbf16)> di32;
3276 const Rebind<uint32_t, decltype(dbf16)> du32;
3277 const Rebind<uint16_t, decltype(dbf16)> du16;
3278 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3284 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
3286 const Repartition<uint32_t, decltype(dbf16)> du32;
3287 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(
BitCast(du32, b));
3294 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
3295 return Vec128<uint8_t, N>{
3296 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3323 const DFromV<decltype(
v)> du8;
3327 using VU16 =
VFromD<decltype(du16)>;
3329 const VU16 vFDB97531 = ShiftRight<8>(
BitCast(du16,
v));
3331 const VU16 sFE_DC_BA_98_76_54_32_10 =
Add(vFDB97531, vECA86420);
3333 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
3334 BitCast(du16, ShiftRight<16>(
BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
3335 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
3336 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
3337 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
3338 BitCast(du16, ShiftRight<32>(
BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
3339 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
3340 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
3341 return And(
BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70),
Set(du64, 0xFFFF));
3348 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
3353 const Vec128<T, N> vbits{wasm_i32x4_splat(
static_cast<int32_t
>(bits))};
3356 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
3357 1, 1, 1, 1, 1, 1, 1, 1};
3360 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
3361 1, 2, 4, 8, 16, 32, 64, 128};
3365 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3368 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3373 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3376 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
3381 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3384 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
3391 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
3394 uint64_t mask_bits = 0;
3404 template <
typename T>
3406 const Mask128<T> mask) {
3407 alignas(16) uint64_t lanes[2];
3408 wasm_v128_store(lanes, mask.raw);
3410 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3411 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
3412 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
3417 template <
typename T>
3420 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3421 return (
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0)) *
3427 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3430 uint64_t bytes =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0));
3432 bytes &= (1ULL << (
N * 8)) - 1;
3433 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3434 return (bytes * kMagic) >> 56;
3437 template <
typename T,
size_t N>
3441 const __i16x8 zero = wasm_i16x8_splat(0);
3446 template <
typename T,
size_t N>
3449 const __i32x4 mask_i =
static_cast<__i32x4
>(mask.
raw);
3450 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
3451 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
3452 alignas(16) uint32_t lanes[4];
3453 wasm_v128_store(lanes, sliced_mask);
3454 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
3457 template <
typename T,
size_t N>
3460 const __i64x2 mask_i =
static_cast<__i64x2
>(mask.
raw);
3461 const __i64x2 slice = wasm_i64x2_make(1, 2);
3462 const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
3463 alignas(16) uint64_t lanes[2];
3464 wasm_v128_store(lanes, sliced_mask);
3465 return lanes[0] | lanes[1];
3469 template <
typename T,
size_t N>
3470 constexpr uint64_t
OnlyActive(uint64_t bits) {
3471 return ((
N *
sizeof(T)) == 16) ? bits : bits & ((1ull <<
N) - 1);
3478 (
N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
3479 : (
N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
3480 : (
N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
3481 : (
N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
3482 : (
N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
3483 : (
N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
3484 : (
N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
3485 : (
N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
3486 : (
N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
3487 : (
N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3489 : (
N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3491 : (
N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
3493 : (
N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
3495 : (
N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
3498 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
3500 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
3501 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
3504 template <
typename T,
size_t N>
3509 template <
typename T>
3514 template <
typename T>
3519 template <
typename T>
3521 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
3522 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
3523 alignas(16) uint64_t lanes[2];
3524 wasm_v128_store(lanes, shifted_bits);
3525 return PopCount(lanes[0] | lanes[1]);
3528 template <
typename T>
3530 alignas(16) int64_t lanes[2];
3531 wasm_v128_store(lanes, m.raw);
3532 return static_cast<size_t>(-(lanes[0] + lanes[1]));
3538 template <
typename T,
size_t N>
3540 const Mask128<T, N> mask, uint8_t* bits) {
3542 const size_t kNumBytes = (
N + 7) / 8;
3543 CopyBytes<kNumBytes>(&mask_bits, bits);
3547 template <
typename T,
size_t N>
3553 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3556 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3561 template <
typename T>
3567 return !wasm_i8x16_any_true(v8.raw);
3570 return (wasm_i64x2_extract_lane(m.
raw, 0) |
3571 wasm_i64x2_extract_lane(m.
raw, 1)) == 0;
3577 template <
typename T>
3579 return wasm_i8x16_all_true(m.
raw);
3581 template <
typename T>
3583 return wasm_i16x8_all_true(m.
raw);
3585 template <
typename T>
3587 return wasm_i32x4_all_true(m.
raw);
3589 template <
typename T>
3591 return wasm_i64x2_all_true(m.
raw);
3596 template <
typename T,
size_t N>
3603 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3606 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3610 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3611 HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> m) {
3613 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3614 return AllTrue(Full128<T>(), Mask128<T>{
Or(mask, m).raw});
3617 template <
typename T,
size_t N>
3619 const Mask128<T, N> mask) {
3628 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3632 const Rebind<uint8_t, decltype(
d)> d8;
3640 alignas(16) constexpr uint8_t table[256 * 8] = {
3642 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3643 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3644 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
3645 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3646 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
3647 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
3648 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
3649 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3650 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
3651 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
3652 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
3653 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
3654 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
3655 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
3656 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
3657 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3658 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
3659 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
3660 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
3661 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
3662 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
3663 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
3664 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
3665 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
3666 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
3667 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
3668 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
3669 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
3670 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
3671 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
3672 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
3673 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3674 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
3675 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
3676 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
3677 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
3678 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
3679 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
3680 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
3681 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
3682 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
3683 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
3684 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
3685 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
3686 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
3687 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
3688 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
3689 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
3690 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
3691 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
3692 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
3693 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
3694 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
3695 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
3696 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
3697 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
3698 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
3699 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
3700 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
3701 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
3702 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
3703 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
3704 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
3705 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3706 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
3707 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
3708 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
3709 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
3710 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
3711 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
3712 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
3713 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
3714 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
3715 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
3716 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
3717 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
3718 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
3719 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
3720 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
3721 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
3722 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
3723 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
3724 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
3725 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
3726 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
3727 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
3728 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
3729 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
3730 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
3731 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
3732 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
3733 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
3734 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
3735 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
3736 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
3737 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
3738 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
3739 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
3740 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
3741 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
3742 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
3743 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
3744 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
3745 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
3746 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
3747 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
3748 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
3749 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
3750 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
3751 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
3752 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
3753 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
3754 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
3755 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
3756 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
3757 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
3758 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
3759 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
3760 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
3761 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
3762 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
3763 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
3764 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
3765 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
3766 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
3767 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
3768 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
3769 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
3776 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3780 const Rebind<uint8_t, decltype(
d)> d8;
3788 alignas(16) constexpr uint8_t table[256 * 8] = {
3790 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
3791 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
3792 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
3793 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
3794 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
3795 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
3796 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
3797 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
3798 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
3799 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
3800 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
3801 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
3802 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
3803 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
3804 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
3805 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
3806 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
3807 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
3808 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
3809 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
3810 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
3811 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
3812 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
3813 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
3814 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
3815 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
3816 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
3817 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
3818 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
3819 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
3820 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
3821 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
3822 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
3823 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
3824 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
3825 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
3826 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
3827 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
3828 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
3829 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
3830 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
3831 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
3832 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
3833 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
3834 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
3835 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
3836 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
3837 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
3838 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
3839 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
3840 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
3841 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
3842 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
3843 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
3844 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
3845 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
3846 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
3847 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
3848 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
3849 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
3850 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
3851 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
3852 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
3853 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
3854 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
3855 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
3856 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
3857 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
3858 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
3859 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
3860 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
3861 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
3862 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
3863 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
3864 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
3865 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
3866 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
3867 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
3868 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
3869 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
3870 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
3871 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
3872 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
3873 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
3874 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
3875 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
3876 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
3877 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
3878 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
3879 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
3880 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
3881 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
3882 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
3883 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
3884 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
3885 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
3886 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
3887 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
3888 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
3889 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
3890 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
3891 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
3892 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
3893 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
3894 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
3895 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
3896 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
3897 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
3898 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
3899 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
3900 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
3901 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
3902 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
3903 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
3904 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
3905 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
3906 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
3907 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
3908 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
3909 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
3910 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
3911 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
3912 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
3913 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
3914 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
3915 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
3916 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
3917 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
3924 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3929 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
3931 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3932 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3933 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
3934 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3935 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
3936 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
3937 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
3938 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3939 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
3940 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
3941 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
3942 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
3943 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3944 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
3945 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
3946 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3947 const Simd<T, N, 0>
d;
3949 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
3952 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3957 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
3959 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
3960 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
3961 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
3962 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
3963 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
3964 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
3965 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
3966 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3967 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
3968 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
3969 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
3970 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
3971 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
3972 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
3974 const Simd<T, N, 0>
d;
3976 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
3979 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3984 alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
3986 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3987 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3988 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3989 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3991 const Simd<T, N, 0>
d;
3993 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
3996 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4001 alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
4003 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4004 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4005 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4006 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4008 const Simd<T, N, 0>
d;
4010 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
4016 template <
typename T,
size_t N>
4018 const auto idx = detail::IdxFromBits<T, N>(mask_bits);
4024 template <
typename T,
size_t N>
4026 const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
4034 template <
typename T>
4035 struct CompressIsPartition {
4040 template <
typename T>
4046 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4058 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
4064 template <
typename T>
4070 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4075 const Vec128<T> maskL =
DupEven(m);
4076 const Vec128<T> maskH =
DupOdd(m);
4077 const Vec128<T> swap =
AndNot(maskH, maskL);
4082 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
4086 if (
N < 16 /
sizeof(T)) {
4093 Mask128<uint64_t> ) {
4099 template <
typename T,
size_t N>
4102 uint64_t mask_bits = 0;
4103 constexpr
size_t kNumBytes = (
N + 7) / 8;
4104 CopyBytes<kNumBytes>(bits, &mask_bits);
4106 mask_bits &= (1ull <<
N) - 1;
4113 template <
typename T,
size_t N>
4123 template <
typename T,
size_t N>
4128 using TU =
TFromD<decltype(du)>;
4130 const size_t count =
PopCount(mask_bits);
4139 template <
typename T,
size_t N>
4143 uint64_t mask_bits = 0;
4144 constexpr
size_t kNumBytes = (
N + 7) / 8;
4145 CopyBytes<kNumBytes>(bits, &mask_bits);
4147 mask_bits &= (1ull <<
N) - 1;
4163 const Vec128<uint64_t> b) {
4164 alignas(16) uint64_t mul[2];
4166 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0)),
4167 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
4168 return Load(Full128<uint64_t>(), mul);
4172 const Vec128<uint64_t> b) {
4173 alignas(16) uint64_t mul[2];
4175 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1)),
4176 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
4177 return Load(Full128<uint64_t>(), mul);
4184 Vec128<bfloat16_t, 2 * N> a,
4185 Vec128<bfloat16_t, 2 * N> b,
4186 const Vec128<float, N> sum0,
4187 Vec128<float, N>& sum1) {
4190 const Vec128<uint16_t, 2 * N> zero =
Zero(du16);
4191 const Vec128<uint32_t, N> a0 =
ZipLower(du32, zero,
BitCast(du16, a));
4192 const Vec128<uint32_t, N> a1 =
ZipUpper(du32, zero,
BitCast(du16, a));
4193 const Vec128<uint32_t, N> b0 =
ZipLower(du32, zero,
BitCast(du16, b));
4194 const Vec128<uint32_t, N> b1 =
ZipUpper(du32, zero,
BitCast(du16, b));
4204 template <
typename T>
4209 template <
typename T>
4214 template <
typename T>
4216 const Vec128<T, 1>
v) {
4223 template <
typename T>
4228 template <
typename T>
4233 template <
typename T>
4235 const Vec128<T, 2> v10) {
4236 return Max(v10, Vec128<T, 2>{
Shuffle2301(Vec128<T>{v10.raw}).raw});
4240 template <
typename T>
4244 const Vec128<T> v31_20_31_20 = v3210 + v1032;
4246 return v20_31_20_31 + v31_20_31_20;
4248 template <
typename T>
4254 return Min(v20_31_20_31, v31_20_31_20);
4256 template <
typename T>
4258 const Vec128<T> v3210) {
4260 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
4261 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
4262 return Max(v20_31_20_31, v31_20_31_20);
4268 template <
typename T>
4274 template <
typename T>
4278 return Min(v10, v01);
4280 template <
typename T>
4282 const Vec128<T> v10) {
4284 return Max(v10, v01);
4288 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4293 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4296 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
4298 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4303 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4306 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
4312 template <
typename T,
size_t N>
4316 template <
typename T,
size_t N>
4320 template <
typename T,
size_t N>
4327 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4330 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"Use u64");
4344 const Mask128<T, N> eqHL =
Eq(a, b);
4350 const Vec128<T, N> ltLx =
DupEven(ltHL);
4351 const Vec128<T, N> outHx =
IfThenElse(eqHL, ltLx, ltHL);
4355 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4415 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
4419 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
4423 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
4428 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
4432 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
4437 HWY_API auto Le(V a, V b) -> decltype(a == b) {
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
detail::Raw128< T >::type raw
Definition: wasm_128-inl.h:106
Raw raw
Definition: arm_neon-inl.h:814
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: wasm_128-inl.h:84
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: wasm_128-inl.h:87
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: wasm_128-inl.h:75
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: wasm_128-inl.h:90
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: wasm_128-inl.h:72
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: wasm_128-inl.h:78
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: wasm_128-inl.h:81
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:2144
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3476
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1700
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5491
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5339
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
Vec128< T, 4/sizeof(T)> Vec32
Definition: arm_neon-inl.h:800
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:797
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
__v128_u raw
Definition: wasm_128-inl.h:2521
Definition: ops/shared-inl.h:40
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition: wasm_128-inl.h:151
Definition: wasm_128-inl.h:146
HWY_INLINE __v128_u operator()(__v128_u v)
Definition: wasm_128-inl.h:147
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: wasm_128-inl.h:114
__f32x4 type
Definition: wasm_128-inl.h:60
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56