21 #include <wasm_simd128.h>
44 return *
this = (*
this * other);
47 return *
this = (*
this / other);
50 return *
this = (*
this + other);
53 return *
this = (*
this - other);
56 return *
this = (*
this & other);
59 return *
this = (*
this | other);
62 return *
this = (*
this ^ other);
77 template <
typename T,
typename FromT>
79 const Half<decltype(
d)> dh;
139 template <
typename T>
147 template <
typename T,
typename T2>
150 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
151 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
153 return Load(
d, lanes);
353 template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
360 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
368 return shifted &
Set(d8, 0xFF >> kBits);
376 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
377 return (shifted ^ shifted_sign) - shifted_sign;
381 template <
int kBits,
typename T>
383 constexpr
size_t kSizeInBits =
sizeof(T) * 8;
384 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
385 if (kBits == 0)
return v;
386 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
426 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
431 return shifted &
Set(d8, (0xFF << bits) & 0xFF);
439 return shifted &
Set(d8, 0xFF >> bits);
446 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> bits));
447 return (shifted ^ shifted_sign) - shifted_sign;
466 alignas(32)
float min[4];
468 HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
470 HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
485 alignas(32)
float min[4];
487 HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
489 HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
514 alignas(32)
float max[4];
516 HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
518 HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
533 alignas(32)
float max[4];
535 HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
537 HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
572 const auto al = wasm_u32x4_extend_low_u16x8(a.
raw);
573 const auto ah = wasm_u32x4_extend_high_u16x8(a.
raw);
574 const auto bl = wasm_u32x4_extend_low_u16x8(b.
raw);
575 const auto bh = wasm_u32x4_extend_high_u16x8(b.
raw);
576 const auto l = wasm_i32x4_mul(al, bl);
577 const auto h = wasm_i32x4_mul(ah, bh);
579 return Vec256<uint16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
584 const auto al = wasm_i32x4_extend_low_i16x8(a.
raw);
585 const auto ah = wasm_i32x4_extend_high_i16x8(a.
raw);
586 const auto bl = wasm_i32x4_extend_low_i16x8(b.
raw);
587 const auto bh = wasm_i32x4_extend_high_i16x8(b.
raw);
588 const auto l = wasm_i32x4_mul(al, bl);
589 const auto h = wasm_i32x4_mul(ah, bh);
591 return Vec256<int16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
602 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
603 const auto ae = wasm_v128_and(a.
raw, kEvenMask);
604 const auto be = wasm_v128_and(b.
raw, kEvenMask);
610 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
611 const auto ae = wasm_v128_and(a.
raw, kEvenMask);
612 const auto be = wasm_v128_and(b.
raw, kEvenMask);
618 template <
typename T, HWY_IF_FLOAT(T)>
664 return mul * x + add;
671 return add - mul * x;
679 return mul * x - sub;
686 return Neg(mul) * x - sub;
700 return one /
Sqrt(
v);
727 template <
typename T>
732 template <
typename T, HWY_IF_FLOAT(T)>
742 template <
typename T, HWY_IF_FLOAT(T)>
751 const VFromD<decltype(di)> exp =
760 template <
typename TFrom,
typename TTo>
762 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
766 template <
typename T>
768 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
769 return (
v & bit) == bit;
859 const auto a32 =
BitCast(d32, a);
860 const auto b32 =
BitCast(d32, b);
862 const auto m_gt = a32 < b32;
865 const auto m_eq = a32 == b32;
866 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
867 const auto lo_gt =
And(m_eq, lo_in_hi);
869 const auto gt =
Or(lo_gt, m_gt);
874 template <
typename T, HWY_IF_UNSIGNED(T)>
878 const Vec256<T> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
886 template <
typename T>
905 template <
typename T>
915 template <
typename T>
922 template <
typename T>
930 template <
typename T>
937 template <
typename T>
944 template <
typename T>
951 template <
typename T>
953 return Or(o1,
Or(o2, o3));
958 template <
typename T>
960 return Or(o,
And(a1, a2));
965 template <
typename T>
972 template <
typename T>
977 template <
typename T>
982 template <
typename T>
989 template <
typename T>
991 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
996 template <
typename T>
998 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1004 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
1015 template <
typename T>
1020 template <
typename T>
1026 template <
typename T>
1032 template <
typename T>
1038 template <
typename T>
1043 template <
typename T>
1049 template <
typename T, HWY_IF_FLOAT(T)>
1052 const auto zero =
Zero(
d);
1053 return IfThenElse(Mask256<T>{(
v > zero).raw},
v, zero);
1058 template <
typename T>
1063 template <
typename T>
1064 HWY_API Mask256<T>
And(
const Mask256<T> a, Mask256<T> b) {
1069 template <
typename T>
1070 HWY_API Mask256<T>
AndNot(
const Mask256<T> a, Mask256<T> b) {
1075 template <
typename T>
1076 HWY_API Mask256<T>
Or(
const Mask256<T> a, Mask256<T> b) {
1081 template <
typename T>
1082 HWY_API Mask256<T>
Xor(
const Mask256<T> a, Mask256<T> b) {
1097 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1104 test = ShiftLeft<12>(test);
1107 test = ShiftLeft<1>(test);
1111 test = ShiftLeft<1>(test);
1115 test = ShiftLeft<1>(test);
1122 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1129 test = ShiftLeft<27>(test);
1132 test = ShiftLeft<1>(test);
1136 test = ShiftLeft<1>(test);
1140 test = ShiftLeft<1>(test);
1144 test = ShiftLeft<1>(test);
1153 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1160 test = ShiftLeft<12>(test);
1163 test = ShiftLeft<1>(test);
1167 test = ShiftLeft<1>(test);
1171 test = ShiftLeft<1>(test);
1178 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1185 test = ShiftLeft<27>(test);
1188 test = ShiftLeft<1>(test);
1192 test = ShiftLeft<1>(test);
1196 test = ShiftLeft<1>(test);
1200 test = ShiftLeft<1>(test);
1211 template <
typename T>
1213 return Vec256<T>{wasm_v128_load(aligned)};
1216 template <
typename T>
1223 template <
typename T>
1229 template <
typename T>
1236 template <
typename T>
1238 wasm_v128_store(aligned,
v.raw);
1242 template <
typename T>
1247 template <
typename T>
1257 template <
typename T>
1260 wasm_v128_store(aligned,
v.raw);
1265 template <
typename T,
typename Offset>
1268 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1270 alignas(32) T lanes[32 /
sizeof(T)];
1273 alignas(32) Offset offset_lanes[32 /
sizeof(T)];
1276 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
1277 for (
size_t i = 0; i <
N; ++i) {
1278 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1282 template <
typename T,
typename Index>
1285 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1287 alignas(32) T lanes[32 /
sizeof(T)];
1290 alignas(32) Index index_lanes[32 /
sizeof(T)];
1293 for (
size_t i = 0; i <
N; ++i) {
1294 base[index_lanes[i]] = lanes[i];
1300 template <
typename T,
typename Offset>
1303 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1305 alignas(32) Offset offset_lanes[32 /
sizeof(T)];
1308 alignas(32) T lanes[32 /
sizeof(T)];
1309 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
1310 for (
size_t i = 0; i <
N; ++i) {
1311 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1313 return Load(
d, lanes);
1316 template <
typename T,
typename Index>
1319 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1321 alignas(32) Index index_lanes[32 /
sizeof(T)];
1324 alignas(32) T lanes[32 /
sizeof(T)];
1325 for (
size_t i = 0; i <
N; ++i) {
1326 lanes[i] = base[index_lanes[i]];
1328 return Load(
d, lanes);
1334 template <
typename T,
size_t N>
1340 template <
typename T,
size_t N>
1348 return wasm_i8x16_extract_lane(
v.raw, 0);
1351 return wasm_i8x16_extract_lane(
v.raw, 0);
1354 return wasm_i16x8_extract_lane(
v.raw, 0);
1357 return wasm_i16x8_extract_lane(
v.raw, 0);
1360 return wasm_i32x4_extract_lane(
v.raw, 0);
1363 return wasm_i32x4_extract_lane(
v.raw, 0);
1366 return wasm_i64x2_extract_lane(
v.raw, 0);
1369 return wasm_i64x2_extract_lane(
v.raw, 0);
1373 return wasm_f32x4_extract_lane(
v.raw, 0);
1378 template <
typename T>
1383 template <
typename T>
1391 template <
int kBytes,
typename T>
1393 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1394 const __i8x16 zero = wasm_i8x16_splat(0);
1400 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
1401 7, 8, 9, 10, 11, 12, 13, 14)};
1404 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
1405 6, 7, 8, 9, 10, 11, 12, 13)};
1408 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
1409 4, 5, 6, 7, 8, 9, 10, 11, 12)};
1412 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
1413 3, 4, 5, 6, 7, 8, 9, 10, 11)};
1416 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
1417 2, 3, 4, 5, 6, 7, 8, 9, 10)};
1420 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1421 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1424 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1425 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1428 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1429 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1432 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1433 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
1436 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1437 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
1440 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1441 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
1444 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1445 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
1448 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1449 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
1452 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1453 16, 16, 16, 16, 16, 16, 16, 16, 0,
1457 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1458 16, 16, 16, 16, 16, 16, 16, 16, 16,
1464 template <
int kBytes,
typename T>
1471 template <
int kLanes,
typename T>
1477 template <
int kLanes,
typename T>
1486 template <
int kBytes,
typename T>
1488 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1489 const __i8x16 zero = wasm_i8x16_splat(0);
1496 return wasm_i8x16_shuffle(
v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1497 12, 13, 14, 15, 16);
1500 return wasm_i8x16_shuffle(
v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1501 13, 14, 15, 16, 16);
1504 return wasm_i8x16_shuffle(
v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1505 13, 14, 15, 16, 16, 16);
1508 return wasm_i8x16_shuffle(
v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1509 14, 15, 16, 16, 16, 16);
1512 return wasm_i8x16_shuffle(
v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1513 15, 16, 16, 16, 16, 16);
1516 return wasm_i8x16_shuffle(
v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1517 16, 16, 16, 16, 16, 16);
1520 return wasm_i8x16_shuffle(
v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1521 16, 16, 16, 16, 16, 16, 16);
1524 return wasm_i8x16_shuffle(
v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1525 16, 16, 16, 16, 16, 16, 16);
1528 return wasm_i8x16_shuffle(
v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1529 16, 16, 16, 16, 16, 16, 16);
1532 return wasm_i8x16_shuffle(
v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1533 16, 16, 16, 16, 16, 16, 16);
1536 return wasm_i8x16_shuffle(
v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1537 16, 16, 16, 16, 16, 16, 16);
1540 return wasm_i8x16_shuffle(
v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1541 16, 16, 16, 16, 16, 16, 16);
1544 return wasm_i8x16_shuffle(
v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1545 16, 16, 16, 16, 16, 16, 16);
1548 return wasm_i8x16_shuffle(
v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1549 16, 16, 16, 16, 16, 16, 16);
1552 return wasm_i8x16_shuffle(
v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1553 16, 16, 16, 16, 16, 16, 16);
1562 template <
int kBytes,
typename T>
1564 return Vec256<T>{detail::ShrBytes<kBytes>(
v)};
1568 template <
int kLanes,
typename T>
1577 template <
typename T>
1580 return Vec128<T, 8 /
sizeof(T)>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
1589 template <
int kBytes,
typename T,
class V = Vec256<T>>
1591 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1597 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1598 11, 12, 13, 14, 15, 16)};
1601 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1602 11, 12, 13, 14, 15, 16, 17)};
1605 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1606 12, 13, 14, 15, 16, 17, 18)};
1609 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1610 13, 14, 15, 16, 17, 18, 19)};
1613 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1614 14, 15, 16, 17, 18, 19, 20)};
1617 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1618 14, 15, 16, 17, 18, 19, 20, 21)};
1621 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1622 15, 16, 17, 18, 19, 20, 21, 22)};
1625 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1626 16, 17, 18, 19, 20, 21, 22, 23)};
1629 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1630 17, 18, 19, 20, 21, 22, 23, 24)};
1633 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1634 17, 18, 19, 20, 21, 22, 23, 24, 25)};
1637 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1638 18, 19, 20, 21, 22, 23, 24, 25, 26)};
1641 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1642 19, 20, 21, 22, 23, 24, 25, 26, 27)};
1645 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1646 20, 21, 22, 23, 24, 25, 26, 27, 28)};
1649 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1650 21, 22, 23, 24, 25, 26, 27, 28, 29)};
1653 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
1654 22, 23, 24, 25, 26, 27, 28, 29, 30)};
1662 template <
int kLane>
1664 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1666 v.raw,
v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1668 template <
int kLane>
1670 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1672 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
1676 template <
int kLane>
1678 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1680 kLane, kLane, kLane, kLane, kLane)};
1682 template <
int kLane>
1684 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1686 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
1690 template <
int kLane>
1692 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1694 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
1701 template <
typename T,
typename TI>
1711 alignas(32) uint8_t control[16];
1712 alignas(32) uint8_t input[16];
1713 alignas(32) uint8_t output[16];
1714 wasm_v128_store(control, from.
raw);
1715 wasm_v128_store(input, bytes.
raw);
1716 for (
size_t i = 0; i < 16; ++i) {
1717 output[i] = control[i] < 16 ? input[control[i]] : 0;
1723 template <
typename T,
typename TI>
1745 return Vec128<uint32_t>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
1748 return Vec128<int32_t>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
1751 return Vec128<float>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
1800 template <
typename T>
1805 template <
typename T,
typename TI>
1807 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
1811 template <
typename T,
typename TI>
1813 const Rebind<TI, decltype(
d)> di;
1817 template <
typename T>
1827 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1833 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1839 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1847 template <
typename T>
1854 template <
typename T>
1861 template <
typename T>
1870 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1875 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1888 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1892 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1906 template <
typename T,
class V = Vec256<T>>
1918 11, 27, 12, 28, 13, 29, 14, 30, 15,
1924 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1937 11, 27, 12, 28, 13, 29, 14, 30, 15,
1942 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1957 template <
typename T,
class V = Vec256<T>>
1966 template <
typename T,
class DW = RepartitionToW
ide<Full256<T>>>
1970 template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1975 template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1985 template <
typename T>
1987 const Half<decltype(
d)> d2;
1991 const VU lo{
BitCast(du2, lo_half).raw};
1992 const VU hi{
BitCast(du2, hi_half).raw};
1998 template <
typename T>
2006 template <
typename T>
2014 template <
typename T>
2022 template <
typename T>
2025 return CombineShiftRightBytes<8>(
d, hi, lo);
2029 template <
typename T>
2038 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2045 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2053 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2060 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2066 template <
typename T>
2072 template <
typename T>
2081 template <
typename T>
2086 alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2087 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2090 template <
typename T>
2093 return Vec256<T>{wasm_i16x8_shuffle(a.
raw, b.
raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2095 template <
typename T>
2100 template <
typename T>
2108 template <
typename T>
2117 template <
typename T>
2124 template <
typename T>
2131 template <
typename T>
2148 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
2157 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
2176 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(
v.raw))};
2195 const auto sign = ShiftRight<15>(bits16);
2196 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
2197 const auto mantissa = bits16 &
Set(du32, 0x3FF);
2198 const auto subnormal =
2200 Set(df32, 1.0f / 16384 / 1024));
2202 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
2203 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
2204 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2205 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
2206 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2211 const Rebind<uint16_t, decltype(df32)> du16;
2230 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2231 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2241 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2242 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
2260 const auto bits32 =
BitCast(du,
v);
2261 const auto sign = ShiftRight<31>(bits32);
2262 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
2263 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
2265 const auto k15 =
Set(di, 15);
2266 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
2267 const auto is_tiny = exp <
Set(di, -24);
2269 const auto is_subnormal = exp <
Set(di, -14);
2270 const auto biased_exp16 =
2272 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
2273 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
2274 (mantissa32 >> (
Set(du, 13) + sub_exp));
2276 ShiftRight<13>(mantissa32));
2278 const auto sign16 = ShiftLeft<15>(sign);
2279 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2286 const Rebind<int32_t, decltype(dbf16)> di32;
2287 const Rebind<uint32_t, decltype(dbf16)> du32;
2288 const Rebind<uint16_t, decltype(dbf16)> du16;
2289 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
2296 const Repartition<uint32_t, decltype(dbf16)> du32;
2303 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2304 return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2329 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2334 const Vec256<T> vbits{wasm_i32x4_splat(
static_cast<int32_t
>(bits))};
2337 alignas(32) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2338 1, 1, 1, 1, 1, 1, 1, 1};
2341 alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2342 1, 2, 4, 8, 16, 32, 64, 128};
2346 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2349 alignas(32) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2353 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2356 alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2360 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2363 alignas(32) constexpr uint64_t kBit[8] = {1, 2};
2370 template <
typename T>
2373 uint64_t mask_bits = 0;
2383 template <
typename T>
2385 const Mask128<T> mask) {
2386 alignas(32) uint64_t lanes[2];
2387 wasm_v128_store(lanes, mask.raw);
2389 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2390 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2391 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2395 template <
typename T>
2399 const __i16x8 zero = wasm_i16x8_splat(0);
2404 template <
typename T>
2407 const __i32x4 mask_i =
static_cast<__i32x4
>(mask.
raw);
2408 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2409 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2410 alignas(32) uint32_t lanes[4];
2411 wasm_v128_store(lanes, sliced_mask);
2412 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2418 (
N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
2419 : (
N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
2420 : (
N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
2421 : (
N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
2422 : (
N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
2423 : (
N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
2424 : (
N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
2425 : (
N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
2426 : (
N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
2427 : (
N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2429 : (
N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2431 : (
N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
2433 : (
N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
2435 : (
N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
2438 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
2440 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
2441 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
2444 template <
typename T>
2449 template <
typename T>
2454 template <
typename T>
2459 template <
typename T>
2461 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
2462 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
2463 alignas(32) uint64_t lanes[2];
2464 wasm_v128_store(lanes, shifted_bits);
2465 return PopCount(lanes[0] | lanes[1]);
2471 template <
typename T>
2475 const size_t kNumBytes = (
N + 7) / 8;
2476 CopyBytes<kNumBytes>(&mask_bits, bits);
2480 template <
typename T>
2485 template <
typename T>
2491 return !wasm_i8x16_any_true(v8.raw);
2494 return (wasm_i64x2_extract_lane(m.
raw, 0) |
2495 wasm_i64x2_extract_lane(m.
raw, 1)) == 0;
2501 template <
typename T>
2503 return wasm_i8x16_all_true(m.raw);
2505 template <
typename T>
2507 return wasm_i16x8_all_true(m.raw);
2509 template <
typename T>
2511 return wasm_i32x4_all_true(m.raw);
2516 template <
typename T>
2521 template <
typename T>
2532 template <
typename T>
2536 const Rebind<uint8_t, decltype(
d)> d8;
2544 alignas(32) constexpr uint8_t table[256 * 8] = {
2545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
2546 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
2547 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
2548 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
2549 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
2550 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
2551 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
2552 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
2553 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
2554 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
2555 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
2556 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
2557 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
2558 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
2559 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
2560 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
2561 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
2562 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
2563 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
2564 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
2565 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
2566 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
2567 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
2568 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
2569 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
2570 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
2571 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
2572 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
2573 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
2574 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
2575 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
2576 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
2577 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
2578 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
2579 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
2580 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
2581 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
2582 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
2583 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
2584 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
2585 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
2586 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
2587 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
2588 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
2589 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
2590 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
2591 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
2592 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
2593 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
2594 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
2595 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
2596 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
2597 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
2598 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
2599 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
2600 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
2601 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
2602 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
2603 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
2604 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
2605 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
2606 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
2607 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
2608 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
2609 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
2610 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
2611 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
2612 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
2613 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
2614 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
2615 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
2616 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
2617 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
2618 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
2619 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
2620 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
2621 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
2622 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
2623 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
2624 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
2625 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
2626 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
2627 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
2628 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
2629 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
2630 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
2631 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
2632 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
2633 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
2634 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
2635 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
2636 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
2637 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
2638 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
2639 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
2640 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
2641 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
2642 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
2643 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
2644 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
2645 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
2646 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
2647 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
2648 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
2649 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
2650 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
2651 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
2652 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
2653 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
2654 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
2655 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
2656 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
2657 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
2658 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
2665 template <
typename T>
2670 alignas(32) constexpr uint8_t packed_array[16 * 16] = {
2671 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2672 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2673 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2674 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3,
2675 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2676 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
2677 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
2678 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
2679 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2680 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2681 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2682 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3,
2683 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2684 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
2685 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
2686 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2690 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
2693 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2695 template <
typename T>
2700 alignas(32) constexpr uint8_t packed_array[4 * 16] = {
2701 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
2702 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
2703 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
2704 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2708 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
2716 template <
typename T>
2718 const uint64_t mask_bits) {
2719 const auto idx = detail::Idx16x8FromBits<T>(mask_bits);
2725 template <
typename T>
2727 const uint64_t mask_bits) {
2728 const auto idx = detail::Idx32x4FromBits<T>(mask_bits);
2734 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2736 template <
typename T>
2739 const uint64_t mask_bits) {
2740 const auto idx = detail::Idx64x2FromBits<uint64_t>(mask_bits);
2750 template <
typename T>
2751 struct CompressIsPartition {
2755 template <
typename T>
2762 template <
typename T>
2775 template <
typename T>
2777 uint64_t mask_bits = 0;
2778 constexpr
size_t kNumBytes = (
N + 7) / 8;
2779 CopyBytes<kNumBytes>(bits, &mask_bits);
2781 mask_bits &= (1ull <<
N) - 1;
2788 template <
typename T>
2798 template <
typename T>
2802 using TU =
TFromD<decltype(du)>;
2804 const size_t count =
PopCount(mask_bits);
2815 template <
typename T>
2818 uint64_t mask_bits = 0;
2819 constexpr
size_t kNumBytes = (
N + 7) / 8;
2820 CopyBytes<kNumBytes>(bits, &mask_bits);
2822 mask_bits &= (1ull <<
N) - 1;
2839 alignas(32) uint64_t mul[2];
2841 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.
raw, 0)),
2842 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.
raw, 0)), &mul[1]);
2848 alignas(32) uint64_t mul[2];
2850 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.
raw, 1)),
2851 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.
raw, 1)), &mul[1]);
2879 template <
typename T>
2883 const Vec256<T> v31_20_31_20 = v3210 + v1032;
2885 return v20_31_20_31 + v31_20_31_20;
2887 template <
typename T>
2893 return Min(v20_31_20_31, v31_20_31_20);
2895 template <
typename T>
2901 return Max(v20_31_20_31, v31_20_31_20);
2906 template <
typename T>
2912 template <
typename T>
2916 return Min(v10, v01);
2918 template <
typename T>
2922 return Max(v10, v01);
2926 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2930 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
2935 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2939 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
2948 template <
typename T>
2952 template <
typename T>
2956 template <
typename T>
2963 template <
typename T>
2966 template <
typename T>
2969 template <
typename T>
2972 template <
typename T>
2975 template <
typename T>
2978 template <
typename T>
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_ABORT(format,...)
Definition: base.h:141
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
Raw raw
Definition: arm_neon-inl.h:814
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: wasm_256-inl.h:46
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: wasm_256-inl.h:43
Raw raw
Definition: x86_256-inl.h:100
Vec128< T > v1
Definition: wasm_256-inl.h:66
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: wasm_256-inl.h:58
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: wasm_256-inl.h:61
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: wasm_256-inl.h:49
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: wasm_256-inl.h:55
Vec128< T > v0
Definition: wasm_256-inl.h:65
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: wasm_256-inl.h:52
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:2144
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3476
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec256< T > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_256-inl.h:2533
HWY_INLINE Vec256< T > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_256-inl.h:2666
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: wasm_256-inl.h:1801
__v128_u raw
Definition: wasm_256-inl.h:1802
Definition: wasm_256-inl.h:70
Mask128< T > m1
Definition: wasm_256-inl.h:72
Raw raw
Definition: x86_256-inl.h:137
Mask128< T > m0
Definition: wasm_256-inl.h:71
Definition: ops/shared-inl.h:40