24 #include <immintrin.h>
28 #if HWY_COMPILER_CLANGCL
32 #include <avxintrin.h>
34 #include <avx2intrin.h>
35 #include <bmi2intrin.h>
36 #include <f16cintrin.h>
37 #include <fmaintrin.h>
38 #include <smmintrin.h>
45 #include <sanitizer/msan_interface.h>
79 return *
this = (*
this * other);
82 return *
this = (*
this / other);
85 return *
this = (*
this + other);
88 return *
this = (*
this - other);
91 return *
this = (*
this & other);
94 return *
this = (*
this | other);
97 return *
this = (*
this ^ other);
103 #if HWY_TARGET <= HWY_AVX3
108 template <
size_t size>
129 template <
typename T>
143 template <
typename T>
157 return _mm256_castpd_si256(
v);
160 template <
typename T>
166 template <
typename T>
179 template <
typename T>
186 template <
typename T,
typename FromT>
194 template <
typename T>
196 return Vec256<T>{_mm256_setzero_si256()};
199 return Vec256<float>{_mm256_setzero_ps()};
206 HWY_API Vec256<uint8_t>
Set(Full256<uint8_t> ,
const uint8_t t) {
207 return Vec256<uint8_t>{_mm256_set1_epi8(
static_cast<char>(t))};
209 HWY_API Vec256<uint16_t>
Set(Full256<uint16_t> ,
const uint16_t t) {
210 return Vec256<uint16_t>{_mm256_set1_epi16(
static_cast<short>(t))};
212 HWY_API Vec256<uint32_t>
Set(Full256<uint32_t> ,
const uint32_t t) {
213 return Vec256<uint32_t>{_mm256_set1_epi32(
static_cast<int>(t))};
215 HWY_API Vec256<uint64_t>
Set(Full256<uint64_t> ,
const uint64_t t) {
216 return Vec256<uint64_t>{
217 _mm256_set1_epi64x(
static_cast<long long>(t))};
219 HWY_API Vec256<int8_t>
Set(Full256<int8_t> ,
const int8_t t) {
220 return Vec256<int8_t>{_mm256_set1_epi8(
static_cast<char>(t))};
222 HWY_API Vec256<int16_t>
Set(Full256<int16_t> ,
const int16_t t) {
223 return Vec256<int16_t>{_mm256_set1_epi16(
static_cast<short>(t))};
225 HWY_API Vec256<int32_t>
Set(Full256<int32_t> ,
const int32_t t) {
226 return Vec256<int32_t>{_mm256_set1_epi32(t)};
228 HWY_API Vec256<int64_t>
Set(Full256<int64_t> ,
const int64_t t) {
229 return Vec256<int64_t>{
230 _mm256_set1_epi64x(
static_cast<long long>(t))};
232 HWY_API Vec256<float>
Set(Full256<float> ,
const float t) {
233 return Vec256<float>{_mm256_set1_ps(t)};
243 template <
typename T>
247 return Vec256<T>{_mm256_undefined_si256()};
262 template <
typename T>
263 HWY_API Vec256<T>
And(Vec256<T> a, Vec256<T> b) {
264 return Vec256<T>{_mm256_and_si256(a.raw, b.raw)};
277 template <
typename T>
278 HWY_API Vec256<T>
AndNot(Vec256<T> not_mask, Vec256<T> mask) {
279 return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
292 template <
typename T>
293 HWY_API Vec256<T>
Or(Vec256<T> a, Vec256<T> b) {
294 return Vec256<T>{_mm256_or_si256(a.raw, b.raw)};
306 template <
typename T>
307 HWY_API Vec256<T>
Xor(Vec256<T> a, Vec256<T> b) {
308 return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)};
320 template <
typename T>
323 #if HWY_TARGET <= HWY_AVX3
324 const __m256i vu =
BitCast(Full256<TU>(),
v).raw;
326 Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
328 return Xor(
v,
BitCast(Full256<T>(), Vec256<TU>{_mm256_set1_epi32(-1)}));
334 template <
typename T>
335 HWY_API Vec256<T>
Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
336 #if HWY_TARGET <= HWY_AVX3
339 using VU =
VFromD<decltype(du)>;
340 const __m256i ret = _mm256_ternarylogic_epi64(
344 return Or(o1,
Or(o2, o3));
350 template <
typename T>
351 HWY_API Vec256<T>
OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
352 #if HWY_TARGET <= HWY_AVX3
355 using VU =
VFromD<decltype(du)>;
356 const __m256i ret = _mm256_ternarylogic_epi64(
360 return Or(o,
And(a1, a2));
366 template <
typename T>
368 #if HWY_TARGET <= HWY_AVX3
371 using VU =
VFromD<decltype(du)>;
382 template <
typename T>
387 template <
typename T>
392 template <
typename T>
400 #if HWY_TARGET == HWY_AVX3_DL
402 #ifdef HWY_NATIVE_POPCNT
403 #undef HWY_NATIVE_POPCNT
405 #define HWY_NATIVE_POPCNT
410 template <
typename T>
414 template <
typename T>
418 template <
typename T>
422 template <
typename T>
429 template <
typename T>
440 template <
typename T>
441 HWY_API Vec256<T>
CopySign(
const Vec256<T> magn,
const Vec256<T> sign) {
442 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
447 #if HWY_TARGET <= HWY_AVX3
448 const Rebind<MakeUnsigned<T>, decltype(
d)> du;
459 const __m256i out = _mm256_ternarylogic_epi32(
467 template <
typename T>
469 #if HWY_TARGET <= HWY_AVX3
479 #if HWY_TARGET <= HWY_AVX3
488 template <
typename T>
493 template <
typename T>
498 template <
typename T>
503 template <
typename T>
511 template <
typename T>
526 template <
typename T>
531 template <
typename T>
536 template <
typename T>
541 template <
typename T>
549 template <
typename T>
563 template <
typename T>
569 template <
typename T>
574 template <
typename T>
579 template <
typename T>
587 template <
typename T>
598 template <
typename T, HWY_IF_FLOAT(T)>
608 template <
typename T>
611 #if HWY_COMPILER_HAS_MASK_INTRINSICS
617 template <
typename T>
620 #if HWY_COMPILER_HAS_MASK_INTRINSICS
626 template <
typename T>
629 #if HWY_COMPILER_HAS_MASK_INTRINSICS
635 template <
typename T>
638 #if HWY_COMPILER_HAS_MASK_INTRINSICS
645 template <
typename T>
648 #if HWY_COMPILER_HAS_MASK_INTRINSICS
654 template <
typename T>
657 #if HWY_COMPILER_HAS_MASK_INTRINSICS
663 template <
typename T>
666 #if HWY_COMPILER_HAS_MASK_INTRINSICS
672 template <
typename T>
675 #if HWY_COMPILER_HAS_MASK_INTRINSICS
682 template <
typename T>
685 #if HWY_COMPILER_HAS_MASK_INTRINSICS
691 template <
typename T>
694 #if HWY_COMPILER_HAS_MASK_INTRINSICS
700 template <
typename T>
703 #if HWY_COMPILER_HAS_MASK_INTRINSICS
709 template <
typename T>
712 #if HWY_COMPILER_HAS_MASK_INTRINSICS
719 template <
typename T>
722 #if HWY_COMPILER_HAS_MASK_INTRINSICS
728 template <
typename T>
731 #if HWY_COMPILER_HAS_MASK_INTRINSICS
737 template <
typename T>
740 #if HWY_COMPILER_HAS_MASK_INTRINSICS
746 template <
typename T>
749 #if HWY_COMPILER_HAS_MASK_INTRINSICS
758 template <
typename T>
763 template <
typename T>
768 template <
typename T>
773 template <
typename T>
778 template <
typename T>
781 constexpr
size_t N = 32 /
sizeof(T);
790 template <
typename T>
792 return Mask256<T>{
v.raw};
795 template <
typename T>
797 return Vec256<T>{
v.raw};
800 template <
typename T>
802 return Vec256<T>{
v.raw};
808 template <
typename T>
810 const Vec256<T> no) {
811 return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
814 const Vec256<float> yes,
815 const Vec256<float> no) {
816 return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
819 const Vec256<double> yes,
820 const Vec256<double> no) {
821 return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
825 template <
typename T>
831 template <
typename T>
836 template <
typename T, HWY_IF_FLOAT(T)>
838 const auto zero =
Zero(Full256<T>());
845 template <
typename T>
850 template <
typename T>
851 HWY_API Mask256<T>
And(
const Mask256<T> a, Mask256<T> b) {
856 template <
typename T>
857 HWY_API Mask256<T>
AndNot(
const Mask256<T> a, Mask256<T> b) {
862 template <
typename T>
863 HWY_API Mask256<T>
Or(
const Mask256<T> a, Mask256<T> b) {
868 template <
typename T>
869 HWY_API Mask256<T>
Xor(
const Mask256<T> a, Mask256<T> b) {
878 #if HWY_TARGET <= HWY_AVX3
882 template <
typename TFrom,
typename TTo>
884 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
885 return Mask256<TTo>{m.raw};
890 template <
typename T>
895 template <
typename T>
900 template <
typename T>
905 template <
typename T>
913 template <
typename T>
915 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
921 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
925 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
927 return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
929 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
931 return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
933 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
935 return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
939 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
948 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
952 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
954 return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
956 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
958 return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
960 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
962 return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
966 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
976 return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)};
979 return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)};
982 return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)};
985 return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)};
1005 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1014 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1024 template <
typename T>
1028 template <
typename T>
1032 template <
typename T>
1036 template <
typename T>
1043 template <
typename T>
1055 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1060 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1062 return Vec256<T>{_mm256_movm_epi16(
v.raw)};
1065 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1067 return Vec256<T>{_mm256_movm_epi32(
v.raw)};
1070 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1072 return Vec256<T>{_mm256_movm_epi64(
v.raw)};
1076 return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(
v.raw))};
1083 template <
typename T>
1092 template <
typename TFrom,
typename TTo>
1094 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1098 template <
typename T>
1100 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1101 return (
v & bit) == bit;
1106 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1108 return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1111 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1113 return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1116 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1118 return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1121 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1123 return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1127 const Vec256<float> b) {
1128 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1132 const Vec256<double> b) {
1133 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1138 template <
typename T, HWY_IF_NOT_FLOAT(T)>
1144 const Vec256<float> b) {
1145 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1148 const Vec256<double> b) {
1149 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1157 #if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1158 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1160 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1164 #if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1165 using i8x32 =
signed char __attribute__((__vector_size__(32)));
1166 return Mask256<int8_t>{
static_cast<__m256i
>(
reinterpret_cast<i8x32
>(a.raw) >
1167 reinterpret_cast<i8x32
>(b.raw))};
1169 return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1173 const Vec256<int16_t> b) {
1174 return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1177 const Vec256<int32_t> b) {
1178 return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1181 const Vec256<int64_t> b) {
1182 return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1185 template <
typename T, HWY_IF_UNSIGNED(T)>
1187 const Full256<T> du;
1189 const Vec256<T> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
1193 HWY_API Mask256<float>
operator>(
const Vec256<float> a,
const Vec256<float> b) {
1194 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1197 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1203 const Vec256<float> b) {
1204 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1207 const Vec256<double> b) {
1208 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1215 template <
typename T>
1220 template <
typename T>
1228 HWY_API Vec256<uint8_t>
Min(
const Vec256<uint8_t> a,
const Vec256<uint8_t> b) {
1229 return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)};
1231 HWY_API Vec256<uint16_t>
Min(
const Vec256<uint16_t> a,
1232 const Vec256<uint16_t> b) {
1233 return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)};
1235 HWY_API Vec256<uint32_t>
Min(
const Vec256<uint32_t> a,
1236 const Vec256<uint32_t> b) {
1237 return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)};
1239 HWY_API Vec256<uint64_t>
Min(
const Vec256<uint64_t> a,
1240 const Vec256<uint64_t> b) {
1241 #if HWY_TARGET <= HWY_AVX3
1242 return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)};
1244 const Full256<uint64_t> du;
1245 const Full256<int64_t> di;
1246 const auto msb =
Set(du, 1ull << 63);
1253 HWY_API Vec256<int8_t>
Min(
const Vec256<int8_t> a,
const Vec256<int8_t> b) {
1254 return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)};
1256 HWY_API Vec256<int16_t>
Min(
const Vec256<int16_t> a,
const Vec256<int16_t> b) {
1257 return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)};
1259 HWY_API Vec256<int32_t>
Min(
const Vec256<int32_t> a,
const Vec256<int32_t> b) {
1260 return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)};
1262 HWY_API Vec256<int64_t>
Min(
const Vec256<int64_t> a,
const Vec256<int64_t> b) {
1263 #if HWY_TARGET <= HWY_AVX3
1264 return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)};
1271 HWY_API Vec256<float>
Min(
const Vec256<float> a,
const Vec256<float> b) {
1272 return Vec256<float>{_mm256_min_ps(a.raw, b.raw)};
1281 HWY_API Vec256<uint8_t>
Max(
const Vec256<uint8_t> a,
const Vec256<uint8_t> b) {
1282 return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)};
1284 HWY_API Vec256<uint16_t>
Max(
const Vec256<uint16_t> a,
1285 const Vec256<uint16_t> b) {
1286 return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)};
1288 HWY_API Vec256<uint32_t>
Max(
const Vec256<uint32_t> a,
1289 const Vec256<uint32_t> b) {
1290 return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)};
1292 HWY_API Vec256<uint64_t>
Max(
const Vec256<uint64_t> a,
1293 const Vec256<uint64_t> b) {
1294 #if HWY_TARGET <= HWY_AVX3
1295 return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)};
1297 const Full256<uint64_t> du;
1298 const Full256<int64_t> di;
1299 const auto msb =
Set(du, 1ull << 63);
1306 HWY_API Vec256<int8_t>
Max(
const Vec256<int8_t> a,
const Vec256<int8_t> b) {
1307 return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)};
1309 HWY_API Vec256<int16_t>
Max(
const Vec256<int16_t> a,
const Vec256<int16_t> b) {
1310 return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)};
1312 HWY_API Vec256<int32_t>
Max(
const Vec256<int32_t> a,
const Vec256<int32_t> b) {
1313 return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)};
1315 HWY_API Vec256<int64_t>
Max(
const Vec256<int64_t> a,
const Vec256<int64_t> b) {
1316 #if HWY_TARGET <= HWY_AVX3
1317 return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)};
1324 HWY_API Vec256<float>
Max(
const Vec256<float> a,
const Vec256<float> b) {
1325 return Vec256<float>{_mm256_max_ps(a.raw, b.raw)};
1333 template <
typename T>
1335 #if HWY_TARGET <= HWY_AVX3
1337 constexpr
size_t N = 32 /
sizeof(T);
1339 const uint64_t all = (1ull <<
N) - 1;
1343 const uint32_t all =
static_cast<uint32_t
>((1ull <<
N) - 1);
1346 (n > 255) ? all : _bzhi_u32(all,
static_cast<uint32_t
>(n)));
1360 const Vec256<uint8_t> b) {
1361 return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)};
1364 const Vec256<uint16_t> b) {
1365 return Vec256<uint16_t>{_mm256_add_epi16(a.raw, b.raw)};
1368 const Vec256<uint32_t> b) {
1369 return Vec256<uint32_t>{_mm256_add_epi32(a.raw, b.raw)};
1378 const Vec256<int8_t> b) {
1379 return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)};
1382 const Vec256<int16_t> b) {
1383 return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)};
1386 const Vec256<int32_t> b) {
1387 return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)};
1395 HWY_API Vec256<float>
operator+(
const Vec256<float> a,
const Vec256<float> b) {
1396 return Vec256<float>{_mm256_add_ps(a.raw, b.raw)};
1407 const Vec256<uint8_t> b) {
1408 return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1411 const Vec256<uint16_t> b) {
1412 return Vec256<uint16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1415 const Vec256<uint32_t> b) {
1416 return Vec256<uint32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1425 const Vec256<int8_t> b) {
1426 return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1429 const Vec256<int16_t> b) {
1430 return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1433 const Vec256<int32_t> b) {
1434 return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1442 HWY_API Vec256<float>
operator-(
const Vec256<float> a,
const Vec256<float> b) {
1443 return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)};
1452 return Vec256<uint64_t>{_mm256_sad_epu8(
v.raw, _mm256_setzero_si256())};
1461 const Vec256<uint8_t> b) {
1462 return Vec256<uint8_t>{_mm256_adds_epu8(a.raw, b.raw)};
1465 const Vec256<uint16_t> b) {
1466 return Vec256<uint16_t>{_mm256_adds_epu16(a.raw, b.raw)};
1471 const Vec256<int8_t> b) {
1472 return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)};
1475 const Vec256<int16_t> b) {
1476 return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
1485 const Vec256<uint8_t> b) {
1486 return Vec256<uint8_t>{_mm256_subs_epu8(a.raw, b.raw)};
1489 const Vec256<uint16_t> b) {
1490 return Vec256<uint16_t>{_mm256_subs_epu16(a.raw, b.raw)};
1495 const Vec256<int8_t> b) {
1496 return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)};
1499 const Vec256<int16_t> b) {
1500 return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
1509 const Vec256<uint8_t> b) {
1510 return Vec256<uint8_t>{_mm256_avg_epu8(a.raw, b.raw)};
1513 const Vec256<uint16_t> b) {
1514 return Vec256<uint16_t>{_mm256_avg_epu16(a.raw, b.raw)};
1520 HWY_API Vec256<int8_t>
Abs(
const Vec256<int8_t>
v) {
1521 #if HWY_COMPILER_MSVC
1523 const auto zero =
Zero(Full256<int8_t>());
1524 return Vec256<int8_t>{_mm256_max_epi8(
v.raw, (zero -
v).raw)};
1526 return Vec256<int8_t>{_mm256_abs_epi8(
v.raw)};
1529 HWY_API Vec256<int16_t>
Abs(
const Vec256<int16_t>
v) {
1530 return Vec256<int16_t>{_mm256_abs_epi16(
v.raw)};
1532 HWY_API Vec256<int32_t>
Abs(
const Vec256<int32_t>
v) {
1533 return Vec256<int32_t>{_mm256_abs_epi32(
v.raw)};
1537 HWY_API Vec256<float>
Abs(
const Vec256<float>
v) {
1538 const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
1539 return v &
BitCast(Full256<float>(), mask);
1549 HWY_API Vec256<uint16_t>
operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1550 return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1552 HWY_API Vec256<uint32_t>
operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1553 return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1558 return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1561 return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1565 HWY_API Vec256<uint16_t>
MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1566 return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)};
1568 HWY_API Vec256<int16_t>
MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) {
1569 return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)};
1573 return Vec256<int16_t>{_mm256_mulhrs_epi16(a.raw, b.raw)};
1578 HWY_API Vec256<int64_t>
MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
1579 return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
1581 HWY_API Vec256<uint64_t>
MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1582 return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
1587 template <
int kBits>
1589 return Vec256<uint16_t>{_mm256_slli_epi16(
v.raw, kBits)};
1592 template <
int kBits>
1594 return Vec256<uint32_t>{_mm256_slli_epi32(
v.raw, kBits)};
1597 template <
int kBits>
1602 template <
int kBits>
1604 return Vec256<int16_t>{_mm256_slli_epi16(
v.raw, kBits)};
1607 template <
int kBits>
1609 return Vec256<int32_t>{_mm256_slli_epi32(
v.raw, kBits)};
1612 template <
int kBits>
1617 template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
1619 const Full256<T> d8;
1624 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
1629 template <
int kBits>
1631 return Vec256<uint16_t>{_mm256_srli_epi16(
v.raw, kBits)};
1634 template <
int kBits>
1636 return Vec256<uint32_t>{_mm256_srli_epi32(
v.raw, kBits)};
1639 template <
int kBits>
1644 template <
int kBits>
1646 const Full256<uint8_t> d8;
1648 const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{
v.raw}).raw};
1649 return shifted &
Set(d8, 0xFF >> kBits);
1652 template <
int kBits>
1654 return Vec256<int16_t>{_mm256_srai_epi16(
v.raw, kBits)};
1657 template <
int kBits>
1659 return Vec256<int32_t>{_mm256_srai_epi32(
v.raw, kBits)};
1662 template <
int kBits>
1664 const Full256<int8_t> di;
1665 const Full256<uint8_t> du;
1667 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1668 return (shifted ^ shifted_sign) - shifted_sign;
1675 template <
int kBits>
1677 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
1678 #if HWY_TARGET <= HWY_AVX3
1681 if (kBits == 0)
return v;
1686 template <
int kBits>
1688 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
1689 #if HWY_TARGET <= HWY_AVX3
1692 if (kBits == 0)
return v;
1704 return ShiftRight<15>(
v);
1708 return ShiftRight<31>(
v);
1712 #if HWY_TARGET == HWY_AVX2
1719 template <
int kBits>
1721 #if HWY_TARGET <= HWY_AVX3
1728 return right | sign;
1732 HWY_API Vec256<int64_t>
Abs(
const Vec256<int64_t>
v) {
1733 #if HWY_TARGET <= HWY_AVX3
1734 return Vec256<int64_t>{_mm256_abs_epi64(
v.raw)};
1736 const auto zero =
Zero(Full256<int64_t>());
1748 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1750 static_assert(IsSigned<T>(),
"Only works for signed/float");
1759 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
1761 static_assert(IsSigned<T>(),
"Only works for signed/float");
1774 return Vec256<uint16_t>{_mm256_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
1778 return Vec256<uint32_t>{_mm256_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
1786 return Vec256<int16_t>{_mm256_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
1790 return Vec256<int32_t>{_mm256_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
1797 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1799 const Full256<T> d8;
1802 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
1809 return Vec256<uint16_t>{_mm256_srl_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
1813 return Vec256<uint32_t>{_mm256_srl_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
1821 const Full256<uint8_t> d8;
1824 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
1829 return Vec256<int16_t>{_mm256_sra_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
1834 return Vec256<int32_t>{_mm256_sra_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
1838 #if HWY_TARGET <= HWY_AVX3
1845 return right | sign;
1850 const Full256<int8_t> di;
1851 const Full256<uint8_t> du;
1853 const auto shifted_sign =
1854 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
1855 return (shifted ^ shifted_sign) - shifted_sign;
1860 template <
typename T, HWY_IF_FLOAT(T)>
1865 template <
typename T, HWY_IF_NOT_FLOAT(T)>
1867 return Zero(Full256<T>()) -
v;
1872 HWY_API Vec256<float>
operator*(
const Vec256<float> a,
const Vec256<float> b) {
1873 return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)};
1880 HWY_API Vec256<float>
operator/(
const Vec256<float> a,
const Vec256<float> b) {
1881 return Vec256<float>{_mm256_div_ps(a.raw, b.raw)};
1890 return Vec256<float>{_mm256_rcp_ps(
v.raw)};
1894 HWY_API Vec256<float>
AbsDiff(
const Vec256<float> a,
const Vec256<float> b) {
1901 HWY_API Vec256<float>
MulAdd(
const Vec256<float> mul,
const Vec256<float> x,
1902 const Vec256<float> add) {
1903 #ifdef HWY_DISABLE_BMI2_FMA
1904 return mul * x + add;
1906 return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
1911 #ifdef HWY_DISABLE_BMI2_FMA
1912 return mul * x + add;
1919 HWY_API Vec256<float>
NegMulAdd(
const Vec256<float> mul,
const Vec256<float> x,
1920 const Vec256<float> add) {
1921 #ifdef HWY_DISABLE_BMI2_FMA
1922 return add - mul * x;
1924 return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
1930 #ifdef HWY_DISABLE_BMI2_FMA
1931 return add - mul * x;
1938 HWY_API Vec256<float>
MulSub(
const Vec256<float> mul,
const Vec256<float> x,
1939 const Vec256<float> sub) {
1940 #ifdef HWY_DISABLE_BMI2_FMA
1941 return mul * x - sub;
1943 return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
1948 #ifdef HWY_DISABLE_BMI2_FMA
1949 return mul * x - sub;
1956 HWY_API Vec256<float>
NegMulSub(
const Vec256<float> mul,
const Vec256<float> x,
1957 const Vec256<float> sub) {
1958 #ifdef HWY_DISABLE_BMI2_FMA
1959 return Neg(mul * x) - sub;
1961 return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1967 #ifdef HWY_DISABLE_BMI2_FMA
1968 return Neg(mul * x) - sub;
1978 return Vec256<float>{_mm256_sqrt_ps(
v.raw)};
1986 return Vec256<float>{_mm256_rsqrt_ps(
v.raw)};
1993 return Vec256<float>{
1994 _mm256_round_ps(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1998 _mm256_round_pd(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2003 return Vec256<float>{
2004 _mm256_round_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2008 _mm256_round_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2013 return Vec256<float>{
2014 _mm256_round_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2018 _mm256_round_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2023 return Vec256<float>{
2024 _mm256_round_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2028 _mm256_round_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2034 #if HWY_TARGET <= HWY_AVX3
2041 #if HWY_TARGET <= HWY_AVX3
2048 #if HWY_TARGET <= HWY_AVX3
2068 template <
typename T, HWY_IF_FLOAT(T)>
2078 template <
typename T, HWY_IF_FLOAT(T)>
2088 const VFromD<decltype(di)> exp =
2099 template <
typename T>
2102 _mm256_load_si256(
reinterpret_cast<const __m256i*
>(aligned))};
2113 template <
typename T>
2115 return Vec256<T>{_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(p))};
2128 #if HWY_TARGET <= HWY_AVX3
2130 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2136 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2139 return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)};
2142 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2145 return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)};
2148 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2151 return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)};
2167 template <
typename T, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2168 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
2169 const T* HWY_RESTRICT p) {
2170 return IfThenElseZero(m, LoadU(d, p));
2173 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2176 auto pi =
reinterpret_cast<const int*
>(p);
2177 return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)};
2180 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2183 auto pi =
reinterpret_cast<const long long*
>(p);
2184 return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)};
2189 const Vec256<int32_t> mi =
2191 return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
2196 const Vec256<int64_t> mi =
2198 return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
2207 template <
typename T>
2209 #if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
2215 const __m128i v128 =
LoadU(Full128<T>(), p).raw;
2217 _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
2219 return Vec256<T>{_mm256_broadcastsi128_si256(
LoadU(Full128<T>(), p).raw)};
2224 #if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
2227 _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
2229 return Vec256<float>{_mm256_broadcast_ps(
reinterpret_cast<const __m128*
>(p))};
2234 #if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
2237 _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2240 _mm256_broadcast_pd(
reinterpret_cast<const __m128d*
>(p))};
2246 template <
typename T>
2248 _mm256_store_si256(
reinterpret_cast<__m256i*
>(aligned),
v.raw);
2252 _mm256_store_ps(aligned,
v.raw);
2256 _mm256_store_pd(aligned,
v.raw);
2259 template <
typename T>
2261 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(p),
v.raw);
2265 _mm256_storeu_ps(p,
v.raw);
2269 _mm256_storeu_pd(p,
v.raw);
2274 #if HWY_TARGET <= HWY_AVX3
2276 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2279 _mm256_mask_storeu_epi8(p, m.
raw,
v.raw);
2282 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2285 _mm256_mask_storeu_epi16(p, m.raw,
v.raw);
2288 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2291 _mm256_mask_storeu_epi32(p, m.raw,
v.raw);
2294 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2297 _mm256_mask_storeu_epi64(p, m.raw,
v.raw);
2302 _mm256_mask_storeu_ps(p, m.
raw,
v.raw);
2307 _mm256_mask_storeu_pd(p, m.
raw,
v.raw);
2321 template <
typename T, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2322 HWY_API
void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
2323 T* HWY_RESTRICT p) {
2327 const RebindToUn
signed<decltype(d)> du;
2328 using TU = TFromD<decltype(du)>;
2329 alignas(32) TU buf[32 / sizeof(T)];
2330 alignas(32) TU mask[32 / sizeof(T)];
2331 Store(BitCast(du, v), du, buf);
2332 Store(BitCast(du, VecFromMask(d, m)), du, mask);
2333 for (
size_t i = 0; i < 32 / sizeof(T); ++i) {
2335 CopyBytes<sizeof(T)>(buf + i, p + i);
2340 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2343 auto pi =
reinterpret_cast<int*
>(p);
2344 _mm256_maskstore_epi32(pi, m.raw,
v.raw);
2347 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2350 auto pi =
reinterpret_cast<long long*
>(p);
2351 _mm256_maskstore_epi64(pi, m.raw,
v.raw);
2356 const Vec256<int32_t> mi =
2358 _mm256_maskstore_ps(p, mi.raw,
v.raw);
2363 const Vec256<int64_t> mi =
2365 _mm256_maskstore_pd(p, mi.raw,
v.raw);
2372 template <
typename T>
2375 _mm256_stream_si256(
reinterpret_cast<__m256i*
>(aligned),
v.raw);
2379 _mm256_stream_ps(aligned,
v.raw);
2383 _mm256_stream_pd(aligned,
v.raw);
2392 #if HWY_TARGET <= HWY_AVX3
2395 template <
typename T>
2399 _mm256_i32scatter_epi32(base, offset.
raw,
v.raw, 1);
2401 template <
typename T>
2405 _mm256_i32scatter_epi32(base, index.
raw,
v.raw, 4);
2408 template <
typename T>
2412 _mm256_i64scatter_epi64(base, offset.
raw,
v.raw, 1);
2414 template <
typename T>
2418 _mm256_i64scatter_epi64(base, index.
raw,
v.raw, 8);
2423 template <
typename T,
typename Offset>
2425 const Vec256<Offset> offset) {
2426 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2429 template <
typename T,
typename Index>
2431 const Vec256<Index> index) {
2432 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2439 _mm256_i32scatter_ps(base, offset.
raw,
v.raw, 1);
2444 _mm256_i32scatter_ps(base, index.
raw,
v.raw, 4);
2450 _mm256_i64scatter_pd(base, offset.
raw,
v.raw, 1);
2455 _mm256_i64scatter_pd(base, index.
raw,
v.raw, 8);
2460 template <
typename T,
typename Offset>
2462 const Vec256<Offset> offset) {
2463 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2465 constexpr
size_t N = 32 /
sizeof(T);
2466 alignas(32) T lanes[
N];
2469 alignas(32) Offset offset_lanes[
N];
2470 Store(offset, Full256<Offset>(), offset_lanes);
2472 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
2473 for (
size_t i = 0; i <
N; ++i) {
2474 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2478 template <
typename T,
typename Index>
2480 const Vec256<Index> index) {
2481 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2483 constexpr
size_t N = 32 /
sizeof(T);
2484 alignas(32) T lanes[
N];
2487 alignas(32) Index index_lanes[
N];
2488 Store(index, Full256<Index>(), index_lanes);
2490 for (
size_t i = 0; i <
N; ++i) {
2491 base[index_lanes[i]] = lanes[i];
2501 template <
typename T>
2506 return Vec256<T>{_mm256_i32gather_epi32(
2507 reinterpret_cast<const int32_t*
>(base), offset.
raw, 1)};
2509 template <
typename T>
2514 return Vec256<T>{_mm256_i32gather_epi32(
2515 reinterpret_cast<const int32_t*
>(base), index.
raw, 4)};
2518 template <
typename T>
2523 return Vec256<T>{_mm256_i64gather_epi64(
2526 template <
typename T>
2531 return Vec256<T>{_mm256_i64gather_epi64(
2537 template <
typename T,
typename Offset>
2539 const Vec256<Offset> offset) {
2540 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2543 template <
typename T,
typename Index>
2545 const Vec256<Index> index) {
2546 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2578 template <
typename T>
2580 return Vec128<T>{_mm256_castsi256_si128(
v.raw)};
2589 template <
typename T>
2596 template <
typename T>
2598 return Vec128<T>{_mm256_extracti128_si256(
v.raw, 1)};
2608 template <
typename T>
2612 alignas(32) T lanes[32 /
sizeof(T)];
2618 template <
typename T>
2622 alignas(64) T lanes[64 /
sizeof(T)];
2625 return Load(
d, lanes);
2629 template <
typename T>
2647 #if !defined(HWY_HAVE_ZEXT)
2648 #if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) || \
2649 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
2650 (!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC >= 1000)
2651 #define HWY_HAVE_ZEXT 1
2653 #define HWY_HAVE_ZEXT 0
2657 template <
typename T>
2660 return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
2662 return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
2670 return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.
raw, 0)};
2684 template <
typename T>
2687 return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
2702 template <
int kBytes,
typename T>
2704 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2706 return Vec256<T>{_mm256_slli_si256(
v.raw, kBytes)};
2709 template <
int kBytes,
typename T>
2711 return ShiftLeftBytes<kBytes>(Full256<T>(),
v);
2716 template <
int kLanes,
typename T>
2722 template <
int kLanes,
typename T>
2724 return ShiftLeftLanes<kLanes>(Full256<T>(),
v);
2729 template <
int kBytes,
typename T>
2731 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2733 return Vec256<T>{_mm256_srli_si256(
v.raw, kBytes)};
2737 template <
int kLanes,
typename T>
2746 template <
int kBytes,
typename T,
class V = Vec256<T>>
2749 return BitCast(
d, Vec256<uint8_t>{_mm256_alignr_epi8(
2756 template <
int kLane>
2758 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2760 const __m256i lo = _mm256_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2761 return Vec256<uint16_t>{_mm256_unpacklo_epi64(lo, lo)};
2764 _mm256_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2765 return Vec256<uint16_t>{_mm256_unpackhi_epi64(hi, hi)};
2768 template <
int kLane>
2770 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2771 return Vec256<uint32_t>{_mm256_shuffle_epi32(
v.raw, 0x55 * kLane)};
2773 template <
int kLane>
2775 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2780 template <
int kLane>
2782 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2784 const __m256i lo = _mm256_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2785 return Vec256<int16_t>{_mm256_unpacklo_epi64(lo, lo)};
2788 _mm256_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2789 return Vec256<int16_t>{_mm256_unpackhi_epi64(hi, hi)};
2792 template <
int kLane>
2794 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2795 return Vec256<int32_t>{_mm256_shuffle_epi32(
v.raw, 0x55 * kLane)};
2797 template <
int kLane>
2799 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2804 template <
int kLane>
2806 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2807 return Vec256<float>{_mm256_shuffle_ps(
v.raw,
v.raw, 0x55 * kLane)};
2809 template <
int kLane>
2811 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2824 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2826 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, 0xB1)};
2834 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2838 constexpr
int m = _MM_SHUFFLE(2, 3, 0, 1);
2842 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2846 constexpr
int m = _MM_SHUFFLE(1, 2, 3, 0);
2850 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2854 constexpr
int m = _MM_SHUFFLE(3, 0, 1, 2);
2918 template <
typename T>
2924 template <
typename T,
typename TI, HWY_IF_LANE_SIZE(T, 4)>
2926 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2927 #if HWY_IS_DEBUG_BUILD
2930 AllTrue(di,
Lt(vec,
Set(di,
static_cast<TI
>(32 /
sizeof(T))))));
2936 template <
typename T,
typename TI, HWY_IF_LANE_SIZE(T, 8)>
2938 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2939 const Rebind<TI, decltype(
d)> di;
2941 #if HWY_IS_DEBUG_BUILD
2943 AllTrue(di,
Lt(idx64,
Set(di,
static_cast<TI
>(32 /
sizeof(T))))));
2946 #if HWY_TARGET <= HWY_AVX3
2948 return Indices256<T>{idx64.raw};
2952 const Vec256<TI> dup =
2953 BitCast(di, Vec256<float>{_mm256_moveldup_ps(
BitCast(df, idx64).raw)});
2955 const Vec256<TI> idx32 = dup + dup +
Set(di, TI(1) << 32);
2956 return Indices256<T>{idx32.raw};
2960 template <
typename T,
typename TI>
2962 const Rebind<TI, decltype(
d)> di;
2966 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2968 return Vec256<T>{_mm256_permutevar8x32_epi32(
v.raw, idx.raw)};
2971 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2973 #if HWY_TARGET <= HWY_AVX3
2974 return Vec256<T>{_mm256_permutexvar_epi64(idx.raw,
v.raw)};
2976 return Vec256<T>{_mm256_permutevar8x32_epi32(
v.raw, idx.raw)};
2987 #if HWY_TARGET <= HWY_AVX3
2999 template <
typename T>
3001 return Vec256<T>{_mm256_permute2x128_si256(
v.raw,
v.raw, 0x01)};
3014 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3016 alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
3020 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3022 alignas(32) constexpr int64_t kReverse[4] = {3, 2, 1, 0};
3026 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3028 #if HWY_TARGET <= HWY_AVX3
3030 alignas(32) constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
3031 7, 6, 5, 4, 3, 2, 1, 0};
3032 const Vec256<int16_t> idx =
Load(di, kReverse);
3034 _mm256_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
3038 return BitCast(
d, RotateRight<16>(rev32));
3044 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3050 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3055 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3062 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3064 #if HWY_TARGET <= HWY_AVX3
3066 alignas(32) constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
3067 11, 10, 9, 8, 15, 14, 13, 12};
3070 _mm256_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
3077 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3082 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3090 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3092 #if HWY_TARGET <= HWY_AVX3
3094 alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
3095 15, 14, 13, 12, 11, 10, 9, 8};
3098 _mm256_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
3105 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3110 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3122 const Vec256<uint8_t> b) {
3123 return Vec256<uint8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
3126 const Vec256<uint16_t> b) {
3127 return Vec256<uint16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
3130 const Vec256<uint32_t> b) {
3131 return Vec256<uint32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3134 const Vec256<uint64_t> b) {
3135 return Vec256<uint64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3139 const Vec256<int8_t> b) {
3140 return Vec256<int8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
3143 const Vec256<int16_t> b) {
3144 return Vec256<int16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
3147 const Vec256<int32_t> b) {
3148 return Vec256<int32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3151 const Vec256<int64_t> b) {
3152 return Vec256<int64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3156 const Vec256<float> b) {
3157 return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)};
3170 const Vec256<uint8_t> b) {
3171 return Vec256<uint8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3174 const Vec256<uint16_t> b) {
3175 return Vec256<uint16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3178 const Vec256<uint32_t> b) {
3179 return Vec256<uint32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3182 const Vec256<uint64_t> b) {
3183 return Vec256<uint64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3187 const Vec256<int8_t> b) {
3188 return Vec256<int8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3191 const Vec256<int16_t> b) {
3192 return Vec256<int16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3195 const Vec256<int32_t> b) {
3196 return Vec256<int32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3199 const Vec256<int64_t> b) {
3200 return Vec256<int64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3204 const Vec256<float> b) {
3205 return Vec256<float>{_mm256_unpackhi_ps(a.raw, b.raw)};
3214 template <
typename T,
class V = Vec256<T>>
3223 template <
typename T,
typename TW = MakeW
ide<T>>
3227 template <
typename T,
typename TW = MakeW
ide<T>>
3232 template <
typename T,
typename TW = MakeW
ide<T>>
3244 template <
typename T>
3246 const Vec256<T> lo) {
3247 const Half<decltype(
d)> d2;
3248 return Vec256<T>{_mm256_inserti128_si256(lo.raw,
LowerHalf(d2, hi).raw, 1)};
3252 const Half<decltype(
d)> d2;
3258 const Half<decltype(
d)> d2;
3263 template <
typename T>
3265 const Vec256<T> lo) {
3266 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
3280 template <
typename T>
3282 const Vec256<T> lo) {
3283 return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
3297 template <
typename T>
3299 const Vec256<T> lo) {
3300 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
3315 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3318 #if HWY_TARGET == HWY_AVX3_DL
3319 alignas(32) constexpr uint8_t kIdx[32] = {
3320 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
3321 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
3322 return BitCast(
d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi8(
3324 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
3328 const Vec256<uint16_t> uH = ShiftRight<8>(
BitCast(dw, hi));
3329 const Vec256<uint16_t> uL = ShiftRight<8>(
BitCast(dw, lo));
3330 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
3331 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3335 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3338 #if HWY_TARGET <= HWY_AVX3
3339 alignas(32) constexpr uint16_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
3340 17, 19, 21, 23, 25, 27, 29, 31};
3341 return BitCast(
d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi16(
3343 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
3347 const Vec256<uint32_t> uH = ShiftRight<16>(
BitCast(dw, hi));
3348 const Vec256<uint32_t> uL = ShiftRight<16>(
BitCast(dw, lo));
3349 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3350 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3354 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3357 #if HWY_TARGET <= HWY_AVX3
3358 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3359 return BitCast(
d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3360 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3364 const Vec256<float> v3131{_mm256_shuffle_ps(
3365 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
3366 return Vec256<T>{_mm256_permute4x64_epi64(
BitCast(du, v3131).raw,
3367 _MM_SHUFFLE(3, 1, 2, 0))};
3374 #if HWY_TARGET <= HWY_AVX3
3375 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3377 __mmask8{0xFF}, hi.
raw)};
3380 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(3, 1, 3, 1))};
3382 BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3386 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3389 #if HWY_TARGET <= HWY_AVX3
3390 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3391 return BitCast(
d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3392 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3396 const Vec256<double> v31{
3397 _mm256_shuffle_pd(
BitCast(df, lo).raw,
BitCast(df, hi).raw, 15)};
3399 _mm256_permute4x64_epi64(
BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3405 #if HWY_TARGET <= HWY_AVX3
3407 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3409 __mmask8{0xFF}, hi.
raw)};
3414 _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3420 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3423 #if HWY_TARGET == HWY_AVX3_DL
3424 alignas(64) constexpr uint8_t kIdx[32] = {
3425 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3426 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
3427 return BitCast(
d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi8(
3429 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
3433 const Vec256<uint16_t> mask =
Set(dw, 0x00FF);
3434 const Vec256<uint16_t> uH =
And(
BitCast(dw, hi), mask);
3435 const Vec256<uint16_t> uL =
And(
BitCast(dw, lo), mask);
3436 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
3437 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3441 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3444 #if HWY_TARGET <= HWY_AVX3
3445 alignas(64) constexpr uint16_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3446 16, 18, 20, 22, 24, 26, 28, 30};
3447 return BitCast(
d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi16(
3449 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
3453 const Vec256<uint32_t> mask =
Set(dw, 0x0000FFFF);
3454 const Vec256<uint32_t> uH =
And(
BitCast(dw, hi), mask);
3455 const Vec256<uint32_t> uL =
And(
BitCast(dw, lo), mask);
3456 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3457 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3461 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3464 #if HWY_TARGET <= HWY_AVX3
3465 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3466 return BitCast(
d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3467 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3471 const Vec256<float> v2020{_mm256_shuffle_ps(
3472 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
3473 return Vec256<T>{_mm256_permute4x64_epi64(
BitCast(du, v2020).raw,
3474 _MM_SHUFFLE(3, 1, 2, 0))};
3482 #if HWY_TARGET <= HWY_AVX3
3483 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3485 __mmask8{0xFF}, hi.
raw)};
3488 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(2, 0, 2, 0))};
3490 BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3495 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3498 #if HWY_TARGET <= HWY_AVX3
3499 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3500 return BitCast(
d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3501 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3505 const Vec256<double> v20{
3508 _mm256_permute4x64_epi64(
BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3515 #if HWY_TARGET <= HWY_AVX3
3517 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3519 __mmask8{0xFF}, hi.
raw)};
3524 _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3530 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3532 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3536 _mm256_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3539 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3546 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3548 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3552 _mm256_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3555 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3564 template <
typename T>
3566 const Vec256<T> b) {
3568 const Full256<uint8_t> d8;
3569 alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3570 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3573 template <
typename T>
3575 const Vec256<T> b) {
3576 return Vec256<T>{_mm256_blend_epi16(a.raw, b.raw, 0x55)};
3578 template <
typename T>
3580 const Vec256<T> b) {
3581 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
3583 template <
typename T>
3585 const Vec256<T> b) {
3586 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
3591 template <
typename T>
3592 HWY_API Vec256<T>
OddEven(
const Vec256<T> a,
const Vec256<T> b) {
3595 HWY_API Vec256<float>
OddEven(
const Vec256<float> a,
const Vec256<float> b) {
3596 return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)};
3605 template <
typename T>
3620 template <
typename T>
3628 template <
typename T,
typename TI>
3630 const Vec256<TI> from) {
3631 return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
3635 template <
typename T,
typename TI,
size_t NI>
3646 template <
typename T,
size_t N,
typename TI>
3658 #if HWY_TARGET > HWY_AVX3
3662 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3666 const Rebind<float, decltype(dw)> df;
3667 const auto zero =
Zero(
d);
3670 const auto upper = exp +
Set(
d, 0x3F80);
3672 const auto f0 =
ZipLower(dw, zero, upper);
3673 const auto f1 =
ZipUpper(dw, zero, upper);
3676 const Vec256<int32_t> bits0{_mm256_cvttps_epi32(
BitCast(df, f0).raw)};
3677 const Vec256<int32_t> bits1{_mm256_cvttps_epi32(
BitCast(df, f1).raw)};
3678 return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
3686 #if HWY_TARGET <= HWY_AVX3
3704 template <
typename T, HWY_IF_SIGNED(T)>
3715 #if HWY_TARGET <= HWY_AVX3
3738 #if HWY_TARGET <= HWY_AVX3
3752 #if HWY_TARGET <= HWY_AVX3
3760 const Vec256<uint64_t> b) {
3761 const DFromV<decltype(a)> du64;
3763 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3764 const auto a32 =
BitCast(du32, a);
3765 const auto b32 =
BitCast(du32, b);
3773 const auto aLbL =
MulEven(a32, b32);
3774 const auto w3 = aLbL & maskL;
3776 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3777 const auto w2 = t2 & maskL;
3778 const auto w1 = ShiftRight<32>(t2);
3780 const auto t =
MulEven(a32, bH) + w2;
3781 const auto k = ShiftRight<32>(t);
3783 const auto mulH =
MulEven(aH, bH) + w1 + k;
3784 const auto mulL = ShiftLeft<32>(t) + w3;
3789 const Vec256<uint64_t> b) {
3790 const DFromV<decltype(a)> du64;
3792 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3793 const auto a32 =
BitCast(du32, a);
3794 const auto b32 =
BitCast(du32, b);
3800 const auto aLbL =
MulEven(a32, b32);
3801 const auto w3 = aLbL & maskL;
3803 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3804 const auto w2 = t2 & maskL;
3805 const auto w1 = ShiftRight<32>(t2);
3807 const auto t =
MulEven(a32, bH) + w2;
3808 const auto k = ShiftRight<32>(t);
3810 const auto mulH =
MulEven(aH, bH) + w1 + k;
3811 const auto mulL = ShiftLeft<32>(t) + w3;
3818 Vec256<bfloat16_t> a,
3819 Vec256<bfloat16_t> b,
3820 const Vec256<float> sum0,
3821 Vec256<float>& sum1) {
3825 const Vec256<uint16_t> zero =
Zero(du16);
3854 Vec128<uint8_t>
v) {
3855 return Vec256<uint16_t>{_mm256_cvtepu8_epi16(
v.raw)};
3862 Vec128<uint8_t>
v) {
3863 return Vec256<int16_t>{_mm256_cvtepu8_epi16(
v.raw)};
3870 Vec128<uint16_t>
v) {
3871 return Vec256<uint32_t>{_mm256_cvtepu16_epi32(
v.raw)};
3874 Vec128<uint16_t>
v) {
3875 return Vec256<int32_t>{_mm256_cvtepu16_epi32(
v.raw)};
3888 return Vec256<int16_t>{_mm256_cvtepi8_epi16(
v.raw)};
3895 Vec128<int16_t>
v) {
3896 return Vec256<int32_t>{_mm256_cvtepi16_epi32(
v.raw)};
3906 const Vec256<int32_t>
v) {
3907 const __m256i u16 = _mm256_packus_epi32(
v.raw,
v.raw);
3910 return Vec128<uint16_t>{
3911 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
3915 const Vec256<int32_t>
v) {
3916 const __m256i i16 = _mm256_packs_epi32(
v.raw,
v.raw);
3917 return Vec128<int16_t>{
3918 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
3923 const __m256i u16_blocks = _mm256_packus_epi32(
v.raw,
v.raw);
3925 const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
3926 const __m128i u16 = _mm256_castsi256_si128(u16_concat);
3929 const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
3934 const Vec256<int16_t>
v) {
3935 const __m256i u8 = _mm256_packus_epi16(
v.raw,
v.raw);
3936 return Vec128<uint8_t>{
3937 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
3942 const __m256i i16_blocks = _mm256_packs_epi32(
v.raw,
v.raw);
3944 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
3945 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
3950 const Vec256<int16_t>
v) {
3951 const __m256i i8 = _mm256_packs_epi16(
v.raw,
v.raw);
3952 return Vec128<int8_t>{
3953 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
3962 const Vec256<float>
v) {
3963 #ifdef HWY_DISABLE_F16C
3965 const Rebind<uint32_t, decltype(df16)> du;
3967 const auto bits32 =
BitCast(du,
v);
3968 const auto sign = ShiftRight<31>(bits32);
3969 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
3970 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3972 const auto k15 =
Set(di, 15);
3973 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3974 const auto is_tiny = exp <
Set(di, -24);
3976 const auto is_subnormal = exp <
Set(di, -14);
3977 const auto biased_exp16 =
3979 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3980 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3981 (mantissa32 >> (
Set(du, 13) + sub_exp));
3983 ShiftRight<13>(mantissa32));
3985 const auto sign16 = ShiftLeft<15>(sign);
3986 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3991 return Vec128<float16_t>{_mm256_cvtps_ph(
v.raw, _MM_FROUND_NO_EXC)};
3998 const Vec256<float>
v) {
4000 const Rebind<int32_t, decltype(dbf16)> di32;
4001 const Rebind<uint32_t, decltype(dbf16)> du32;
4002 const Rebind<uint16_t, decltype(dbf16)> du16;
4003 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
4011 const Repartition<uint32_t, decltype(dbf16)> du32;
4022 const Vec256<double>
v) {
4024 return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
4029 const Full256<uint32_t> d32;
4030 alignas(32)
static constexpr uint32_t k8From32[8] = {
4031 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
4036 const auto hi =
UpperHalf(Full128<uint32_t>(), quad);
4038 return BitCast(Full64<uint8_t>(), pair);
4044 const Vec256<int32_t>
v) {
4045 return Vec256<float>{_mm256_cvtepi32_ps(
v.raw)};
4049 #if HWY_TARGET <= HWY_AVX3
4058 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
4059 const auto v_upper =
BitCast(dd, ShiftRight<32>(
BitCast(d64,
v)) ^ k84_63);
4062 const auto k52 =
Set(d32, 0x43300000);
4065 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
4066 return (v_upper - k84_63_52) + v_lower;
4076 #if HWY_TARGET <= HWY_AVX3
4079 using VI = decltype(
Zero(di));
4080 const VI k0 =
Zero(di);
4081 const VI k1 =
Set(di, 1);
4082 const VI k51 =
Set(di, 51);
4085 const VI biased_exp = ShiftRight<52>(
BitCast(di,
v)) &
Set(di, 0x7FF);
4086 const VI exp = biased_exp -
Set(di, 0x3FF);
4087 const auto in_range = exp <
Set(di, 63);
4095 const VI shift_mnt =
Max(k51 - exp, k0);
4096 const VI shift_int =
Max(exp - k51, k0);
4097 const VI mantissa =
BitCast(di,
v) &
Set(di, (1ULL << 52) - 1);
4099 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4101 const VI shifted = int52 << shift_int;
4103 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4107 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
4108 const VI magnitude =
IfThenElse(in_range, restored, limit);
4111 return (magnitude ^ sign_mask) - sign_mask;
4116 const Full256<int32_t> di;
4122 const Vec128<float16_t>
v) {
4123 #ifdef HWY_DISABLE_F16C
4127 const auto bits16 =
PromoteTo(du32, Vec128<uint16_t>{
v.raw});
4128 const auto sign = ShiftRight<15>(bits16);
4129 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
4130 const auto mantissa = bits16 &
Set(du32, 0x3FF);
4131 const auto subnormal =
4133 Set(df32, 1.0f / 16384 / 1024));
4135 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
4136 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
4137 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4138 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
4139 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4142 return Vec256<float>{_mm256_cvtph_ps(
v.raw)};
4147 const Vec128<bfloat16_t>
v) {
4148 const Rebind<uint16_t, decltype(df32)> du16;
4155 #if !defined(HWY_DISABLE_PCLMUL_AES)
4158 #ifdef HWY_NATIVE_AES
4159 #undef HWY_NATIVE_AES
4161 #define HWY_NATIVE_AES
4166 #if HWY_TARGET == HWY_AVX3_DL
4170 const Half<decltype(
d)> d2;
4178 #if HWY_TARGET == HWY_AVX3_DL
4182 const Half<decltype(
d)> d2;
4190 #if HWY_TARGET == HWY_AVX3_DL
4194 const Half<decltype(
d)> d2;
4201 #if HWY_TARGET == HWY_AVX3_DL
4205 const Half<decltype(
d)> d2;
4216 template <
typename T,
typename T2>
4219 for (
size_t i = 0; i < 32 /
sizeof(T); ++i) {
4220 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
4222 return Load(
d, lanes);
4225 #if HWY_TARGET <= HWY_AVX3
4230 template <
typename T>
4233 constexpr
size_t N = 32 /
sizeof(T);
4234 constexpr
size_t kNumBytes = (
N + 7) / 8;
4236 uint64_t mask_bits = 0;
4237 CopyBytes<kNumBytes>(bits, &mask_bits);
4240 mask_bits &= (1ull <<
N) - 1;
4249 template <
typename T>
4252 constexpr
size_t N = 32 /
sizeof(T);
4253 constexpr
size_t kNumBytes = (
N + 7) / 8;
4255 CopyBytes<kNumBytes>(&mask.raw, bits);
4259 const int mask =
static_cast<int>((1ull <<
N) - 1);
4260 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
4267 template <
typename T>
4272 template <
typename T>
4274 const Mask256<T> mask) {
4282 template <
typename T>
4284 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4285 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
4287 return mask.
raw == 0;
4290 template <
typename T>
4292 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4293 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
4295 return mask.
raw == 0;
4298 template <
typename T>
4300 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4301 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
4303 return mask.
raw == 0;
4306 template <
typename T>
4308 return (uint64_t{mask.
raw} & 0xF) == 0;
4313 template <
typename T>
4320 template <
typename T>
4322 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4323 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
4325 return mask.
raw == 0xFFFFFFFFu;
4328 template <
typename T>
4330 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4331 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
4333 return mask.
raw == 0xFFFFu;
4336 template <
typename T>
4338 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4339 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
4341 return mask.
raw == 0xFFu;
4344 template <
typename T>
4347 return mask.
raw == 0xFu;
4352 template <
typename T>
4361 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4363 return Vec256<T>{_mm256_maskz_compress_epi32(mask.
raw,
v.raw)};
4370 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4373 alignas(16) constexpr uint64_t packed_array[16] = {
4375 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
4376 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
4377 0x00001032, 0x00001320, 0x00000321, 0x00003210};
4383 const auto packed =
Set(du64, packed_array[mask.raw]);
4384 alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4385 const auto indices = Indices256<T>{(packed >>
Load(du64, shifts)).raw};
4391 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
4396 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4399 alignas(16) constexpr uint64_t packed_array[16] = {
4401 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
4402 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
4403 0x00003210, 0x00003201, 0x00003210, 0x00003210};
4409 const auto packed =
Set(du64, packed_array[mask.raw]);
4410 alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4411 const auto indices = Indices256<T>{(packed >>
Load(du64, shifts)).raw};
4417 Mask256<uint64_t> mask) {
4422 template <
typename T>
4429 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4432 const Rebind<uint16_t, decltype(
d)> du;
4435 const uint64_t mask_bits{mask.
raw};
4437 #if HWY_TARGET == HWY_AVX3_DL
4438 _mm256_mask_compressstoreu_epi16(unaligned, mask.
raw, vu.raw);
4441 const Half<decltype(du)> duh;
4445 const uint64_t mask_bitsL = mask_bits & 0xFF;
4446 const uint64_t mask_bitsH = mask_bits >> 8;
4448 const auto idxL = detail::IndicesForCompress16(mask_bitsL);
4449 const auto idxH = detail::IndicesForCompress16(mask_bitsH);
4454 const Half<decltype(
d)> dh;
4462 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4465 _mm256_mask_compressstoreu_epi32(unaligned, mask.raw,
v.raw);
4466 const size_t count =
PopCount(uint64_t{mask.raw});
4469 __msan_unpoison(unaligned, count *
sizeof(T));
4474 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4477 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw,
v.raw);
4478 const size_t count =
PopCount(uint64_t{mask.raw} & 0xFull);
4481 __msan_unpoison(unaligned, count *
sizeof(T));
4489 _mm256_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
4490 const size_t count =
PopCount(uint64_t{mask.
raw});
4493 __msan_unpoison(unaligned, count *
sizeof(
float));
4501 _mm256_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
4502 const size_t count =
PopCount(uint64_t{mask.
raw} & 0xFull);
4505 __msan_unpoison(unaligned, count *
sizeof(
double));
4512 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4520 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4523 #if HWY_TARGET <= HWY_AVX3_DL
4530 __msan_unpoison(unaligned, count *
sizeof(T));
4538 template <
typename T>
4551 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4552 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T>
d, uint64_t mask_bits) {
4555 const auto vbits =
BitCast(du,
Set(du32,
static_cast<uint32_t
>(mask_bits)));
4559 alignas(32) constexpr uint64_t kRep8[4] = {
4560 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
4561 0x0303030303030303ull};
4564 alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4565 1, 2, 4, 8, 16, 32, 64, 128};
4569 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4570 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T>
d, uint64_t mask_bits) {
4572 alignas(32) constexpr uint16_t kBit[16] = {
4573 1, 2, 4, 8, 16, 32, 64, 128,
4574 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
4575 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
4579 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4580 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T>
d, uint64_t mask_bits) {
4582 alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4583 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
4587 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4588 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T>
d, uint64_t mask_bits) {
4590 alignas(32) constexpr uint64_t kBit[8] = {1, 2, 4, 8};
4597 template <
typename T>
4600 constexpr
size_t N = 32 /
sizeof(T);
4601 constexpr
size_t kNumBytes = (
N + 7) / 8;
4603 uint64_t mask_bits = 0;
4604 CopyBytes<kNumBytes>(bits, &mask_bits);
4607 mask_bits &= (1ull <<
N) - 1;
4610 return detail::LoadMaskBits256(
d, mask_bits);
4617 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4620 const Full256<uint8_t> d8;
4623 return static_cast<uint32_t
>(_mm256_movemask_epi8(sign_bits));
4626 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4630 const Full256<uint8_t> d8;
4635 return _pext_u64(sign_bits8, 0xAAAAAAAAull);
4640 const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
4642 const auto compressed =
4643 _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
4644 return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
4648 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4651 const Full256<float> df;
4653 return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
4656 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4659 const Full256<double> df;
4661 return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
4667 template <
typename T>
4670 constexpr
size_t N = 32 /
sizeof(T);
4671 constexpr
size_t kNumBytes = (
N + 7) / 8;
4674 CopyBytes<kNumBytes>(&mask_bits, bits);
4682 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4689 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4695 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4701 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4703 constexpr uint64_t kAllBits = (1ull << (32 /
sizeof(T))) - 1;
4707 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4713 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4718 template <
typename T>
4720 const Mask256<T> mask) {
4729 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4730 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T>
d,
4731 uint64_t mask_bits) {
4738 alignas(16) constexpr uint32_t packed_array[256] = {
4740 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
4741 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
4742 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
4743 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
4744 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
4745 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
4746 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
4747 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
4748 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
4749 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
4750 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
4751 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
4752 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
4753 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
4754 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
4755 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
4756 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
4757 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
4758 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
4759 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
4760 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
4761 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
4762 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
4763 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
4764 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
4765 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
4766 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
4767 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
4768 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
4769 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
4770 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
4771 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
4772 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
4773 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
4774 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
4775 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
4776 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
4777 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
4778 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
4779 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
4780 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
4781 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
4782 0x10765432, 0x17654320, 0x07654321, 0x76543210};
4788 const auto packed =
Set(d32, packed_array[mask_bits]);
4789 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4790 return Indices256<uint32_t>{(packed >>
Load(d32, shifts)).raw};
4793 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4794 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T>
d,
4795 uint64_t mask_bits) {
4801 alignas(32) constexpr uint32_t u32_indices[128] = {
4803 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
4804 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5,
4805 2, 3, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6, 7,
4806 0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5,
4807 0, 1, 2, 3, 6, 7, 4, 5, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7,
4808 2, 3, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
4809 return Indices256<uint32_t>{
Load(d32, u32_indices + 8 * mask_bits).raw};
4812 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4813 HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T>
d,
4814 uint64_t mask_bits) {
4821 alignas(16) constexpr uint32_t packed_array[256] = {
4823 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
4824 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
4825 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
4826 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
4827 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
4828 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
4829 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
4830 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
4831 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
4832 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
4833 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
4834 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
4835 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
4836 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
4837 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
4838 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
4839 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
4840 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
4841 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
4842 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
4843 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
4844 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
4845 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
4846 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
4847 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
4848 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
4849 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
4850 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
4851 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
4852 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
4853 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
4854 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
4855 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
4856 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
4857 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
4858 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
4859 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
4860 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
4861 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
4862 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
4863 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
4864 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
4865 0x76543210, 0x76543201, 0x76543210, 0x76543210};
4871 const auto packed =
Set(d32, packed_array[mask_bits]);
4872 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4873 return Indices256<uint32_t>{(packed >>
Load(d32, shifts)).raw};
4876 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4877 HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T>
d,
4878 uint64_t mask_bits) {
4884 alignas(32) constexpr uint32_t u32_indices[128] = {
4886 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 4, 5, 6, 7,
4887 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 6, 7, 4, 5, 2, 3, 6, 7,
4888 0, 1, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 0, 1,
4889 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
4890 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
4891 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
4892 return Indices256<uint32_t>{
Load(d32, u32_indices + 8 * mask_bits).raw};
4894 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4899 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
4900 const auto indices = IndicesFromBits(
d, mask_bits);
4906 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4911 const Half<decltype(du)> duh;
4912 const auto half0 =
LowerHalf(duh, vu16);
4913 const auto half1 =
UpperHalf(duh, vu16);
4915 const uint64_t mask_bits0 = mask_bits & 0xFF;
4916 const uint64_t mask_bits1 = mask_bits >> 8;
4920 alignas(32) uint16_t all_true[16] = {};
4922 const size_t num_true0 =
PopCount(mask_bits0);
4923 Store(compressed0, duh, all_true);
4924 StoreU(compressed1, duh, all_true + num_true0);
4930 alignas(32) uint16_t all_false[16] = {};
4931 const size_t num_true1 =
PopCount(mask_bits1);
4932 Store(compressed1, duh, all_false + 8);
4933 StoreU(compressed0, duh, all_false + num_true1);
4935 const auto mask =
FirstN(du, num_true0 + num_true1);
4944 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4949 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
4950 const auto indices = IndicesFromNotBits(
d, mask_bits);
4956 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4964 template <
typename T>
4969 template <
typename T>
4975 Mask256<uint64_t> mask) {
4979 template <
typename T>
4981 constexpr
size_t N = 32 /
sizeof(T);
4982 constexpr
size_t kNumBytes = (
N + 7) / 8;
4984 uint64_t mask_bits = 0;
4985 CopyBytes<kNumBytes>(bits, &mask_bits);
4988 mask_bits &= (1ull <<
N) - 1;
4996 template <
typename T>
5000 const size_t count =
PopCount(mask_bits);
5004 __msan_unpoison(unaligned, count *
sizeof(T));
5009 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
5013 const size_t count =
PopCount(mask_bits);
5017 __msan_unpoison(unaligned, count *
sizeof(T));
5022 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
5026 const size_t count =
PopCount(mask_bits);
5029 #if HWY_MEM_OPS_MIGHT_FAULT
5032 alignas(32) T buf[16];
5033 Store(compressed,
d, buf);
5034 memcpy(unaligned, buf, count *
sizeof(T));
5041 template <
typename T>
5044 constexpr
size_t N = 32 /
sizeof(T);
5045 constexpr
size_t kNumBytes = (
N + 7) / 8;
5047 uint64_t mask_bits = 0;
5048 CopyBytes<kNumBytes>(bits, &mask_bits);
5051 mask_bits &= (1ull <<
N) - 1;
5053 const size_t count =
PopCount(mask_bits);
5058 __msan_unpoison(unaligned, count *
sizeof(T));
5079 template <
typename T>
5083 constexpr
size_t N = 32 /
sizeof(T);
5103 template <
typename T>
5108 constexpr
size_t N = 32 /
sizeof(T);
5134 template <
typename T>
5138 constexpr
size_t N = 32 /
sizeof(T);
5141 StoreU(out0,
d, unaligned + 0 *
N);
5142 StoreU(out1,
d, unaligned + 1 *
N);
5153 template <
typename T>
5157 constexpr
size_t N = 32 /
sizeof(T);
5161 StoreU(out0,
d, unaligned + 0 *
N);
5162 StoreU(out1,
d, unaligned + 1 *
N);
5163 StoreU(out2,
d, unaligned + 2 *
N);
5176 template <
typename T>
5180 constexpr
size_t N = 32 /
sizeof(T);
5184 StoreU(out0,
d, unaligned + 0 *
N);
5185 StoreU(out1,
d, unaligned + 1 *
N);
5188 StoreU(out2,
d, unaligned + 2 *
N);
5189 StoreU(out3,
d, unaligned + 3 *
N);
5200 template <
typename T>
5202 const Vec256<T> v3210) {
5204 const auto v31_20_31_20 = v3210 + v1032;
5205 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5206 return v20_31_20_31 + v31_20_31_20;
5208 template <
typename T>
5210 const Vec256<T> v3210) {
5212 const auto v31_20_31_20 =
Min(v3210, v1032);
5213 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5214 return Min(v20_31_20_31, v31_20_31_20);
5216 template <
typename T>
5218 const Vec256<T> v3210) {
5220 const auto v31_20_31_20 =
Max(v3210, v1032);
5221 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5222 return Max(v20_31_20_31, v31_20_31_20);
5225 template <
typename T>
5227 const Vec256<T> v10) {
5231 template <
typename T>
5233 const Vec256<T> v10) {
5235 return Min(v10, v01);
5237 template <
typename T>
5239 const Vec256<T> v10) {
5241 return Max(v10, v01);
5245 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
5247 const Repartition<int32_t, Full256<T>> d32;
5249 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5252 return BitCast(Full256<T>(),
Or(min, ShiftLeft<16>(min)));
5254 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
5256 const Repartition<int32_t, Full256<T>> d32;
5258 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5261 return BitCast(Full256<T>(),
Or(min, ShiftLeft<16>(min)));
5267 template <
typename T>
5272 template <
typename T>
5277 template <
typename T>
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: x86_256-inl.h:81
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: x86_256-inl.h:78
Raw raw
Definition: x86_256-inl.h:100
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: x86_256-inl.h:93
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: x86_256-inl.h:96
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: x86_256-inl.h:84
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: x86_256-inl.h:90
typename detail::Raw256< T >::type Raw
Definition: x86_256-inl.h:73
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: x86_256-inl.h:87
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5417
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:151
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5406
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:462
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:870
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:513
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:332
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:763
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1155
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:252
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
long long int GatherIndex64
Definition: x86_128-inl.h:3201
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:5318
Definition: wasm_256-inl.h:1801
__m256i raw
Definition: x86_256-inl.h:2920
Definition: wasm_256-inl.h:70
typename detail::RawMask256< sizeof(T)>::type Raw
Definition: x86_256-inl.h:131
Raw raw
Definition: x86_256-inl.h:137
static Mask256< T > FromBits(uint64_t mask_bits)
Definition: x86_256-inl.h:133
Definition: ops/shared-inl.h:40
HWY_INLINE __m256d operator()(__m256i v)
Definition: x86_256-inl.h:176
HWY_INLINE __m256 operator()(__m256i v)
Definition: x86_256-inl.h:172
Definition: x86_256-inl.h:167
HWY_INLINE __m256i operator()(__m256i v)
Definition: x86_256-inl.h:168
__m256d type
Definition: x86_256-inl.h:66
__m256 type
Definition: x86_256-inl.h:62
Definition: x86_256-inl.h:57
__m256i type
Definition: x86_256-inl.h:58
__mmask32 type
Definition: x86_256-inl.h:112
__mmask16 type
Definition: x86_256-inl.h:116
__mmask8 type
Definition: x86_256-inl.h:120
__mmask8 type
Definition: x86_256-inl.h:124
Definition: x86_256-inl.h:109