Grok  10.0.3
x86_256-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when
17 // compiling for that target.
18 // External include guard in highway.h - see comment there.
19 
20 // WARNING: most operations do not cross 128-bit block boundaries. In
21 // particular, "Broadcast", pack and zip behavior may be surprising.
22 
23 // Must come before HWY_COMPILER_CLANGCL
24 #include <immintrin.h> // AVX2+
25 
26 #include "hwy/base.h"
27 
28 #if HWY_COMPILER_CLANGCL
29 // Including <immintrin.h> should be enough, but Clang's headers helpfully skip
30 // including these headers when _MSC_VER is defined, like when using clang-cl.
31 // Include these directly here.
32 #include <avxintrin.h>
33 // avxintrin defines __m256i and must come before avx2intrin.
34 #include <avx2intrin.h>
35 #include <bmi2intrin.h> // _pext_u64
36 #include <f16cintrin.h>
37 #include <fmaintrin.h>
38 #include <smmintrin.h>
39 #endif // HWY_COMPILER_CLANGCL
40 
41 #include <stddef.h>
42 #include <stdint.h>
43 
44 #if HWY_IS_MSAN
45 #include <sanitizer/msan_interface.h>
46 #endif
47 
48 // For half-width vectors. Already includes base.h and shared-inl.h.
49 #include "hwy/ops/x86_128-inl.h"
50 
52 namespace hwy {
53 namespace HWY_NAMESPACE {
54 namespace detail {
55 
56 template <typename T>
57 struct Raw256 {
58  using type = __m256i;
59 };
60 template <>
61 struct Raw256<float> {
62  using type = __m256;
63 };
64 template <>
65 struct Raw256<double> {
66  using type = __m256d;
67 };
68 
69 } // namespace detail
70 
71 template <typename T>
72 class Vec256 {
73  using Raw = typename detail::Raw256<T>::type;
74 
75  public:
76  // Compound assignment. Only usable if there is a corresponding non-member
77  // binary operator overload. For example, only f32 and f64 support division.
79  return *this = (*this * other);
80  }
82  return *this = (*this / other);
83  }
85  return *this = (*this + other);
86  }
88  return *this = (*this - other);
89  }
91  return *this = (*this & other);
92  }
94  return *this = (*this | other);
95  }
97  return *this = (*this ^ other);
98  }
99 
101 };
102 
103 #if HWY_TARGET <= HWY_AVX3
104 
105 namespace detail {
106 
107 // Template arg: sizeof(lane type)
108 template <size_t size>
109 struct RawMask256 {};
110 template <>
111 struct RawMask256<1> {
112  using type = __mmask32;
113 };
114 template <>
115 struct RawMask256<2> {
116  using type = __mmask16;
117 };
118 template <>
119 struct RawMask256<4> {
120  using type = __mmask8;
121 };
122 template <>
123 struct RawMask256<8> {
124  using type = __mmask8;
125 };
126 
127 } // namespace detail
128 
129 template <typename T>
130 struct Mask256 {
131  using Raw = typename detail::RawMask256<sizeof(T)>::type;
132 
133  static Mask256<T> FromBits(uint64_t mask_bits) {
134  return Mask256<T>{static_cast<Raw>(mask_bits)};
135  }
136 
138 };
139 
140 #else // AVX2
141 
142 // FF..FF or 0.
143 template <typename T>
144 struct Mask256 {
145  typename detail::Raw256<T>::type raw;
146 };
147 
148 #endif // HWY_TARGET <= HWY_AVX3
149 
150 // ------------------------------ BitCast
151 
152 namespace detail {
153 
154 HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; }
155 HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); }
156 HWY_INLINE __m256i BitCastToInteger(__m256d v) {
157  return _mm256_castpd_si256(v);
158 }
159 
160 template <typename T>
162  return Vec256<uint8_t>{BitCastToInteger(v.raw)};
163 }
164 
165 // Cannot rely on function overloading because return types differ.
166 template <typename T>
168  HWY_INLINE __m256i operator()(__m256i v) { return v; }
169 };
170 template <>
171 struct BitCastFromInteger256<float> {
172  HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); }
173 };
174 template <>
175 struct BitCastFromInteger256<double> {
176  HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); }
177 };
178 
179 template <typename T>
181  return Vec256<T>{BitCastFromInteger256<T>()(v.raw)};
182 }
183 
184 } // namespace detail
185 
186 template <typename T, typename FromT>
187 HWY_API Vec256<T> BitCast(Full256<T> d, Vec256<FromT> v) {
189 }
190 
191 // ------------------------------ Set
192 
193 // Returns an all-zero vector.
194 template <typename T>
195 HWY_API Vec256<T> Zero(Full256<T> /* tag */) {
196  return Vec256<T>{_mm256_setzero_si256()};
197 }
198 HWY_API Vec256<float> Zero(Full256<float> /* tag */) {
199  return Vec256<float>{_mm256_setzero_ps()};
200 }
202  return Vec256<double>{_mm256_setzero_pd()};
203 }
204 
205 // Returns a vector with all lanes set to "t".
206 HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
207  return Vec256<uint8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
208 }
209 HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
210  return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
211 }
212 HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
213  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
214 }
215 HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
216  return Vec256<uint64_t>{
217  _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
218 }
219 HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
220  return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
221 }
222 HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
223  return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
224 }
225 HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
226  return Vec256<int32_t>{_mm256_set1_epi32(t)};
227 }
228 HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
229  return Vec256<int64_t>{
230  _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
231 }
232 HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
233  return Vec256<float>{_mm256_set1_ps(t)};
234 }
235 HWY_API Vec256<double> Set(Full256<double> /* tag */, const double t) {
236  return Vec256<double>{_mm256_set1_pd(t)};
237 }
238 
239 HWY_DIAGNOSTICS(push)
240 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
241 
242 // Returns a vector with uninitialized elements.
243 template <typename T>
244 HWY_API Vec256<T> Undefined(Full256<T> /* tag */) {
245  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
246  // generate an XOR instruction.
247  return Vec256<T>{_mm256_undefined_si256()};
248 }
250  return Vec256<float>{_mm256_undefined_ps()};
251 }
253  return Vec256<double>{_mm256_undefined_pd()};
254 }
255 
256 HWY_DIAGNOSTICS(pop)
257 
258 // ================================================== LOGICAL
259 
260 // ------------------------------ And
261 
262 template <typename T>
263 HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
264  return Vec256<T>{_mm256_and_si256(a.raw, b.raw)};
265 }
266 
268  return Vec256<float>{_mm256_and_ps(a.raw, b.raw)};
269 }
271  return Vec256<double>{_mm256_and_pd(a.raw, b.raw)};
272 }
273 
274 // ------------------------------ AndNot
275 
276 // Returns ~not_mask & mask.
277 template <typename T>
278 HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
279  return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
280 }
282  const Vec256<float> mask) {
283  return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
284 }
286  const Vec256<double> mask) {
287  return Vec256<double>{_mm256_andnot_pd(not_mask.raw, mask.raw)};
288 }
289 
290 // ------------------------------ Or
291 
292 template <typename T>
293 HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
294  return Vec256<T>{_mm256_or_si256(a.raw, b.raw)};
295 }
296 
298  return Vec256<float>{_mm256_or_ps(a.raw, b.raw)};
299 }
301  return Vec256<double>{_mm256_or_pd(a.raw, b.raw)};
302 }
303 
304 // ------------------------------ Xor
305 
306 template <typename T>
307 HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
308  return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)};
309 }
310 
312  return Vec256<float>{_mm256_xor_ps(a.raw, b.raw)};
313 }
315  return Vec256<double>{_mm256_xor_pd(a.raw, b.raw)};
316 }
317 
318 // ------------------------------ Not
319 
320 template <typename T>
321 HWY_API Vec256<T> Not(const Vec256<T> v) {
322  using TU = MakeUnsigned<T>;
323 #if HWY_TARGET <= HWY_AVX3
324  const __m256i vu = BitCast(Full256<TU>(), v).raw;
325  return BitCast(Full256<T>(),
326  Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
327 #else
328  return Xor(v, BitCast(Full256<T>(), Vec256<TU>{_mm256_set1_epi32(-1)}));
329 #endif
330 }
331 
332 // ------------------------------ Or3
333 
334 template <typename T>
335 HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
336 #if HWY_TARGET <= HWY_AVX3
337  const Full256<T> d;
338  const RebindToUnsigned<decltype(d)> du;
339  using VU = VFromD<decltype(du)>;
340  const __m256i ret = _mm256_ternarylogic_epi64(
341  BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
342  return BitCast(d, VU{ret});
343 #else
344  return Or(o1, Or(o2, o3));
345 #endif
346 }
347 
348 // ------------------------------ OrAnd
349 
350 template <typename T>
351 HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
352 #if HWY_TARGET <= HWY_AVX3
353  const Full256<T> d;
354  const RebindToUnsigned<decltype(d)> du;
355  using VU = VFromD<decltype(du)>;
356  const __m256i ret = _mm256_ternarylogic_epi64(
357  BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
358  return BitCast(d, VU{ret});
359 #else
360  return Or(o, And(a1, a2));
361 #endif
362 }
363 
364 // ------------------------------ IfVecThenElse
365 
366 template <typename T>
367 HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
368 #if HWY_TARGET <= HWY_AVX3
369  const Full256<T> d;
370  const RebindToUnsigned<decltype(d)> du;
371  using VU = VFromD<decltype(du)>;
372  return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw,
373  BitCast(du, yes).raw,
374  BitCast(du, no).raw, 0xCA)});
375 #else
376  return IfThenElse(MaskFromVec(mask), yes, no);
377 #endif
378 }
379 
380 // ------------------------------ Operator overloads (internal-only if float)
381 
382 template <typename T>
383 HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
384  return And(a, b);
385 }
386 
387 template <typename T>
388 HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
389  return Or(a, b);
390 }
391 
392 template <typename T>
393 HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
394  return Xor(a, b);
395 }
396 
397 // ------------------------------ PopulationCount
398 
399 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
400 #if HWY_TARGET == HWY_AVX3_DL
401 
402 #ifdef HWY_NATIVE_POPCNT
403 #undef HWY_NATIVE_POPCNT
404 #else
405 #define HWY_NATIVE_POPCNT
406 #endif
407 
408 namespace detail {
409 
410 template <typename T>
412  return Vec256<T>{_mm256_popcnt_epi8(v.raw)};
413 }
414 template <typename T>
416  return Vec256<T>{_mm256_popcnt_epi16(v.raw)};
417 }
418 template <typename T>
420  return Vec256<T>{_mm256_popcnt_epi32(v.raw)};
421 }
422 template <typename T>
424  return Vec256<T>{_mm256_popcnt_epi64(v.raw)};
425 }
426 
427 } // namespace detail
428 
429 template <typename T>
431  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
432 }
433 
434 #endif // HWY_TARGET == HWY_AVX3_DL
435 
436 // ================================================== SIGN
437 
438 // ------------------------------ CopySign
439 
440 template <typename T>
441 HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
442  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
443 
444  const Full256<T> d;
445  const auto msb = SignBit(d);
446 
447 #if HWY_TARGET <= HWY_AVX3
448  const Rebind<MakeUnsigned<T>, decltype(d)> du;
449  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
450  // 0 0 0 | 0
451  // 0 0 1 | 0
452  // 0 1 0 | 1
453  // 0 1 1 | 1
454  // 1 0 0 | 0
455  // 1 0 1 | 1
456  // 1 1 0 | 0
457  // 1 1 1 | 1
458  // The lane size does not matter because we are not using predication.
459  const __m256i out = _mm256_ternarylogic_epi32(
460  BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
461  return BitCast(d, decltype(Zero(du)){out});
462 #else
463  return Or(AndNot(msb, magn), And(msb, sign));
464 #endif
465 }
466 
467 template <typename T>
468 HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) {
469 #if HWY_TARGET <= HWY_AVX3
470  // AVX3 can also handle abs < 0, so no extra action needed.
471  return CopySign(abs, sign);
472 #else
473  return Or(abs, And(SignBit(Full256<T>()), sign));
474 #endif
475 }
476 
477 // ================================================== MASK
478 
479 #if HWY_TARGET <= HWY_AVX3
480 
481 // ------------------------------ IfThenElse
482 
483 // Returns mask ? b : a.
484 
485 namespace detail {
486 
487 // Templates for signed/unsigned integer of a particular size.
488 template <typename T>
490  Vec256<T> yes, Vec256<T> no) {
491  return Vec256<T>{_mm256_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
492 }
493 template <typename T>
495  Vec256<T> yes, Vec256<T> no) {
496  return Vec256<T>{_mm256_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
497 }
498 template <typename T>
500  Vec256<T> yes, Vec256<T> no) {
501  return Vec256<T>{_mm256_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
502 }
503 template <typename T>
505  Vec256<T> yes, Vec256<T> no) {
506  return Vec256<T>{_mm256_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
507 }
508 
509 } // namespace detail
510 
511 template <typename T>
512 HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
513  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
514 }
516  Vec256<float> no) {
517  return Vec256<float>{_mm256_mask_mov_ps(no.raw, mask.raw, yes.raw)};
518 }
520  Vec256<double> no) {
521  return Vec256<double>{_mm256_mask_mov_pd(no.raw, mask.raw, yes.raw)};
522 }
523 
524 namespace detail {
525 
526 template <typename T>
528  Vec256<T> yes) {
529  return Vec256<T>{_mm256_maskz_mov_epi8(mask.raw, yes.raw)};
530 }
531 template <typename T>
533  Vec256<T> yes) {
534  return Vec256<T>{_mm256_maskz_mov_epi16(mask.raw, yes.raw)};
535 }
536 template <typename T>
538  Vec256<T> yes) {
539  return Vec256<T>{_mm256_maskz_mov_epi32(mask.raw, yes.raw)};
540 }
541 template <typename T>
543  Vec256<T> yes) {
544  return Vec256<T>{_mm256_maskz_mov_epi64(mask.raw, yes.raw)};
545 }
546 
547 } // namespace detail
548 
549 template <typename T>
550 HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
551  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
552 }
554  return Vec256<float>{_mm256_maskz_mov_ps(mask.raw, yes.raw)};
555 }
557  Vec256<double> yes) {
558  return Vec256<double>{_mm256_maskz_mov_pd(mask.raw, yes.raw)};
559 }
560 
561 namespace detail {
562 
563 template <typename T>
565  Vec256<T> no) {
566  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
567  return Vec256<T>{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
568 }
569 template <typename T>
571  Vec256<T> no) {
572  return Vec256<T>{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
573 }
574 template <typename T>
576  Vec256<T> no) {
577  return Vec256<T>{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
578 }
579 template <typename T>
581  Vec256<T> no) {
582  return Vec256<T>{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
583 }
584 
585 } // namespace detail
586 
587 template <typename T>
588 HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
589  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
590 }
592  return Vec256<float>{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
593 }
595  return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
596 }
597 
598 template <typename T, HWY_IF_FLOAT(T)>
600  // AVX3 MaskFromVec only looks at the MSB
601  return IfThenZeroElse(MaskFromVec(v), v);
602 }
603 
604 // ------------------------------ Mask logical
605 
606 namespace detail {
607 
608 template <typename T>
610  const Mask256<T> b) {
611 #if HWY_COMPILER_HAS_MASK_INTRINSICS
612  return Mask256<T>{_kand_mask32(a.raw, b.raw)};
613 #else
614  return Mask256<T>{static_cast<__mmask32>(a.raw & b.raw)};
615 #endif
616 }
617 template <typename T>
619  const Mask256<T> b) {
620 #if HWY_COMPILER_HAS_MASK_INTRINSICS
621  return Mask256<T>{_kand_mask16(a.raw, b.raw)};
622 #else
623  return Mask256<T>{static_cast<__mmask16>(a.raw & b.raw)};
624 #endif
625 }
626 template <typename T>
628  const Mask256<T> b) {
629 #if HWY_COMPILER_HAS_MASK_INTRINSICS
630  return Mask256<T>{_kand_mask8(a.raw, b.raw)};
631 #else
632  return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
633 #endif
634 }
635 template <typename T>
637  const Mask256<T> b) {
638 #if HWY_COMPILER_HAS_MASK_INTRINSICS
639  return Mask256<T>{_kand_mask8(a.raw, b.raw)};
640 #else
641  return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
642 #endif
643 }
644 
645 template <typename T>
647  const Mask256<T> b) {
648 #if HWY_COMPILER_HAS_MASK_INTRINSICS
649  return Mask256<T>{_kandn_mask32(a.raw, b.raw)};
650 #else
651  return Mask256<T>{static_cast<__mmask32>(~a.raw & b.raw)};
652 #endif
653 }
654 template <typename T>
656  const Mask256<T> b) {
657 #if HWY_COMPILER_HAS_MASK_INTRINSICS
658  return Mask256<T>{_kandn_mask16(a.raw, b.raw)};
659 #else
660  return Mask256<T>{static_cast<__mmask16>(~a.raw & b.raw)};
661 #endif
662 }
663 template <typename T>
665  const Mask256<T> b) {
666 #if HWY_COMPILER_HAS_MASK_INTRINSICS
667  return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
668 #else
669  return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
670 #endif
671 }
672 template <typename T>
674  const Mask256<T> b) {
675 #if HWY_COMPILER_HAS_MASK_INTRINSICS
676  return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
677 #else
678  return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
679 #endif
680 }
681 
682 template <typename T>
684  const Mask256<T> b) {
685 #if HWY_COMPILER_HAS_MASK_INTRINSICS
686  return Mask256<T>{_kor_mask32(a.raw, b.raw)};
687 #else
688  return Mask256<T>{static_cast<__mmask32>(a.raw | b.raw)};
689 #endif
690 }
691 template <typename T>
693  const Mask256<T> b) {
694 #if HWY_COMPILER_HAS_MASK_INTRINSICS
695  return Mask256<T>{_kor_mask16(a.raw, b.raw)};
696 #else
697  return Mask256<T>{static_cast<__mmask16>(a.raw | b.raw)};
698 #endif
699 }
700 template <typename T>
702  const Mask256<T> b) {
703 #if HWY_COMPILER_HAS_MASK_INTRINSICS
704  return Mask256<T>{_kor_mask8(a.raw, b.raw)};
705 #else
706  return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
707 #endif
708 }
709 template <typename T>
711  const Mask256<T> b) {
712 #if HWY_COMPILER_HAS_MASK_INTRINSICS
713  return Mask256<T>{_kor_mask8(a.raw, b.raw)};
714 #else
715  return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
716 #endif
717 }
718 
719 template <typename T>
721  const Mask256<T> b) {
722 #if HWY_COMPILER_HAS_MASK_INTRINSICS
723  return Mask256<T>{_kxor_mask32(a.raw, b.raw)};
724 #else
725  return Mask256<T>{static_cast<__mmask32>(a.raw ^ b.raw)};
726 #endif
727 }
728 template <typename T>
730  const Mask256<T> b) {
731 #if HWY_COMPILER_HAS_MASK_INTRINSICS
732  return Mask256<T>{_kxor_mask16(a.raw, b.raw)};
733 #else
734  return Mask256<T>{static_cast<__mmask16>(a.raw ^ b.raw)};
735 #endif
736 }
737 template <typename T>
739  const Mask256<T> b) {
740 #if HWY_COMPILER_HAS_MASK_INTRINSICS
741  return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
742 #else
743  return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
744 #endif
745 }
746 template <typename T>
748  const Mask256<T> b) {
749 #if HWY_COMPILER_HAS_MASK_INTRINSICS
750  return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
751 #else
752  return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
753 #endif
754 }
755 
756 } // namespace detail
757 
758 template <typename T>
760  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
761 }
762 
763 template <typename T>
765  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
766 }
767 
768 template <typename T>
770  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
771 }
772 
773 template <typename T>
775  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
776 }
777 
778 template <typename T>
780  // Flip only the valid bits.
781  constexpr size_t N = 32 / sizeof(T);
782  return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
783 }
784 
785 #else // AVX2
786 
787 // ------------------------------ Mask
788 
789 // Mask and Vec are the same (true = FF..FF).
790 template <typename T>
791 HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
792  return Mask256<T>{v.raw};
793 }
794 
795 template <typename T>
796 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
797  return Vec256<T>{v.raw};
798 }
799 
800 template <typename T>
801 HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
802  return Vec256<T>{v.raw};
803 }
804 
805 // ------------------------------ IfThenElse
806 
807 // mask ? yes : no
808 template <typename T>
809 HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
810  const Vec256<T> no) {
811  return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
812 }
813 HWY_API Vec256<float> IfThenElse(const Mask256<float> mask,
814  const Vec256<float> yes,
815  const Vec256<float> no) {
816  return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
817 }
818 HWY_API Vec256<double> IfThenElse(const Mask256<double> mask,
819  const Vec256<double> yes,
820  const Vec256<double> no) {
821  return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
822 }
823 
824 // mask ? yes : 0
825 template <typename T>
826 HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
827  return yes & VecFromMask(Full256<T>(), mask);
828 }
829 
830 // mask ? 0 : no
831 template <typename T>
832 HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
833  return AndNot(VecFromMask(Full256<T>(), mask), no);
834 }
835 
836 template <typename T, HWY_IF_FLOAT(T)>
837 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
838  const auto zero = Zero(Full256<T>());
839  // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
840  return IfThenElse(MaskFromVec(v), zero, v);
841 }
842 
843 // ------------------------------ Mask logical
844 
845 template <typename T>
846 HWY_API Mask256<T> Not(const Mask256<T> m) {
847  return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
848 }
849 
850 template <typename T>
851 HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
852  const Full256<T> d;
853  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
854 }
855 
856 template <typename T>
857 HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
858  const Full256<T> d;
859  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
860 }
861 
862 template <typename T>
863 HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
864  const Full256<T> d;
865  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
866 }
867 
868 template <typename T>
869 HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
870  const Full256<T> d;
871  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
872 }
873 
874 #endif // HWY_TARGET <= HWY_AVX3
875 
876 // ================================================== COMPARE
877 
878 #if HWY_TARGET <= HWY_AVX3
879 
880 // Comparisons set a mask bit to 1 if the condition is true, else 0.
881 
882 template <typename TFrom, typename TTo>
883 HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
884  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
885  return Mask256<TTo>{m.raw};
886 }
887 
888 namespace detail {
889 
890 template <typename T>
892  const Vec256<T> bit) {
893  return Mask256<T>{_mm256_test_epi8_mask(v.raw, bit.raw)};
894 }
895 template <typename T>
897  const Vec256<T> bit) {
898  return Mask256<T>{_mm256_test_epi16_mask(v.raw, bit.raw)};
899 }
900 template <typename T>
902  const Vec256<T> bit) {
903  return Mask256<T>{_mm256_test_epi32_mask(v.raw, bit.raw)};
904 }
905 template <typename T>
907  const Vec256<T> bit) {
908  return Mask256<T>{_mm256_test_epi64_mask(v.raw, bit.raw)};
909 }
910 
911 } // namespace detail
912 
913 template <typename T>
914 HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
915  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
916  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
917 }
918 
919 // ------------------------------ Equality
920 
921 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
923  return Mask256<T>{_mm256_cmpeq_epi8_mask(a.raw, b.raw)};
924 }
925 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
926 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
927  return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
928 }
929 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
930 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
931  return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
932 }
933 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
934 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
935  return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
936 }
937 
938 HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
939  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
940 }
941 
943  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
944 }
945 
946 // ------------------------------ Inequality
947 
948 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
950  return Mask256<T>{_mm256_cmpneq_epi8_mask(a.raw, b.raw)};
951 }
952 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
953 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
954  return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
955 }
956 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
957 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
958  return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
959 }
960 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
961 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
962  return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
963 }
964 
965 HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
966  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
967 }
968 
970  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
971 }
972 
973 // ------------------------------ Strict inequality
974 
975 HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
976  return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)};
977 }
978 HWY_API Mask256<int16_t> operator>(Vec256<int16_t> a, Vec256<int16_t> b) {
979  return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)};
980 }
981 HWY_API Mask256<int32_t> operator>(Vec256<int32_t> a, Vec256<int32_t> b) {
982  return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)};
983 }
984 HWY_API Mask256<int64_t> operator>(Vec256<int64_t> a, Vec256<int64_t> b) {
985  return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)};
986 }
987 
989  return Mask256<uint8_t>{_mm256_cmpgt_epu8_mask(a.raw, b.raw)};
990 }
992  const Vec256<uint16_t> b) {
993  return Mask256<uint16_t>{_mm256_cmpgt_epu16_mask(a.raw, b.raw)};
994 }
996  const Vec256<uint32_t> b) {
997  return Mask256<uint32_t>{_mm256_cmpgt_epu32_mask(a.raw, b.raw)};
998 }
1000  const Vec256<uint64_t> b) {
1001  return Mask256<uint64_t>{_mm256_cmpgt_epu64_mask(a.raw, b.raw)};
1002 }
1003 
1004 HWY_API Mask256<float> operator>(Vec256<float> a, Vec256<float> b) {
1005  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1006 }
1008  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1009 }
1010 
1011 // ------------------------------ Weak inequality
1012 
1013 HWY_API Mask256<float> operator>=(Vec256<float> a, Vec256<float> b) {
1014  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1015 }
1017  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1018 }
1019 
1020 // ------------------------------ Mask
1021 
1022 namespace detail {
1023 
1024 template <typename T>
1026  return Mask256<T>{_mm256_movepi8_mask(v.raw)};
1027 }
1028 template <typename T>
1030  return Mask256<T>{_mm256_movepi16_mask(v.raw)};
1031 }
1032 template <typename T>
1034  return Mask256<T>{_mm256_movepi32_mask(v.raw)};
1035 }
1036 template <typename T>
1038  return Mask256<T>{_mm256_movepi64_mask(v.raw)};
1039 }
1040 
1041 } // namespace detail
1042 
1043 template <typename T>
1044 HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
1045  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1046 }
1047 // There do not seem to be native floating-point versions of these instructions.
1050 }
1053 }
1054 
1055 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1057  return Vec256<T>{_mm256_movm_epi8(v.raw)};
1058 }
1059 
1060 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1061 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1062  return Vec256<T>{_mm256_movm_epi16(v.raw)};
1063 }
1064 
1065 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1066 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1067  return Vec256<T>{_mm256_movm_epi32(v.raw)};
1068 }
1069 
1070 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1071 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1072  return Vec256<T>{_mm256_movm_epi64(v.raw)};
1073 }
1074 
1076  return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))};
1077 }
1078 
1080  return Vec256<double>{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))};
1081 }
1082 
1083 template <typename T>
1084 HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
1085  return VecFromMask(v);
1086 }
1087 
1088 #else // AVX2
1089 
1090 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1091 
1092 template <typename TFrom, typename TTo>
1093 HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
1094  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1095  return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
1096 }
1097 
1098 template <typename T>
1099 HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
1100  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1101  return (v & bit) == bit;
1102 }
1103 
1104 // ------------------------------ Equality
1105 
1106 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1107 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1108  return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1109 }
1110 
1111 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1112 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1113  return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1114 }
1115 
1116 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1117 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1118  return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1119 }
1120 
1121 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1122 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1123  return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1124 }
1125 
1126 HWY_API Mask256<float> operator==(const Vec256<float> a,
1127  const Vec256<float> b) {
1128  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1129 }
1130 
1131 HWY_API Mask256<double> operator==(const Vec256<double> a,
1132  const Vec256<double> b) {
1133  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1134 }
1135 
1136 // ------------------------------ Inequality
1137 
1138 template <typename T, HWY_IF_NOT_FLOAT(T)>
1139 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1140  return Not(a == b);
1141 }
1142 
1143 HWY_API Mask256<float> operator!=(const Vec256<float> a,
1144  const Vec256<float> b) {
1145  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1146 }
1147 HWY_API Mask256<double> operator!=(const Vec256<double> a,
1148  const Vec256<double> b) {
1149  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1150 }
1151 
1152 // ------------------------------ Strict inequality
1153 
1154 // Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8
1155 // to perform an unsigned comparison instead of the intended signed. Workaround
1156 // is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy
1157 #if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1158 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1159 #else
1160 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1161 #endif
1162 
1163 HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
1164 #if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1165  using i8x32 = signed char __attribute__((__vector_size__(32)));
1166  return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) >
1167  reinterpret_cast<i8x32>(b.raw))};
1168 #else
1169  return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1170 #endif
1171 }
1172 HWY_API Mask256<int16_t> operator>(const Vec256<int16_t> a,
1173  const Vec256<int16_t> b) {
1174  return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1175 }
1176 HWY_API Mask256<int32_t> operator>(const Vec256<int32_t> a,
1177  const Vec256<int32_t> b) {
1178  return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1179 }
1180 HWY_API Mask256<int64_t> operator>(const Vec256<int64_t> a,
1181  const Vec256<int64_t> b) {
1182  return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1183 }
1184 
1185 template <typename T, HWY_IF_UNSIGNED(T)>
1186 HWY_API Mask256<T> operator>(const Vec256<T> a, const Vec256<T> b) {
1187  const Full256<T> du;
1188  const RebindToSigned<decltype(du)> di;
1189  const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1190  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
1191 }
1192 
1193 HWY_API Mask256<float> operator>(const Vec256<float> a, const Vec256<float> b) {
1194  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1195 }
1196 HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
1197  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1198 }
1199 
1200 // ------------------------------ Weak inequality
1201 
1202 HWY_API Mask256<float> operator>=(const Vec256<float> a,
1203  const Vec256<float> b) {
1204  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1205 }
1206 HWY_API Mask256<double> operator>=(const Vec256<double> a,
1207  const Vec256<double> b) {
1208  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1209 }
1210 
1211 #endif // HWY_TARGET <= HWY_AVX3
1212 
1213 // ------------------------------ Reversed comparisons
1214 
1215 template <typename T>
1216 HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) {
1217  return b > a;
1218 }
1219 
1220 template <typename T>
1222  return b >= a;
1223 }
1224 
1225 // ------------------------------ Min (Gt, IfThenElse)
1226 
1227 // Unsigned
1228 HWY_API Vec256<uint8_t> Min(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
1229  return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)};
1230 }
1231 HWY_API Vec256<uint16_t> Min(const Vec256<uint16_t> a,
1232  const Vec256<uint16_t> b) {
1233  return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)};
1234 }
1235 HWY_API Vec256<uint32_t> Min(const Vec256<uint32_t> a,
1236  const Vec256<uint32_t> b) {
1237  return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)};
1238 }
1239 HWY_API Vec256<uint64_t> Min(const Vec256<uint64_t> a,
1240  const Vec256<uint64_t> b) {
1241 #if HWY_TARGET <= HWY_AVX3
1242  return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)};
1243 #else
1244  const Full256<uint64_t> du;
1245  const Full256<int64_t> di;
1246  const auto msb = Set(du, 1ull << 63);
1247  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1248  return IfThenElse(gt, b, a);
1249 #endif
1250 }
1251 
1252 // Signed
1253 HWY_API Vec256<int8_t> Min(const Vec256<int8_t> a, const Vec256<int8_t> b) {
1254  return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)};
1255 }
1256 HWY_API Vec256<int16_t> Min(const Vec256<int16_t> a, const Vec256<int16_t> b) {
1257  return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)};
1258 }
1259 HWY_API Vec256<int32_t> Min(const Vec256<int32_t> a, const Vec256<int32_t> b) {
1260  return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)};
1261 }
1262 HWY_API Vec256<int64_t> Min(const Vec256<int64_t> a, const Vec256<int64_t> b) {
1263 #if HWY_TARGET <= HWY_AVX3
1264  return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)};
1265 #else
1266  return IfThenElse(a < b, a, b);
1267 #endif
1268 }
1269 
1270 // Float
1271 HWY_API Vec256<float> Min(const Vec256<float> a, const Vec256<float> b) {
1272  return Vec256<float>{_mm256_min_ps(a.raw, b.raw)};
1273 }
1275  return Vec256<double>{_mm256_min_pd(a.raw, b.raw)};
1276 }
1277 
1278 // ------------------------------ Max (Gt, IfThenElse)
1279 
1280 // Unsigned
1281 HWY_API Vec256<uint8_t> Max(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
1282  return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)};
1283 }
1284 HWY_API Vec256<uint16_t> Max(const Vec256<uint16_t> a,
1285  const Vec256<uint16_t> b) {
1286  return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)};
1287 }
1288 HWY_API Vec256<uint32_t> Max(const Vec256<uint32_t> a,
1289  const Vec256<uint32_t> b) {
1290  return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)};
1291 }
1292 HWY_API Vec256<uint64_t> Max(const Vec256<uint64_t> a,
1293  const Vec256<uint64_t> b) {
1294 #if HWY_TARGET <= HWY_AVX3
1295  return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)};
1296 #else
1297  const Full256<uint64_t> du;
1298  const Full256<int64_t> di;
1299  const auto msb = Set(du, 1ull << 63);
1300  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1301  return IfThenElse(gt, a, b);
1302 #endif
1303 }
1304 
1305 // Signed
1306 HWY_API Vec256<int8_t> Max(const Vec256<int8_t> a, const Vec256<int8_t> b) {
1307  return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)};
1308 }
1309 HWY_API Vec256<int16_t> Max(const Vec256<int16_t> a, const Vec256<int16_t> b) {
1310  return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)};
1311 }
1312 HWY_API Vec256<int32_t> Max(const Vec256<int32_t> a, const Vec256<int32_t> b) {
1313  return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)};
1314 }
1315 HWY_API Vec256<int64_t> Max(const Vec256<int64_t> a, const Vec256<int64_t> b) {
1316 #if HWY_TARGET <= HWY_AVX3
1317  return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)};
1318 #else
1319  return IfThenElse(a < b, b, a);
1320 #endif
1321 }
1322 
1323 // Float
1324 HWY_API Vec256<float> Max(const Vec256<float> a, const Vec256<float> b) {
1325  return Vec256<float>{_mm256_max_ps(a.raw, b.raw)};
1326 }
1328  return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
1329 }
1330 
1331 // ------------------------------ FirstN (Iota, Lt)
1332 
1333 template <typename T>
1334 HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
1335 #if HWY_TARGET <= HWY_AVX3
1336  (void)d;
1337  constexpr size_t N = 32 / sizeof(T);
1338 #if HWY_ARCH_X86_64
1339  const uint64_t all = (1ull << N) - 1;
1340  // BZHI only looks at the lower 8 bits of n!
1341  return Mask256<T>::FromBits((n > 255) ? all : _bzhi_u64(all, n));
1342 #else
1343  const uint32_t all = static_cast<uint32_t>((1ull << N) - 1);
1344  // BZHI only looks at the lower 8 bits of n!
1345  return Mask256<T>::FromBits(
1346  (n > 255) ? all : _bzhi_u32(all, static_cast<uint32_t>(n)));
1347 #endif // HWY_ARCH_X86_64
1348 #else
1349  const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1350  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
1351 #endif
1352 }
1353 
1354 // ================================================== ARITHMETIC
1355 
1356 // ------------------------------ Addition
1357 
1358 // Unsigned
1359 HWY_API Vec256<uint8_t> operator+(const Vec256<uint8_t> a,
1360  const Vec256<uint8_t> b) {
1361  return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)};
1362 }
1363 HWY_API Vec256<uint16_t> operator+(const Vec256<uint16_t> a,
1364  const Vec256<uint16_t> b) {
1365  return Vec256<uint16_t>{_mm256_add_epi16(a.raw, b.raw)};
1366 }
1367 HWY_API Vec256<uint32_t> operator+(const Vec256<uint32_t> a,
1368  const Vec256<uint32_t> b) {
1369  return Vec256<uint32_t>{_mm256_add_epi32(a.raw, b.raw)};
1370 }
1372  const Vec256<uint64_t> b) {
1373  return Vec256<uint64_t>{_mm256_add_epi64(a.raw, b.raw)};
1374 }
1375 
1376 // Signed
1377 HWY_API Vec256<int8_t> operator+(const Vec256<int8_t> a,
1378  const Vec256<int8_t> b) {
1379  return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)};
1380 }
1381 HWY_API Vec256<int16_t> operator+(const Vec256<int16_t> a,
1382  const Vec256<int16_t> b) {
1383  return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)};
1384 }
1385 HWY_API Vec256<int32_t> operator+(const Vec256<int32_t> a,
1386  const Vec256<int32_t> b) {
1387  return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)};
1388 }
1390  const Vec256<int64_t> b) {
1391  return Vec256<int64_t>{_mm256_add_epi64(a.raw, b.raw)};
1392 }
1393 
1394 // Float
1395 HWY_API Vec256<float> operator+(const Vec256<float> a, const Vec256<float> b) {
1396  return Vec256<float>{_mm256_add_ps(a.raw, b.raw)};
1397 }
1399  const Vec256<double> b) {
1400  return Vec256<double>{_mm256_add_pd(a.raw, b.raw)};
1401 }
1402 
1403 // ------------------------------ Subtraction
1404 
1405 // Unsigned
1406 HWY_API Vec256<uint8_t> operator-(const Vec256<uint8_t> a,
1407  const Vec256<uint8_t> b) {
1408  return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1409 }
1410 HWY_API Vec256<uint16_t> operator-(const Vec256<uint16_t> a,
1411  const Vec256<uint16_t> b) {
1412  return Vec256<uint16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1413 }
1414 HWY_API Vec256<uint32_t> operator-(const Vec256<uint32_t> a,
1415  const Vec256<uint32_t> b) {
1416  return Vec256<uint32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1417 }
1419  const Vec256<uint64_t> b) {
1420  return Vec256<uint64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1421 }
1422 
1423 // Signed
1424 HWY_API Vec256<int8_t> operator-(const Vec256<int8_t> a,
1425  const Vec256<int8_t> b) {
1426  return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1427 }
1428 HWY_API Vec256<int16_t> operator-(const Vec256<int16_t> a,
1429  const Vec256<int16_t> b) {
1430  return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1431 }
1432 HWY_API Vec256<int32_t> operator-(const Vec256<int32_t> a,
1433  const Vec256<int32_t> b) {
1434  return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1435 }
1437  const Vec256<int64_t> b) {
1438  return Vec256<int64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1439 }
1440 
1441 // Float
1442 HWY_API Vec256<float> operator-(const Vec256<float> a, const Vec256<float> b) {
1443  return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)};
1444 }
1446  const Vec256<double> b) {
1447  return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
1448 }
1449 
1450 // ------------------------------ SumsOf8
1451 HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
1452  return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
1453 }
1454 
1455 // ------------------------------ SaturatedAdd
1456 
1457 // Returns a + b clamped to the destination range.
1458 
1459 // Unsigned
1460 HWY_API Vec256<uint8_t> SaturatedAdd(const Vec256<uint8_t> a,
1461  const Vec256<uint8_t> b) {
1462  return Vec256<uint8_t>{_mm256_adds_epu8(a.raw, b.raw)};
1463 }
1464 HWY_API Vec256<uint16_t> SaturatedAdd(const Vec256<uint16_t> a,
1465  const Vec256<uint16_t> b) {
1466  return Vec256<uint16_t>{_mm256_adds_epu16(a.raw, b.raw)};
1467 }
1468 
1469 // Signed
1470 HWY_API Vec256<int8_t> SaturatedAdd(const Vec256<int8_t> a,
1471  const Vec256<int8_t> b) {
1472  return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)};
1473 }
1474 HWY_API Vec256<int16_t> SaturatedAdd(const Vec256<int16_t> a,
1475  const Vec256<int16_t> b) {
1476  return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
1477 }
1478 
1479 // ------------------------------ SaturatedSub
1480 
1481 // Returns a - b clamped to the destination range.
1482 
1483 // Unsigned
1484 HWY_API Vec256<uint8_t> SaturatedSub(const Vec256<uint8_t> a,
1485  const Vec256<uint8_t> b) {
1486  return Vec256<uint8_t>{_mm256_subs_epu8(a.raw, b.raw)};
1487 }
1488 HWY_API Vec256<uint16_t> SaturatedSub(const Vec256<uint16_t> a,
1489  const Vec256<uint16_t> b) {
1490  return Vec256<uint16_t>{_mm256_subs_epu16(a.raw, b.raw)};
1491 }
1492 
1493 // Signed
1494 HWY_API Vec256<int8_t> SaturatedSub(const Vec256<int8_t> a,
1495  const Vec256<int8_t> b) {
1496  return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)};
1497 }
1498 HWY_API Vec256<int16_t> SaturatedSub(const Vec256<int16_t> a,
1499  const Vec256<int16_t> b) {
1500  return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
1501 }
1502 
1503 // ------------------------------ Average
1504 
1505 // Returns (a + b + 1) / 2
1506 
1507 // Unsigned
1508 HWY_API Vec256<uint8_t> AverageRound(const Vec256<uint8_t> a,
1509  const Vec256<uint8_t> b) {
1510  return Vec256<uint8_t>{_mm256_avg_epu8(a.raw, b.raw)};
1511 }
1512 HWY_API Vec256<uint16_t> AverageRound(const Vec256<uint16_t> a,
1513  const Vec256<uint16_t> b) {
1514  return Vec256<uint16_t>{_mm256_avg_epu16(a.raw, b.raw)};
1515 }
1516 
1517 // ------------------------------ Abs (Sub)
1518 
1519 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
1520 HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
1521 #if HWY_COMPILER_MSVC
1522  // Workaround for incorrect codegen? (wrong result)
1523  const auto zero = Zero(Full256<int8_t>());
1524  return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
1525 #else
1526  return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
1527 #endif
1528 }
1529 HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
1530  return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
1531 }
1532 HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
1533  return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
1534 }
1535 // i64 is implemented after BroadcastSignBit.
1536 
1537 HWY_API Vec256<float> Abs(const Vec256<float> v) {
1538  const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
1539  return v & BitCast(Full256<float>(), mask);
1540 }
1542  const Vec256<int64_t> mask{_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
1543  return v & BitCast(Full256<double>(), mask);
1544 }
1545 
1546 // ------------------------------ Integer multiplication
1547 
1548 // Unsigned
1549 HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1550  return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1551 }
1552 HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1553  return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1554 }
1555 
1556 // Signed
1557 HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
1558  return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1559 }
1560 HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) {
1561  return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1562 }
1563 
1564 // Returns the upper 16 bits of a * b in each lane.
1565 HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1566  return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)};
1567 }
1568 HWY_API Vec256<int16_t> MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) {
1569  return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)};
1570 }
1571 
1572 HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t> a, Vec256<int16_t> b) {
1573  return Vec256<int16_t>{_mm256_mulhrs_epi16(a.raw, b.raw)};
1574 }
1575 
1576 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
1577 // even and the upper half into its odd neighbor lane.
1578 HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
1579  return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
1580 }
1581 HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1582  return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
1583 }
1584 
1585 // ------------------------------ ShiftLeft
1586 
1587 template <int kBits>
1588 HWY_API Vec256<uint16_t> ShiftLeft(const Vec256<uint16_t> v) {
1589  return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, kBits)};
1590 }
1591 
1592 template <int kBits>
1593 HWY_API Vec256<uint32_t> ShiftLeft(const Vec256<uint32_t> v) {
1594  return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, kBits)};
1595 }
1596 
1597 template <int kBits>
1599  return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, kBits)};
1600 }
1601 
1602 template <int kBits>
1603 HWY_API Vec256<int16_t> ShiftLeft(const Vec256<int16_t> v) {
1604  return Vec256<int16_t>{_mm256_slli_epi16(v.raw, kBits)};
1605 }
1606 
1607 template <int kBits>
1608 HWY_API Vec256<int32_t> ShiftLeft(const Vec256<int32_t> v) {
1609  return Vec256<int32_t>{_mm256_slli_epi32(v.raw, kBits)};
1610 }
1611 
1612 template <int kBits>
1614  return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
1615 }
1616 
1617 template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
1618 HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
1619  const Full256<T> d8;
1620  const RepartitionToWide<decltype(d8)> d16;
1621  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
1622  return kBits == 1
1623  ? (v + v)
1624  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1625 }
1626 
1627 // ------------------------------ ShiftRight
1628 
1629 template <int kBits>
1630 HWY_API Vec256<uint16_t> ShiftRight(const Vec256<uint16_t> v) {
1631  return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, kBits)};
1632 }
1633 
1634 template <int kBits>
1635 HWY_API Vec256<uint32_t> ShiftRight(const Vec256<uint32_t> v) {
1636  return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, kBits)};
1637 }
1638 
1639 template <int kBits>
1641  return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, kBits)};
1642 }
1643 
1644 template <int kBits>
1645 HWY_API Vec256<uint8_t> ShiftRight(const Vec256<uint8_t> v) {
1646  const Full256<uint8_t> d8;
1647  // Use raw instead of BitCast to support N=1.
1648  const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
1649  return shifted & Set(d8, 0xFF >> kBits);
1650 }
1651 
1652 template <int kBits>
1653 HWY_API Vec256<int16_t> ShiftRight(const Vec256<int16_t> v) {
1654  return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
1655 }
1656 
1657 template <int kBits>
1658 HWY_API Vec256<int32_t> ShiftRight(const Vec256<int32_t> v) {
1659  return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
1660 }
1661 
1662 template <int kBits>
1663 HWY_API Vec256<int8_t> ShiftRight(const Vec256<int8_t> v) {
1664  const Full256<int8_t> di;
1665  const Full256<uint8_t> du;
1666  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1667  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1668  return (shifted ^ shifted_sign) - shifted_sign;
1669 }
1670 
1671 // i64 is implemented after BroadcastSignBit.
1672 
1673 // ------------------------------ RotateRight
1674 
1675 template <int kBits>
1677  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
1678 #if HWY_TARGET <= HWY_AVX3
1679  return Vec256<uint32_t>{_mm256_ror_epi32(v.raw, kBits)};
1680 #else
1681  if (kBits == 0) return v;
1682  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
1683 #endif
1684 }
1685 
1686 template <int kBits>
1688  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
1689 #if HWY_TARGET <= HWY_AVX3
1690  return Vec256<uint64_t>{_mm256_ror_epi64(v.raw, kBits)};
1691 #else
1692  if (kBits == 0) return v;
1693  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
1694 #endif
1695 }
1696 
1697 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1698 
1699 HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
1700  return VecFromMask(v < Zero(Full256<int8_t>()));
1701 }
1702 
1704  return ShiftRight<15>(v);
1705 }
1706 
1708  return ShiftRight<31>(v);
1709 }
1710 
1712 #if HWY_TARGET == HWY_AVX2
1713  return VecFromMask(v < Zero(Full256<int64_t>()));
1714 #else
1715  return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)};
1716 #endif
1717 }
1718 
1719 template <int kBits>
1721 #if HWY_TARGET <= HWY_AVX3
1722  return Vec256<int64_t>{_mm256_srai_epi64(v.raw, kBits)};
1723 #else
1724  const Full256<int64_t> di;
1725  const Full256<uint64_t> du;
1726  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1727  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
1728  return right | sign;
1729 #endif
1730 }
1731 
1732 HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
1733 #if HWY_TARGET <= HWY_AVX3
1734  return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
1735 #else
1736  const auto zero = Zero(Full256<int64_t>());
1737  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
1738 #endif
1739 }
1740 
1741 // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1743  Vec256<int8_t> no) {
1744  // int8: AVX2 IfThenElse only looks at the MSB.
1745  return IfThenElse(MaskFromVec(v), yes, no);
1746 }
1747 
1748 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1750  static_assert(IsSigned<T>(), "Only works for signed/float");
1751  const Full256<T> d;
1752  const RebindToSigned<decltype(d)> di;
1753 
1754  // 16-bit: no native blendv, so copy sign to lower byte's MSB.
1755  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
1756  return IfThenElse(MaskFromVec(v), yes, no);
1757 }
1758 
1759 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
1760 HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
1761  static_assert(IsSigned<T>(), "Only works for signed/float");
1762  const Full256<T> d;
1763  const RebindToFloat<decltype(d)> df;
1764 
1765  // 32/64-bit: use float IfThenElse, which only looks at the MSB.
1766  const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
1767  return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
1768 }
1769 
1770 // ------------------------------ ShiftLeftSame
1771 
1772 HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
1773  const int bits) {
1774  return Vec256<uint16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1775 }
1776 HWY_API Vec256<uint32_t> ShiftLeftSame(const Vec256<uint32_t> v,
1777  const int bits) {
1778  return Vec256<uint32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1779 }
1781  const int bits) {
1782  return Vec256<uint64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1783 }
1784 
1785 HWY_API Vec256<int16_t> ShiftLeftSame(const Vec256<int16_t> v, const int bits) {
1786  return Vec256<int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1787 }
1788 
1789 HWY_API Vec256<int32_t> ShiftLeftSame(const Vec256<int32_t> v, const int bits) {
1790  return Vec256<int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1791 }
1792 
1794  return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1795 }
1796 
1797 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1798 HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
1799  const Full256<T> d8;
1800  const RepartitionToWide<decltype(d8)> d16;
1801  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
1802  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
1803 }
1804 
1805 // ------------------------------ ShiftRightSame (BroadcastSignBit)
1806 
1807 HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
1808  const int bits) {
1809  return Vec256<uint16_t>{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1810 }
1811 HWY_API Vec256<uint32_t> ShiftRightSame(const Vec256<uint32_t> v,
1812  const int bits) {
1813  return Vec256<uint32_t>{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1814 }
1816  const int bits) {
1817  return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1818 }
1819 
1820 HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) {
1821  const Full256<uint8_t> d8;
1822  const RepartitionToWide<decltype(d8)> d16;
1823  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
1824  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
1825 }
1826 
1827 HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
1828  const int bits) {
1829  return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1830 }
1831 
1832 HWY_API Vec256<int32_t> ShiftRightSame(const Vec256<int32_t> v,
1833  const int bits) {
1834  return Vec256<int32_t>{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1835 }
1837  const int bits) {
1838 #if HWY_TARGET <= HWY_AVX3
1839  return Vec256<int64_t>{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1840 #else
1841  const Full256<int64_t> di;
1842  const Full256<uint64_t> du;
1843  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1844  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
1845  return right | sign;
1846 #endif
1847 }
1848 
1849 HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
1850  const Full256<int8_t> di;
1851  const Full256<uint8_t> du;
1852  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1853  const auto shifted_sign =
1854  BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
1855  return (shifted ^ shifted_sign) - shifted_sign;
1856 }
1857 
1858 // ------------------------------ Neg (Xor, Sub)
1859 
1860 template <typename T, HWY_IF_FLOAT(T)>
1861 HWY_API Vec256<T> Neg(const Vec256<T> v) {
1862  return Xor(v, SignBit(Full256<T>()));
1863 }
1864 
1865 template <typename T, HWY_IF_NOT_FLOAT(T)>
1866 HWY_API Vec256<T> Neg(const Vec256<T> v) {
1867  return Zero(Full256<T>()) - v;
1868 }
1869 
1870 // ------------------------------ Floating-point mul / div
1871 
1872 HWY_API Vec256<float> operator*(const Vec256<float> a, const Vec256<float> b) {
1873  return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)};
1874 }
1876  const Vec256<double> b) {
1877  return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)};
1878 }
1879 
1880 HWY_API Vec256<float> operator/(const Vec256<float> a, const Vec256<float> b) {
1881  return Vec256<float>{_mm256_div_ps(a.raw, b.raw)};
1882 }
1884  const Vec256<double> b) {
1885  return Vec256<double>{_mm256_div_pd(a.raw, b.raw)};
1886 }
1887 
1888 // Approximate reciprocal
1889 HWY_API Vec256<float> ApproximateReciprocal(const Vec256<float> v) {
1890  return Vec256<float>{_mm256_rcp_ps(v.raw)};
1891 }
1892 
1893 // Absolute value of difference.
1894 HWY_API Vec256<float> AbsDiff(const Vec256<float> a, const Vec256<float> b) {
1895  return Abs(a - b);
1896 }
1897 
1898 // ------------------------------ Floating-point multiply-add variants
1899 
1900 // Returns mul * x + add
1901 HWY_API Vec256<float> MulAdd(const Vec256<float> mul, const Vec256<float> x,
1902  const Vec256<float> add) {
1903 #ifdef HWY_DISABLE_BMI2_FMA
1904  return mul * x + add;
1905 #else
1906  return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
1907 #endif
1908 }
1910  const Vec256<double> add) {
1911 #ifdef HWY_DISABLE_BMI2_FMA
1912  return mul * x + add;
1913 #else
1914  return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
1915 #endif
1916 }
1917 
1918 // Returns add - mul * x
1919 HWY_API Vec256<float> NegMulAdd(const Vec256<float> mul, const Vec256<float> x,
1920  const Vec256<float> add) {
1921 #ifdef HWY_DISABLE_BMI2_FMA
1922  return add - mul * x;
1923 #else
1924  return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
1925 #endif
1926 }
1928  const Vec256<double> x,
1929  const Vec256<double> add) {
1930 #ifdef HWY_DISABLE_BMI2_FMA
1931  return add - mul * x;
1932 #else
1933  return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
1934 #endif
1935 }
1936 
1937 // Returns mul * x - sub
1938 HWY_API Vec256<float> MulSub(const Vec256<float> mul, const Vec256<float> x,
1939  const Vec256<float> sub) {
1940 #ifdef HWY_DISABLE_BMI2_FMA
1941  return mul * x - sub;
1942 #else
1943  return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
1944 #endif
1945 }
1947  const Vec256<double> sub) {
1948 #ifdef HWY_DISABLE_BMI2_FMA
1949  return mul * x - sub;
1950 #else
1951  return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
1952 #endif
1953 }
1954 
1955 // Returns -mul * x - sub
1956 HWY_API Vec256<float> NegMulSub(const Vec256<float> mul, const Vec256<float> x,
1957  const Vec256<float> sub) {
1958 #ifdef HWY_DISABLE_BMI2_FMA
1959  return Neg(mul * x) - sub;
1960 #else
1961  return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1962 #endif
1963 }
1965  const Vec256<double> x,
1966  const Vec256<double> sub) {
1967 #ifdef HWY_DISABLE_BMI2_FMA
1968  return Neg(mul * x) - sub;
1969 #else
1970  return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1971 #endif
1972 }
1973 
1974 // ------------------------------ Floating-point square root
1975 
1976 // Full precision square root
1977 HWY_API Vec256<float> Sqrt(const Vec256<float> v) {
1978  return Vec256<float>{_mm256_sqrt_ps(v.raw)};
1979 }
1981  return Vec256<double>{_mm256_sqrt_pd(v.raw)};
1982 }
1983 
1984 // Approximate reciprocal square root
1985 HWY_API Vec256<float> ApproximateReciprocalSqrt(const Vec256<float> v) {
1986  return Vec256<float>{_mm256_rsqrt_ps(v.raw)};
1987 }
1988 
1989 // ------------------------------ Floating-point rounding
1990 
1991 // Toward nearest integer, tie to even
1992 HWY_API Vec256<float> Round(const Vec256<float> v) {
1993  return Vec256<float>{
1994  _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1995 }
1997  return Vec256<double>{
1998  _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1999 }
2000 
2001 // Toward zero, aka truncate
2002 HWY_API Vec256<float> Trunc(const Vec256<float> v) {
2003  return Vec256<float>{
2004  _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2005 }
2007  return Vec256<double>{
2008  _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2009 }
2010 
2011 // Toward +infinity, aka ceiling
2012 HWY_API Vec256<float> Ceil(const Vec256<float> v) {
2013  return Vec256<float>{
2014  _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2015 }
2017  return Vec256<double>{
2018  _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2019 }
2020 
2021 // Toward -infinity, aka floor
2022 HWY_API Vec256<float> Floor(const Vec256<float> v) {
2023  return Vec256<float>{
2024  _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2025 }
2027  return Vec256<double>{
2028  _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2029 }
2030 
2031 // ------------------------------ Floating-point classification
2032 
2034 #if HWY_TARGET <= HWY_AVX3
2035  return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x81)};
2036 #else
2037  return Mask256<float>{_mm256_cmp_ps(v.raw, v.raw, _CMP_UNORD_Q)};
2038 #endif
2039 }
2041 #if HWY_TARGET <= HWY_AVX3
2042  return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x81)};
2043 #else
2044  return Mask256<double>{_mm256_cmp_pd(v.raw, v.raw, _CMP_UNORD_Q)};
2045 #endif
2046 }
2047 
2048 #if HWY_TARGET <= HWY_AVX3
2049 
2051  return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x18)};
2052 }
2054  return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x18)};
2055 }
2056 
2058  // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
2059  // and negate the mask.
2060  return Not(Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x99)});
2061 }
2063  return Not(Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x99)});
2064 }
2065 
2066 #else
2067 
2068 template <typename T, HWY_IF_FLOAT(T)>
2069 HWY_API Mask256<T> IsInf(const Vec256<T> v) {
2070  const Full256<T> d;
2071  const RebindToSigned<decltype(d)> di;
2072  const VFromD<decltype(di)> vi = BitCast(di, v);
2073  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
2074  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
2075 }
2076 
2077 // Returns whether normal/subnormal/zero.
2078 template <typename T, HWY_IF_FLOAT(T)>
2079 HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
2080  const Full256<T> d;
2081  const RebindToUnsigned<decltype(d)> du;
2082  const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
2083  const VFromD<decltype(du)> vu = BitCast(du, v);
2084  // Shift left to clear the sign bit, then right so we can compare with the
2085  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
2086  // negative and non-negative floats would be greater). MSVC seems to generate
2087  // incorrect code if we instead add vu + vu.
2088  const VFromD<decltype(di)> exp =
2089  BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
2090  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
2091 }
2092 
2093 #endif // HWY_TARGET <= HWY_AVX3
2094 
2095 // ================================================== MEMORY
2096 
2097 // ------------------------------ Load
2098 
2099 template <typename T>
2100 HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
2101  return Vec256<T>{
2102  _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
2103 }
2105  const float* HWY_RESTRICT aligned) {
2106  return Vec256<float>{_mm256_load_ps(aligned)};
2107 }
2109  const double* HWY_RESTRICT aligned) {
2110  return Vec256<double>{_mm256_load_pd(aligned)};
2111 }
2112 
2113 template <typename T>
2114 HWY_API Vec256<T> LoadU(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
2115  return Vec256<T>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
2116 }
2118  const float* HWY_RESTRICT p) {
2119  return Vec256<float>{_mm256_loadu_ps(p)};
2120 }
2122  const double* HWY_RESTRICT p) {
2123  return Vec256<double>{_mm256_loadu_pd(p)};
2124 }
2125 
2126 // ------------------------------ MaskedLoad
2127 
2128 #if HWY_TARGET <= HWY_AVX3
2129 
2130 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2132  const T* HWY_RESTRICT p) {
2133  return Vec256<T>{_mm256_maskz_loadu_epi8(m.raw, p)};
2134 }
2135 
2136 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2137 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2138  const T* HWY_RESTRICT p) {
2139  return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)};
2140 }
2141 
2142 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2143 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2144  const T* HWY_RESTRICT p) {
2145  return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)};
2146 }
2147 
2148 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2149 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2150  const T* HWY_RESTRICT p) {
2151  return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)};
2152 }
2153 
2155  const float* HWY_RESTRICT p) {
2156  return Vec256<float>{_mm256_maskz_loadu_ps(m.raw, p)};
2157 }
2158 
2160  const double* HWY_RESTRICT p) {
2161  return Vec256<double>{_mm256_maskz_loadu_pd(m.raw, p)};
2162 }
2163 
2164 #else // AVX2
2165 
2166 // There is no maskload_epi8/16, so blend instead.
2167 template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2168 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
2169  const T* HWY_RESTRICT p) {
2170  return IfThenElseZero(m, LoadU(d, p));
2171 }
2172 
2173 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2174 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2175  const T* HWY_RESTRICT p) {
2176  auto pi = reinterpret_cast<const int*>(p); // NOLINT
2177  return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)};
2178 }
2179 
2180 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2181 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2182  const T* HWY_RESTRICT p) {
2183  auto pi = reinterpret_cast<const long long*>(p); // NOLINT
2184  return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)};
2185 }
2186 
2187 HWY_API Vec256<float> MaskedLoad(Mask256<float> m, Full256<float> d,
2188  const float* HWY_RESTRICT p) {
2189  const Vec256<int32_t> mi =
2190  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2191  return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
2192 }
2193 
2194 HWY_API Vec256<double> MaskedLoad(Mask256<double> m, Full256<double> d,
2195  const double* HWY_RESTRICT p) {
2196  const Vec256<int64_t> mi =
2197  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2198  return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
2199 }
2200 
2201 #endif
2202 
2203 // ------------------------------ LoadDup128
2204 
2205 // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
2206 // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
2207 template <typename T>
2208 HWY_API Vec256<T> LoadDup128(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
2209 #if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
2210  // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
2211  // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
2212  // upper half undefined) is fine because we're overwriting that anyway.
2213  // This workaround seems in turn to generate incorrect code in MSVC 2022
2214  // (19.31), so use broadcastsi128 there.
2215  const __m128i v128 = LoadU(Full128<T>(), p).raw;
2216  return Vec256<T>{
2217  _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
2218 #else
2219  return Vec256<T>{_mm256_broadcastsi128_si256(LoadU(Full128<T>(), p).raw)};
2220 #endif
2221 }
2223  const float* const HWY_RESTRICT p) {
2224 #if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
2225  const __m128 v128 = LoadU(Full128<float>(), p).raw;
2226  return Vec256<float>{
2227  _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
2228 #else
2229  return Vec256<float>{_mm256_broadcast_ps(reinterpret_cast<const __m128*>(p))};
2230 #endif
2231 }
2233  const double* const HWY_RESTRICT p) {
2234 #if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
2235  const __m128d v128 = LoadU(Full128<double>(), p).raw;
2236  return Vec256<double>{
2237  _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2238 #else
2239  return Vec256<double>{
2240  _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(p))};
2241 #endif
2242 }
2243 
2244 // ------------------------------ Store
2245 
2246 template <typename T>
2247 HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
2248  _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2249 }
2251  float* HWY_RESTRICT aligned) {
2252  _mm256_store_ps(aligned, v.raw);
2253 }
2255  double* HWY_RESTRICT aligned) {
2256  _mm256_store_pd(aligned, v.raw);
2257 }
2258 
2259 template <typename T>
2260 HWY_API void StoreU(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT p) {
2261  _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
2262 }
2264  float* HWY_RESTRICT p) {
2265  _mm256_storeu_ps(p, v.raw);
2266 }
2268  double* HWY_RESTRICT p) {
2269  _mm256_storeu_pd(p, v.raw);
2270 }
2271 
2272 // ------------------------------ BlendedStore
2273 
2274 #if HWY_TARGET <= HWY_AVX3
2275 
2276 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2278  T* HWY_RESTRICT p) {
2279  _mm256_mask_storeu_epi8(p, m.raw, v.raw);
2280 }
2281 
2282 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2283 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2284  T* HWY_RESTRICT p) {
2285  _mm256_mask_storeu_epi16(p, m.raw, v.raw);
2286 }
2287 
2288 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2289 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2290  T* HWY_RESTRICT p) {
2291  _mm256_mask_storeu_epi32(p, m.raw, v.raw);
2292 }
2293 
2294 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2295 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2296  T* HWY_RESTRICT p) {
2297  _mm256_mask_storeu_epi64(p, m.raw, v.raw);
2298 }
2299 
2301  Full256<float> /* tag */, float* HWY_RESTRICT p) {
2302  _mm256_mask_storeu_ps(p, m.raw, v.raw);
2303 }
2304 
2306  Full256<double> /* tag */, double* HWY_RESTRICT p) {
2307  _mm256_mask_storeu_pd(p, m.raw, v.raw);
2308 }
2309 
2310 #else // AVX2
2311 
2312 // Intel SDM says "No AC# reported for any mask bit combinations". However, AMD
2313 // allows AC# if "Alignment checking enabled and: 256-bit memory operand not
2314 // 32-byte aligned". Fortunately AC# is not enabled by default and requires both
2315 // OS support (CR0) and the application to set rflags.AC. We assume these remain
2316 // disabled because x86/x64 code and compiler output often contain misaligned
2317 // scalar accesses, which would also fault.
2318 //
2319 // Caveat: these are slow on AMD Jaguar/Bulldozer.
2320 
2321 template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2322 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
2323  T* HWY_RESTRICT p) {
2324  // There is no maskload_epi8/16. Blending is also unsafe because loading a
2325  // full vector that crosses the array end causes asan faults. Resort to scalar
2326  // code; the caller should instead use memcpy, assuming m is FirstN(d, n).
2327  const RebindToUnsigned<decltype(d)> du;
2328  using TU = TFromD<decltype(du)>;
2329  alignas(32) TU buf[32 / sizeof(T)];
2330  alignas(32) TU mask[32 / sizeof(T)];
2331  Store(BitCast(du, v), du, buf);
2332  Store(BitCast(du, VecFromMask(d, m)), du, mask);
2333  for (size_t i = 0; i < 32 / sizeof(T); ++i) {
2334  if (mask[i]) {
2335  CopyBytes<sizeof(T)>(buf + i, p + i);
2336  }
2337  }
2338 }
2339 
2340 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2341 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2342  T* HWY_RESTRICT p) {
2343  auto pi = reinterpret_cast<int*>(p); // NOLINT
2344  _mm256_maskstore_epi32(pi, m.raw, v.raw);
2345 }
2346 
2347 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2348 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2349  T* HWY_RESTRICT p) {
2350  auto pi = reinterpret_cast<long long*>(p); // NOLINT
2351  _mm256_maskstore_epi64(pi, m.raw, v.raw);
2352 }
2353 
2354 HWY_API void BlendedStore(Vec256<float> v, Mask256<float> m, Full256<float> d,
2355  float* HWY_RESTRICT p) {
2356  const Vec256<int32_t> mi =
2357  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2358  _mm256_maskstore_ps(p, mi.raw, v.raw);
2359 }
2360 
2361 HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m,
2362  Full256<double> d, double* HWY_RESTRICT p) {
2363  const Vec256<int64_t> mi =
2364  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2365  _mm256_maskstore_pd(p, mi.raw, v.raw);
2366 }
2367 
2368 #endif
2369 
2370 // ------------------------------ Non-temporal stores
2371 
2372 template <typename T>
2373 HWY_API void Stream(Vec256<T> v, Full256<T> /* tag */,
2374  T* HWY_RESTRICT aligned) {
2375  _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2376 }
2378  float* HWY_RESTRICT aligned) {
2379  _mm256_stream_ps(aligned, v.raw);
2380 }
2382  double* HWY_RESTRICT aligned) {
2383  _mm256_stream_pd(aligned, v.raw);
2384 }
2385 
2386 // ------------------------------ Scatter
2387 
2388 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2389 HWY_DIAGNOSTICS(push)
2390 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2391 
2392 #if HWY_TARGET <= HWY_AVX3
2393 namespace detail {
2394 
2395 template <typename T>
2397  Full256<T> /* tag */, T* HWY_RESTRICT base,
2398  const Vec256<int32_t> offset) {
2399  _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
2400 }
2401 template <typename T>
2403  Full256<T> /* tag */, T* HWY_RESTRICT base,
2404  const Vec256<int32_t> index) {
2405  _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
2406 }
2407 
2408 template <typename T>
2410  Full256<T> /* tag */, T* HWY_RESTRICT base,
2411  const Vec256<int64_t> offset) {
2412  _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
2413 }
2414 template <typename T>
2416  Full256<T> /* tag */, T* HWY_RESTRICT base,
2417  const Vec256<int64_t> index) {
2418  _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
2419 }
2420 
2421 } // namespace detail
2422 
2423 template <typename T, typename Offset>
2424 HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2425  const Vec256<Offset> offset) {
2426  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2427  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2428 }
2429 template <typename T, typename Index>
2430 HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2431  const Vec256<Index> index) {
2432  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2433  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2434 }
2435 
2437  float* HWY_RESTRICT base,
2438  const Vec256<int32_t> offset) {
2439  _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
2440 }
2442  float* HWY_RESTRICT base,
2443  const Vec256<int32_t> index) {
2444  _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
2445 }
2446 
2448  double* HWY_RESTRICT base,
2449  const Vec256<int64_t> offset) {
2450  _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
2451 }
2453  double* HWY_RESTRICT base,
2454  const Vec256<int64_t> index) {
2455  _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
2456 }
2457 
2458 #else
2459 
2460 template <typename T, typename Offset>
2461 HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2462  const Vec256<Offset> offset) {
2463  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2464 
2465  constexpr size_t N = 32 / sizeof(T);
2466  alignas(32) T lanes[N];
2467  Store(v, d, lanes);
2468 
2469  alignas(32) Offset offset_lanes[N];
2470  Store(offset, Full256<Offset>(), offset_lanes);
2471 
2472  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
2473  for (size_t i = 0; i < N; ++i) {
2474  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2475  }
2476 }
2477 
2478 template <typename T, typename Index>
2479 HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2480  const Vec256<Index> index) {
2481  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2482 
2483  constexpr size_t N = 32 / sizeof(T);
2484  alignas(32) T lanes[N];
2485  Store(v, d, lanes);
2486 
2487  alignas(32) Index index_lanes[N];
2488  Store(index, Full256<Index>(), index_lanes);
2489 
2490  for (size_t i = 0; i < N; ++i) {
2491  base[index_lanes[i]] = lanes[i];
2492  }
2493 }
2494 
2495 #endif
2496 
2497 // ------------------------------ Gather
2498 
2499 namespace detail {
2500 
2501 template <typename T>
2503  Full256<T> /* tag */,
2504  const T* HWY_RESTRICT base,
2505  const Vec256<int32_t> offset) {
2506  return Vec256<T>{_mm256_i32gather_epi32(
2507  reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
2508 }
2509 template <typename T>
2511  Full256<T> /* tag */,
2512  const T* HWY_RESTRICT base,
2513  const Vec256<int32_t> index) {
2514  return Vec256<T>{_mm256_i32gather_epi32(
2515  reinterpret_cast<const int32_t*>(base), index.raw, 4)};
2516 }
2517 
2518 template <typename T>
2520  Full256<T> /* tag */,
2521  const T* HWY_RESTRICT base,
2522  const Vec256<int64_t> offset) {
2523  return Vec256<T>{_mm256_i64gather_epi64(
2524  reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
2525 }
2526 template <typename T>
2528  Full256<T> /* tag */,
2529  const T* HWY_RESTRICT base,
2530  const Vec256<int64_t> index) {
2531  return Vec256<T>{_mm256_i64gather_epi64(
2532  reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
2533 }
2534 
2535 } // namespace detail
2536 
2537 template <typename T, typename Offset>
2538 HWY_API Vec256<T> GatherOffset(Full256<T> d, const T* HWY_RESTRICT base,
2539  const Vec256<Offset> offset) {
2540  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2541  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2542 }
2543 template <typename T, typename Index>
2544 HWY_API Vec256<T> GatherIndex(Full256<T> d, const T* HWY_RESTRICT base,
2545  const Vec256<Index> index) {
2546  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2547  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2548 }
2549 
2551  const float* HWY_RESTRICT base,
2552  const Vec256<int32_t> offset) {
2553  return Vec256<float>{_mm256_i32gather_ps(base, offset.raw, 1)};
2554 }
2556  const float* HWY_RESTRICT base,
2557  const Vec256<int32_t> index) {
2558  return Vec256<float>{_mm256_i32gather_ps(base, index.raw, 4)};
2559 }
2560 
2562  const double* HWY_RESTRICT base,
2563  const Vec256<int64_t> offset) {
2564  return Vec256<double>{_mm256_i64gather_pd(base, offset.raw, 1)};
2565 }
2567  const double* HWY_RESTRICT base,
2568  const Vec256<int64_t> index) {
2569  return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
2570 }
2571 
2572 HWY_DIAGNOSTICS(pop)
2573 
2574 // ================================================== SWIZZLE
2575 
2576 // ------------------------------ LowerHalf
2577 
2578 template <typename T>
2579 HWY_API Vec128<T> LowerHalf(Full128<T> /* tag */, Vec256<T> v) {
2580  return Vec128<T>{_mm256_castsi256_si128(v.raw)};
2581 }
2583  return Vec128<float>{_mm256_castps256_ps128(v.raw)};
2584 }
2586  return Vec128<double>{_mm256_castpd256_pd128(v.raw)};
2587 }
2588 
2589 template <typename T>
2590 HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
2591  return LowerHalf(Full128<T>(), v);
2592 }
2593 
2594 // ------------------------------ UpperHalf
2595 
2596 template <typename T>
2598  return Vec128<T>{_mm256_extracti128_si256(v.raw, 1)};
2599 }
2601  return Vec128<float>{_mm256_extractf128_ps(v.raw, 1)};
2602 }
2604  return Vec128<double>{_mm256_extractf128_pd(v.raw, 1)};
2605 }
2606 
2607 // ------------------------------ ExtractLane (Store)
2608 template <typename T>
2609 HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
2610  const Full256<T> d;
2611  HWY_DASSERT(i < Lanes(d));
2612  alignas(32) T lanes[32 / sizeof(T)];
2613  Store(v, d, lanes);
2614  return lanes[i];
2615 }
2616 
2617 // ------------------------------ InsertLane (Store)
2618 template <typename T>
2619 HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
2620  const Full256<T> d;
2621  HWY_DASSERT(i < Lanes(d));
2622  alignas(64) T lanes[64 / sizeof(T)];
2623  Store(v, d, lanes);
2624  lanes[i] = t;
2625  return Load(d, lanes);
2626 }
2627 
2628 // ------------------------------ GetLane (LowerHalf)
2629 template <typename T>
2631  return GetLane(LowerHalf(v));
2632 }
2633 
2634 // ------------------------------ ZeroExtendVector
2635 
2636 // Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper
2637 // bits undefined. Although it makes sense for them to be zero (VEX encoded
2638 // 128-bit instructions zero the upper lanes to avoid large penalties), a
2639 // compiler could decide to optimize out code that relies on this.
2640 //
2641 // The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the
2642 // zeroing, but it is not available on MSVC until 15.7 nor GCC until 10.1. For
2643 // older GCC, we can still obtain the desired code thanks to pattern
2644 // recognition; note that the expensive insert instruction is not actually
2645 // generated, see https://gcc.godbolt.org/z/1MKGaP.
2646 
2647 #if !defined(HWY_HAVE_ZEXT)
2648 #if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) || \
2649  (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
2650  (!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC >= 1000)
2651 #define HWY_HAVE_ZEXT 1
2652 #else
2653 #define HWY_HAVE_ZEXT 0
2654 #endif
2655 #endif // defined(HWY_HAVE_ZEXT)
2656 
2657 template <typename T>
2658 HWY_API Vec256<T> ZeroExtendVector(Full256<T> /* tag */, Vec128<T> lo) {
2659 #if HWY_HAVE_ZEXT
2660 return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
2661 #else
2662  return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
2663 #endif
2664 }
2666  Vec128<float> lo) {
2667 #if HWY_HAVE_ZEXT
2668  return Vec256<float>{_mm256_zextps128_ps256(lo.raw)};
2669 #else
2670  return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
2671 #endif
2672 }
2674  Vec128<double> lo) {
2675 #if HWY_HAVE_ZEXT
2676  return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)};
2677 #else
2678  return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
2679 #endif
2680 }
2681 
2682 // ------------------------------ Combine
2683 
2684 template <typename T>
2685 HWY_API Vec256<T> Combine(Full256<T> d, Vec128<T> hi, Vec128<T> lo) {
2686  const auto lo256 = ZeroExtendVector(d, lo);
2687  return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
2688 }
2690  Vec128<float> lo) {
2691  const auto lo256 = ZeroExtendVector(d, lo);
2692  return Vec256<float>{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)};
2693 }
2695  Vec128<double> lo) {
2696  const auto lo256 = ZeroExtendVector(d, lo);
2697  return Vec256<double>{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)};
2698 }
2699 
2700 // ------------------------------ ShiftLeftBytes
2701 
2702 template <int kBytes, typename T>
2703 HWY_API Vec256<T> ShiftLeftBytes(Full256<T> /* tag */, const Vec256<T> v) {
2704  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2705  // This is the same operation as _mm256_bslli_epi128.
2706  return Vec256<T>{_mm256_slli_si256(v.raw, kBytes)};
2707 }
2708 
2709 template <int kBytes, typename T>
2710 HWY_API Vec256<T> ShiftLeftBytes(const Vec256<T> v) {
2711  return ShiftLeftBytes<kBytes>(Full256<T>(), v);
2712 }
2713 
2714 // ------------------------------ ShiftLeftLanes
2715 
2716 template <int kLanes, typename T>
2717 HWY_API Vec256<T> ShiftLeftLanes(Full256<T> d, const Vec256<T> v) {
2718  const Repartition<uint8_t, decltype(d)> d8;
2719  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2720 }
2721 
2722 template <int kLanes, typename T>
2723 HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) {
2724  return ShiftLeftLanes<kLanes>(Full256<T>(), v);
2725 }
2726 
2727 // ------------------------------ ShiftRightBytes
2728 
2729 template <int kBytes, typename T>
2730 HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, const Vec256<T> v) {
2731  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2732  // This is the same operation as _mm256_bsrli_epi128.
2733  return Vec256<T>{_mm256_srli_si256(v.raw, kBytes)};
2734 }
2735 
2736 // ------------------------------ ShiftRightLanes
2737 template <int kLanes, typename T>
2738 HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) {
2739  const Repartition<uint8_t, decltype(d)> d8;
2740  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2741 }
2742 
2743 // ------------------------------ CombineShiftRightBytes
2744 
2745 // Extracts 128 bits from <hi, lo> by skipping the least-significant kBytes.
2746 template <int kBytes, typename T, class V = Vec256<T>>
2747 HWY_API V CombineShiftRightBytes(Full256<T> d, V hi, V lo) {
2748  const Repartition<uint8_t, decltype(d)> d8;
2749  return BitCast(d, Vec256<uint8_t>{_mm256_alignr_epi8(
2750  BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2751 }
2752 
2753 // ------------------------------ Broadcast/splat any lane
2754 
2755 // Unsigned
2756 template <int kLane>
2757 HWY_API Vec256<uint16_t> Broadcast(const Vec256<uint16_t> v) {
2758  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2759  if (kLane < 4) {
2760  const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2761  return Vec256<uint16_t>{_mm256_unpacklo_epi64(lo, lo)};
2762  } else {
2763  const __m256i hi =
2764  _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2765  return Vec256<uint16_t>{_mm256_unpackhi_epi64(hi, hi)};
2766  }
2767 }
2768 template <int kLane>
2769 HWY_API Vec256<uint32_t> Broadcast(const Vec256<uint32_t> v) {
2770  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2771  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2772 }
2773 template <int kLane>
2775  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2776  return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2777 }
2778 
2779 // Signed
2780 template <int kLane>
2781 HWY_API Vec256<int16_t> Broadcast(const Vec256<int16_t> v) {
2782  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2783  if (kLane < 4) {
2784  const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2785  return Vec256<int16_t>{_mm256_unpacklo_epi64(lo, lo)};
2786  } else {
2787  const __m256i hi =
2788  _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2789  return Vec256<int16_t>{_mm256_unpackhi_epi64(hi, hi)};
2790  }
2791 }
2792 template <int kLane>
2793 HWY_API Vec256<int32_t> Broadcast(const Vec256<int32_t> v) {
2794  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2795  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2796 }
2797 template <int kLane>
2799  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2800  return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2801 }
2802 
2803 // Float
2804 template <int kLane>
2805 HWY_API Vec256<float> Broadcast(Vec256<float> v) {
2806  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2807  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
2808 }
2809 template <int kLane>
2811  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2812  return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)};
2813 }
2814 
2815 // ------------------------------ Hard-coded shuffles
2816 
2817 // Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2818 // least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2819 // right (the previous least-significant lane is now most-significant =>
2820 // 47650321). These could also be implemented via CombineShiftRightBytes but
2821 // the shuffle_abcd notation is more convenient.
2822 
2823 // Swap 32-bit halves in 64-bit halves.
2824 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2826  return Vec256<T>{_mm256_shuffle_epi32(v.raw, 0xB1)};
2827 }
2829  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
2830 }
2831 
2832 namespace detail {
2833 
2834 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2836  const Full256<T> d;
2837  const RebindToFloat<decltype(d)> df;
2838  constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
2839  return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2840  BitCast(df, b).raw, m)});
2841 }
2842 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2844  const Full256<T> d;
2845  const RebindToFloat<decltype(d)> df;
2846  constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
2847  return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2848  BitCast(df, b).raw, m)});
2849 }
2850 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2852  const Full256<T> d;
2853  const RebindToFloat<decltype(d)> df;
2854  constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
2855  return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2856  BitCast(df, b).raw, m)});
2857 }
2858 
2859 } // namespace detail
2860 
2861 // Swap 64-bit halves
2863  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2864 }
2866  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2867 }
2869  // Shorter encoding than _mm256_permute_ps.
2870  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)};
2871 }
2873  return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2874 }
2876  return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2877 }
2879  // Shorter encoding than _mm256_permute_pd.
2880  return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 5)};
2881 }
2882 
2883 // Rotate right 32 bits
2885  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2886 }
2888  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2889 }
2891  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x39)};
2892 }
2893 // Rotate left 32 bits
2895  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
2896 }
2898  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
2899 }
2901  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x93)};
2902 }
2903 
2904 // Reverse
2906  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
2907 }
2909  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
2910 }
2912  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)};
2913 }
2914 
2915 // ------------------------------ TableLookupLanes
2916 
2917 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
2918 template <typename T>
2919 struct Indices256 {
2920  __m256i raw;
2921 };
2922 
2923 // Native 8x32 instruction: indices remain unchanged
2924 template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 4)>
2926  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2927 #if HWY_IS_DEBUG_BUILD
2928  const Full256<TI> di;
2929  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2930  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(32 / sizeof(T))))));
2931 #endif
2932  return Indices256<T>{vec.raw};
2933 }
2934 
2935 // 64-bit lanes: convert indices to 8x32 unless AVX3 is available
2936 template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 8)>
2937 HWY_API Indices256<T> IndicesFromVec(Full256<T> d, Vec256<TI> idx64) {
2938  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2939  const Rebind<TI, decltype(d)> di;
2940  (void)di; // potentially unused
2941 #if HWY_IS_DEBUG_BUILD
2942  HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) &&
2943  AllTrue(di, Lt(idx64, Set(di, static_cast<TI>(32 / sizeof(T))))));
2944 #endif
2945 
2946 #if HWY_TARGET <= HWY_AVX3
2947  (void)d;
2948  return Indices256<T>{idx64.raw};
2949 #else
2950  const Repartition<float, decltype(d)> df; // 32-bit!
2951  // Replicate 64-bit index into upper 32 bits
2952  const Vec256<TI> dup =
2953  BitCast(di, Vec256<float>{_mm256_moveldup_ps(BitCast(df, idx64).raw)});
2954  // For each idx64 i, idx32 are 2*i and 2*i+1.
2955  const Vec256<TI> idx32 = dup + dup + Set(di, TI(1) << 32);
2956  return Indices256<T>{idx32.raw};
2957 #endif
2958 }
2959 
2960 template <typename T, typename TI>
2961 HWY_API Indices256<T> SetTableIndices(const Full256<T> d, const TI* idx) {
2962  const Rebind<TI, decltype(d)> di;
2963  return IndicesFromVec(d, LoadU(di, idx));
2964 }
2965 
2966 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2968  return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
2969 }
2970 
2971 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2972 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
2973 #if HWY_TARGET <= HWY_AVX3
2974  return Vec256<T>{_mm256_permutexvar_epi64(idx.raw, v.raw)};
2975 #else
2976  return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
2977 #endif
2978 }
2979 
2981  const Indices256<float> idx) {
2982  return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
2983 }
2984 
2986  const Indices256<double> idx) {
2987 #if HWY_TARGET <= HWY_AVX3
2988  return Vec256<double>{_mm256_permutexvar_pd(idx.raw, v.raw)};
2989 #else
2990  const Full256<double> df;
2991  const Full256<uint64_t> du;
2992  return BitCast(df, Vec256<uint64_t>{_mm256_permutevar8x32_epi32(
2993  BitCast(du, v).raw, idx.raw)});
2994 #endif
2995 }
2996 
2997 // ------------------------------ SwapAdjacentBlocks
2998 
2999 template <typename T>
3000 HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
3001  return Vec256<T>{_mm256_permute2x128_si256(v.raw, v.raw, 0x01)};
3002 }
3003 
3005  return Vec256<float>{_mm256_permute2f128_ps(v.raw, v.raw, 0x01)};
3006 }
3007 
3009  return Vec256<double>{_mm256_permute2f128_pd(v.raw, v.raw, 0x01)};
3010 }
3011 
3012 // ------------------------------ Reverse (RotateRight)
3013 
3014 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3015 HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
3016  alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
3017  return TableLookupLanes(v, SetTableIndices(d, kReverse));
3018 }
3019 
3020 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3021 HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
3022  alignas(32) constexpr int64_t kReverse[4] = {3, 2, 1, 0};
3023  return TableLookupLanes(v, SetTableIndices(d, kReverse));
3024 }
3025 
3026 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3027 HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
3028 #if HWY_TARGET <= HWY_AVX3
3029  const RebindToSigned<decltype(d)> di;
3030  alignas(32) constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
3031  7, 6, 5, 4, 3, 2, 1, 0};
3032  const Vec256<int16_t> idx = Load(di, kReverse);
3033  return BitCast(d, Vec256<int16_t>{
3034  _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3035 #else
3036  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
3037  const Vec256<uint32_t> rev32 = Reverse(du32, BitCast(du32, v));
3038  return BitCast(d, RotateRight<16>(rev32));
3039 #endif
3040 }
3041 
3042 // ------------------------------ Reverse2
3043 
3044 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3046  const Full256<uint32_t> du32;
3047  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
3048 }
3049 
3050 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3051 HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
3052  return Shuffle2301(v);
3053 }
3054 
3055 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3056 HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
3057  return Shuffle01(v);
3058 }
3059 
3060 // ------------------------------ Reverse4 (SwapAdjacentBlocks)
3061 
3062 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3064 #if HWY_TARGET <= HWY_AVX3
3065  const RebindToSigned<decltype(d)> di;
3066  alignas(32) constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
3067  11, 10, 9, 8, 15, 14, 13, 12};
3068  const Vec256<int16_t> idx = Load(di, kReverse4);
3069  return BitCast(d, Vec256<int16_t>{
3070  _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3071 #else
3072  const RepartitionToWide<decltype(d)> dw;
3073  return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
3074 #endif
3075 }
3076 
3077 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3078 HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
3079  return Shuffle0123(v);
3080 }
3081 
3082 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3083 HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
3084  // Could also use _mm256_permute4x64_epi64.
3085  return SwapAdjacentBlocks(Shuffle01(v));
3086 }
3087 
3088 // ------------------------------ Reverse8
3089 
3090 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3092 #if HWY_TARGET <= HWY_AVX3
3093  const RebindToSigned<decltype(d)> di;
3094  alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
3095  15, 14, 13, 12, 11, 10, 9, 8};
3096  const Vec256<int16_t> idx = Load(di, kReverse8);
3097  return BitCast(d, Vec256<int16_t>{
3098  _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3099 #else
3100  const RepartitionToWide<decltype(d)> dw;
3101  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
3102 #endif
3103 }
3104 
3105 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3106 HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
3107  return Reverse(d, v);
3108 }
3109 
3110 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3111 HWY_API Vec256<T> Reverse8(Full256<T> /* tag */, const Vec256<T> /* v */) {
3112  HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes
3113 }
3114 
3115 // ------------------------------ InterleaveLower
3116 
3117 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
3118 // the least-significant lane) and "b". To concatenate two half-width integers
3119 // into one, use ZipLower/Upper instead (also works with scalar).
3120 
3121 HWY_API Vec256<uint8_t> InterleaveLower(const Vec256<uint8_t> a,
3122  const Vec256<uint8_t> b) {
3123  return Vec256<uint8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
3124 }
3125 HWY_API Vec256<uint16_t> InterleaveLower(const Vec256<uint16_t> a,
3126  const Vec256<uint16_t> b) {
3127  return Vec256<uint16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
3128 }
3129 HWY_API Vec256<uint32_t> InterleaveLower(const Vec256<uint32_t> a,
3130  const Vec256<uint32_t> b) {
3131  return Vec256<uint32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3132 }
3133 HWY_API Vec256<uint64_t> InterleaveLower(const Vec256<uint64_t> a,
3134  const Vec256<uint64_t> b) {
3135  return Vec256<uint64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3136 }
3137 
3138 HWY_API Vec256<int8_t> InterleaveLower(const Vec256<int8_t> a,
3139  const Vec256<int8_t> b) {
3140  return Vec256<int8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
3141 }
3142 HWY_API Vec256<int16_t> InterleaveLower(const Vec256<int16_t> a,
3143  const Vec256<int16_t> b) {
3144  return Vec256<int16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
3145 }
3146 HWY_API Vec256<int32_t> InterleaveLower(const Vec256<int32_t> a,
3147  const Vec256<int32_t> b) {
3148  return Vec256<int32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3149 }
3150 HWY_API Vec256<int64_t> InterleaveLower(const Vec256<int64_t> a,
3151  const Vec256<int64_t> b) {
3152  return Vec256<int64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3153 }
3154 
3155 HWY_API Vec256<float> InterleaveLower(const Vec256<float> a,
3156  const Vec256<float> b) {
3157  return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)};
3158 }
3160  const Vec256<double> b) {
3161  return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
3162 }
3163 
3164 // ------------------------------ InterleaveUpper
3165 
3166 // All functions inside detail lack the required D parameter.
3167 namespace detail {
3168 
3169 HWY_API Vec256<uint8_t> InterleaveUpper(const Vec256<uint8_t> a,
3170  const Vec256<uint8_t> b) {
3171  return Vec256<uint8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3172 }
3173 HWY_API Vec256<uint16_t> InterleaveUpper(const Vec256<uint16_t> a,
3174  const Vec256<uint16_t> b) {
3175  return Vec256<uint16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3176 }
3177 HWY_API Vec256<uint32_t> InterleaveUpper(const Vec256<uint32_t> a,
3178  const Vec256<uint32_t> b) {
3179  return Vec256<uint32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3180 }
3181 HWY_API Vec256<uint64_t> InterleaveUpper(const Vec256<uint64_t> a,
3182  const Vec256<uint64_t> b) {
3183  return Vec256<uint64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3184 }
3185 
3186 HWY_API Vec256<int8_t> InterleaveUpper(const Vec256<int8_t> a,
3187  const Vec256<int8_t> b) {
3188  return Vec256<int8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3189 }
3190 HWY_API Vec256<int16_t> InterleaveUpper(const Vec256<int16_t> a,
3191  const Vec256<int16_t> b) {
3192  return Vec256<int16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3193 }
3194 HWY_API Vec256<int32_t> InterleaveUpper(const Vec256<int32_t> a,
3195  const Vec256<int32_t> b) {
3196  return Vec256<int32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3197 }
3198 HWY_API Vec256<int64_t> InterleaveUpper(const Vec256<int64_t> a,
3199  const Vec256<int64_t> b) {
3200  return Vec256<int64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3201 }
3202 
3203 HWY_API Vec256<float> InterleaveUpper(const Vec256<float> a,
3204  const Vec256<float> b) {
3205  return Vec256<float>{_mm256_unpackhi_ps(a.raw, b.raw)};
3206 }
3208  const Vec256<double> b) {
3209  return Vec256<double>{_mm256_unpackhi_pd(a.raw, b.raw)};
3210 }
3211 
3212 } // namespace detail
3213 
3214 template <typename T, class V = Vec256<T>>
3215 HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
3216  return detail::InterleaveUpper(a, b);
3217 }
3218 
3219 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3220 
3221 // Same as Interleave*, except that the return lanes are double-width integers;
3222 // this is necessary because the single-lane scalar cannot return two values.
3223 template <typename T, typename TW = MakeWide<T>>
3225  return BitCast(Full256<TW>(), InterleaveLower(a, b));
3226 }
3227 template <typename T, typename TW = MakeWide<T>>
3229  return BitCast(dw, InterleaveLower(a, b));
3230 }
3231 
3232 template <typename T, typename TW = MakeWide<T>>
3234  return BitCast(dw, InterleaveUpper(Full256<T>(), a, b));
3235 }
3236 
3237 // ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
3238 
3239 // _mm256_broadcastsi128_si256 has 7 cycle latency on ICL.
3240 // _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no
3241 // extra cost) for LowerLower and UpperLower.
3242 
3243 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
3244 template <typename T>
3245 HWY_API Vec256<T> ConcatLowerLower(Full256<T> d, const Vec256<T> hi,
3246  const Vec256<T> lo) {
3247  const Half<decltype(d)> d2;
3248  return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)};
3249 }
3251  const Vec256<float> lo) {
3252  const Half<decltype(d)> d2;
3253  return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
3254 }
3256  const Vec256<double> hi,
3257  const Vec256<double> lo) {
3258  const Half<decltype(d)> d2;
3259  return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
3260 }
3261 
3262 // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
3263 template <typename T>
3264 HWY_API Vec256<T> ConcatLowerUpper(Full256<T> /* tag */, const Vec256<T> hi,
3265  const Vec256<T> lo) {
3266  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
3267 }
3269  const Vec256<float> hi,
3270  const Vec256<float> lo) {
3271  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
3272 }
3274  const Vec256<double> hi,
3275  const Vec256<double> lo) {
3276  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
3277 }
3278 
3279 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
3280 template <typename T>
3281 HWY_API Vec256<T> ConcatUpperLower(Full256<T> /* tag */, const Vec256<T> hi,
3282  const Vec256<T> lo) {
3283  return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
3284 }
3286  const Vec256<float> hi,
3287  const Vec256<float> lo) {
3288  return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
3289 }
3291  const Vec256<double> hi,
3292  const Vec256<double> lo) {
3293  return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
3294 }
3295 
3296 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
3297 template <typename T>
3298 HWY_API Vec256<T> ConcatUpperUpper(Full256<T> /* tag */, const Vec256<T> hi,
3299  const Vec256<T> lo) {
3300  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
3301 }
3303  const Vec256<float> hi,
3304  const Vec256<float> lo) {
3305  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
3306 }
3308  const Vec256<double> hi,
3309  const Vec256<double> lo) {
3310  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
3311 }
3312 
3313 // ------------------------------ ConcatOdd
3314 
3315 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3316 HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3317  const RebindToUnsigned<decltype(d)> du;
3318 #if HWY_TARGET == HWY_AVX3_DL
3319  alignas(32) constexpr uint8_t kIdx[32] = {
3320  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
3321  33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
3322  return BitCast(d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi8(
3323  BitCast(du, lo).raw, Load(du, kIdx).raw,
3324  __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
3325 #else
3326  const RepartitionToWide<decltype(du)> dw;
3327  // Unsigned 8-bit shift so we can pack.
3328  const Vec256<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
3329  const Vec256<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
3330  const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
3331  return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3332 #endif
3333 }
3334 
3335 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3336 HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3337  const RebindToUnsigned<decltype(d)> du;
3338 #if HWY_TARGET <= HWY_AVX3
3339  alignas(32) constexpr uint16_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
3340  17, 19, 21, 23, 25, 27, 29, 31};
3341  return BitCast(d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi16(
3342  BitCast(du, lo).raw, Load(du, kIdx).raw,
3343  __mmask16{0xFFFF}, BitCast(du, hi).raw)});
3344 #else
3345  const RepartitionToWide<decltype(du)> dw;
3346  // Unsigned 16-bit shift so we can pack.
3347  const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
3348  const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
3349  const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3350  return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3351 #endif
3352 }
3353 
3354 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3355 HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3356  const RebindToUnsigned<decltype(d)> du;
3357 #if HWY_TARGET <= HWY_AVX3
3358  alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3359  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3360  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3361  BitCast(du, hi).raw)});
3362 #else
3363  const RebindToFloat<decltype(d)> df;
3364  const Vec256<float> v3131{_mm256_shuffle_ps(
3365  BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
3366  return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v3131).raw,
3367  _MM_SHUFFLE(3, 1, 2, 0))};
3368 #endif
3369 }
3370 
3372  Vec256<float> lo) {
3373  const RebindToUnsigned<decltype(d)> du;
3374 #if HWY_TARGET <= HWY_AVX3
3375  alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3376  return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
3377  __mmask8{0xFF}, hi.raw)};
3378 #else
3379  const Vec256<float> v3131{
3380  _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
3381  return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
3382  BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3383 #endif
3384 }
3385 
3386 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3387 HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3388  const RebindToUnsigned<decltype(d)> du;
3389 #if HWY_TARGET <= HWY_AVX3
3390  alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3391  return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3392  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3393  BitCast(du, hi).raw)});
3394 #else
3395  const RebindToFloat<decltype(d)> df;
3396  const Vec256<double> v31{
3397  _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)};
3398  return Vec256<T>{
3399  _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3400 #endif
3401 }
3402 
3404  Vec256<double> lo) {
3405 #if HWY_TARGET <= HWY_AVX3
3406  const RebindToUnsigned<decltype(d)> du;
3407  alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3408  return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
3409  __mmask8{0xFF}, hi.raw)};
3410 #else
3411  (void)d;
3412  const Vec256<double> v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)};
3413  return Vec256<double>{
3414  _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3415 #endif
3416 }
3417 
3418 // ------------------------------ ConcatEven
3419 
3420 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3421 HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3422  const RebindToUnsigned<decltype(d)> du;
3423 #if HWY_TARGET == HWY_AVX3_DL
3424  alignas(64) constexpr uint8_t kIdx[32] = {
3425  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3426  32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
3427  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi8(
3428  BitCast(du, lo).raw, Load(du, kIdx).raw,
3429  __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
3430 #else
3431  const RepartitionToWide<decltype(du)> dw;
3432  // Isolate lower 8 bits per u16 so we can pack.
3433  const Vec256<uint16_t> mask = Set(dw, 0x00FF);
3434  const Vec256<uint16_t> uH = And(BitCast(dw, hi), mask);
3435  const Vec256<uint16_t> uL = And(BitCast(dw, lo), mask);
3436  const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
3437  return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3438 #endif
3439 }
3440 
3441 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3442 HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3443  const RebindToUnsigned<decltype(d)> du;
3444 #if HWY_TARGET <= HWY_AVX3
3445  alignas(64) constexpr uint16_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3446  16, 18, 20, 22, 24, 26, 28, 30};
3447  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi16(
3448  BitCast(du, lo).raw, Load(du, kIdx).raw,
3449  __mmask16{0xFFFF}, BitCast(du, hi).raw)});
3450 #else
3451  const RepartitionToWide<decltype(du)> dw;
3452  // Isolate lower 16 bits per u32 so we can pack.
3453  const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF);
3454  const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
3455  const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
3456  const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3457  return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3458 #endif
3459 }
3460 
3461 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3462 HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3463  const RebindToUnsigned<decltype(d)> du;
3464 #if HWY_TARGET <= HWY_AVX3
3465  alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3466  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3467  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3468  BitCast(du, hi).raw)});
3469 #else
3470  const RebindToFloat<decltype(d)> df;
3471  const Vec256<float> v2020{_mm256_shuffle_ps(
3472  BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
3473  return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
3474  _MM_SHUFFLE(3, 1, 2, 0))};
3475 
3476 #endif
3477 }
3478 
3480  Vec256<float> lo) {
3481  const RebindToUnsigned<decltype(d)> du;
3482 #if HWY_TARGET <= HWY_AVX3
3483  alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3484  return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
3485  __mmask8{0xFF}, hi.raw)};
3486 #else
3487  const Vec256<float> v2020{
3488  _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
3489  return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
3490  BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3491 
3492 #endif
3493 }
3494 
3495 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3496 HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3497  const RebindToUnsigned<decltype(d)> du;
3498 #if HWY_TARGET <= HWY_AVX3
3499  alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3500  return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3501  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3502  BitCast(du, hi).raw)});
3503 #else
3504  const RebindToFloat<decltype(d)> df;
3505  const Vec256<double> v20{
3506  _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
3507  return Vec256<T>{
3508  _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3509 
3510 #endif
3511 }
3512 
3514  Vec256<double> lo) {
3515 #if HWY_TARGET <= HWY_AVX3
3516  const RebindToUnsigned<decltype(d)> du;
3517  alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3518  return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
3519  __mmask8{0xFF}, hi.raw)};
3520 #else
3521  (void)d;
3522  const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
3523  return Vec256<double>{
3524  _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3525 #endif
3526 }
3527 
3528 // ------------------------------ DupEven (InterleaveLower)
3529 
3530 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3532  return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3533 }
3535  return Vec256<float>{
3536  _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3537 }
3538 
3539 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3540 HWY_API Vec256<T> DupEven(const Vec256<T> v) {
3541  return InterleaveLower(Full256<T>(), v, v);
3542 }
3543 
3544 // ------------------------------ DupOdd (InterleaveUpper)
3545 
3546 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3548  return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3549 }
3551  return Vec256<float>{
3552  _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3553 }
3554 
3555 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3556 HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
3557  return InterleaveUpper(Full256<T>(), v, v);
3558 }
3559 
3560 // ------------------------------ OddEven
3561 
3562 namespace detail {
3563 
3564 template <typename T>
3565 HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec256<T> a,
3566  const Vec256<T> b) {
3567  const Full256<T> d;
3568  const Full256<uint8_t> d8;
3569  alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3570  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3571  return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a);
3572 }
3573 template <typename T>
3574 HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec256<T> a,
3575  const Vec256<T> b) {
3576  return Vec256<T>{_mm256_blend_epi16(a.raw, b.raw, 0x55)};
3577 }
3578 template <typename T>
3579 HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec256<T> a,
3580  const Vec256<T> b) {
3581  return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
3582 }
3583 template <typename T>
3584 HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec256<T> a,
3585  const Vec256<T> b) {
3586  return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
3587 }
3588 
3589 } // namespace detail
3590 
3591 template <typename T>
3592 HWY_API Vec256<T> OddEven(const Vec256<T> a, const Vec256<T> b) {
3593  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3594 }
3595 HWY_API Vec256<float> OddEven(const Vec256<float> a, const Vec256<float> b) {
3596  return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)};
3597 }
3598 
3600  return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
3601 }
3602 
3603 // ------------------------------ OddEvenBlocks
3604 
3605 template <typename T>
3607  return Vec256<T>{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)};
3608 }
3609 
3611  return Vec256<float>{_mm256_blend_ps(odd.raw, even.raw, 0xFu)};
3612 }
3613 
3615  return Vec256<double>{_mm256_blend_pd(odd.raw, even.raw, 0x3u)};
3616 }
3617 
3618 // ------------------------------ ReverseBlocks (ConcatLowerUpper)
3619 
3620 template <typename T>
3621 HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) {
3622  return ConcatLowerUpper(d, v, v);
3623 }
3624 
3625 // ------------------------------ TableLookupBytes (ZeroExtendVector)
3626 
3627 // Both full
3628 template <typename T, typename TI>
3629 HWY_API Vec256<TI> TableLookupBytes(const Vec256<T> bytes,
3630  const Vec256<TI> from) {
3631  return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
3632 }
3633 
3634 // Partial index vector
3635 template <typename T, typename TI, size_t NI>
3637  const Vec128<TI, NI> from) {
3638  // First expand to full 128, then 256.
3639  const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
3640  const auto tbl_full = TableLookupBytes(bytes, from_256);
3641  // Shrink to 128, then partial.
3642  return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw};
3643 }
3644 
3645 // Partial table vector
3646 template <typename T, size_t N, typename TI>
3648  const Vec256<TI> from) {
3649  // First expand to full 128, then 256.
3650  const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
3651  return TableLookupBytes(bytes_256, from);
3652 }
3653 
3654 // Partial both are handled by x86_128.
3655 
3656 // ------------------------------ Shl (Mul, ZipLower)
3657 
3658 #if HWY_TARGET > HWY_AVX3 // AVX2 or older
3659 namespace detail {
3660 
3661 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
3662 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3663 HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
3664  const Full256<T> d;
3665  const RepartitionToWide<decltype(d)> dw;
3666  const Rebind<float, decltype(dw)> df;
3667  const auto zero = Zero(d);
3668  // Move into exponent (this u16 will become the upper half of an f32)
3669  const auto exp = ShiftLeft<23 - 16>(v);
3670  const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
3671  // Insert 0 into lower halves for reinterpreting as binary32.
3672  const auto f0 = ZipLower(dw, zero, upper);
3673  const auto f1 = ZipUpper(dw, zero, upper);
3674  // Do not use ConvertTo because it checks for overflow, which is redundant
3675  // because we only care about v in [0, 16).
3676  const Vec256<int32_t> bits0{_mm256_cvttps_epi32(BitCast(df, f0).raw)};
3677  const Vec256<int32_t> bits1{_mm256_cvttps_epi32(BitCast(df, f1).raw)};
3678  return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
3679 }
3680 
3681 } // namespace detail
3682 #endif // HWY_TARGET > HWY_AVX3
3683 
3685  const Vec256<uint16_t> bits) {
3686 #if HWY_TARGET <= HWY_AVX3
3687  return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
3688 #else
3689  return v * detail::Pow2(bits);
3690 #endif
3691 }
3692 
3694  const Vec256<uint32_t> bits) {
3695  return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)};
3696 }
3697 
3699  const Vec256<uint64_t> bits) {
3700  return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)};
3701 }
3702 
3703 // Signed left shift is the same as unsigned.
3704 template <typename T, HWY_IF_SIGNED(T)>
3706  const Full256<T> di;
3707  const Full256<MakeUnsigned<T>> du;
3708  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
3709 }
3710 
3711 // ------------------------------ Shr (MulHigh, IfThenElse, Not)
3712 
3714  const Vec256<uint16_t> bits) {
3715 #if HWY_TARGET <= HWY_AVX3
3716  return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
3717 #else
3718  const Full256<uint16_t> d;
3719  // For bits=0, we cannot mul by 2^16, so fix the result later.
3720  const auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
3721  // Replace output with input where bits == 0.
3722  return IfThenElse(bits == Zero(d), v, out);
3723 #endif
3724 }
3725 
3727  const Vec256<uint32_t> bits) {
3728  return Vec256<uint32_t>{_mm256_srlv_epi32(v.raw, bits.raw)};
3729 }
3730 
3732  const Vec256<uint64_t> bits) {
3733  return Vec256<uint64_t>{_mm256_srlv_epi64(v.raw, bits.raw)};
3734 }
3735 
3737  const Vec256<int16_t> bits) {
3738 #if HWY_TARGET <= HWY_AVX3
3739  return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)};
3740 #else
3741  return detail::SignedShr(Full256<int16_t>(), v, bits);
3742 #endif
3743 }
3744 
3746  const Vec256<int32_t> bits) {
3747  return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)};
3748 }
3749 
3751  const Vec256<int64_t> bits) {
3752 #if HWY_TARGET <= HWY_AVX3
3753  return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)};
3754 #else
3755  return detail::SignedShr(Full256<int64_t>(), v, bits);
3756 #endif
3757 }
3758 
3759 HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
3760  const Vec256<uint64_t> b) {
3761  const DFromV<decltype(a)> du64;
3762  const RepartitionToNarrow<decltype(du64)> du32;
3763  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3764  const auto a32 = BitCast(du32, a);
3765  const auto b32 = BitCast(du32, b);
3766  // Inputs for MulEven: we only need the lower 32 bits
3767  const auto aH = Shuffle2301(a32);
3768  const auto bH = Shuffle2301(b32);
3769 
3770  // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
3771  // the even (lower 64 bits of every 128-bit block) results. See
3772  // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
3773  const auto aLbL = MulEven(a32, b32);
3774  const auto w3 = aLbL & maskL;
3775 
3776  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3777  const auto w2 = t2 & maskL;
3778  const auto w1 = ShiftRight<32>(t2);
3779 
3780  const auto t = MulEven(a32, bH) + w2;
3781  const auto k = ShiftRight<32>(t);
3782 
3783  const auto mulH = MulEven(aH, bH) + w1 + k;
3784  const auto mulL = ShiftLeft<32>(t) + w3;
3785  return InterleaveLower(mulL, mulH);
3786 }
3787 
3788 HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
3789  const Vec256<uint64_t> b) {
3790  const DFromV<decltype(a)> du64;
3791  const RepartitionToNarrow<decltype(du64)> du32;
3792  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3793  const auto a32 = BitCast(du32, a);
3794  const auto b32 = BitCast(du32, b);
3795  // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
3796  const auto aH = Shuffle2301(a32);
3797  const auto bH = Shuffle2301(b32);
3798 
3799  // Same as above, but we're using the odd results (upper 64 bits per block).
3800  const auto aLbL = MulEven(a32, b32);
3801  const auto w3 = aLbL & maskL;
3802 
3803  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3804  const auto w2 = t2 & maskL;
3805  const auto w1 = ShiftRight<32>(t2);
3806 
3807  const auto t = MulEven(a32, bH) + w2;
3808  const auto k = ShiftRight<32>(t);
3809 
3810  const auto mulH = MulEven(aH, bH) + w1 + k;
3811  const auto mulL = ShiftLeft<32>(t) + w3;
3812  return InterleaveUpper(du64, mulL, mulH);
3813 }
3814 
3815 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3816 
3817 HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
3818  Vec256<bfloat16_t> a,
3819  Vec256<bfloat16_t> b,
3820  const Vec256<float> sum0,
3821  Vec256<float>& sum1) {
3822  // TODO(janwas): _mm256_dpbf16_ps when available
3823  const Repartition<uint16_t, decltype(df32)> du16;
3824  const RebindToUnsigned<decltype(df32)> du32;
3825  const Vec256<uint16_t> zero = Zero(du16);
3826  // Lane order within sum0/1 is undefined, hence we can avoid the
3827  // longer-latency lane-crossing PromoteTo.
3828  const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
3829  const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3830  const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
3831  const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3832  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3833  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3834 }
3835 
3836 // ================================================== CONVERT
3837 
3838 // ------------------------------ Promotions (part w/ narrow lanes -> full)
3839 
3841  const Vec128<float, 4> v) {
3842  return Vec256<double>{_mm256_cvtps_pd(v.raw)};
3843 }
3844 
3846  const Vec128<int32_t, 4> v) {
3847  return Vec256<double>{_mm256_cvtepi32_pd(v.raw)};
3848 }
3849 
3850 // Unsigned: zero-extend.
3851 // Note: these have 3 cycle latency; if inputs are already split across the
3852 // 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
3853 HWY_API Vec256<uint16_t> PromoteTo(Full256<uint16_t> /* tag */,
3854  Vec128<uint8_t> v) {
3855  return Vec256<uint16_t>{_mm256_cvtepu8_epi16(v.raw)};
3856 }
3859  return Vec256<uint32_t>{_mm256_cvtepu8_epi32(v.raw)};
3860 }
3861 HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
3862  Vec128<uint8_t> v) {
3863  return Vec256<int16_t>{_mm256_cvtepu8_epi16(v.raw)};
3864 }
3867  return Vec256<int32_t>{_mm256_cvtepu8_epi32(v.raw)};
3868 }
3869 HWY_API Vec256<uint32_t> PromoteTo(Full256<uint32_t> /* tag */,
3870  Vec128<uint16_t> v) {
3871  return Vec256<uint32_t>{_mm256_cvtepu16_epi32(v.raw)};
3872 }
3873 HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
3874  Vec128<uint16_t> v) {
3875  return Vec256<int32_t>{_mm256_cvtepu16_epi32(v.raw)};
3876 }
3878  Vec128<uint32_t> v) {
3879  return Vec256<uint64_t>{_mm256_cvtepu32_epi64(v.raw)};
3880 }
3881 
3882 // Signed: replicate sign bit.
3883 // Note: these have 3 cycle latency; if inputs are already split across the
3884 // 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
3885 // signed shift would be faster.
3886 HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
3887  Vec128<int8_t> v) {
3888  return Vec256<int16_t>{_mm256_cvtepi8_epi16(v.raw)};
3889 }
3891  Vec128<int8_t, 8> v) {
3892  return Vec256<int32_t>{_mm256_cvtepi8_epi32(v.raw)};
3893 }
3894 HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
3895  Vec128<int16_t> v) {
3896  return Vec256<int32_t>{_mm256_cvtepi16_epi32(v.raw)};
3897 }
3899  Vec128<int32_t> v) {
3900  return Vec256<int64_t>{_mm256_cvtepi32_epi64(v.raw)};
3901 }
3902 
3903 // ------------------------------ Demotions (full -> part w/ narrow lanes)
3904 
3905 HWY_API Vec128<uint16_t> DemoteTo(Full128<uint16_t> /* tag */,
3906  const Vec256<int32_t> v) {
3907  const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw);
3908  // Concatenating lower halves of both 128-bit blocks afterward is more
3909  // efficient than an extra input with low block = high block of v.
3910  return Vec128<uint16_t>{
3911  _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
3912 }
3913 
3914 HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */,
3915  const Vec256<int32_t> v) {
3916  const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw);
3917  return Vec128<int16_t>{
3918  _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
3919 }
3920 
3922  const Vec256<int32_t> v) {
3923  const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw);
3924  // Concatenate lower 64 bits of each 128-bit block
3925  const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
3926  const __m128i u16 = _mm256_castsi256_si128(u16_concat);
3927  // packus treats the input as signed; we want unsigned. Clear the MSB to get
3928  // unsigned saturation to u8.
3929  const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
3930  return Vec128<uint8_t, 8>{_mm_packus_epi16(i16, i16)};
3931 }
3932 
3933 HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
3934  const Vec256<int16_t> v) {
3935  const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw);
3936  return Vec128<uint8_t>{
3937  _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
3938 }
3939 
3941  const Vec256<int32_t> v) {
3942  const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
3943  // Concatenate lower 64 bits of each 128-bit block
3944  const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
3945  const __m128i i16 = _mm256_castsi256_si128(i16_concat);
3946  return Vec128<int8_t, 8>{_mm_packs_epi16(i16, i16)};
3947 }
3948 
3949 HWY_API Vec128<int8_t> DemoteTo(Full128<int8_t> /* tag */,
3950  const Vec256<int16_t> v) {
3951  const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw);
3952  return Vec128<int8_t>{
3953  _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
3954 }
3955 
3956  // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
3957  // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
3958 HWY_DIAGNOSTICS(push)
3959 HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
3960 
3961 HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> df16,
3962  const Vec256<float> v) {
3963 #ifdef HWY_DISABLE_F16C
3964  const RebindToUnsigned<decltype(df16)> du16;
3965  const Rebind<uint32_t, decltype(df16)> du;
3966  const RebindToSigned<decltype(du)> di;
3967  const auto bits32 = BitCast(du, v);
3968  const auto sign = ShiftRight<31>(bits32);
3969  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3970  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3971 
3972  const auto k15 = Set(di, 15);
3973  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3974  const auto is_tiny = exp < Set(di, -24);
3975 
3976  const auto is_subnormal = exp < Set(di, -14);
3977  const auto biased_exp16 =
3978  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3979  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3980  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3981  (mantissa32 >> (Set(du, 13) + sub_exp));
3982  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3983  ShiftRight<13>(mantissa32)); // <1024
3984 
3985  const auto sign16 = ShiftLeft<15>(sign);
3986  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3987  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3988  return BitCast(df16, DemoteTo(du16, bits16));
3989 #else
3990  (void)df16;
3991  return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
3992 #endif
3993 }
3994 
3995 HWY_DIAGNOSTICS(pop)
3996 
3997 HWY_API Vec128<bfloat16_t> DemoteTo(Full128<bfloat16_t> dbf16,
3998  const Vec256<float> v) {
3999  // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
4000  const Rebind<int32_t, decltype(dbf16)> di32;
4001  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4002  const Rebind<uint16_t, decltype(dbf16)> du16;
4003  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4004  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4005 }
4006 
4009  // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16.
4010  const RebindToUnsigned<decltype(dbf16)> du16;
4011  const Repartition<uint32_t, decltype(dbf16)> du32;
4012  const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
4013  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4014 }
4015 
4017  const Vec256<double> v) {
4018  return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
4019 }
4020 
4021 HWY_API Vec128<int32_t> DemoteTo(Full128<int32_t> /* tag */,
4022  const Vec256<double> v) {
4023  const auto clamped = detail::ClampF64ToI32Max(Full256<double>(), v);
4024  return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
4025 }
4026 
4027 // For already range-limited input [0, 255].
4028 HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
4029  const Full256<uint32_t> d32;
4030  alignas(32) static constexpr uint32_t k8From32[8] = {
4031  0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
4032  // Place first four bytes in lo[0], remaining 4 in hi[1].
4033  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
4034  // Interleave both quadruplets - OR instead of unpack reduces port5 pressure.
4035  const auto lo = LowerHalf(quad);
4036  const auto hi = UpperHalf(Full128<uint32_t>(), quad);
4037  const auto pair = LowerHalf(lo | hi);
4038  return BitCast(Full64<uint8_t>(), pair);
4039 }
4040 
4041 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
4042 
4043 HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
4044  const Vec256<int32_t> v) {
4045  return Vec256<float>{_mm256_cvtepi32_ps(v.raw)};
4046 }
4047 
4049 #if HWY_TARGET <= HWY_AVX3
4050  (void)dd;
4051  return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
4052 #else
4053  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
4054  const Repartition<uint32_t, decltype(dd)> d32;
4055  const Repartition<uint64_t, decltype(dd)> d64;
4056 
4057  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
4058  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
4059  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
4060 
4061  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
4062  const auto k52 = Set(d32, 0x43300000);
4063  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
4064 
4065  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
4066  return (v_upper - k84_63_52) + v_lower; // order matters!
4067 #endif
4068 }
4069 
4070 // Truncates (rounds toward zero).
4071 HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> d, const Vec256<float> v) {
4072  return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw));
4073 }
4074 
4076 #if HWY_TARGET <= HWY_AVX3
4077  return detail::FixConversionOverflow(di, v, _mm256_cvttpd_epi64(v.raw));
4078 #else
4079  using VI = decltype(Zero(di));
4080  const VI k0 = Zero(di);
4081  const VI k1 = Set(di, 1);
4082  const VI k51 = Set(di, 51);
4083 
4084  // Exponent indicates whether the number can be represented as int64_t.
4085  const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
4086  const VI exp = biased_exp - Set(di, 0x3FF);
4087  const auto in_range = exp < Set(di, 63);
4088 
4089  // If we were to cap the exponent at 51 and add 2^52, the number would be in
4090  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
4091  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
4092  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
4093  // manually shift the mantissa into place (we already have many of the
4094  // inputs anyway).
4095  const VI shift_mnt = Max(k51 - exp, k0);
4096  const VI shift_int = Max(exp - k51, k0);
4097  const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
4098  // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
4099  const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4100  // For inputs larger than 2^52, insert zeros at the bottom.
4101  const VI shifted = int52 << shift_int;
4102  // Restore the one bit lost when shifting in the implicit 1-bit.
4103  const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4104 
4105  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
4106  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
4107  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
4108  const VI magnitude = IfThenElse(in_range, restored, limit);
4109 
4110  // If the input was negative, negate the integer (two's complement).
4111  return (magnitude ^ sign_mask) - sign_mask;
4112 #endif
4113 }
4114 
4115 HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) {
4116  const Full256<int32_t> di;
4117  return detail::FixConversionOverflow(di, v, _mm256_cvtps_epi32(v.raw));
4118 }
4119 
4120 
4121 HWY_API Vec256<float> PromoteTo(Full256<float> df32,
4122  const Vec128<float16_t> v) {
4123 #ifdef HWY_DISABLE_F16C
4124  const RebindToSigned<decltype(df32)> di32;
4125  const RebindToUnsigned<decltype(df32)> du32;
4126  // Expand to u32 so we can shift.
4127  const auto bits16 = PromoteTo(du32, Vec128<uint16_t>{v.raw});
4128  const auto sign = ShiftRight<15>(bits16);
4129  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
4130  const auto mantissa = bits16 & Set(du32, 0x3FF);
4131  const auto subnormal =
4132  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
4133  Set(df32, 1.0f / 16384 / 1024));
4134 
4135  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
4136  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
4137  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4138  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
4139  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4140 #else
4141  (void)df32;
4142  return Vec256<float>{_mm256_cvtph_ps(v.raw)};
4143 #endif
4144 }
4145 
4146 HWY_API Vec256<float> PromoteTo(Full256<float> df32,
4147  const Vec128<bfloat16_t> v) {
4148  const Rebind<uint16_t, decltype(df32)> du16;
4149  const RebindToSigned<decltype(df32)> di32;
4150  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4151 }
4152 
4153 // ================================================== CRYPTO
4154 
4155 #if !defined(HWY_DISABLE_PCLMUL_AES)
4156 
4157 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4158 #ifdef HWY_NATIVE_AES
4159 #undef HWY_NATIVE_AES
4160 #else
4161 #define HWY_NATIVE_AES
4162 #endif
4163 
4165  Vec256<uint8_t> round_key) {
4166 #if HWY_TARGET == HWY_AVX3_DL
4167  return Vec256<uint8_t>{_mm256_aesenc_epi128(state.raw, round_key.raw)};
4168 #else
4169  const Full256<uint8_t> d;
4170  const Half<decltype(d)> d2;
4171  return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
4172  AESRound(LowerHalf(state), LowerHalf(round_key)));
4173 #endif
4174 }
4175 
4177  Vec256<uint8_t> round_key) {
4178 #if HWY_TARGET == HWY_AVX3_DL
4179  return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)};
4180 #else
4181  const Full256<uint8_t> d;
4182  const Half<decltype(d)> d2;
4183  return Combine(d,
4184  AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
4185  AESLastRound(LowerHalf(state), LowerHalf(round_key)));
4186 #endif
4187 }
4188 
4190 #if HWY_TARGET == HWY_AVX3_DL
4191  return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
4192 #else
4193  const Full256<uint64_t> d;
4194  const Half<decltype(d)> d2;
4195  return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)),
4196  CLMulLower(LowerHalf(a), LowerHalf(b)));
4197 #endif
4198 }
4199 
4201 #if HWY_TARGET == HWY_AVX3_DL
4202  return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)};
4203 #else
4204  const Full256<uint64_t> d;
4205  const Half<decltype(d)> d2;
4206  return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)),
4207  CLMulUpper(LowerHalf(a), LowerHalf(b)));
4208 #endif
4209 }
4210 
4211 #endif // HWY_DISABLE_PCLMUL_AES
4212 
4213 // ================================================== MISC
4214 
4215 // Returns a vector with lane i=[0, N) set to "first" + i.
4216 template <typename T, typename T2>
4217 HWY_API Vec256<T> Iota(const Full256<T> d, const T2 first) {
4218  HWY_ALIGN T lanes[32 / sizeof(T)];
4219  for (size_t i = 0; i < 32 / sizeof(T); ++i) {
4220  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
4221  }
4222  return Load(d, lanes);
4223 }
4224 
4225 #if HWY_TARGET <= HWY_AVX3
4226 
4227 // ------------------------------ LoadMaskBits
4228 
4229 // `p` points to at least 8 readable bytes, not all of which need be valid.
4230 template <typename T>
4231 HWY_API Mask256<T> LoadMaskBits(const Full256<T> /* tag */,
4232  const uint8_t* HWY_RESTRICT bits) {
4233  constexpr size_t N = 32 / sizeof(T);
4234  constexpr size_t kNumBytes = (N + 7) / 8;
4235 
4236  uint64_t mask_bits = 0;
4237  CopyBytes<kNumBytes>(bits, &mask_bits);
4238 
4239  if (N < 8) {
4240  mask_bits &= (1ull << N) - 1;
4241  }
4242 
4243  return Mask256<T>::FromBits(mask_bits);
4244 }
4245 
4246 // ------------------------------ StoreMaskBits
4247 
4248 // `p` points to at least 8 writable bytes.
4249 template <typename T>
4250 HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
4251  uint8_t* bits) {
4252  constexpr size_t N = 32 / sizeof(T);
4253  constexpr size_t kNumBytes = (N + 7) / 8;
4254 
4255  CopyBytes<kNumBytes>(&mask.raw, bits);
4256 
4257  // Non-full byte, need to clear the undefined upper bits.
4258  if (N < 8) {
4259  const int mask = static_cast<int>((1ull << N) - 1);
4260  bits[0] = static_cast<uint8_t>(bits[0] & mask);
4261  }
4262  return kNumBytes;
4263 }
4264 
4265 // ------------------------------ Mask testing
4266 
4267 template <typename T>
4268 HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4269  return PopCount(static_cast<uint64_t>(mask.raw));
4270 }
4271 
4272 template <typename T>
4273 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
4274  const Mask256<T> mask) {
4275  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
4276 }
4277 
4278 // Beware: the suffix indicates the number of mask bits, not lane size!
4279 
4280 namespace detail {
4281 
4282 template <typename T>
4283 HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) {
4284 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4285  return _kortestz_mask32_u8(mask.raw, mask.raw);
4286 #else
4287  return mask.raw == 0;
4288 #endif
4289 }
4290 template <typename T>
4291 HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) {
4292 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4293  return _kortestz_mask16_u8(mask.raw, mask.raw);
4294 #else
4295  return mask.raw == 0;
4296 #endif
4297 }
4298 template <typename T>
4299 HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) {
4300 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4301  return _kortestz_mask8_u8(mask.raw, mask.raw);
4302 #else
4303  return mask.raw == 0;
4304 #endif
4305 }
4306 template <typename T>
4307 HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) {
4308  return (uint64_t{mask.raw} & 0xF) == 0;
4309 }
4310 
4311 } // namespace detail
4312 
4313 template <typename T>
4314 HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
4315  return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
4316 }
4317 
4318 namespace detail {
4319 
4320 template <typename T>
4321 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) {
4322 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4323  return _kortestc_mask32_u8(mask.raw, mask.raw);
4324 #else
4325  return mask.raw == 0xFFFFFFFFu;
4326 #endif
4327 }
4328 template <typename T>
4329 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) {
4330 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4331  return _kortestc_mask16_u8(mask.raw, mask.raw);
4332 #else
4333  return mask.raw == 0xFFFFu;
4334 #endif
4335 }
4336 template <typename T>
4337 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) {
4338 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4339  return _kortestc_mask8_u8(mask.raw, mask.raw);
4340 #else
4341  return mask.raw == 0xFFu;
4342 #endif
4343 }
4344 template <typename T>
4345 HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) {
4346  // Cannot use _kortestc because we have less than 8 mask bits.
4347  return mask.raw == 0xFu;
4348 }
4349 
4350 } // namespace detail
4351 
4352 template <typename T>
4353 HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4354  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
4355 }
4356 
4357 // ------------------------------ Compress
4358 
4359 // 16-bit is defined in x86_512 so we can use 512-bit vectors.
4360 
4361 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4363  return Vec256<T>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
4364 }
4365 
4367  return Vec256<float>{_mm256_maskz_compress_ps(mask.raw, v.raw)};
4368 }
4369 
4370 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4371 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
4372  // See CompressIsPartition.
4373  alignas(16) constexpr uint64_t packed_array[16] = {
4374  // PrintCompress64x4NibbleTables
4375  0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
4376  0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
4377  0x00001032, 0x00001320, 0x00000321, 0x00003210};
4378 
4379  // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
4380  // _mm256_permutexvar_epi64 will ignore the upper bits.
4381  const Full256<T> d;
4382  const RebindToUnsigned<decltype(d)> du64;
4383  const auto packed = Set(du64, packed_array[mask.raw]);
4384  alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4385  const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
4386  return TableLookupLanes(v, indices);
4387 }
4388 
4389 // ------------------------------ CompressNot (Compress)
4390 
4391 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
4393  return Compress(v, Not(mask));
4394 }
4395 
4396 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4397 HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> mask) {
4398  // See CompressIsPartition.
4399  alignas(16) constexpr uint64_t packed_array[16] = {
4400  // PrintCompressNot64x4NibbleTables
4401  0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
4402  0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
4403  0x00003210, 0x00003201, 0x00003210, 0x00003210};
4404 
4405  // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
4406  // _mm256_permutexvar_epi64 will ignore the upper bits.
4407  const Full256<T> d;
4408  const RebindToUnsigned<decltype(d)> du64;
4409  const auto packed = Set(du64, packed_array[mask.raw]);
4410  alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4411  const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
4412  return TableLookupLanes(v, indices);
4413 }
4414 
4415 // ------------------------------ CompressBlocksNot
4416 HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
4417  Mask256<uint64_t> mask) {
4418  return CompressNot(v, mask);
4419 }
4420 
4421 // ------------------------------ CompressBits (LoadMaskBits)
4422 template <typename T>
4423 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
4424  return Compress(v, LoadMaskBits(Full256<T>(), bits));
4425 }
4426 
4427 // ------------------------------ CompressStore
4428 
4429 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4431  T* HWY_RESTRICT unaligned) {
4432  const Rebind<uint16_t, decltype(d)> du;
4433  const auto vu = BitCast(du, v); // (required for float16_t inputs)
4434 
4435  const uint64_t mask_bits{mask.raw};
4436 
4437 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
4438  _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
4439 #else
4440  // Split into halves to keep the table size manageable.
4441  const Half<decltype(du)> duh;
4442  const auto vL = LowerHalf(duh, vu);
4443  const auto vH = UpperHalf(duh, vu);
4444 
4445  const uint64_t mask_bitsL = mask_bits & 0xFF;
4446  const uint64_t mask_bitsH = mask_bits >> 8;
4447 
4448  const auto idxL = detail::IndicesForCompress16(mask_bitsL);
4449  const auto idxH = detail::IndicesForCompress16(mask_bitsH);
4450 
4451  // Compress and 128-bit halves.
4452  const Vec128<uint16_t> cL{_mm_permutexvar_epi16(idxL.raw, vL.raw)};
4453  const Vec128<uint16_t> cH{_mm_permutexvar_epi16(idxH.raw, vH.raw)};
4454  const Half<decltype(d)> dh;
4455  StoreU(BitCast(dh, cL), dh, unaligned);
4456  StoreU(BitCast(dh, cH), dh, unaligned + PopCount(mask_bitsL));
4457 #endif // HWY_TARGET == HWY_AVX3_DL
4458 
4459  return PopCount(mask_bits);
4460 }
4461 
4462 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4463 HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
4464  T* HWY_RESTRICT unaligned) {
4465  _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
4466  const size_t count = PopCount(uint64_t{mask.raw});
4467  // Workaround for MSAN not marking output as initialized (b/233326619)
4468 #if HWY_IS_MSAN
4469  __msan_unpoison(unaligned, count * sizeof(T));
4470 #endif
4471  return count;
4472 }
4473 
4474 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4475 HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
4476  T* HWY_RESTRICT unaligned) {
4477  _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
4478  const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
4479  // Workaround for MSAN not marking output as initialized (b/233326619)
4480 #if HWY_IS_MSAN
4481  __msan_unpoison(unaligned, count * sizeof(T));
4482 #endif
4483  return count;
4484 }
4485 
4487  Full256<float> /* tag */,
4488  float* HWY_RESTRICT unaligned) {
4489  _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
4490  const size_t count = PopCount(uint64_t{mask.raw});
4491  // Workaround for MSAN not marking output as initialized (b/233326619)
4492 #if HWY_IS_MSAN
4493  __msan_unpoison(unaligned, count * sizeof(float));
4494 #endif
4495  return count;
4496 }
4497 
4499  Full256<double> /* tag */,
4500  double* HWY_RESTRICT unaligned) {
4501  _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
4502  const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
4503  // Workaround for MSAN not marking output as initialized (b/233326619)
4504 #if HWY_IS_MSAN
4505  __msan_unpoison(unaligned, count * sizeof(double));
4506 #endif
4507  return count;
4508 }
4509 
4510 // ------------------------------ CompressBlendedStore (CompressStore)
4511 
4512 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4514  T* HWY_RESTRICT unaligned) {
4515  // Native (32 or 64-bit) AVX-512 instruction already does the blending at no
4516  // extra cost (latency 11, rthroughput 2 - same as compress plus store).
4517  return CompressStore(v, m, d, unaligned);
4518 }
4519 
4520 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4521 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4522  T* HWY_RESTRICT unaligned) {
4523 #if HWY_TARGET <= HWY_AVX3_DL
4524  return CompressStore(v, m, d, unaligned); // also native
4525 #else
4526  const size_t count = CountTrue(d, m);
4527  BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
4528  // Workaround for MSAN not marking output as initialized (b/233326619)
4529 #if HWY_IS_MSAN
4530  __msan_unpoison(unaligned, count * sizeof(T));
4531 #endif
4532  return count;
4533 #endif
4534 }
4535 
4536 // ------------------------------ CompressBitsStore (LoadMaskBits)
4537 
4538 template <typename T>
4539 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
4540  Full256<T> d, T* HWY_RESTRICT unaligned) {
4541  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
4542 }
4543 
4544 #else // AVX2
4545 
4546 // ------------------------------ LoadMaskBits (TestBit)
4547 
4548 namespace detail {
4549 
4550 // 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_LE128 there.
4551 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4552 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4553  const RebindToUnsigned<decltype(d)> du;
4554  const Repartition<uint32_t, decltype(d)> du32;
4555  const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits)));
4556 
4557  // Replicate bytes 8x such that each byte contains the bit that governs it.
4558  const Repartition<uint64_t, decltype(d)> du64;
4559  alignas(32) constexpr uint64_t kRep8[4] = {
4560  0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
4561  0x0303030303030303ull};
4562  const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
4563 
4564  alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4565  1, 2, 4, 8, 16, 32, 64, 128};
4566  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4567 }
4568 
4569 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4570 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4571  const RebindToUnsigned<decltype(d)> du;
4572  alignas(32) constexpr uint16_t kBit[16] = {
4573  1, 2, 4, 8, 16, 32, 64, 128,
4574  0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
4575  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
4576  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4577 }
4578 
4579 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4580 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4581  const RebindToUnsigned<decltype(d)> du;
4582  alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4583  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
4584  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4585 }
4586 
4587 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4588 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4589  const RebindToUnsigned<decltype(d)> du;
4590  alignas(32) constexpr uint64_t kBit[8] = {1, 2, 4, 8};
4591  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
4592 }
4593 
4594 } // namespace detail
4595 
4596 // `p` points to at least 8 readable bytes, not all of which need be valid.
4597 template <typename T>
4598 HWY_API Mask256<T> LoadMaskBits(Full256<T> d,
4599  const uint8_t* HWY_RESTRICT bits) {
4600  constexpr size_t N = 32 / sizeof(T);
4601  constexpr size_t kNumBytes = (N + 7) / 8;
4602 
4603  uint64_t mask_bits = 0;
4604  CopyBytes<kNumBytes>(bits, &mask_bits);
4605 
4606  if (N < 8) {
4607  mask_bits &= (1ull << N) - 1;
4608  }
4609 
4610  return detail::LoadMaskBits256(d, mask_bits);
4611 }
4612 
4613 // ------------------------------ StoreMaskBits
4614 
4615 namespace detail {
4616 
4617 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4618 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4619  const Full256<T> d;
4620  const Full256<uint8_t> d8;
4621  const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw;
4622  // Prevent sign-extension of 32-bit masks because the intrinsic returns int.
4623  return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits));
4624 }
4625 
4626 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4627 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4628 #if HWY_ARCH_X86_64
4629  const Full256<T> d;
4630  const Full256<uint8_t> d8;
4631  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4632  const uint64_t sign_bits8 = BitsFromMask(mask8);
4633  // Skip the bits from the lower byte of each u16 (better not to use the
4634  // same packs_epi16 as SSE4, because that requires an extra swizzle here).
4635  return _pext_u64(sign_bits8, 0xAAAAAAAAull);
4636 #else
4637  // Slow workaround for 32-bit builds, which lack _pext_u64.
4638  // Remove useless lower half of each u16 while preserving the sign bit.
4639  // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes.
4640  const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
4641  // Move odd qwords (value zero) to top so they don't affect the mask value.
4642  const auto compressed =
4643  _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
4644  return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
4645 #endif // HWY_ARCH_X86_64
4646 }
4647 
4648 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4649 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4650  const Full256<T> d;
4651  const Full256<float> df;
4652  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
4653  return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
4654 }
4655 
4656 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4657 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4658  const Full256<T> d;
4659  const Full256<double> df;
4660  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
4661  return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
4662 }
4663 
4664 } // namespace detail
4665 
4666 // `p` points to at least 8 writable bytes.
4667 template <typename T>
4668 HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
4669  uint8_t* bits) {
4670  constexpr size_t N = 32 / sizeof(T);
4671  constexpr size_t kNumBytes = (N + 7) / 8;
4672 
4673  const uint64_t mask_bits = detail::BitsFromMask(mask);
4674  CopyBytes<kNumBytes>(&mask_bits, bits);
4675  return kNumBytes;
4676 }
4677 
4678 // ------------------------------ Mask testing
4679 
4680 // Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask
4681 // lane is 0 or ~0.
4682 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4683 HWY_API bool AllFalse(const Full256<T> d, const Mask256<T> mask) {
4684  const Repartition<uint8_t, decltype(d)> d8;
4685  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4686  return detail::BitsFromMask(mask8) == 0;
4687 }
4688 
4689 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4690 HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
4691  // Cheaper than PTEST, which is 2 uop / 3L.
4692  return detail::BitsFromMask(mask) == 0;
4693 }
4694 
4695 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4696 HWY_API bool AllTrue(const Full256<T> d, const Mask256<T> mask) {
4697  const Repartition<uint8_t, decltype(d)> d8;
4698  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4699  return detail::BitsFromMask(mask8) == (1ull << 32) - 1;
4700 }
4701 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4702 HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4703  constexpr uint64_t kAllBits = (1ull << (32 / sizeof(T))) - 1;
4704  return detail::BitsFromMask(mask) == kAllBits;
4705 }
4706 
4707 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4708 HWY_API size_t CountTrue(const Full256<T> d, const Mask256<T> mask) {
4709  const Repartition<uint8_t, decltype(d)> d8;
4710  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4711  return PopCount(detail::BitsFromMask(mask8)) >> 1;
4712 }
4713 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4714 HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4715  return PopCount(detail::BitsFromMask(mask));
4716 }
4717 
4718 template <typename T>
4719 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
4720  const Mask256<T> mask) {
4721  const uint64_t mask_bits = detail::BitsFromMask(mask);
4722  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
4723 }
4724 
4725 // ------------------------------ Compress, CompressBits
4726 
4727 namespace detail {
4728 
4729 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4730 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
4731  uint64_t mask_bits) {
4732  const RebindToUnsigned<decltype(d)> d32;
4733  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
4734  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
4735  // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
4736  // and unavailable in 32-bit builds. We instead compress each index into 4
4737  // bits, for a total of 1 KiB.
4738  alignas(16) constexpr uint32_t packed_array[256] = {
4739  // PrintCompress32x8Tables
4740  0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
4741  0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
4742  0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
4743  0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
4744  0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
4745  0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
4746  0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
4747  0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
4748  0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
4749  0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
4750  0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
4751  0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
4752  0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
4753  0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
4754  0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
4755  0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
4756  0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
4757  0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
4758  0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
4759  0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
4760  0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
4761  0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
4762  0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
4763  0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
4764  0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
4765  0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
4766  0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
4767  0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
4768  0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
4769  0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
4770  0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
4771  0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
4772  0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
4773  0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
4774  0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
4775  0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
4776  0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
4777  0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
4778  0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
4779  0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
4780  0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
4781  0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
4782  0x10765432, 0x17654320, 0x07654321, 0x76543210};
4783 
4784  // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
4785  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
4786  // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
4787  // latency, it may be faster to use LoadDup128 and PSHUFB.
4788  const auto packed = Set(d32, packed_array[mask_bits]);
4789  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4790  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
4791 }
4792 
4793 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4794 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
4795  uint64_t mask_bits) {
4796  const Repartition<uint32_t, decltype(d)> d32;
4797 
4798  // For 64-bit, we still need 32-bit indices because there is no 64-bit
4799  // permutevar, but there are only 4 lanes, so we can afford to skip the
4800  // unpacking and load the entire index vector directly.
4801  alignas(32) constexpr uint32_t u32_indices[128] = {
4802  // PrintCompress64x4PairTables
4803  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
4804  6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5,
4805  2, 3, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6, 7,
4806  0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5,
4807  0, 1, 2, 3, 6, 7, 4, 5, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7,
4808  2, 3, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
4809  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
4810 }
4811 
4812 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4813 HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
4814  uint64_t mask_bits) {
4815  const RebindToUnsigned<decltype(d)> d32;
4816  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
4817  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
4818  // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
4819  // and unavailable in 32-bit builds. We instead compress each index into 4
4820  // bits, for a total of 1 KiB.
4821  alignas(16) constexpr uint32_t packed_array[256] = {
4822  // PrintCompressNot32x8Tables
4823  0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
4824  0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
4825  0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
4826  0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
4827  0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
4828  0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
4829  0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
4830  0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
4831  0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
4832  0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
4833  0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
4834  0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
4835  0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
4836  0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
4837  0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
4838  0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
4839  0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
4840  0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
4841  0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
4842  0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
4843  0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
4844  0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
4845  0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
4846  0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
4847  0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
4848  0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
4849  0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
4850  0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
4851  0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
4852  0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
4853  0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
4854  0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
4855  0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
4856  0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
4857  0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
4858  0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
4859  0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
4860  0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
4861  0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
4862  0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
4863  0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
4864  0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
4865  0x76543210, 0x76543201, 0x76543210, 0x76543210};
4866 
4867  // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
4868  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
4869  // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
4870  // latency, it may be faster to use LoadDup128 and PSHUFB.
4871  const auto packed = Set(d32, packed_array[mask_bits]);
4872  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4873  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
4874 }
4875 
4876 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4877 HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
4878  uint64_t mask_bits) {
4879  const Repartition<uint32_t, decltype(d)> d32;
4880 
4881  // For 64-bit, we still need 32-bit indices because there is no 64-bit
4882  // permutevar, but there are only 4 lanes, so we can afford to skip the
4883  // unpacking and load the entire index vector directly.
4884  alignas(32) constexpr uint32_t u32_indices[128] = {
4885  // PrintCompressNot64x4PairTables
4886  0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 4, 5, 6, 7,
4887  2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 6, 7, 4, 5, 2, 3, 6, 7,
4888  0, 1, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 0, 1,
4889  2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
4890  4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
4891  6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
4892  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
4893 }
4894 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4895 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4896  const Full256<T> d;
4897  const Repartition<uint32_t, decltype(d)> du32;
4898 
4899  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
4900  const auto indices = IndicesFromBits(d, mask_bits);
4901  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
4902 }
4903 
4904 // LUTs are infeasible for 2^16 possible masks, so splice together two
4905 // half-vector Compress.
4906 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4907 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4908  const Full256<T> d;
4909  const RebindToUnsigned<decltype(d)> du;
4910  const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
4911  const Half<decltype(du)> duh;
4912  const auto half0 = LowerHalf(duh, vu16);
4913  const auto half1 = UpperHalf(duh, vu16);
4914 
4915  const uint64_t mask_bits0 = mask_bits & 0xFF;
4916  const uint64_t mask_bits1 = mask_bits >> 8;
4917  const auto compressed0 = detail::CompressBits(half0, mask_bits0);
4918  const auto compressed1 = detail::CompressBits(half1, mask_bits1);
4919 
4920  alignas(32) uint16_t all_true[16] = {};
4921  // Store mask=true lanes, left to right.
4922  const size_t num_true0 = PopCount(mask_bits0);
4923  Store(compressed0, duh, all_true);
4924  StoreU(compressed1, duh, all_true + num_true0);
4925 
4927  // Store mask=false lanes, right to left. The second vector fills the upper
4928  // half with right-aligned false lanes. The first vector is shifted
4929  // rightwards to overwrite the true lanes of the second.
4930  alignas(32) uint16_t all_false[16] = {};
4931  const size_t num_true1 = PopCount(mask_bits1);
4932  Store(compressed1, duh, all_false + 8);
4933  StoreU(compressed0, duh, all_false + num_true1);
4934 
4935  const auto mask = FirstN(du, num_true0 + num_true1);
4936  return BitCast(d,
4937  IfThenElse(mask, Load(du, all_true), Load(du, all_false)));
4938  } else {
4939  // Only care about the mask=true lanes.
4940  return BitCast(d, Load(du, all_true));
4941  }
4942 }
4943 
4944 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4945 HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
4946  const Full256<T> d;
4947  const Repartition<uint32_t, decltype(d)> du32;
4948 
4949  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
4950  const auto indices = IndicesFromNotBits(d, mask_bits);
4951  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
4952 }
4953 
4954 // LUTs are infeasible for 2^16 possible masks, so splice together two
4955 // half-vector Compress.
4956 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4957 HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
4958  // Compress ensures only the lower 16 bits are set, so flip those.
4959  return Compress(v, mask_bits ^ 0xFFFF);
4960 }
4961 
4962 } // namespace detail
4963 
4964 template <typename T>
4965 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
4967 }
4968 
4969 template <typename T>
4970 HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
4972 }
4973 
4974 HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
4975  Mask256<uint64_t> mask) {
4976  return CompressNot(v, mask);
4977 }
4978 
4979 template <typename T>
4980 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
4981  constexpr size_t N = 32 / sizeof(T);
4982  constexpr size_t kNumBytes = (N + 7) / 8;
4983 
4984  uint64_t mask_bits = 0;
4985  CopyBytes<kNumBytes>(bits, &mask_bits);
4986 
4987  if (N < 8) {
4988  mask_bits &= (1ull << N) - 1;
4989  }
4990 
4991  return detail::Compress(v, mask_bits);
4992 }
4993 
4994 // ------------------------------ CompressStore, CompressBitsStore
4995 
4996 template <typename T>
4997 HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4998  T* HWY_RESTRICT unaligned) {
4999  const uint64_t mask_bits = detail::BitsFromMask(m);
5000  const size_t count = PopCount(mask_bits);
5001  StoreU(detail::Compress(v, mask_bits), d, unaligned);
5002  // Workaround for MSAN not marking output as initialized (b/233326619)
5003 #if HWY_IS_MSAN
5004  __msan_unpoison(unaligned, count * sizeof(T));
5005 #endif
5006  return count;
5007 }
5008 
5009 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
5010 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
5011  T* HWY_RESTRICT unaligned) {
5012  const uint64_t mask_bits = detail::BitsFromMask(m);
5013  const size_t count = PopCount(mask_bits);
5014  BlendedStore(detail::Compress(v, mask_bits), FirstN(d, count), d, unaligned);
5015  // Workaround for MSAN not marking output as initialized (b/233326619)
5016 #if HWY_IS_MSAN
5017  __msan_unpoison(unaligned, count * sizeof(T));
5018 #endif
5019  return count;
5020 }
5021 
5022 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
5023 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
5024  T* HWY_RESTRICT unaligned) {
5025  const uint64_t mask_bits = detail::BitsFromMask(m);
5026  const size_t count = PopCount(mask_bits);
5027  const Vec256<T> compressed = detail::Compress(v, mask_bits);
5028 
5029 #if HWY_MEM_OPS_MIGHT_FAULT // true if HWY_IS_MSAN
5030  // BlendedStore tests mask for each lane, but we know that the mask is
5031  // FirstN, so we can just copy.
5032  alignas(32) T buf[16];
5033  Store(compressed, d, buf);
5034  memcpy(unaligned, buf, count * sizeof(T));
5035 #else
5036  BlendedStore(compressed, FirstN(d, count), d, unaligned);
5037 #endif
5038  return count;
5039 }
5040 
5041 template <typename T>
5042 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
5043  Full256<T> d, T* HWY_RESTRICT unaligned) {
5044  constexpr size_t N = 32 / sizeof(T);
5045  constexpr size_t kNumBytes = (N + 7) / 8;
5046 
5047  uint64_t mask_bits = 0;
5048  CopyBytes<kNumBytes>(bits, &mask_bits);
5049 
5050  if (N < 8) {
5051  mask_bits &= (1ull << N) - 1;
5052  }
5053  const size_t count = PopCount(mask_bits);
5054 
5055  StoreU(detail::Compress(v, mask_bits), d, unaligned);
5056  // Workaround for MSAN not marking output as initialized (b/233326619)
5057 #if HWY_IS_MSAN
5058  __msan_unpoison(unaligned, count * sizeof(T));
5059 #endif
5060  return count;
5061 }
5062 
5063 #endif // HWY_TARGET <= HWY_AVX3
5064 
5065 // ------------------------------ LoadInterleaved3/4
5066 
5067 // Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
5068 
5069 namespace detail {
5070 
5071 // Input:
5072 // 1 0 (<- first block of unaligned)
5073 // 3 2
5074 // 5 4
5075 // Output:
5076 // 3 0
5077 // 4 1
5078 // 5 2
5079 template <typename T>
5081  const T* HWY_RESTRICT unaligned,
5082  Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) {
5083  constexpr size_t N = 32 / sizeof(T);
5084  const Vec256<T> v10 = LoadU(d, unaligned + 0 * N); // 1 0
5085  const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
5086  const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
5087 
5088  A = ConcatUpperLower(d, v32, v10);
5089  B = ConcatLowerUpper(d, v54, v10);
5090  C = ConcatUpperLower(d, v54, v32);
5091 }
5092 
5093 // Input (128-bit blocks):
5094 // 1 0 (first block of unaligned)
5095 // 3 2
5096 // 5 4
5097 // 7 6
5098 // Output:
5099 // 4 0 (LSB of A)
5100 // 5 1
5101 // 6 2
5102 // 7 3
5103 template <typename T>
5105  const T* HWY_RESTRICT unaligned,
5106  Vec256<T>& A, Vec256<T>& B, Vec256<T>& C,
5107  Vec256<T>& D) {
5108  constexpr size_t N = 32 / sizeof(T);
5109  const Vec256<T> v10 = LoadU(d, unaligned + 0 * N);
5110  const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
5111  const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
5112  const Vec256<T> v76 = LoadU(d, unaligned + 3 * N);
5113 
5114  A = ConcatLowerLower(d, v54, v10);
5115  B = ConcatUpperUpper(d, v54, v10);
5116  C = ConcatLowerLower(d, v76, v32);
5117  D = ConcatUpperUpper(d, v76, v32);
5118 }
5119 
5120 } // namespace detail
5121 
5122 // ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
5123 
5124 // Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
5125 
5126 namespace detail {
5127 
5128 // Input (128-bit blocks):
5129 // 2 0 (LSB of i)
5130 // 3 1
5131 // Output:
5132 // 1 0
5133 // 3 2
5134 template <typename T>
5136  const Full256<T> d,
5137  T* HWY_RESTRICT unaligned) {
5138  constexpr size_t N = 32 / sizeof(T);
5139  const auto out0 = ConcatLowerLower(d, j, i);
5140  const auto out1 = ConcatUpperUpper(d, j, i);
5141  StoreU(out0, d, unaligned + 0 * N);
5142  StoreU(out1, d, unaligned + 1 * N);
5143 }
5144 
5145 // Input (128-bit blocks):
5146 // 3 0 (LSB of i)
5147 // 4 1
5148 // 5 2
5149 // Output:
5150 // 1 0
5151 // 3 2
5152 // 5 4
5153 template <typename T>
5155  const Vec256<T> k, Full256<T> d,
5156  T* HWY_RESTRICT unaligned) {
5157  constexpr size_t N = 32 / sizeof(T);
5158  const auto out0 = ConcatLowerLower(d, j, i);
5159  const auto out1 = ConcatUpperLower(d, i, k);
5160  const auto out2 = ConcatUpperUpper(d, k, j);
5161  StoreU(out0, d, unaligned + 0 * N);
5162  StoreU(out1, d, unaligned + 1 * N);
5163  StoreU(out2, d, unaligned + 2 * N);
5164 }
5165 
5166 // Input (128-bit blocks):
5167 // 4 0 (LSB of i)
5168 // 5 1
5169 // 6 2
5170 // 7 3
5171 // Output:
5172 // 1 0
5173 // 3 2
5174 // 5 4
5175 // 7 6
5176 template <typename T>
5178  const Vec256<T> k, const Vec256<T> l,
5179  Full256<T> d, T* HWY_RESTRICT unaligned) {
5180  constexpr size_t N = 32 / sizeof(T);
5181  // Write lower halves, then upper.
5182  const auto out0 = ConcatLowerLower(d, j, i);
5183  const auto out1 = ConcatLowerLower(d, l, k);
5184  StoreU(out0, d, unaligned + 0 * N);
5185  StoreU(out1, d, unaligned + 1 * N);
5186  const auto out2 = ConcatUpperUpper(d, j, i);
5187  const auto out3 = ConcatUpperUpper(d, l, k);
5188  StoreU(out2, d, unaligned + 2 * N);
5189  StoreU(out3, d, unaligned + 3 * N);
5190 }
5191 
5192 } // namespace detail
5193 
5194 // ------------------------------ Reductions
5195 
5196 namespace detail {
5197 
5198 // Returns sum{lane[i]} in each lane. "v3210" is a replicated 128-bit block.
5199 // Same logic as x86/128.h, but with Vec256 arguments.
5200 template <typename T>
5201 HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
5202  const Vec256<T> v3210) {
5203  const auto v1032 = Shuffle1032(v3210);
5204  const auto v31_20_31_20 = v3210 + v1032;
5205  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
5206  return v20_31_20_31 + v31_20_31_20;
5207 }
5208 template <typename T>
5209 HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
5210  const Vec256<T> v3210) {
5211  const auto v1032 = Shuffle1032(v3210);
5212  const auto v31_20_31_20 = Min(v3210, v1032);
5213  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
5214  return Min(v20_31_20_31, v31_20_31_20);
5215 }
5216 template <typename T>
5217 HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
5218  const Vec256<T> v3210) {
5219  const auto v1032 = Shuffle1032(v3210);
5220  const auto v31_20_31_20 = Max(v3210, v1032);
5221  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
5222  return Max(v20_31_20_31, v31_20_31_20);
5223 }
5224 
5225 template <typename T>
5226 HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
5227  const Vec256<T> v10) {
5228  const auto v01 = Shuffle01(v10);
5229  return v10 + v01;
5230 }
5231 template <typename T>
5232 HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
5233  const Vec256<T> v10) {
5234  const auto v01 = Shuffle01(v10);
5235  return Min(v10, v01);
5236 }
5237 template <typename T>
5238 HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
5239  const Vec256<T> v10) {
5240  const auto v01 = Shuffle01(v10);
5241  return Max(v10, v01);
5242 }
5243 
5244 // u16/i16
5245 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
5246 HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
5247  const Repartition<int32_t, Full256<T>> d32;
5248  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5249  const auto odd = ShiftRight<16>(BitCast(d32, v));
5250  const auto min = MinOfLanes(d32, Min(even, odd));
5251  // Also broadcast into odd lanes.
5252  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
5253 }
5254 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
5255 HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
5256  const Repartition<int32_t, Full256<T>> d32;
5257  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5258  const auto odd = ShiftRight<16>(BitCast(d32, v));
5259  const auto min = MaxOfLanes(d32, Max(even, odd));
5260  // Also broadcast into odd lanes.
5261  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
5262 }
5263 
5264 } // namespace detail
5265 
5266 // Supported for {uif}32x8, {uif}64x4. Returns the sum in each lane.
5267 template <typename T>
5268 HWY_API Vec256<T> SumOfLanes(Full256<T> d, const Vec256<T> vHL) {
5269  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5270  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), vLH + vHL);
5271 }
5272 template <typename T>
5273 HWY_API Vec256<T> MinOfLanes(Full256<T> d, const Vec256<T> vHL) {
5274  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5275  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), Min(vLH, vHL));
5276 }
5277 template <typename T>
5278 HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) {
5279  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5280  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL));
5281 }
5282 
5283 // NOLINTNEXTLINE(google-readability-namespace-comments)
5284 } // namespace HWY_NAMESPACE
5285 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: x86_256-inl.h:81
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: x86_256-inl.h:78
Raw raw
Definition: x86_256-inl.h:100
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: x86_256-inl.h:93
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: x86_256-inl.h:96
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: x86_256-inl.h:84
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: x86_256-inl.h:90
typename detail::Raw256< T >::type Raw
Definition: x86_256-inl.h:73
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: x86_256-inl.h:87
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5417
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:151
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5406
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:462
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:870
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:513
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:332
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:763
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1155
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:252
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
long long int GatherIndex64
Definition: x86_128-inl.h:3201
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:5318
Definition: wasm_256-inl.h:1801
__m256i raw
Definition: x86_256-inl.h:2920
Definition: wasm_256-inl.h:70
typename detail::RawMask256< sizeof(T)>::type Raw
Definition: x86_256-inl.h:131
Raw raw
Definition: x86_256-inl.h:137
static Mask256< T > FromBits(uint64_t mask_bits)
Definition: x86_256-inl.h:133
Definition: ops/shared-inl.h:40
HWY_INLINE __m256d operator()(__m256i v)
Definition: x86_256-inl.h:176
HWY_INLINE __m256 operator()(__m256i v)
Definition: x86_256-inl.h:172
HWY_INLINE __m256i operator()(__m256i v)
Definition: x86_256-inl.h:168
__m256d type
Definition: x86_256-inl.h:66
__m256 type
Definition: x86_256-inl.h:62
Definition: x86_256-inl.h:57
__m256i type
Definition: x86_256-inl.h:58
__mmask32 type
Definition: x86_256-inl.h:112
__mmask16 type
Definition: x86_256-inl.h:116
__mmask8 type
Definition: x86_256-inl.h:120
__mmask8 type
Definition: x86_256-inl.h:124
Definition: x86_256-inl.h:109
Definition: base.h:358
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()