Grok  10.0.3
x86_512-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 512-bit AVX512 vectors and operations.
17 // External include guard in highway.h - see comment there.
18 
19 // WARNING: most operations do not cross 128-bit block boundaries. In
20 // particular, "Broadcast", pack and zip behavior may be surprising.
21 
22 // Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
23 #include "hwy/base.h"
24 
25 // Avoid uninitialized warnings in GCC's avx512fintrin.h - see
26 // https://github.com/google/highway/issues/710)
27 HWY_DIAGNOSTICS(push)
28 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
29 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
30 HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
31 #endif
32 
33 #include <immintrin.h> // AVX2+
34 
35 #if HWY_COMPILER_CLANGCL
36 // Including <immintrin.h> should be enough, but Clang's headers helpfully skip
37 // including these headers when _MSC_VER is defined, like when using clang-cl.
38 // Include these directly here.
39 // clang-format off
40 #include <smmintrin.h>
41 
42 #include <avxintrin.h>
43 #include <avx2intrin.h>
44 #include <f16cintrin.h>
45 #include <fmaintrin.h>
46 
47 #include <avx512fintrin.h>
48 #include <avx512vlintrin.h>
49 #include <avx512bwintrin.h>
50 #include <avx512dqintrin.h>
51 #include <avx512vlbwintrin.h>
52 #include <avx512vldqintrin.h>
53 #include <avx512bitalgintrin.h>
54 #include <avx512vlbitalgintrin.h>
55 #include <avx512vpopcntdqintrin.h>
56 #include <avx512vpopcntdqvlintrin.h>
57 // clang-format on
58 #endif // HWY_COMPILER_CLANGCL
59 
60 #include <stddef.h>
61 #include <stdint.h>
62 
63 #if HWY_IS_MSAN
64 #include <sanitizer/msan_interface.h>
65 #endif
66 
67 // For half-width vectors. Already includes base.h and shared-inl.h.
68 #include "hwy/ops/x86_256-inl.h"
69 
71 namespace hwy {
72 namespace HWY_NAMESPACE {
73 
74 namespace detail {
75 
76 template <typename T>
77 struct Raw512 {
78  using type = __m512i;
79 };
80 template <>
81 struct Raw512<float> {
82  using type = __m512;
83 };
84 template <>
85 struct Raw512<double> {
86  using type = __m512d;
87 };
88 
89 // Template arg: sizeof(lane type)
90 template <size_t size>
91 struct RawMask512 {};
92 template <>
93 struct RawMask512<1> {
94  using type = __mmask64;
95 };
96 template <>
97 struct RawMask512<2> {
98  using type = __mmask32;
99 };
100 template <>
101 struct RawMask512<4> {
102  using type = __mmask16;
103 };
104 template <>
105 struct RawMask512<8> {
106  using type = __mmask8;
107 };
108 
109 } // namespace detail
110 
111 template <typename T>
112 class Vec512 {
113  using Raw = typename detail::Raw512<T>::type;
114 
115  public:
116  // Compound assignment. Only usable if there is a corresponding non-member
117  // binary operator overload. For example, only f32 and f64 support division.
119  return *this = (*this * other);
120  }
122  return *this = (*this / other);
123  }
125  return *this = (*this + other);
126  }
128  return *this = (*this - other);
129  }
131  return *this = (*this & other);
132  }
134  return *this = (*this | other);
135  }
137  return *this = (*this ^ other);
138  }
139 
141 };
142 
143 // Mask register: one bit per lane.
144 template <typename T>
145 struct Mask512 {
146  typename detail::RawMask512<sizeof(T)>::type raw;
147 };
148 
149 // ------------------------------ BitCast
150 
151 namespace detail {
152 
153 HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; }
154 HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); }
155 HWY_INLINE __m512i BitCastToInteger(__m512d v) {
156  return _mm512_castpd_si512(v);
157 }
158 
159 template <typename T>
161  return Vec512<uint8_t>{BitCastToInteger(v.raw)};
162 }
163 
164 // Cannot rely on function overloading because return types differ.
165 template <typename T>
167  HWY_INLINE __m512i operator()(__m512i v) { return v; }
168 };
169 template <>
170 struct BitCastFromInteger512<float> {
171  HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); }
172 };
173 template <>
174 struct BitCastFromInteger512<double> {
175  HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); }
176 };
177 
178 template <typename T>
180  return Vec512<T>{BitCastFromInteger512<T>()(v.raw)};
181 }
182 
183 } // namespace detail
184 
185 template <typename T, typename FromT>
188 }
189 
190 // ------------------------------ Set
191 
192 // Returns an all-zero vector.
193 template <typename T>
195  return Vec512<T>{_mm512_setzero_si512()};
196 }
198  return Vec512<float>{_mm512_setzero_ps()};
199 }
201  return Vec512<double>{_mm512_setzero_pd()};
202 }
203 
204 // Returns a vector with all lanes set to "t".
205 HWY_API Vec512<uint8_t> Set(Full512<uint8_t> /* tag */, const uint8_t t) {
206  return Vec512<uint8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
207 }
208 HWY_API Vec512<uint16_t> Set(Full512<uint16_t> /* tag */, const uint16_t t) {
209  return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
210 }
211 HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
212  return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
213 }
214 HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
215  return Vec512<uint64_t>{
216  _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
217 }
218 HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
219  return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
220 }
221 HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
222  return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
223 }
224 HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
225  return Vec512<int32_t>{_mm512_set1_epi32(t)};
226 }
227 HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
228  return Vec512<int64_t>{
229  _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
230 }
231 HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
232  return Vec512<float>{_mm512_set1_ps(t)};
233 }
234 HWY_API Vec512<double> Set(Full512<double> /* tag */, const double t) {
235  return Vec512<double>{_mm512_set1_pd(t)};
236 }
237 
238 HWY_DIAGNOSTICS(push)
239 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
240 
241 // Returns a vector with uninitialized elements.
242 template <typename T>
244  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
245  // generate an XOR instruction.
246  return Vec512<T>{_mm512_undefined_epi32()};
247 }
249  return Vec512<float>{_mm512_undefined_ps()};
250 }
252  return Vec512<double>{_mm512_undefined_pd()};
253 }
254 
255 HWY_DIAGNOSTICS(pop)
256 
257 // ================================================== LOGICAL
258 
259 // ------------------------------ Not
260 
261 template <typename T>
263  using TU = MakeUnsigned<T>;
264  const __m512i vu = BitCast(Full512<TU>(), v).raw;
265  return BitCast(Full512<T>(),
266  Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
267 }
268 
269 // ------------------------------ And
270 
271 template <typename T>
273  return Vec512<T>{_mm512_and_si512(a.raw, b.raw)};
274 }
275 
277  return Vec512<float>{_mm512_and_ps(a.raw, b.raw)};
278 }
280  return Vec512<double>{_mm512_and_pd(a.raw, b.raw)};
281 }
282 
283 // ------------------------------ AndNot
284 
285 // Returns ~not_mask & mask.
286 template <typename T>
287 HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
288  return Vec512<T>{_mm512_andnot_si512(not_mask.raw, mask.raw)};
289 }
291  const Vec512<float> mask) {
292  return Vec512<float>{_mm512_andnot_ps(not_mask.raw, mask.raw)};
293 }
295  const Vec512<double> mask) {
296  return Vec512<double>{_mm512_andnot_pd(not_mask.raw, mask.raw)};
297 }
298 
299 // ------------------------------ Or
300 
301 template <typename T>
303  return Vec512<T>{_mm512_or_si512(a.raw, b.raw)};
304 }
305 
307  return Vec512<float>{_mm512_or_ps(a.raw, b.raw)};
308 }
310  return Vec512<double>{_mm512_or_pd(a.raw, b.raw)};
311 }
312 
313 // ------------------------------ Xor
314 
315 template <typename T>
317  return Vec512<T>{_mm512_xor_si512(a.raw, b.raw)};
318 }
319 
321  return Vec512<float>{_mm512_xor_ps(a.raw, b.raw)};
322 }
324  return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
325 }
326 
327 // ------------------------------ Or3
328 
329 template <typename T>
331  const Full512<T> d;
332  const RebindToUnsigned<decltype(d)> du;
333  using VU = VFromD<decltype(du)>;
334  const __m512i ret = _mm512_ternarylogic_epi64(
335  BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
336  return BitCast(d, VU{ret});
337 }
338 
339 // ------------------------------ OrAnd
340 
341 template <typename T>
343  const Full512<T> d;
344  const RebindToUnsigned<decltype(d)> du;
345  using VU = VFromD<decltype(du)>;
346  const __m512i ret = _mm512_ternarylogic_epi64(
347  BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
348  return BitCast(d, VU{ret});
349 }
350 
351 // ------------------------------ IfVecThenElse
352 
353 template <typename T>
355  const Full512<T> d;
356  const RebindToUnsigned<decltype(d)> du;
357  using VU = VFromD<decltype(du)>;
358  return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
359  BitCast(du, yes).raw,
360  BitCast(du, no).raw, 0xCA)});
361 }
362 
363 // ------------------------------ Operator overloads (internal-only if float)
364 
365 template <typename T>
367  return And(a, b);
368 }
369 
370 template <typename T>
372  return Or(a, b);
373 }
374 
375 template <typename T>
377  return Xor(a, b);
378 }
379 
380 // ------------------------------ PopulationCount
381 
382 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
383 #if HWY_TARGET == HWY_AVX3_DL
384 
385 #ifdef HWY_NATIVE_POPCNT
386 #undef HWY_NATIVE_POPCNT
387 #else
388 #define HWY_NATIVE_POPCNT
389 #endif
390 
391 namespace detail {
392 
393 template <typename T>
395  return Vec512<T>{_mm512_popcnt_epi8(v.raw)};
396 }
397 template <typename T>
399  return Vec512<T>{_mm512_popcnt_epi16(v.raw)};
400 }
401 template <typename T>
403  return Vec512<T>{_mm512_popcnt_epi32(v.raw)};
404 }
405 template <typename T>
407  return Vec512<T>{_mm512_popcnt_epi64(v.raw)};
408 }
409 
410 } // namespace detail
411 
412 template <typename T>
414  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
415 }
416 
417 #endif // HWY_TARGET == HWY_AVX3_DL
418 
419 // ================================================== SIGN
420 
421 // ------------------------------ CopySign
422 
423 template <typename T>
424 HWY_API Vec512<T> CopySign(const Vec512<T> magn, const Vec512<T> sign) {
425  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
426 
427  const Full512<T> d;
428  const auto msb = SignBit(d);
429 
430  const Rebind<MakeUnsigned<T>, decltype(d)> du;
431  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
432  // 0 0 0 | 0
433  // 0 0 1 | 0
434  // 0 1 0 | 1
435  // 0 1 1 | 1
436  // 1 0 0 | 0
437  // 1 0 1 | 1
438  // 1 1 0 | 0
439  // 1 1 1 | 1
440  // The lane size does not matter because we are not using predication.
441  const __m512i out = _mm512_ternarylogic_epi32(
442  BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
443  return BitCast(d, decltype(Zero(du)){out});
444 }
445 
446 template <typename T>
448  // AVX3 can also handle abs < 0, so no extra action needed.
449  return CopySign(abs, sign);
450 }
451 
452 // ================================================== MASK
453 
454 // ------------------------------ FirstN
455 
456 // Possibilities for constructing a bitmask of N ones:
457 // - kshift* only consider the lowest byte of the shift count, so they would
458 // not correctly handle large n.
459 // - Scalar shifts >= 64 are UB.
460 // - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
461 // we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
462 
463 #if HWY_ARCH_X86_32
464 namespace detail {
465 
466 // 32 bit mask is sufficient for lane size >= 2.
467 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
468 HWY_INLINE Mask512<T> FirstN(size_t n) {
469  Mask512<T> m;
470  const uint32_t all = ~uint32_t(0);
471  // BZHI only looks at the lower 8 bits of n!
472  m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u32(all, n));
473  return m;
474 }
475 
476 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
477 HWY_INLINE Mask512<T> FirstN(size_t n) {
478  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
479  return Mask512<T>{static_cast<__mmask64>(bits)};
480 }
481 
482 } // namespace detail
483 #endif // HWY_ARCH_X86_32
484 
485 template <typename T>
486 HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
487 #if HWY_ARCH_X86_64
488  Mask512<T> m;
489  const uint64_t all = ~uint64_t(0);
490  // BZHI only looks at the lower 8 bits of n!
491  m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u64(all, n));
492  return m;
493 #else
494  return detail::FirstN<T>(n);
495 #endif // HWY_ARCH_X86_64
496 }
497 
498 // ------------------------------ IfThenElse
499 
500 // Returns mask ? b : a.
501 
502 namespace detail {
503 
504 // Templates for signed/unsigned integer of a particular size.
505 template <typename T>
507  const Mask512<T> mask, const Vec512<T> yes,
508  const Vec512<T> no) {
509  return Vec512<T>{_mm512_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
510 }
511 template <typename T>
513  const Mask512<T> mask, const Vec512<T> yes,
514  const Vec512<T> no) {
515  return Vec512<T>{_mm512_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
516 }
517 template <typename T>
519  const Mask512<T> mask, const Vec512<T> yes,
520  const Vec512<T> no) {
521  return Vec512<T>{_mm512_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
522 }
523 template <typename T>
525  const Mask512<T> mask, const Vec512<T> yes,
526  const Vec512<T> no) {
527  return Vec512<T>{_mm512_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
528 }
529 
530 } // namespace detail
531 
532 template <typename T>
534  const Vec512<T> no) {
535  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
536 }
538  const Vec512<float> yes,
539  const Vec512<float> no) {
540  return Vec512<float>{_mm512_mask_mov_ps(no.raw, mask.raw, yes.raw)};
541 }
543  const Vec512<double> yes,
544  const Vec512<double> no) {
545  return Vec512<double>{_mm512_mask_mov_pd(no.raw, mask.raw, yes.raw)};
546 }
547 
548 namespace detail {
549 
550 template <typename T>
552  const Mask512<T> mask,
553  const Vec512<T> yes) {
554  return Vec512<T>{_mm512_maskz_mov_epi8(mask.raw, yes.raw)};
555 }
556 template <typename T>
558  const Mask512<T> mask,
559  const Vec512<T> yes) {
560  return Vec512<T>{_mm512_maskz_mov_epi16(mask.raw, yes.raw)};
561 }
562 template <typename T>
564  const Mask512<T> mask,
565  const Vec512<T> yes) {
566  return Vec512<T>{_mm512_maskz_mov_epi32(mask.raw, yes.raw)};
567 }
568 template <typename T>
570  const Mask512<T> mask,
571  const Vec512<T> yes) {
572  return Vec512<T>{_mm512_maskz_mov_epi64(mask.raw, yes.raw)};
573 }
574 
575 } // namespace detail
576 
577 template <typename T>
579  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
580 }
582  const Vec512<float> yes) {
583  return Vec512<float>{_mm512_maskz_mov_ps(mask.raw, yes.raw)};
584 }
586  const Vec512<double> yes) {
587  return Vec512<double>{_mm512_maskz_mov_pd(mask.raw, yes.raw)};
588 }
589 
590 namespace detail {
591 
592 template <typename T>
594  const Mask512<T> mask, const Vec512<T> no) {
595  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
596  return Vec512<T>{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
597 }
598 template <typename T>
600  const Mask512<T> mask, const Vec512<T> no) {
601  return Vec512<T>{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
602 }
603 template <typename T>
605  const Mask512<T> mask, const Vec512<T> no) {
606  return Vec512<T>{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
607 }
608 template <typename T>
610  const Mask512<T> mask, const Vec512<T> no) {
611  return Vec512<T>{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
612 }
613 
614 } // namespace detail
615 
616 template <typename T>
618  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
619 }
621  const Vec512<float> no) {
622  return Vec512<float>{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
623 }
625  const Vec512<double> no) {
626  return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
627 }
628 
629 template <typename T>
631  static_assert(IsSigned<T>(), "Only works for signed/float");
632  // AVX3 MaskFromVec only looks at the MSB
633  return IfThenElse(MaskFromVec(v), yes, no);
634 }
635 
636 template <typename T, HWY_IF_FLOAT(T)>
638  // AVX3 MaskFromVec only looks at the MSB
639  return IfThenZeroElse(MaskFromVec(v), v);
640 }
641 
642 // ================================================== ARITHMETIC
643 
644 // ------------------------------ Addition
645 
646 // Unsigned
648  const Vec512<uint8_t> b) {
649  return Vec512<uint8_t>{_mm512_add_epi8(a.raw, b.raw)};
650 }
652  const Vec512<uint16_t> b) {
653  return Vec512<uint16_t>{_mm512_add_epi16(a.raw, b.raw)};
654 }
656  const Vec512<uint32_t> b) {
657  return Vec512<uint32_t>{_mm512_add_epi32(a.raw, b.raw)};
658 }
660  const Vec512<uint64_t> b) {
661  return Vec512<uint64_t>{_mm512_add_epi64(a.raw, b.raw)};
662 }
663 
664 // Signed
666  const Vec512<int8_t> b) {
667  return Vec512<int8_t>{_mm512_add_epi8(a.raw, b.raw)};
668 }
670  const Vec512<int16_t> b) {
671  return Vec512<int16_t>{_mm512_add_epi16(a.raw, b.raw)};
672 }
674  const Vec512<int32_t> b) {
675  return Vec512<int32_t>{_mm512_add_epi32(a.raw, b.raw)};
676 }
678  const Vec512<int64_t> b) {
679  return Vec512<int64_t>{_mm512_add_epi64(a.raw, b.raw)};
680 }
681 
682 // Float
684  return Vec512<float>{_mm512_add_ps(a.raw, b.raw)};
685 }
687  const Vec512<double> b) {
688  return Vec512<double>{_mm512_add_pd(a.raw, b.raw)};
689 }
690 
691 // ------------------------------ Subtraction
692 
693 // Unsigned
695  const Vec512<uint8_t> b) {
696  return Vec512<uint8_t>{_mm512_sub_epi8(a.raw, b.raw)};
697 }
699  const Vec512<uint16_t> b) {
700  return Vec512<uint16_t>{_mm512_sub_epi16(a.raw, b.raw)};
701 }
703  const Vec512<uint32_t> b) {
704  return Vec512<uint32_t>{_mm512_sub_epi32(a.raw, b.raw)};
705 }
707  const Vec512<uint64_t> b) {
708  return Vec512<uint64_t>{_mm512_sub_epi64(a.raw, b.raw)};
709 }
710 
711 // Signed
713  const Vec512<int8_t> b) {
714  return Vec512<int8_t>{_mm512_sub_epi8(a.raw, b.raw)};
715 }
717  const Vec512<int16_t> b) {
718  return Vec512<int16_t>{_mm512_sub_epi16(a.raw, b.raw)};
719 }
721  const Vec512<int32_t> b) {
722  return Vec512<int32_t>{_mm512_sub_epi32(a.raw, b.raw)};
723 }
725  const Vec512<int64_t> b) {
726  return Vec512<int64_t>{_mm512_sub_epi64(a.raw, b.raw)};
727 }
728 
729 // Float
731  return Vec512<float>{_mm512_sub_ps(a.raw, b.raw)};
732 }
734  const Vec512<double> b) {
735  return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
736 }
737 
738 // ------------------------------ SumsOf8
740  return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())};
741 }
742 
743 // ------------------------------ SaturatedAdd
744 
745 // Returns a + b clamped to the destination range.
746 
747 // Unsigned
749  const Vec512<uint8_t> b) {
750  return Vec512<uint8_t>{_mm512_adds_epu8(a.raw, b.raw)};
751 }
753  const Vec512<uint16_t> b) {
754  return Vec512<uint16_t>{_mm512_adds_epu16(a.raw, b.raw)};
755 }
756 
757 // Signed
759  const Vec512<int8_t> b) {
760  return Vec512<int8_t>{_mm512_adds_epi8(a.raw, b.raw)};
761 }
763  const Vec512<int16_t> b) {
764  return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
765 }
766 
767 // ------------------------------ SaturatedSub
768 
769 // Returns a - b clamped to the destination range.
770 
771 // Unsigned
773  const Vec512<uint8_t> b) {
774  return Vec512<uint8_t>{_mm512_subs_epu8(a.raw, b.raw)};
775 }
777  const Vec512<uint16_t> b) {
778  return Vec512<uint16_t>{_mm512_subs_epu16(a.raw, b.raw)};
779 }
780 
781 // Signed
783  const Vec512<int8_t> b) {
784  return Vec512<int8_t>{_mm512_subs_epi8(a.raw, b.raw)};
785 }
787  const Vec512<int16_t> b) {
788  return Vec512<int16_t>{_mm512_subs_epi16(a.raw, b.raw)};
789 }
790 
791 // ------------------------------ Average
792 
793 // Returns (a + b + 1) / 2
794 
795 // Unsigned
797  const Vec512<uint8_t> b) {
798  return Vec512<uint8_t>{_mm512_avg_epu8(a.raw, b.raw)};
799 }
801  const Vec512<uint16_t> b) {
802  return Vec512<uint16_t>{_mm512_avg_epu16(a.raw, b.raw)};
803 }
804 
805 // ------------------------------ Abs (Sub)
806 
807 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
809 #if HWY_COMPILER_MSVC
810  // Workaround for incorrect codegen? (untested due to internal compiler error)
811  const auto zero = Zero(Full512<int8_t>());
812  return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
813 #else
814  return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
815 #endif
816 }
818  return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
819 }
821  return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
822 }
824  return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
825 }
826 
827 // These aren't native instructions, they also involve AND with constant.
829  return Vec512<float>{_mm512_abs_ps(v.raw)};
830 }
832  return Vec512<double>{_mm512_abs_pd(v.raw)};
833 }
834 // ------------------------------ ShiftLeft
835 
836 template <int kBits>
838  return Vec512<uint16_t>{_mm512_slli_epi16(v.raw, kBits)};
839 }
840 
841 template <int kBits>
843  return Vec512<uint32_t>{_mm512_slli_epi32(v.raw, kBits)};
844 }
845 
846 template <int kBits>
848  return Vec512<uint64_t>{_mm512_slli_epi64(v.raw, kBits)};
849 }
850 
851 template <int kBits>
853  return Vec512<int16_t>{_mm512_slli_epi16(v.raw, kBits)};
854 }
855 
856 template <int kBits>
858  return Vec512<int32_t>{_mm512_slli_epi32(v.raw, kBits)};
859 }
860 
861 template <int kBits>
863  return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
864 }
865 
866 template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
868  const Full512<T> d8;
869  const RepartitionToWide<decltype(d8)> d16;
870  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
871  return kBits == 1
872  ? (v + v)
873  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
874 }
875 
876 // ------------------------------ ShiftRight
877 
878 template <int kBits>
880  return Vec512<uint16_t>{_mm512_srli_epi16(v.raw, kBits)};
881 }
882 
883 template <int kBits>
885  return Vec512<uint32_t>{_mm512_srli_epi32(v.raw, kBits)};
886 }
887 
888 template <int kBits>
890  return Vec512<uint64_t>{_mm512_srli_epi64(v.raw, kBits)};
891 }
892 
893 template <int kBits>
895  const Full512<uint8_t> d8;
896  // Use raw instead of BitCast to support N=1.
897  const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
898  return shifted & Set(d8, 0xFF >> kBits);
899 }
900 
901 template <int kBits>
903  return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
904 }
905 
906 template <int kBits>
908  return Vec512<int32_t>{_mm512_srai_epi32(v.raw, kBits)};
909 }
910 
911 template <int kBits>
913  return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
914 }
915 
916 template <int kBits>
918  const Full512<int8_t> di;
919  const Full512<uint8_t> du;
920  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
921  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
922  return (shifted ^ shifted_sign) - shifted_sign;
923 }
924 
925 // ------------------------------ RotateRight
926 
927 template <int kBits>
929  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
930  return Vec512<uint32_t>{_mm512_ror_epi32(v.raw, kBits)};
931 }
932 
933 template <int kBits>
935  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
936  return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
937 }
938 
939 // ------------------------------ ShiftLeftSame
940 
942  const int bits) {
943  return Vec512<uint16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
944 }
946  const int bits) {
947  return Vec512<uint32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
948 }
950  const int bits) {
951  return Vec512<uint64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
952 }
953 
955  return Vec512<int16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
956 }
957 
959  return Vec512<int32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
960 }
961 
963  return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
964 }
965 
966 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
967 HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
968  const Full512<T> d8;
969  const RepartitionToWide<decltype(d8)> d16;
970  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
971  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
972 }
973 
974 // ------------------------------ ShiftRightSame
975 
977  const int bits) {
978  return Vec512<uint16_t>{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
979 }
981  const int bits) {
982  return Vec512<uint32_t>{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
983 }
985  const int bits) {
986  return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
987 }
988 
990  const Full512<uint8_t> d8;
991  const RepartitionToWide<decltype(d8)> d16;
992  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
993  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
994 }
995 
997  const int bits) {
998  return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
999 }
1000 
1002  const int bits) {
1003  return Vec512<int32_t>{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1004 }
1006  const int bits) {
1007  return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1008 }
1009 
1011  const Full512<int8_t> di;
1012  const Full512<uint8_t> du;
1013  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1014  const auto shifted_sign =
1015  BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
1016  return (shifted ^ shifted_sign) - shifted_sign;
1017 }
1018 
1019 // ------------------------------ Shl
1020 
1022  const Vec512<uint16_t> bits) {
1023  return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
1024 }
1025 
1027  const Vec512<uint32_t> bits) {
1028  return Vec512<uint32_t>{_mm512_sllv_epi32(v.raw, bits.raw)};
1029 }
1030 
1032  const Vec512<uint64_t> bits) {
1033  return Vec512<uint64_t>{_mm512_sllv_epi64(v.raw, bits.raw)};
1034 }
1035 
1036 // Signed left shift is the same as unsigned.
1037 template <typename T, HWY_IF_SIGNED(T)>
1039  const Full512<T> di;
1040  const Full512<MakeUnsigned<T>> du;
1041  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
1042 }
1043 
1044 // ------------------------------ Shr
1045 
1047  const Vec512<uint16_t> bits) {
1048  return Vec512<uint16_t>{_mm512_srlv_epi16(v.raw, bits.raw)};
1049 }
1050 
1052  const Vec512<uint32_t> bits) {
1053  return Vec512<uint32_t>{_mm512_srlv_epi32(v.raw, bits.raw)};
1054 }
1055 
1057  const Vec512<uint64_t> bits) {
1058  return Vec512<uint64_t>{_mm512_srlv_epi64(v.raw, bits.raw)};
1059 }
1060 
1062  const Vec512<int16_t> bits) {
1063  return Vec512<int16_t>{_mm512_srav_epi16(v.raw, bits.raw)};
1064 }
1065 
1067  const Vec512<int32_t> bits) {
1068  return Vec512<int32_t>{_mm512_srav_epi32(v.raw, bits.raw)};
1069 }
1070 
1072  const Vec512<int64_t> bits) {
1073  return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
1074 }
1075 
1076 // ------------------------------ Minimum
1077 
1078 // Unsigned
1080  return Vec512<uint8_t>{_mm512_min_epu8(a.raw, b.raw)};
1081 }
1083  const Vec512<uint16_t> b) {
1084  return Vec512<uint16_t>{_mm512_min_epu16(a.raw, b.raw)};
1085 }
1087  const Vec512<uint32_t> b) {
1088  return Vec512<uint32_t>{_mm512_min_epu32(a.raw, b.raw)};
1089 }
1091  const Vec512<uint64_t> b) {
1092  return Vec512<uint64_t>{_mm512_min_epu64(a.raw, b.raw)};
1093 }
1094 
1095 // Signed
1097  return Vec512<int8_t>{_mm512_min_epi8(a.raw, b.raw)};
1098 }
1100  return Vec512<int16_t>{_mm512_min_epi16(a.raw, b.raw)};
1101 }
1103  return Vec512<int32_t>{_mm512_min_epi32(a.raw, b.raw)};
1104 }
1106  return Vec512<int64_t>{_mm512_min_epi64(a.raw, b.raw)};
1107 }
1108 
1109 // Float
1111  return Vec512<float>{_mm512_min_ps(a.raw, b.raw)};
1112 }
1114  return Vec512<double>{_mm512_min_pd(a.raw, b.raw)};
1115 }
1116 
1117 // ------------------------------ Maximum
1118 
1119 // Unsigned
1121  return Vec512<uint8_t>{_mm512_max_epu8(a.raw, b.raw)};
1122 }
1124  const Vec512<uint16_t> b) {
1125  return Vec512<uint16_t>{_mm512_max_epu16(a.raw, b.raw)};
1126 }
1128  const Vec512<uint32_t> b) {
1129  return Vec512<uint32_t>{_mm512_max_epu32(a.raw, b.raw)};
1130 }
1132  const Vec512<uint64_t> b) {
1133  return Vec512<uint64_t>{_mm512_max_epu64(a.raw, b.raw)};
1134 }
1135 
1136 // Signed
1138  return Vec512<int8_t>{_mm512_max_epi8(a.raw, b.raw)};
1139 }
1141  return Vec512<int16_t>{_mm512_max_epi16(a.raw, b.raw)};
1142 }
1144  return Vec512<int32_t>{_mm512_max_epi32(a.raw, b.raw)};
1145 }
1147  return Vec512<int64_t>{_mm512_max_epi64(a.raw, b.raw)};
1148 }
1149 
1150 // Float
1152  return Vec512<float>{_mm512_max_ps(a.raw, b.raw)};
1153 }
1155  return Vec512<double>{_mm512_max_pd(a.raw, b.raw)};
1156 }
1157 
1158 // ------------------------------ Integer multiplication
1159 
1160 // Unsigned
1162  return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1163 }
1165  return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1166 }
1167 
1168 // Signed
1170  return Vec512<int16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1171 }
1173  return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1174 }
1175 
1176 // Returns the upper 16 bits of a * b in each lane.
1178  return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
1179 }
1181  return Vec512<int16_t>{_mm512_mulhi_epi16(a.raw, b.raw)};
1182 }
1183 
1185  return Vec512<int16_t>{_mm512_mulhrs_epi16(a.raw, b.raw)};
1186 }
1187 
1188 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
1189 // even and the upper half into its odd neighbor lane.
1191  return Vec512<int64_t>{_mm512_mul_epi32(a.raw, b.raw)};
1192 }
1194  return Vec512<uint64_t>{_mm512_mul_epu32(a.raw, b.raw)};
1195 }
1196 
1197 // ------------------------------ Neg (Sub)
1198 
1199 template <typename T, HWY_IF_FLOAT(T)>
1201  return Xor(v, SignBit(Full512<T>()));
1202 }
1203 
1204 template <typename T, HWY_IF_NOT_FLOAT(T)>
1205 HWY_API Vec512<T> Neg(const Vec512<T> v) {
1206  return Zero(Full512<T>()) - v;
1207 }
1208 
1209 // ------------------------------ Floating-point mul / div
1210 
1212  return Vec512<float>{_mm512_mul_ps(a.raw, b.raw)};
1213 }
1215  const Vec512<double> b) {
1216  return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)};
1217 }
1218 
1220  return Vec512<float>{_mm512_div_ps(a.raw, b.raw)};
1221 }
1223  const Vec512<double> b) {
1224  return Vec512<double>{_mm512_div_pd(a.raw, b.raw)};
1225 }
1226 
1227 // Approximate reciprocal
1229  return Vec512<float>{_mm512_rcp14_ps(v.raw)};
1230 }
1231 
1232 // Absolute value of difference.
1234  return Abs(a - b);
1235 }
1236 
1237 // ------------------------------ Floating-point multiply-add variants
1238 
1239 // Returns mul * x + add
1241  const Vec512<float> add) {
1242  return Vec512<float>{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)};
1243 }
1245  const Vec512<double> add) {
1246  return Vec512<double>{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)};
1247 }
1248 
1249 // Returns add - mul * x
1251  const Vec512<float> add) {
1252  return Vec512<float>{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)};
1253 }
1255  const Vec512<double> x,
1256  const Vec512<double> add) {
1257  return Vec512<double>{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)};
1258 }
1259 
1260 // Returns mul * x - sub
1262  const Vec512<float> sub) {
1263  return Vec512<float>{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)};
1264 }
1266  const Vec512<double> sub) {
1267  return Vec512<double>{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)};
1268 }
1269 
1270 // Returns -mul * x - sub
1272  const Vec512<float> sub) {
1273  return Vec512<float>{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1274 }
1276  const Vec512<double> x,
1277  const Vec512<double> sub) {
1278  return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1279 }
1280 
1281 // ------------------------------ Floating-point square root
1282 
1283 // Full precision square root
1285  return Vec512<float>{_mm512_sqrt_ps(v.raw)};
1286 }
1288  return Vec512<double>{_mm512_sqrt_pd(v.raw)};
1289 }
1290 
1291 // Approximate reciprocal square root
1293  return Vec512<float>{_mm512_rsqrt14_ps(v.raw)};
1294 }
1295 
1296 // ------------------------------ Floating-point rounding
1297 
1298 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1299 HWY_DIAGNOSTICS(push)
1300 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1301 
1302 // Toward nearest integer, tie to even
1303 HWY_API Vec512<float> Round(const Vec512<float> v) {
1304  return Vec512<float>{_mm512_roundscale_ps(
1305  v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1306 }
1308  return Vec512<double>{_mm512_roundscale_pd(
1309  v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1310 }
1311 
1312 // Toward zero, aka truncate
1314  return Vec512<float>{
1315  _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1316 }
1318  return Vec512<double>{
1319  _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1320 }
1321 
1322 // Toward +infinity, aka ceiling
1324  return Vec512<float>{
1325  _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1326 }
1328  return Vec512<double>{
1329  _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1330 }
1331 
1332 // Toward -infinity, aka floor
1334  return Vec512<float>{
1335  _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1336 }
1338  return Vec512<double>{
1339  _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1340 }
1341 
1342 HWY_DIAGNOSTICS(pop)
1343 
1344 // ================================================== COMPARE
1345 
1346 // Comparisons set a mask bit to 1 if the condition is true, else 0.
1347 
1348 template <typename TFrom, typename TTo>
1350  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1351  return Mask512<TTo>{m.raw};
1352 }
1353 
1354 namespace detail {
1355 
1356 template <typename T>
1358  const Vec512<T> bit) {
1359  return Mask512<T>{_mm512_test_epi8_mask(v.raw, bit.raw)};
1360 }
1361 template <typename T>
1363  const Vec512<T> bit) {
1364  return Mask512<T>{_mm512_test_epi16_mask(v.raw, bit.raw)};
1365 }
1366 template <typename T>
1368  const Vec512<T> bit) {
1369  return Mask512<T>{_mm512_test_epi32_mask(v.raw, bit.raw)};
1370 }
1371 template <typename T>
1373  const Vec512<T> bit) {
1374  return Mask512<T>{_mm512_test_epi64_mask(v.raw, bit.raw)};
1375 }
1376 
1377 } // namespace detail
1378 
1379 template <typename T>
1381  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1382  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1383 }
1384 
1385 // ------------------------------ Equality
1386 
1387 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1389  return Mask512<T>{_mm512_cmpeq_epi8_mask(a.raw, b.raw)};
1390 }
1391 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1392 HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1393  return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
1394 }
1395 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1396 HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1397  return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
1398 }
1399 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1400 HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1401  return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
1402 }
1403 
1405  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1406 }
1407 
1409  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1410 }
1411 
1412 // ------------------------------ Inequality
1413 
1414 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1416  return Mask512<T>{_mm512_cmpneq_epi8_mask(a.raw, b.raw)};
1417 }
1418 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1419 HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1420  return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
1421 }
1422 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1423 HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1424  return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
1425 }
1426 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1427 HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1428  return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
1429 }
1430 
1432  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1433 }
1434 
1436  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1437 }
1438 
1439 // ------------------------------ Strict inequality
1440 
1442  return Mask512<uint8_t>{_mm512_cmpgt_epu8_mask(a.raw, b.raw)};
1443 }
1445  return Mask512<uint16_t>{_mm512_cmpgt_epu16_mask(a.raw, b.raw)};
1446 }
1448  return Mask512<uint32_t>{_mm512_cmpgt_epu32_mask(a.raw, b.raw)};
1449 }
1451  return Mask512<uint64_t>{_mm512_cmpgt_epu64_mask(a.raw, b.raw)};
1452 }
1453 
1455  return Mask512<int8_t>{_mm512_cmpgt_epi8_mask(a.raw, b.raw)};
1456 }
1458  return Mask512<int16_t>{_mm512_cmpgt_epi16_mask(a.raw, b.raw)};
1459 }
1461  return Mask512<int32_t>{_mm512_cmpgt_epi32_mask(a.raw, b.raw)};
1462 }
1464  return Mask512<int64_t>{_mm512_cmpgt_epi64_mask(a.raw, b.raw)};
1465 }
1466 
1468  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1469 }
1471  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1472 }
1473 
1474 // ------------------------------ Weak inequality
1475 
1477  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1478 }
1480  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1481 }
1482 
1483 // ------------------------------ Reversed comparisons
1484 
1485 template <typename T>
1487  return b > a;
1488 }
1489 
1490 template <typename T>
1492  return b >= a;
1493 }
1494 
1495 // ------------------------------ Mask
1496 
1497 namespace detail {
1498 
1499 template <typename T>
1501  return Mask512<T>{_mm512_movepi8_mask(v.raw)};
1502 }
1503 template <typename T>
1505  return Mask512<T>{_mm512_movepi16_mask(v.raw)};
1506 }
1507 template <typename T>
1509  return Mask512<T>{_mm512_movepi32_mask(v.raw)};
1510 }
1511 template <typename T>
1513  return Mask512<T>{_mm512_movepi64_mask(v.raw)};
1514 }
1515 
1516 } // namespace detail
1517 
1518 template <typename T>
1520  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1521 }
1522 // There do not seem to be native floating-point versions of these instructions.
1525 }
1528 }
1529 
1531  return Vec512<uint8_t>{_mm512_movm_epi8(v.raw)};
1532 }
1534  return Vec512<int8_t>{_mm512_movm_epi8(v.raw)};
1535 }
1536 
1538  return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)};
1539 }
1541  return Vec512<int16_t>{_mm512_movm_epi16(v.raw)};
1542 }
1543 
1545  return Vec512<uint32_t>{_mm512_movm_epi32(v.raw)};
1546 }
1548  return Vec512<int32_t>{_mm512_movm_epi32(v.raw)};
1549 }
1551  return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))};
1552 }
1553 
1555  return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)};
1556 }
1558  return Vec512<int64_t>{_mm512_movm_epi64(v.raw)};
1559 }
1561  return Vec512<double>{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))};
1562 }
1563 
1564 template <typename T>
1566  return VecFromMask(v);
1567 }
1568 
1569 // ------------------------------ Mask logical
1570 
1571 namespace detail {
1572 
1573 template <typename T>
1575 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1576  return Mask512<T>{_knot_mask64(m.raw)};
1577 #else
1578  return Mask512<T>{~m.raw};
1579 #endif
1580 }
1581 template <typename T>
1583 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1584  return Mask512<T>{_knot_mask32(m.raw)};
1585 #else
1586  return Mask512<T>{~m.raw};
1587 #endif
1588 }
1589 template <typename T>
1591 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1592  return Mask512<T>{_knot_mask16(m.raw)};
1593 #else
1594  return Mask512<T>{static_cast<uint16_t>(~m.raw & 0xFFFF)};
1595 #endif
1596 }
1597 template <typename T>
1599 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1600  return Mask512<T>{_knot_mask8(m.raw)};
1601 #else
1602  return Mask512<T>{static_cast<uint8_t>(~m.raw & 0xFF)};
1603 #endif
1604 }
1605 
1606 template <typename T>
1608  const Mask512<T> b) {
1609 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1610  return Mask512<T>{_kand_mask64(a.raw, b.raw)};
1611 #else
1612  return Mask512<T>{a.raw & b.raw};
1613 #endif
1614 }
1615 template <typename T>
1617  const Mask512<T> b) {
1618 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1619  return Mask512<T>{_kand_mask32(a.raw, b.raw)};
1620 #else
1621  return Mask512<T>{a.raw & b.raw};
1622 #endif
1623 }
1624 template <typename T>
1626  const Mask512<T> b) {
1627 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1628  return Mask512<T>{_kand_mask16(a.raw, b.raw)};
1629 #else
1630  return Mask512<T>{static_cast<uint16_t>(a.raw & b.raw)};
1631 #endif
1632 }
1633 template <typename T>
1635  const Mask512<T> b) {
1636 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1637  return Mask512<T>{_kand_mask8(a.raw, b.raw)};
1638 #else
1639  return Mask512<T>{static_cast<uint8_t>(a.raw & b.raw)};
1640 #endif
1641 }
1642 
1643 template <typename T>
1645  const Mask512<T> b) {
1646 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1647  return Mask512<T>{_kandn_mask64(a.raw, b.raw)};
1648 #else
1649  return Mask512<T>{~a.raw & b.raw};
1650 #endif
1651 }
1652 template <typename T>
1654  const Mask512<T> b) {
1655 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1656  return Mask512<T>{_kandn_mask32(a.raw, b.raw)};
1657 #else
1658  return Mask512<T>{~a.raw & b.raw};
1659 #endif
1660 }
1661 template <typename T>
1663  const Mask512<T> b) {
1664 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1665  return Mask512<T>{_kandn_mask16(a.raw, b.raw)};
1666 #else
1667  return Mask512<T>{static_cast<uint16_t>(~a.raw & b.raw)};
1668 #endif
1669 }
1670 template <typename T>
1672  const Mask512<T> b) {
1673 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1674  return Mask512<T>{_kandn_mask8(a.raw, b.raw)};
1675 #else
1676  return Mask512<T>{static_cast<uint8_t>(~a.raw & b.raw)};
1677 #endif
1678 }
1679 
1680 template <typename T>
1682  const Mask512<T> b) {
1683 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1684  return Mask512<T>{_kor_mask64(a.raw, b.raw)};
1685 #else
1686  return Mask512<T>{a.raw | b.raw};
1687 #endif
1688 }
1689 template <typename T>
1691  const Mask512<T> b) {
1692 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1693  return Mask512<T>{_kor_mask32(a.raw, b.raw)};
1694 #else
1695  return Mask512<T>{a.raw | b.raw};
1696 #endif
1697 }
1698 template <typename T>
1700  const Mask512<T> b) {
1701 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1702  return Mask512<T>{_kor_mask16(a.raw, b.raw)};
1703 #else
1704  return Mask512<T>{static_cast<uint16_t>(a.raw | b.raw)};
1705 #endif
1706 }
1707 template <typename T>
1709  const Mask512<T> b) {
1710 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1711  return Mask512<T>{_kor_mask8(a.raw, b.raw)};
1712 #else
1713  return Mask512<T>{static_cast<uint8_t>(a.raw | b.raw)};
1714 #endif
1715 }
1716 
1717 template <typename T>
1719  const Mask512<T> b) {
1720 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1721  return Mask512<T>{_kxor_mask64(a.raw, b.raw)};
1722 #else
1723  return Mask512<T>{a.raw ^ b.raw};
1724 #endif
1725 }
1726 template <typename T>
1728  const Mask512<T> b) {
1729 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1730  return Mask512<T>{_kxor_mask32(a.raw, b.raw)};
1731 #else
1732  return Mask512<T>{a.raw ^ b.raw};
1733 #endif
1734 }
1735 template <typename T>
1737  const Mask512<T> b) {
1738 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1739  return Mask512<T>{_kxor_mask16(a.raw, b.raw)};
1740 #else
1741  return Mask512<T>{static_cast<uint16_t>(a.raw ^ b.raw)};
1742 #endif
1743 }
1744 template <typename T>
1746  const Mask512<T> b) {
1747 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1748  return Mask512<T>{_kxor_mask8(a.raw, b.raw)};
1749 #else
1750  return Mask512<T>{static_cast<uint8_t>(a.raw ^ b.raw)};
1751 #endif
1752 }
1753 
1754 } // namespace detail
1755 
1756 template <typename T>
1758  return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
1759 }
1760 
1761 template <typename T>
1763  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
1764 }
1765 
1766 template <typename T>
1768  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
1769 }
1770 
1771 template <typename T>
1773  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
1774 }
1775 
1776 template <typename T>
1778  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
1779 }
1780 
1781 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1782 
1784  return VecFromMask(v < Zero(Full512<int8_t>()));
1785 }
1786 
1788  return ShiftRight<15>(v);
1789 }
1790 
1792  return ShiftRight<31>(v);
1793 }
1794 
1796  return Vec512<int64_t>{_mm512_srai_epi64(v.raw, 63)};
1797 }
1798 
1799 // ------------------------------ Floating-point classification (Not)
1800 
1802  return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x81)};
1803 }
1805  return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x81)};
1806 }
1807 
1809  return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x18)};
1810 }
1812  return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x18)};
1813 }
1814 
1815 // Returns whether normal/subnormal/zero. fpclass doesn't have a flag for
1816 // positive, so we have to check for inf/NaN and negate.
1818  return Not(Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x99)});
1819 }
1821  return Not(Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x99)});
1822 }
1823 
1824 // ================================================== MEMORY
1825 
1826 // ------------------------------ Load
1827 
1828 template <typename T>
1829 HWY_API Vec512<T> Load(Full512<T> /* tag */, const T* HWY_RESTRICT aligned) {
1830  return Vec512<T>{_mm512_load_si512(aligned)};
1831 }
1833  const float* HWY_RESTRICT aligned) {
1834  return Vec512<float>{_mm512_load_ps(aligned)};
1835 }
1837  const double* HWY_RESTRICT aligned) {
1838  return Vec512<double>{_mm512_load_pd(aligned)};
1839 }
1840 
1841 template <typename T>
1843  return Vec512<T>{_mm512_loadu_si512(p)};
1844 }
1846  const float* HWY_RESTRICT p) {
1847  return Vec512<float>{_mm512_loadu_ps(p)};
1848 }
1850  const double* HWY_RESTRICT p) {
1851  return Vec512<double>{_mm512_loadu_pd(p)};
1852 }
1853 
1854 // ------------------------------ MaskedLoad
1855 
1856 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1858  const T* HWY_RESTRICT p) {
1859  return Vec512<T>{_mm512_maskz_loadu_epi8(m.raw, p)};
1860 }
1861 
1862 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1863 HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1864  const T* HWY_RESTRICT p) {
1865  return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)};
1866 }
1867 
1868 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1869 HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1870  const T* HWY_RESTRICT p) {
1871  return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)};
1872 }
1873 
1874 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1875 HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1876  const T* HWY_RESTRICT p) {
1877  return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)};
1878 }
1879 
1881  const float* HWY_RESTRICT p) {
1882  return Vec512<float>{_mm512_maskz_loadu_ps(m.raw, p)};
1883 }
1884 
1886  const double* HWY_RESTRICT p) {
1887  return Vec512<double>{_mm512_maskz_loadu_pd(m.raw, p)};
1888 }
1889 
1890 // ------------------------------ LoadDup128
1891 
1892 // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
1893 // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
1894 template <typename T>
1896  const T* const HWY_RESTRICT p) {
1897  const auto x4 = LoadU(Full128<T>(), p);
1898  return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1899 }
1901  const float* const HWY_RESTRICT p) {
1902  const __m128 x4 = _mm_loadu_ps(p);
1903  return Vec512<float>{_mm512_broadcast_f32x4(x4)};
1904 }
1905 
1907  const double* const HWY_RESTRICT p) {
1908  const __m128d x2 = _mm_loadu_pd(p);
1909  return Vec512<double>{_mm512_broadcast_f64x2(x2)};
1910 }
1911 
1912 // ------------------------------ Store
1913 
1914 template <typename T>
1915 HWY_API void Store(const Vec512<T> v, Full512<T> /* tag */,
1916  T* HWY_RESTRICT aligned) {
1917  _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
1918 }
1920  float* HWY_RESTRICT aligned) {
1921  _mm512_store_ps(aligned, v.raw);
1922 }
1924  double* HWY_RESTRICT aligned) {
1925  _mm512_store_pd(aligned, v.raw);
1926 }
1927 
1928 template <typename T>
1929 HWY_API void StoreU(const Vec512<T> v, Full512<T> /* tag */,
1930  T* HWY_RESTRICT p) {
1931  _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
1932 }
1934  float* HWY_RESTRICT p) {
1935  _mm512_storeu_ps(p, v.raw);
1936 }
1938  double* HWY_RESTRICT p) {
1939  _mm512_storeu_pd(p, v.raw);
1940 }
1941 
1942 // ------------------------------ BlendedStore
1943 
1944 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1946  T* HWY_RESTRICT p) {
1947  _mm512_mask_storeu_epi8(p, m.raw, v.raw);
1948 }
1949 
1950 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1951 HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
1952  T* HWY_RESTRICT p) {
1953  _mm512_mask_storeu_epi16(p, m.raw, v.raw);
1954 }
1955 
1956 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1957 HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
1958  T* HWY_RESTRICT p) {
1959  _mm512_mask_storeu_epi32(p, m.raw, v.raw);
1960 }
1961 
1962 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1963 HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
1964  T* HWY_RESTRICT p) {
1965  _mm512_mask_storeu_epi64(p, m.raw, v.raw);
1966 }
1967 
1969  Full512<float> /* tag */, float* HWY_RESTRICT p) {
1970  _mm512_mask_storeu_ps(p, m.raw, v.raw);
1971 }
1972 
1974  Full512<double> /* tag */, double* HWY_RESTRICT p) {
1975  _mm512_mask_storeu_pd(p, m.raw, v.raw);
1976 }
1977 
1978 // ------------------------------ Non-temporal stores
1979 
1980 template <typename T>
1981 HWY_API void Stream(const Vec512<T> v, Full512<T> /* tag */,
1982  T* HWY_RESTRICT aligned) {
1983  _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
1984 }
1986  float* HWY_RESTRICT aligned) {
1987  _mm512_stream_ps(aligned, v.raw);
1988 }
1990  double* HWY_RESTRICT aligned) {
1991  _mm512_stream_pd(aligned, v.raw);
1992 }
1993 
1994 // ------------------------------ Scatter
1995 
1996 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1997 HWY_DIAGNOSTICS(push)
1998 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1999 
2000 namespace detail {
2001 
2002 template <typename T>
2004  Full512<T> /* tag */, T* HWY_RESTRICT base,
2005  const Vec512<int32_t> offset) {
2006  _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
2007 }
2008 template <typename T>
2010  Full512<T> /* tag */, T* HWY_RESTRICT base,
2011  const Vec512<int32_t> index) {
2012  _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
2013 }
2014 
2015 template <typename T>
2017  Full512<T> /* tag */, T* HWY_RESTRICT base,
2018  const Vec512<int64_t> offset) {
2019  _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
2020 }
2021 template <typename T>
2023  Full512<T> /* tag */, T* HWY_RESTRICT base,
2024  const Vec512<int64_t> index) {
2025  _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
2026 }
2027 
2028 } // namespace detail
2029 
2030 template <typename T, typename Offset>
2032  const Vec512<Offset> offset) {
2033  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2034  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2035 }
2036 template <typename T, typename Index>
2038  const Vec512<Index> index) {
2039  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2040  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2041 }
2042 
2044  float* HWY_RESTRICT base,
2045  const Vec512<int32_t> offset) {
2046  _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
2047 }
2049  float* HWY_RESTRICT base,
2050  const Vec512<int32_t> index) {
2051  _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
2052 }
2053 
2055  double* HWY_RESTRICT base,
2056  const Vec512<int64_t> offset) {
2057  _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
2058 }
2060  double* HWY_RESTRICT base,
2061  const Vec512<int64_t> index) {
2062  _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
2063 }
2064 
2065 // ------------------------------ Gather
2066 
2067 namespace detail {
2068 
2069 template <typename T>
2071  Full512<T> /* tag */,
2072  const T* HWY_RESTRICT base,
2073  const Vec512<int32_t> offset) {
2074  return Vec512<T>{_mm512_i32gather_epi32(offset.raw, base, 1)};
2075 }
2076 template <typename T>
2078  Full512<T> /* tag */,
2079  const T* HWY_RESTRICT base,
2080  const Vec512<int32_t> index) {
2081  return Vec512<T>{_mm512_i32gather_epi32(index.raw, base, 4)};
2082 }
2083 
2084 template <typename T>
2086  Full512<T> /* tag */,
2087  const T* HWY_RESTRICT base,
2088  const Vec512<int64_t> offset) {
2089  return Vec512<T>{_mm512_i64gather_epi64(offset.raw, base, 1)};
2090 }
2091 template <typename T>
2093  Full512<T> /* tag */,
2094  const T* HWY_RESTRICT base,
2095  const Vec512<int64_t> index) {
2096  return Vec512<T>{_mm512_i64gather_epi64(index.raw, base, 8)};
2097 }
2098 
2099 } // namespace detail
2100 
2101 template <typename T, typename Offset>
2103  const Vec512<Offset> offset) {
2104  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2105  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2106 }
2107 template <typename T, typename Index>
2109  const Vec512<Index> index) {
2110  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2111  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2112 }
2113 
2115  const float* HWY_RESTRICT base,
2116  const Vec512<int32_t> offset) {
2117  return Vec512<float>{_mm512_i32gather_ps(offset.raw, base, 1)};
2118 }
2120  const float* HWY_RESTRICT base,
2121  const Vec512<int32_t> index) {
2122  return Vec512<float>{_mm512_i32gather_ps(index.raw, base, 4)};
2123 }
2124 
2126  const double* HWY_RESTRICT base,
2127  const Vec512<int64_t> offset) {
2128  return Vec512<double>{_mm512_i64gather_pd(offset.raw, base, 1)};
2129 }
2131  const double* HWY_RESTRICT base,
2132  const Vec512<int64_t> index) {
2133  return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
2134 }
2135 
2136 HWY_DIAGNOSTICS(pop)
2137 
2138 // ================================================== SWIZZLE
2139 
2140 // ------------------------------ LowerHalf
2141 
2142 template <typename T>
2144  return Vec256<T>{_mm512_castsi512_si256(v.raw)};
2145 }
2147  return Vec256<float>{_mm512_castps512_ps256(v.raw)};
2148 }
2150  return Vec256<double>{_mm512_castpd512_pd256(v.raw)};
2151 }
2152 
2153 template <typename T>
2155  return LowerHalf(Full256<T>(), v);
2156 }
2157 
2158 // ------------------------------ UpperHalf
2159 
2160 template <typename T>
2162  return Vec256<T>{_mm512_extracti32x8_epi32(v.raw, 1)};
2163 }
2165  return Vec256<float>{_mm512_extractf32x8_ps(v.raw, 1)};
2166 }
2168  return Vec256<double>{_mm512_extractf64x4_pd(v.raw, 1)};
2169 }
2170 
2171 // ------------------------------ ExtractLane (Store)
2172 template <typename T>
2173 HWY_API T ExtractLane(const Vec512<T> v, size_t i) {
2174  const Full512<T> d;
2175  HWY_DASSERT(i < Lanes(d));
2176  alignas(64) T lanes[64 / sizeof(T)];
2177  Store(v, d, lanes);
2178  return lanes[i];
2179 }
2180 
2181 // ------------------------------ InsertLane (Store)
2182 template <typename T>
2183 HWY_API Vec512<T> InsertLane(const Vec512<T> v, size_t i, T t) {
2184  const Full512<T> d;
2185  HWY_DASSERT(i < Lanes(d));
2186  alignas(64) T lanes[64 / sizeof(T)];
2187  Store(v, d, lanes);
2188  lanes[i] = t;
2189  return Load(d, lanes);
2190 }
2191 
2192 // ------------------------------ GetLane (LowerHalf)
2193 template <typename T>
2195  return GetLane(LowerHalf(v));
2196 }
2197 
2198 // ------------------------------ ZeroExtendVector
2199 
2200 template <typename T>
2202 #if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h.
2203  return Vec512<T>{_mm512_zextsi256_si512(lo.raw)};
2204 #else
2205  return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
2206 #endif
2207 }
2209  Vec256<float> lo) {
2210 #if HWY_HAVE_ZEXT
2211  return Vec512<float>{_mm512_zextps256_ps512(lo.raw)};
2212 #else
2213  return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
2214 #endif
2215 }
2217  Vec256<double> lo) {
2218 #if HWY_HAVE_ZEXT
2219  return Vec512<double>{_mm512_zextpd256_pd512(lo.raw)};
2220 #else
2221  return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
2222 #endif
2223 }
2224 
2225 // ------------------------------ Combine
2226 
2227 template <typename T>
2229  const auto lo512 = ZeroExtendVector(d, lo);
2230  return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.raw, 1)};
2231 }
2233  Vec256<float> lo) {
2234  const auto lo512 = ZeroExtendVector(d, lo);
2235  return Vec512<float>{_mm512_insertf32x8(lo512.raw, hi.raw, 1)};
2236 }
2238  Vec256<double> lo) {
2239  const auto lo512 = ZeroExtendVector(d, lo);
2240  return Vec512<double>{_mm512_insertf64x4(lo512.raw, hi.raw, 1)};
2241 }
2242 
2243 // ------------------------------ ShiftLeftBytes
2244 
2245 template <int kBytes, typename T>
2247  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2248  return Vec512<T>{_mm512_bslli_epi128(v.raw, kBytes)};
2249 }
2250 
2251 template <int kBytes, typename T>
2253  return ShiftLeftBytes<kBytes>(Full512<T>(), v);
2254 }
2255 
2256 // ------------------------------ ShiftLeftLanes
2257 
2258 template <int kLanes, typename T>
2260  const Repartition<uint8_t, decltype(d)> d8;
2261  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2262 }
2263 
2264 template <int kLanes, typename T>
2266  return ShiftLeftLanes<kLanes>(Full512<T>(), v);
2267 }
2268 
2269 // ------------------------------ ShiftRightBytes
2270 template <int kBytes, typename T>
2272  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2273  return Vec512<T>{_mm512_bsrli_epi128(v.raw, kBytes)};
2274 }
2275 
2276 // ------------------------------ ShiftRightLanes
2277 template <int kLanes, typename T>
2279  const Repartition<uint8_t, decltype(d)> d8;
2280  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2281 }
2282 
2283 // ------------------------------ CombineShiftRightBytes
2284 
2285 template <int kBytes, typename T, class V = Vec512<T>>
2287  const Repartition<uint8_t, decltype(d)> d8;
2288  return BitCast(d, Vec512<uint8_t>{_mm512_alignr_epi8(
2289  BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2290 }
2291 
2292 // ------------------------------ Broadcast/splat any lane
2293 
2294 // Unsigned
2295 template <int kLane>
2297  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2298  if (kLane < 4) {
2299  const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2300  return Vec512<uint16_t>{_mm512_unpacklo_epi64(lo, lo)};
2301  } else {
2302  const __m512i hi =
2303  _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2304  return Vec512<uint16_t>{_mm512_unpackhi_epi64(hi, hi)};
2305  }
2306 }
2307 template <int kLane>
2309  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2310  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2311  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2312 }
2313 template <int kLane>
2315  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2316  constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2317  return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2318 }
2319 
2320 // Signed
2321 template <int kLane>
2323  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2324  if (kLane < 4) {
2325  const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2326  return Vec512<int16_t>{_mm512_unpacklo_epi64(lo, lo)};
2327  } else {
2328  const __m512i hi =
2329  _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2330  return Vec512<int16_t>{_mm512_unpackhi_epi64(hi, hi)};
2331  }
2332 }
2333 template <int kLane>
2335  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2336  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2337  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2338 }
2339 template <int kLane>
2341  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2342  constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2343  return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2344 }
2345 
2346 // Float
2347 template <int kLane>
2349  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2350  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2351  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, perm)};
2352 }
2353 template <int kLane>
2355  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2356  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane);
2357  return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, perm)};
2358 }
2359 
2360 // ------------------------------ Hard-coded shuffles
2361 
2362 // Notation: let Vec512<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2363 // least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2364 // right (the previous least-significant lane is now most-significant =>
2365 // 47650321). These could also be implemented via CombineShiftRightBytes but
2366 // the shuffle_abcd notation is more convenient.
2367 
2368 // Swap 32-bit halves in 64-bit halves.
2369 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2371  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)};
2372 }
2374  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)};
2375 }
2376 
2377 namespace detail {
2378 
2379 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2381  const Full512<T> d;
2382  const RebindToFloat<decltype(d)> df;
2383  return BitCast(
2384  d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
2385  _MM_PERM_CDAB)});
2386 }
2387 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2389  const Full512<T> d;
2390  const RebindToFloat<decltype(d)> df;
2391  return BitCast(
2392  d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
2393  _MM_PERM_BCDA)});
2394 }
2395 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2397  const Full512<T> d;
2398  const RebindToFloat<decltype(d)> df;
2399  return BitCast(
2400  d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
2401  _MM_PERM_DABC)});
2402 }
2403 
2404 } // namespace detail
2405 
2406 // Swap 64-bit halves
2408  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2409 }
2411  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2412 }
2414  // Shorter encoding than _mm512_permute_ps.
2415  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)};
2416 }
2418  return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2419 }
2421  return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2422 }
2424  // Shorter encoding than _mm512_permute_pd.
2425  return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)};
2426 }
2427 
2428 // Rotate right 32 bits
2430  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2431 }
2433  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2434 }
2436  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)};
2437 }
2438 // Rotate left 32 bits
2440  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2441 }
2443  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2444 }
2446  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)};
2447 }
2448 
2449 // Reverse
2451  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2452 }
2454  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2455 }
2457  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)};
2458 }
2459 
2460 // ------------------------------ TableLookupLanes
2461 
2462 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
2463 template <typename T>
2464 struct Indices512 {
2465  __m512i raw;
2466 };
2467 
2468 template <typename T, typename TI>
2470  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2471 #if HWY_IS_DEBUG_BUILD
2472  const Full512<TI> di;
2473  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2474  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(64 / sizeof(T))))));
2475 #endif
2476  return Indices512<T>{vec.raw};
2477 }
2478 
2479 template <typename T, typename TI>
2481  const Rebind<TI, decltype(d)> di;
2482  return IndicesFromVec(d, LoadU(di, idx));
2483 }
2484 
2485 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2487  return Vec512<T>{_mm512_permutexvar_epi32(idx.raw, v.raw)};
2488 }
2489 
2490 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2491 HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) {
2492  return Vec512<T>{_mm512_permutexvar_epi64(idx.raw, v.raw)};
2493 }
2494 
2496  return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)};
2497 }
2498 
2500  Indices512<double> idx) {
2501  return Vec512<double>{_mm512_permutexvar_pd(idx.raw, v.raw)};
2502 }
2503 
2504 // ------------------------------ Reverse
2505 
2506 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2508  const RebindToSigned<decltype(d)> di;
2509  alignas(64) constexpr int16_t kReverse[32] = {
2510  31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
2511  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2512  const Vec512<int16_t> idx = Load(di, kReverse);
2513  return BitCast(d, Vec512<int16_t>{
2514  _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2515 }
2516 
2517 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2518 HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
2519  alignas(64) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2520  7, 6, 5, 4, 3, 2, 1, 0};
2521  return TableLookupLanes(v, SetTableIndices(d, kReverse));
2522 }
2523 
2524 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2525 HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
2526  alignas(64) constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2527  return TableLookupLanes(v, SetTableIndices(d, kReverse));
2528 }
2529 
2530 // ------------------------------ Reverse2
2531 
2532 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2534  const Full512<uint32_t> du32;
2535  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2536 }
2537 
2538 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2539 HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
2540  return Shuffle2301(v);
2541 }
2542 
2543 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2544 HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
2545  return Shuffle01(v);
2546 }
2547 
2548 // ------------------------------ Reverse4
2549 
2550 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2552  const RebindToSigned<decltype(d)> di;
2553  alignas(64) constexpr int16_t kReverse4[32] = {
2554  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
2555  19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
2556  const Vec512<int16_t> idx = Load(di, kReverse4);
2557  return BitCast(d, Vec512<int16_t>{
2558  _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2559 }
2560 
2561 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2562 HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
2563  return Shuffle0123(v);
2564 }
2565 
2566 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2567 HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
2568  return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2569 }
2571  return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2572 }
2573 
2574 // ------------------------------ Reverse8
2575 
2576 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2578  const RebindToSigned<decltype(d)> di;
2579  alignas(64) constexpr int16_t kReverse8[32] = {
2580  7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
2581  23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
2582  const Vec512<int16_t> idx = Load(di, kReverse8);
2583  return BitCast(d, Vec512<int16_t>{
2584  _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2585 }
2586 
2587 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2588 HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
2589  const RebindToSigned<decltype(d)> di;
2590  alignas(64) constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2591  15, 14, 13, 12, 11, 10, 9, 8};
2592  const Vec512<int32_t> idx = Load(di, kReverse8);
2593  return BitCast(d, Vec512<int32_t>{
2594  _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)});
2595 }
2596 
2597 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2598 HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
2599  return Reverse(d, v);
2600 }
2601 
2602 // ------------------------------ InterleaveLower
2603 
2604 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
2605 // the least-significant lane) and "b". To concatenate two half-width integers
2606 // into one, use ZipLower/Upper instead (also works with scalar).
2607 
2609  const Vec512<uint8_t> b) {
2610  return Vec512<uint8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2611 }
2613  const Vec512<uint16_t> b) {
2614  return Vec512<uint16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2615 }
2617  const Vec512<uint32_t> b) {
2618  return Vec512<uint32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2619 }
2621  const Vec512<uint64_t> b) {
2622  return Vec512<uint64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2623 }
2624 
2626  const Vec512<int8_t> b) {
2627  return Vec512<int8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2628 }
2630  const Vec512<int16_t> b) {
2631  return Vec512<int16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2632 }
2634  const Vec512<int32_t> b) {
2635  return Vec512<int32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2636 }
2638  const Vec512<int64_t> b) {
2639  return Vec512<int64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2640 }
2641 
2643  const Vec512<float> b) {
2644  return Vec512<float>{_mm512_unpacklo_ps(a.raw, b.raw)};
2645 }
2647  const Vec512<double> b) {
2648  return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
2649 }
2650 
2651 // ------------------------------ InterleaveUpper
2652 
2653 // All functions inside detail lack the required D parameter.
2654 namespace detail {
2655 
2657  const Vec512<uint8_t> b) {
2658  return Vec512<uint8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2659 }
2661  const Vec512<uint16_t> b) {
2662  return Vec512<uint16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2663 }
2665  const Vec512<uint32_t> b) {
2666  return Vec512<uint32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2667 }
2669  const Vec512<uint64_t> b) {
2670  return Vec512<uint64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2671 }
2672 
2674  const Vec512<int8_t> b) {
2675  return Vec512<int8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2676 }
2678  const Vec512<int16_t> b) {
2679  return Vec512<int16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2680 }
2682  const Vec512<int32_t> b) {
2683  return Vec512<int32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2684 }
2686  const Vec512<int64_t> b) {
2687  return Vec512<int64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2688 }
2689 
2691  const Vec512<float> b) {
2692  return Vec512<float>{_mm512_unpackhi_ps(a.raw, b.raw)};
2693 }
2695  const Vec512<double> b) {
2696  return Vec512<double>{_mm512_unpackhi_pd(a.raw, b.raw)};
2697 }
2698 
2699 } // namespace detail
2700 
2701 template <typename T, class V = Vec512<T>>
2702 HWY_API V InterleaveUpper(Full512<T> /* tag */, V a, V b) {
2703  return detail::InterleaveUpper(a, b);
2704 }
2705 
2706 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2707 
2708 // Same as Interleave*, except that the return lanes are double-width integers;
2709 // this is necessary because the single-lane scalar cannot return two values.
2710 template <typename T, typename TW = MakeWide<T>>
2712  return BitCast(Full512<TW>(), InterleaveLower(a, b));
2713 }
2714 template <typename T, typename TW = MakeWide<T>>
2716  return BitCast(Full512<TW>(), InterleaveLower(a, b));
2717 }
2718 
2719 template <typename T, typename TW = MakeWide<T>>
2721  return BitCast(Full512<TW>(), InterleaveUpper(d, a, b));
2722 }
2723 
2724 // ------------------------------ Concat* halves
2725 
2726 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2727 template <typename T>
2729  const Vec512<T> lo) {
2730  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2731 }
2733  const Vec512<float> hi,
2734  const Vec512<float> lo) {
2735  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2736 }
2738  const Vec512<double> hi,
2739  const Vec512<double> lo) {
2740  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)};
2741 }
2742 
2743 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
2744 template <typename T>
2746  const Vec512<T> lo) {
2747  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2748 }
2750  const Vec512<float> hi,
2751  const Vec512<float> lo) {
2752  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2753 }
2755  const Vec512<double> hi,
2756  const Vec512<double> lo) {
2757  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)};
2758 }
2759 
2760 // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
2761 template <typename T>
2763  const Vec512<T> lo) {
2764  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
2765 }
2767  const Vec512<float> hi,
2768  const Vec512<float> lo) {
2769  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
2770 }
2772  const Vec512<double> hi,
2773  const Vec512<double> lo) {
2774  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)};
2775 }
2776 
2777 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
2778 template <typename T>
2780  const Vec512<T> lo) {
2781  // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
2782  // are efficiently loaded from 32-bit regs.
2783  const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
2784  return Vec512<T>{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)};
2785 }
2787  const Vec512<float> hi,
2788  const Vec512<float> lo) {
2789  const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF);
2790  return Vec512<float>{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)};
2791 }
2793  const Vec512<double> hi,
2794  const Vec512<double> lo) {
2795  const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F);
2796  return Vec512<double>{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)};
2797 }
2798 
2799 // ------------------------------ ConcatOdd
2800 
2801 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2803  const RebindToUnsigned<decltype(d)> du;
2804 #if HWY_TARGET == HWY_AVX3_DL
2805  alignas(64) constexpr uint8_t kIdx[64] = {
2806  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
2807  27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51,
2808  53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
2809  79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103,
2810  105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
2811  return BitCast(d,
2812  Vec512<uint8_t>{_mm512_mask2_permutex2var_epi8(
2813  BitCast(du, lo).raw, Load(du, kIdx).raw,
2814  __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)});
2815 #else
2816  const RepartitionToWide<decltype(du)> dw;
2817  // Right-shift 8 bits per u16 so we can pack.
2818  const Vec512<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
2819  const Vec512<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
2820  const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
2821  // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
2822  const Full512<uint64_t> du64;
2823  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2824  return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
2825 #endif
2826 }
2827 
2828 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2829 HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2830  const RebindToUnsigned<decltype(d)> du;
2831  alignas(64) constexpr uint16_t kIdx[32] = {
2832  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
2833  33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
2834  return BitCast(d, Vec512<uint16_t>{_mm512_mask2_permutex2var_epi16(
2835  BitCast(du, lo).raw, Load(du, kIdx).raw,
2836  __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
2837 }
2838 
2839 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2840 HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2841  const RebindToUnsigned<decltype(d)> du;
2842  alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2843  17, 19, 21, 23, 25, 27, 29, 31};
2844  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2845  BitCast(du, lo).raw, Load(du, kIdx).raw,
2846  __mmask16{0xFFFF}, BitCast(du, hi).raw)});
2847 }
2848 
2850  Vec512<float> lo) {
2851  const RebindToUnsigned<decltype(d)> du;
2852  alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2853  17, 19, 21, 23, 25, 27, 29, 31};
2854  return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2855  __mmask16{0xFFFF}, hi.raw)};
2856 }
2857 
2858 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2859 HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2860  const RebindToUnsigned<decltype(d)> du;
2861  alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2862  return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2863  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2864  BitCast(du, hi).raw)});
2865 }
2866 
2868  Vec512<double> lo) {
2869  const RebindToUnsigned<decltype(d)> du;
2870  alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2871  return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2872  __mmask8{0xFF}, hi.raw)};
2873 }
2874 
2875 // ------------------------------ ConcatEven
2876 
2877 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2879  const RebindToUnsigned<decltype(d)> du;
2880 #if HWY_TARGET == HWY_AVX3_DL
2881  alignas(64) constexpr uint8_t kIdx[64] = {
2882  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
2883  26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
2884  52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
2885  78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
2886  104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
2887  return BitCast(d,
2888  Vec512<uint32_t>{_mm512_mask2_permutex2var_epi8(
2889  BitCast(du, lo).raw, Load(du, kIdx).raw,
2890  __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)});
2891 #else
2892  const RepartitionToWide<decltype(du)> dw;
2893  // Isolate lower 8 bits per u16 so we can pack.
2894  const Vec512<uint16_t> mask = Set(dw, 0x00FF);
2895  const Vec512<uint16_t> uH = And(BitCast(dw, hi), mask);
2896  const Vec512<uint16_t> uL = And(BitCast(dw, lo), mask);
2897  const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
2898  // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
2899  const Full512<uint64_t> du64;
2900  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2901  return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
2902 #endif
2903 }
2904 
2905 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2906 HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2907  const RebindToUnsigned<decltype(d)> du;
2908  alignas(64) constexpr uint16_t kIdx[32] = {
2909  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
2910  32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
2911  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi16(
2912  BitCast(du, lo).raw, Load(du, kIdx).raw,
2913  __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
2914 }
2915 
2916 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2917 HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2918  const RebindToUnsigned<decltype(d)> du;
2919  alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2920  16, 18, 20, 22, 24, 26, 28, 30};
2921  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2922  BitCast(du, lo).raw, Load(du, kIdx).raw,
2923  __mmask16{0xFFFF}, BitCast(du, hi).raw)});
2924 }
2925 
2927  Vec512<float> lo) {
2928  const RebindToUnsigned<decltype(d)> du;
2929  alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2930  16, 18, 20, 22, 24, 26, 28, 30};
2931  return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2932  __mmask16{0xFFFF}, hi.raw)};
2933 }
2934 
2935 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2936 HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2937  const RebindToUnsigned<decltype(d)> du;
2938  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2939  return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2940  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2941  BitCast(du, hi).raw)});
2942 }
2943 
2945  Vec512<double> lo) {
2946  const RebindToUnsigned<decltype(d)> du;
2947  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2948  return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2949  __mmask8{0xFF}, hi.raw)};
2950 }
2951 
2952 // ------------------------------ DupEven (InterleaveLower)
2953 
2954 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2956  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)};
2957 }
2959  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)};
2960 }
2961 
2962 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2963 HWY_API Vec512<T> DupEven(const Vec512<T> v) {
2964  return InterleaveLower(Full512<T>(), v, v);
2965 }
2966 
2967 // ------------------------------ DupOdd (InterleaveUpper)
2968 
2969 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2971  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)};
2972 }
2974  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)};
2975 }
2976 
2977 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2978 HWY_API Vec512<T> DupOdd(const Vec512<T> v) {
2979  return InterleaveUpper(Full512<T>(), v, v);
2980 }
2981 
2982 // ------------------------------ OddEven
2983 
2984 template <typename T>
2986  constexpr size_t s = sizeof(T);
2987  constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
2988  return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
2989 }
2990 
2991 // ------------------------------ OddEvenBlocks
2992 
2993 template <typename T>
2995  return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
2996 }
2997 
2999  return Vec512<float>{
3000  _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.raw, even.raw)};
3001 }
3002 
3004  return Vec512<double>{
3005  _mm512_mask_blend_pd(__mmask8{0x33u}, odd.raw, even.raw)};
3006 }
3007 
3008 // ------------------------------ SwapAdjacentBlocks
3009 
3010 template <typename T>
3012  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
3013 }
3014 
3016  return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)};
3017 }
3018 
3020  return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
3021 }
3022 
3023 // ------------------------------ ReverseBlocks
3024 
3025 template <typename T>
3027  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
3028 }
3030  return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
3031 }
3033  Vec512<double> v) {
3034  return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
3035 }
3036 
3037 // ------------------------------ TableLookupBytes (ZeroExtendVector)
3038 
3039 // Both full
3040 template <typename T, typename TI>
3042  return Vec512<TI>{_mm512_shuffle_epi8(bytes.raw, indices.raw)};
3043 }
3044 
3045 // Partial index vector
3046 template <typename T, typename TI, size_t NI>
3048  const Full512<TI> d512;
3049  const Half<decltype(d512)> d256;
3050  const Half<decltype(d256)> d128;
3051  // First expand to full 128, then 256, then 512.
3052  const Vec128<TI> from_full{from.raw};
3053  const auto from_512 =
3054  ZeroExtendVector(d512, ZeroExtendVector(d256, from_full));
3055  const auto tbl_full = TableLookupBytes(bytes, from_512);
3056  // Shrink to 256, then 128, then partial.
3057  return Vec128<TI, NI>{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw};
3058 }
3059 template <typename T, typename TI>
3061  const auto from_512 = ZeroExtendVector(Full512<TI>(), from);
3062  return LowerHalf(Full256<TI>(), TableLookupBytes(bytes, from_512));
3063 }
3064 
3065 // Partial table vector
3066 template <typename T, size_t N, typename TI>
3068  const Full512<TI> d512;
3069  const Half<decltype(d512)> d256;
3070  const Half<decltype(d256)> d128;
3071  // First expand to full 128, then 256, then 512.
3072  const Vec128<T> bytes_full{bytes.raw};
3073  const auto bytes_512 =
3074  ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full));
3075  return TableLookupBytes(bytes_512, from);
3076 }
3077 template <typename T, typename TI>
3079  const auto bytes_512 = ZeroExtendVector(Full512<T>(), bytes);
3080  return TableLookupBytes(bytes_512, from);
3081 }
3082 
3083 // Partial both are handled by x86_128/256.
3084 
3085 // ================================================== CONVERT
3086 
3087 // ------------------------------ Promotions (part w/ narrow lanes -> full)
3088 
3089 // Unsigned: zero-extend.
3090 // Note: these have 3 cycle latency; if inputs are already split across the
3091 // 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
3093  Vec256<uint8_t> v) {
3094  return Vec512<uint16_t>{_mm512_cvtepu8_epi16(v.raw)};
3095 }
3097  Vec128<uint8_t> v) {
3098  return Vec512<uint32_t>{_mm512_cvtepu8_epi32(v.raw)};
3099 }
3101  Vec256<uint8_t> v) {
3102  return Vec512<int16_t>{_mm512_cvtepu8_epi16(v.raw)};
3103 }
3105  Vec128<uint8_t> v) {
3106  return Vec512<int32_t>{_mm512_cvtepu8_epi32(v.raw)};
3107 }
3109  Vec256<uint16_t> v) {
3110  return Vec512<uint32_t>{_mm512_cvtepu16_epi32(v.raw)};
3111 }
3113  Vec256<uint16_t> v) {
3114  return Vec512<int32_t>{_mm512_cvtepu16_epi32(v.raw)};
3115 }
3117  Vec256<uint32_t> v) {
3118  return Vec512<uint64_t>{_mm512_cvtepu32_epi64(v.raw)};
3119 }
3120 
3121 // Signed: replicate sign bit.
3122 // Note: these have 3 cycle latency; if inputs are already split across the
3123 // 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
3124 // signed shift would be faster.
3126  Vec256<int8_t> v) {
3127  return Vec512<int16_t>{_mm512_cvtepi8_epi16(v.raw)};
3128 }
3130  Vec128<int8_t> v) {
3131  return Vec512<int32_t>{_mm512_cvtepi8_epi32(v.raw)};
3132 }
3134  Vec256<int16_t> v) {
3135  return Vec512<int32_t>{_mm512_cvtepi16_epi32(v.raw)};
3136 }
3138  Vec256<int32_t> v) {
3139  return Vec512<int64_t>{_mm512_cvtepi32_epi64(v.raw)};
3140 }
3141 
3142 // Float
3144  const Vec256<float16_t> v) {
3145  return Vec512<float>{_mm512_cvtph_ps(v.raw)};
3146 }
3147 
3149  const Vec256<bfloat16_t> v) {
3150  const Rebind<uint16_t, decltype(df32)> du16;
3151  const RebindToSigned<decltype(df32)> di32;
3152  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3153 }
3154 
3156  return Vec512<double>{_mm512_cvtps_pd(v.raw)};
3157 }
3158 
3160  return Vec512<double>{_mm512_cvtepi32_pd(v.raw)};
3161 }
3162 
3163 // ------------------------------ Demotions (full -> part w/ narrow lanes)
3164 
3166  const Vec512<int32_t> v) {
3167  const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
3168 
3169  // Compress even u64 lanes into 256 bit.
3170  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3171  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3172  const Vec512<uint16_t> even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)};
3173  return LowerHalf(even);
3174 }
3175 
3177  const Vec512<int32_t> v) {
3178  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
3179 
3180  // Compress even u64 lanes into 256 bit.
3181  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3182  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3183  const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
3184  return LowerHalf(even);
3185 }
3186 
3188  const Vec512<int32_t> v) {
3189  const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
3190  // packus treats the input as signed; we want unsigned. Clear the MSB to get
3191  // unsigned saturation to u8.
3192  const Vec512<int16_t> i16{
3193  _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
3194  const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
3195 
3196  alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
3197  const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
3198  const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
3199  return LowerHalf(LowerHalf(fixed));
3200 }
3201 
3203  const Vec512<int16_t> v) {
3204  const Vec512<uint8_t> u8{_mm512_packus_epi16(v.raw, v.raw)};
3205 
3206  // Compress even u64 lanes into 256 bit.
3207  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3208  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3209  const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3210  return LowerHalf(even);
3211 }
3212 
3214  const Vec512<int32_t> v) {
3215  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
3216  const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
3217 
3218  alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
3219  0, 4, 8, 12, 0, 4, 8, 12};
3220  const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
3221  const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
3222  return LowerHalf(LowerHalf(fixed));
3223 }
3224 
3226  const Vec512<int16_t> v) {
3227  const Vec512<int8_t> u8{_mm512_packs_epi16(v.raw, v.raw)};
3228 
3229  // Compress even u64 lanes into 256 bit.
3230  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3231  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3232  const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3233  return LowerHalf(even);
3234 }
3235 
3237  const Vec512<float> v) {
3238  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3239  HWY_DIAGNOSTICS(push)
3240  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3241  return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
3242  HWY_DIAGNOSTICS(pop)
3243 }
3244 
3246  const Vec512<float> v) {
3247  // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
3248  const Rebind<int32_t, decltype(dbf16)> di32;
3249  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3250  const Rebind<uint16_t, decltype(dbf16)> du16;
3251  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3252  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3253 }
3254 
3257  // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16.
3258  const RebindToUnsigned<decltype(dbf16)> du16;
3259  const Repartition<uint32_t, decltype(dbf16)> du32;
3260  const Vec512<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
3261  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
3262 }
3263 
3265  const Vec512<double> v) {
3266  return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
3267 }
3268 
3270  const Vec512<double> v) {
3271  const auto clamped = detail::ClampF64ToI32Max(Full512<double>(), v);
3272  return Vec256<int32_t>{_mm512_cvttpd_epi32(clamped.raw)};
3273 }
3274 
3275 // For already range-limited input [0, 255].
3277  const Full512<uint32_t> d32;
3278  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
3279  // lowest 4 bytes.
3280  alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3281  ~0u};
3282  const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
3283  // Gather the lowest 4 bytes of 4 128-bit blocks.
3284  alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3285  const Vec512<uint8_t> bytes{
3286  _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
3287  return LowerHalf(LowerHalf(bytes));
3288 }
3289 
3290 // ------------------------------ Convert integer <=> floating point
3291 
3293  const Vec512<int32_t> v) {
3294  return Vec512<float>{_mm512_cvtepi32_ps(v.raw)};
3295 }
3296 
3298  const Vec512<int64_t> v) {
3299  return Vec512<double>{_mm512_cvtepi64_pd(v.raw)};
3300 }
3301 
3302 // Truncates (rounds toward zero).
3304  return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw));
3305 }
3307  return detail::FixConversionOverflow(di, v, _mm512_cvttpd_epi64(v.raw));
3308 }
3309 
3311  const Full512<int32_t> di;
3312  return detail::FixConversionOverflow(di, v, _mm512_cvtps_epi32(v.raw));
3313 }
3314 
3315 // ================================================== CRYPTO
3316 
3317 #if !defined(HWY_DISABLE_PCLMUL_AES)
3318 
3319 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
3320 #ifdef HWY_NATIVE_AES
3321 #undef HWY_NATIVE_AES
3322 #else
3323 #define HWY_NATIVE_AES
3324 #endif
3325 
3327  Vec512<uint8_t> round_key) {
3328 #if HWY_TARGET == HWY_AVX3_DL
3329  return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
3330 #else
3331  const Full512<uint8_t> d;
3332  const Half<decltype(d)> d2;
3333  return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3334  AESRound(LowerHalf(state), LowerHalf(round_key)));
3335 #endif
3336 }
3337 
3339  Vec512<uint8_t> round_key) {
3340 #if HWY_TARGET == HWY_AVX3_DL
3341  return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)};
3342 #else
3343  const Full512<uint8_t> d;
3344  const Half<decltype(d)> d2;
3345  return Combine(d,
3346  AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3347  AESLastRound(LowerHalf(state), LowerHalf(round_key)));
3348 #endif
3349 }
3350 
3352 #if HWY_TARGET == HWY_AVX3_DL
3353  return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)};
3354 #else
3355  alignas(64) uint64_t a[8];
3356  alignas(64) uint64_t b[8];
3357  const Full512<uint64_t> d;
3358  const Full128<uint64_t> d128;
3359  Store(va, d, a);
3360  Store(vb, d, b);
3361  for (size_t i = 0; i < 8; i += 2) {
3362  const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i));
3363  Store(mul, d128, a + i);
3364  }
3365  return Load(d, a);
3366 #endif
3367 }
3368 
3370 #if HWY_TARGET == HWY_AVX3_DL
3371  return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)};
3372 #else
3373  alignas(64) uint64_t a[8];
3374  alignas(64) uint64_t b[8];
3375  const Full512<uint64_t> d;
3376  const Full128<uint64_t> d128;
3377  Store(va, d, a);
3378  Store(vb, d, b);
3379  for (size_t i = 0; i < 8; i += 2) {
3380  const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i));
3381  Store(mul, d128, a + i);
3382  }
3383  return Load(d, a);
3384 #endif
3385 }
3386 
3387 #endif // HWY_DISABLE_PCLMUL_AES
3388 
3389 // ================================================== MISC
3390 
3391 // Returns a vector with lane i=[0, N) set to "first" + i.
3392 template <typename T, typename T2>
3393 Vec512<T> Iota(const Full512<T> d, const T2 first) {
3394  HWY_ALIGN T lanes[64 / sizeof(T)];
3395  for (size_t i = 0; i < 64 / sizeof(T); ++i) {
3396  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
3397  }
3398  return Load(d, lanes);
3399 }
3400 
3401 // ------------------------------ Mask testing
3402 
3403 // Beware: the suffix indicates the number of mask bits, not lane size!
3404 
3405 namespace detail {
3406 
3407 template <typename T>
3408 HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) {
3409 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3410  return _kortestz_mask64_u8(mask.raw, mask.raw);
3411 #else
3412  return mask.raw == 0;
3413 #endif
3414 }
3415 template <typename T>
3416 HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) {
3417 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3418  return _kortestz_mask32_u8(mask.raw, mask.raw);
3419 #else
3420  return mask.raw == 0;
3421 #endif
3422 }
3423 template <typename T>
3424 HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) {
3425 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3426  return _kortestz_mask16_u8(mask.raw, mask.raw);
3427 #else
3428  return mask.raw == 0;
3429 #endif
3430 }
3431 template <typename T>
3432 HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) {
3433 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3434  return _kortestz_mask8_u8(mask.raw, mask.raw);
3435 #else
3436  return mask.raw == 0;
3437 #endif
3438 }
3439 
3440 } // namespace detail
3441 
3442 template <typename T>
3443 HWY_API bool AllFalse(const Full512<T> /* tag */, const Mask512<T> mask) {
3444  return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
3445 }
3446 
3447 namespace detail {
3448 
3449 template <typename T>
3450 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) {
3451 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3452  return _kortestc_mask64_u8(mask.raw, mask.raw);
3453 #else
3454  return mask.raw == 0xFFFFFFFFFFFFFFFFull;
3455 #endif
3456 }
3457 template <typename T>
3458 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) {
3459 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3460  return _kortestc_mask32_u8(mask.raw, mask.raw);
3461 #else
3462  return mask.raw == 0xFFFFFFFFull;
3463 #endif
3464 }
3465 template <typename T>
3466 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) {
3467 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3468  return _kortestc_mask16_u8(mask.raw, mask.raw);
3469 #else
3470  return mask.raw == 0xFFFFull;
3471 #endif
3472 }
3473 template <typename T>
3474 HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) {
3475 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3476  return _kortestc_mask8_u8(mask.raw, mask.raw);
3477 #else
3478  return mask.raw == 0xFFull;
3479 #endif
3480 }
3481 
3482 } // namespace detail
3483 
3484 template <typename T>
3485 HWY_API bool AllTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3486  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
3487 }
3488 
3489 // `p` points to at least 8 readable bytes, not all of which need be valid.
3490 template <typename T>
3492  const uint8_t* HWY_RESTRICT bits) {
3493  Mask512<T> mask;
3494  CopyBytes<8 / sizeof(T)>(bits, &mask.raw);
3495  // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3496  return mask;
3497 }
3498 
3499 // `p` points to at least 8 writable bytes.
3500 template <typename T>
3501 HWY_API size_t StoreMaskBits(const Full512<T> /* tag */, const Mask512<T> mask,
3502  uint8_t* bits) {
3503  const size_t kNumBytes = 8 / sizeof(T);
3504  CopyBytes<kNumBytes>(&mask.raw, bits);
3505  // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3506  return kNumBytes;
3507 }
3508 
3509 template <typename T>
3510 HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3511  return PopCount(static_cast<uint64_t>(mask.raw));
3512 }
3513 
3514 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3515 HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
3516  const Mask512<T> mask) {
3517  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
3518 }
3519 
3520 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3521 HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
3522  const Mask512<T> mask) {
3523  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask.raw)) : -1;
3524 }
3525 
3526 // ------------------------------ Compress
3527 
3528 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3530  return Vec512<T>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
3531 }
3532 
3534  return Vec512<float>{_mm512_maskz_compress_ps(mask.raw, v.raw)};
3535 }
3536 
3537 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3538 HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
3539  // See CompressIsPartition. u64 is faster than u32.
3540  alignas(16) constexpr uint64_t packed_array[256] = {
3541  // PrintCompress32x8Tables
3542  0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
3543  0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
3544  0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
3545  0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
3546  0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
3547  0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
3548  0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
3549  0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
3550  0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
3551  0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
3552  0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
3553  0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
3554  0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
3555  0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
3556  0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
3557  0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
3558  0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
3559  0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
3560  0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
3561  0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
3562  0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
3563  0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
3564  0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
3565  0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
3566  0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
3567  0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
3568  0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
3569  0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
3570  0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
3571  0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
3572  0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
3573  0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
3574  0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
3575  0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
3576  0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
3577  0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
3578  0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
3579  0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
3580  0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
3581  0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
3582  0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
3583  0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
3584  0x10765432, 0x17654320, 0x07654321, 0x76543210};
3585 
3586  // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
3587  // _mm512_permutexvar_epi64 will ignore the upper bits.
3588  const Full512<T> d;
3589  const RebindToUnsigned<decltype(d)> du64;
3590  const auto packed = Set(du64, packed_array[mask.raw]);
3591  alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3592  const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
3593  return TableLookupLanes(v, indices);
3594 }
3595 
3596 // 16-bit may use the 32-bit Compress and must be defined after it.
3597 //
3598 // Ignore IDE redefinition error - this is not actually defined in x86_256 if
3599 // we are including x86_512-inl.h.
3600 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3601 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
3602  const Full256<T> d;
3603  const Rebind<uint16_t, decltype(d)> du;
3604  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3605 
3606 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3607  const Vec256<uint16_t> cu{_mm256_maskz_compress_epi16(mask.raw, vu.raw)};
3608 #else
3609  // Promote to i32 (512-bit vector!) so we can use the native Compress.
3610  const auto vw = PromoteTo(Rebind<int32_t, decltype(d)>(), vu);
3611  const Mask512<int32_t> mask32{static_cast<__mmask16>(mask.raw)};
3612  const auto cu = DemoteTo(du, Compress(vw, mask32));
3613 #endif // HWY_TARGET == HWY_AVX3_DL
3614 
3615  return BitCast(d, cu);
3616 }
3617 
3618 // Expands to 32-bit, compresses, concatenate demoted halves.
3619 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3620 HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
3621  const Full512<T> d;
3622  const Rebind<uint16_t, decltype(d)> du;
3623  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3624 
3625 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3626  const Vec512<uint16_t> cu{_mm512_maskz_compress_epi16(mask.raw, vu.raw)};
3627 #else
3628  const Repartition<int32_t, decltype(d)> dw;
3629  const Half<decltype(du)> duh;
3630  const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
3631  const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
3632 
3633  const uint32_t mask_bits{mask.raw};
3634  const Mask512<int32_t> mask0{static_cast<__mmask16>(mask_bits & 0xFFFF)};
3635  const Mask512<int32_t> mask1{static_cast<__mmask16>(mask_bits >> 16)};
3636  const auto compressed0 = Compress(promoted0, mask0);
3637  const auto compressed1 = Compress(promoted1, mask1);
3638 
3639  const auto demoted0 = ZeroExtendVector(du, DemoteTo(duh, compressed0));
3640  const auto demoted1 = ZeroExtendVector(du, DemoteTo(duh, compressed1));
3641 
3642  // Concatenate into single vector by shifting upper with writemask.
3643  const size_t num0 = CountTrue(dw, mask0);
3644  const __mmask32 m_upper = ~((1u << num0) - 1);
3645  alignas(64) uint16_t iota[64] = {
3646  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3647  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3648  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3649  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
3650  const auto idx = LoadU(du, iota + 32 - num0);
3651  const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
3652  demoted0.raw, m_upper, idx.raw, demoted1.raw)};
3653 #endif // HWY_TARGET == HWY_AVX3_DL
3654 
3655  return BitCast(d, cu);
3656 }
3657 
3658 // ------------------------------ CompressNot
3659 
3660 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
3662  return Compress(v, Not(mask));
3663 }
3664 
3665 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3666 HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
3667  // See CompressIsPartition. u64 is faster than u32.
3668  alignas(16) constexpr uint64_t packed_array[256] = {
3669  // PrintCompressNot32x8Tables
3670  0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
3671  0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
3672  0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
3673  0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
3674  0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
3675  0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
3676  0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
3677  0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
3678  0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
3679  0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
3680  0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
3681  0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
3682  0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
3683  0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
3684  0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
3685  0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
3686  0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
3687  0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
3688  0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
3689  0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
3690  0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
3691  0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
3692  0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
3693  0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
3694  0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
3695  0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
3696  0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
3697  0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
3698  0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
3699  0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
3700  0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
3701  0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
3702  0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
3703  0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
3704  0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
3705  0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
3706  0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
3707  0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
3708  0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
3709  0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
3710  0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
3711  0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
3712  0x76543210, 0x76543201, 0x76543210, 0x76543210};
3713 
3714  // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
3715  // _mm512_permutexvar_epi64 will ignore the upper bits.
3716  const Full512<T> d;
3717  const RebindToUnsigned<decltype(d)> du64;
3718  const auto packed = Set(du64, packed_array[mask.raw]);
3719  alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3720  const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
3721  return TableLookupLanes(v, indices);
3722 }
3723 
3725  Mask512<uint64_t> mask) {
3726  return CompressNot(v, mask);
3727 }
3728 
3729 // ------------------------------ CompressBits
3730 template <typename T>
3732  return Compress(v, LoadMaskBits(Full512<T>(), bits));
3733 }
3734 
3735 // ------------------------------ CompressStore
3736 
3737 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3739  T* HWY_RESTRICT unaligned) {
3740  const Rebind<uint16_t, decltype(d)> du;
3741  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3742 
3743  const uint64_t mask_bits{mask.raw};
3744 
3745 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3746  _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
3747 #else
3748  const Repartition<int32_t, decltype(d)> dw;
3749  const Half<decltype(du)> duh;
3750  const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
3751  const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
3752 
3753  const uint64_t maskL = mask_bits & 0xFFFF;
3754  const uint64_t maskH = mask_bits >> 16;
3755  const Mask512<int32_t> mask0{static_cast<__mmask16>(maskL)};
3756  const Mask512<int32_t> mask1{static_cast<__mmask16>(maskH)};
3757  const auto compressed0 = Compress(promoted0, mask0);
3758  const auto compressed1 = Compress(promoted1, mask1);
3759 
3760  const Half<decltype(d)> dh;
3761  const auto demoted0 = BitCast(dh, DemoteTo(duh, compressed0));
3762  const auto demoted1 = BitCast(dh, DemoteTo(duh, compressed1));
3763 
3764  // Store 256-bit halves
3765  StoreU(demoted0, dh, unaligned);
3766  StoreU(demoted1, dh, unaligned + PopCount(maskL));
3767 #endif
3768 
3769  return PopCount(mask_bits);
3770 }
3771 
3772 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3773 HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
3774  T* HWY_RESTRICT unaligned) {
3775  _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3776  const size_t count = PopCount(uint64_t{mask.raw});
3777 // Workaround for MSAN not marking output as initialized (b/233326619)
3778 #if HWY_IS_MSAN
3779  __msan_unpoison(unaligned, count * sizeof(T));
3780 #endif
3781  return count;
3782 }
3783 
3784 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3785 HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
3786  T* HWY_RESTRICT unaligned) {
3787  _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3788  const size_t count = PopCount(uint64_t{mask.raw});
3789 // Workaround for MSAN not marking output as initialized (b/233326619)
3790 #if HWY_IS_MSAN
3791  __msan_unpoison(unaligned, count * sizeof(T));
3792 #endif
3793  return count;
3794 }
3795 
3797  Full512<float> /* tag */,
3798  float* HWY_RESTRICT unaligned) {
3799  _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
3800  const size_t count = PopCount(uint64_t{mask.raw});
3801 // Workaround for MSAN not marking output as initialized (b/233326619)
3802 #if HWY_IS_MSAN
3803  __msan_unpoison(unaligned, count * sizeof(float));
3804 #endif
3805  return count;
3806 }
3807 
3809  Full512<double> /* tag */,
3810  double* HWY_RESTRICT unaligned) {
3811  _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
3812  const size_t count = PopCount(uint64_t{mask.raw});
3813 // Workaround for MSAN not marking output as initialized (b/233326619)
3814 #if HWY_IS_MSAN
3815  __msan_unpoison(unaligned, count * sizeof(double));
3816 #endif
3817  return count;
3818 }
3819 
3820 // ------------------------------ CompressBlendedStore
3821 template <typename T>
3823  T* HWY_RESTRICT unaligned) {
3824  // AVX-512 already does the blending at no extra cost (latency 11,
3825  // rthroughput 2 - same as compress plus store).
3826  if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
3827  return CompressStore(v, m, d, unaligned);
3828  } else {
3829  const size_t count = CountTrue(d, m);
3830  BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
3831 // Workaround for MSAN not marking output as initialized (b/233326619)
3832 #if HWY_IS_MSAN
3833  __msan_unpoison(unaligned, count * sizeof(T));
3834 #endif
3835  return count;
3836  }
3837 }
3838 
3839 // ------------------------------ CompressBitsStore
3840 template <typename T>
3841 HWY_API size_t CompressBitsStore(Vec512<T> v, const uint8_t* HWY_RESTRICT bits,
3842  Full512<T> d, T* HWY_RESTRICT unaligned) {
3843  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
3844 }
3845 
3846 // ------------------------------ LoadInterleaved4
3847 
3848 // Actually implemented in generic_ops, we just overload LoadTransposedBlocks4.
3849 namespace detail {
3850 
3851 // Type-safe wrapper.
3852 template <_MM_PERM_ENUM kPerm, typename T>
3854  return Vec512<T>{_mm512_shuffle_i64x2(lo.raw, hi.raw, kPerm)};
3855 }
3856 template <_MM_PERM_ENUM kPerm>
3858  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, kPerm)};
3859 }
3860 template <_MM_PERM_ENUM kPerm>
3862  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, kPerm)};
3863 }
3864 
3865 // Input (128-bit blocks):
3866 // 3 2 1 0 (<- first block in unaligned)
3867 // 7 6 5 4
3868 // b a 9 8
3869 // Output:
3870 // 9 6 3 0 (LSB of A)
3871 // a 7 4 1
3872 // b 8 5 2
3873 template <typename T>
3875  const T* HWY_RESTRICT unaligned,
3876  Vec512<T>& A, Vec512<T>& B, Vec512<T>& C) {
3877  constexpr size_t N = 64 / sizeof(T);
3878  const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N);
3879  const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N);
3880  const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N);
3881 
3882  const Vec512<T> v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654);
3883  const Vec512<T> va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98);
3884 
3885  A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976);
3886  B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976);
3887  C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98);
3888 }
3889 
3890 // Input (128-bit blocks):
3891 // 3 2 1 0 (<- first block in unaligned)
3892 // 7 6 5 4
3893 // b a 9 8
3894 // f e d c
3895 // Output:
3896 // c 8 4 0 (LSB of A)
3897 // d 9 5 1
3898 // e a 6 2
3899 // f b 7 3
3900 template <typename T>
3902  const T* HWY_RESTRICT unaligned,
3903  Vec512<T>& A, Vec512<T>& B, Vec512<T>& C,
3904  Vec512<T>& D) {
3905  constexpr size_t N = 64 / sizeof(T);
3906  const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N);
3907  const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N);
3908  const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N);
3909  const Vec512<T> vfedc = LoadU(d, unaligned + 3 * N);
3910 
3911  const Vec512<T> v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654);
3912  const Vec512<T> vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc);
3913  const Vec512<T> v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654);
3914  const Vec512<T> vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc);
3915  A = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98);
3916  B = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98);
3917  C = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba);
3918  D = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba);
3919 }
3920 
3921 } // namespace detail
3922 
3923 // ------------------------------ StoreInterleaved2
3924 
3925 // Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
3926 
3927 namespace detail {
3928 
3929 // Input (128-bit blocks):
3930 // 6 4 2 0 (LSB of i)
3931 // 7 5 3 1
3932 // Output:
3933 // 3 2 1 0
3934 // 7 6 5 4
3935 template <typename T>
3937  const Full512<T> d,
3938  T* HWY_RESTRICT unaligned) {
3939  constexpr size_t N = 64 / sizeof(T);
3940  const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
3941  const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
3942  const auto j1_i1_j0_i0 =
3943  detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0);
3944  const auto j3_i3_j2_i2 =
3945  detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2);
3946  StoreU(j1_i1_j0_i0, d, unaligned + 0 * N);
3947  StoreU(j3_i3_j2_i2, d, unaligned + 1 * N);
3948 }
3949 
3950 // Input (128-bit blocks):
3951 // 9 6 3 0 (LSB of i)
3952 // a 7 4 1
3953 // b 8 5 2
3954 // Output:
3955 // 3 2 1 0
3956 // 7 6 5 4
3957 // b a 9 8
3958 template <typename T>
3960  const Vec512<T> k, Full512<T> d,
3961  T* HWY_RESTRICT unaligned) {
3962  constexpr size_t N = 64 / sizeof(T);
3963  const Vec512<T> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
3964  const Vec512<T> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
3965  const Vec512<T> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
3966 
3967  const Vec512<T> out0 = // i1 k0 j0 i0
3968  detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0);
3969  const Vec512<T> out1 = // j2 i2 k1 j1
3970  detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0);
3971  const Vec512<T> out2 = // k3 j3 i3 k2
3972  detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1);
3973 
3974  StoreU(out0, d, unaligned + 0 * N);
3975  StoreU(out1, d, unaligned + 1 * N);
3976  StoreU(out2, d, unaligned + 2 * N);
3977 }
3978 
3979 // Input (128-bit blocks):
3980 // c 8 4 0 (LSB of i)
3981 // d 9 5 1
3982 // e a 6 2
3983 // f b 7 3
3984 // Output:
3985 // 3 2 1 0
3986 // 7 6 5 4
3987 // b a 9 8
3988 // f e d c
3989 template <typename T>
3991  const Vec512<T> k, const Vec512<T> l,
3992  Full512<T> d, T* HWY_RESTRICT unaligned) {
3993  constexpr size_t N = 64 / sizeof(T);
3994  const Vec512<T> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
3995  const Vec512<T> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
3996  const Vec512<T> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
3997  const Vec512<T> l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l);
3998  const Vec512<T> out0 =
3999  detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0);
4000  const Vec512<T> out1 =
4001  detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0);
4002  const Vec512<T> out2 =
4003  detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2);
4004  const Vec512<T> out3 =
4005  detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2);
4006  StoreU(out0, d, unaligned + 0 * N);
4007  StoreU(out1, d, unaligned + 1 * N);
4008  StoreU(out2, d, unaligned + 2 * N);
4009  StoreU(out3, d, unaligned + 3 * N);
4010 }
4011 
4012 } // namespace detail
4013 
4014 // ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
4015 
4017  const Vec512<uint64_t> b) {
4018  const DFromV<decltype(a)> du64;
4019  const RepartitionToNarrow<decltype(du64)> du32;
4020  const auto maskL = Set(du64, 0xFFFFFFFFULL);
4021  const auto a32 = BitCast(du32, a);
4022  const auto b32 = BitCast(du32, b);
4023  // Inputs for MulEven: we only need the lower 32 bits
4024  const auto aH = Shuffle2301(a32);
4025  const auto bH = Shuffle2301(b32);
4026 
4027  // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
4028  // the even (lower 64 bits of every 128-bit block) results. See
4029  // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
4030  const auto aLbL = MulEven(a32, b32);
4031  const auto w3 = aLbL & maskL;
4032 
4033  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
4034  const auto w2 = t2 & maskL;
4035  const auto w1 = ShiftRight<32>(t2);
4036 
4037  const auto t = MulEven(a32, bH) + w2;
4038  const auto k = ShiftRight<32>(t);
4039 
4040  const auto mulH = MulEven(aH, bH) + w1 + k;
4041  const auto mulL = ShiftLeft<32>(t) + w3;
4042  return InterleaveLower(mulL, mulH);
4043 }
4044 
4046  const Vec512<uint64_t> b) {
4047  const DFromV<decltype(a)> du64;
4048  const RepartitionToNarrow<decltype(du64)> du32;
4049  const auto maskL = Set(du64, 0xFFFFFFFFULL);
4050  const auto a32 = BitCast(du32, a);
4051  const auto b32 = BitCast(du32, b);
4052  // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
4053  const auto aH = Shuffle2301(a32);
4054  const auto bH = Shuffle2301(b32);
4055 
4056  // Same as above, but we're using the odd results (upper 64 bits per block).
4057  const auto aLbL = MulEven(a32, b32);
4058  const auto w3 = aLbL & maskL;
4059 
4060  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
4061  const auto w2 = t2 & maskL;
4062  const auto w1 = ShiftRight<32>(t2);
4063 
4064  const auto t = MulEven(a32, bH) + w2;
4065  const auto k = ShiftRight<32>(t);
4066 
4067  const auto mulH = MulEven(aH, bH) + w1 + k;
4068  const auto mulL = ShiftLeft<32>(t) + w3;
4069  return InterleaveUpper(du64, mulL, mulH);
4070 }
4071 
4072 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4073 
4077  const Vec512<float> sum0,
4078  Vec512<float>& sum1) {
4079  // TODO(janwas): _mm512_dpbf16_ps when available
4080  const Repartition<uint16_t, decltype(df32)> du16;
4081  const RebindToUnsigned<decltype(df32)> du32;
4082  const Vec512<uint16_t> zero = Zero(du16);
4083  // Lane order within sum0/1 is undefined, hence we can avoid the
4084  // longer-latency lane-crossing PromoteTo.
4085  const Vec512<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
4086  const Vec512<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
4087  const Vec512<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
4088  const Vec512<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
4089  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
4090  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
4091 }
4092 
4093 // ------------------------------ Reductions
4094 
4095 // Returns the sum in each lane.
4097  return Set(d, _mm512_reduce_add_epi32(v.raw));
4098 }
4100  return Set(d, _mm512_reduce_add_epi64(v.raw));
4101 }
4103  return Set(d, static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw)));
4104 }
4106  return Set(d, static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw)));
4107 }
4109  return Set(d, _mm512_reduce_add_ps(v.raw));
4110 }
4112  return Set(d, _mm512_reduce_add_pd(v.raw));
4113 }
4114 
4115 // Returns the minimum in each lane.
4117  return Set(d, _mm512_reduce_min_epi32(v.raw));
4118 }
4120  return Set(d, _mm512_reduce_min_epi64(v.raw));
4121 }
4123  return Set(d, _mm512_reduce_min_epu32(v.raw));
4124 }
4126  return Set(d, _mm512_reduce_min_epu64(v.raw));
4127 }
4129  return Set(d, _mm512_reduce_min_ps(v.raw));
4130 }
4132  return Set(d, _mm512_reduce_min_pd(v.raw));
4133 }
4134 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4136  const Repartition<int32_t, decltype(d)> d32;
4137  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4138  const auto odd = ShiftRight<16>(BitCast(d32, v));
4139  const auto min = MinOfLanes(d32, Min(even, odd));
4140  // Also broadcast into odd lanes.
4141  return BitCast(d, Or(min, ShiftLeft<16>(min)));
4142 }
4143 
4144 // Returns the maximum in each lane.
4146  return Set(d, _mm512_reduce_max_epi32(v.raw));
4147 }
4149  return Set(d, _mm512_reduce_max_epi64(v.raw));
4150 }
4152  return Set(d, _mm512_reduce_max_epu32(v.raw));
4153 }
4155  return Set(d, _mm512_reduce_max_epu64(v.raw));
4156 }
4158  return Set(d, _mm512_reduce_max_ps(v.raw));
4159 }
4161  return Set(d, _mm512_reduce_max_pd(v.raw));
4162 }
4163 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4165  const Repartition<int32_t, decltype(d)> d32;
4166  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4167  const auto odd = ShiftRight<16>(BitCast(d32, v));
4168  const auto min = MaxOfLanes(d32, Max(even, odd));
4169  // Also broadcast into odd lanes.
4170  return BitCast(d, Or(min, ShiftLeft<16>(min)));
4171 }
4172 
4173 // NOLINTNEXTLINE(google-readability-namespace-comments)
4174 } // namespace HWY_NAMESPACE
4175 } // namespace hwy
4177 
4178 // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
4179 // the warning seems to be issued at the call site of intrinsics, i.e. our code.
4180 HWY_DIAGNOSTICS(pop)
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
Raw raw
Definition: x86_256-inl.h:100
Definition: x86_512-inl.h:112
typename detail::Raw512< T >::type Raw
Definition: x86_512-inl.h:113
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition: x86_512-inl.h:118
Raw raw
Definition: x86_512-inl.h:140
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition: x86_512-inl.h:124
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition: x86_512-inl.h:130
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition: x86_512-inl.h:133
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition: x86_512-inl.h:127
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition: x86_512-inl.h:136
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition: x86_512-inl.h:121
#define HWY_AVX3_DL
Definition: detect_targets.h:62
#define HWY_TARGET
Definition: detect_targets.h:341
const double shift
Definition: RateControl.cpp:165
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5417
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:151
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5406
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:462
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:870
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
Vec512< T > Shuffle128(const Vec512< T > lo, const Vec512< T > hi)
Definition: x86_512-inl.h:3853
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:513
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:332
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:763
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: x86_512-inl.h:2464
__m512i raw
Definition: x86_512-inl.h:2465
Definition: x86_512-inl.h:145
detail::RawMask512< sizeof(T)>::type raw
Definition: x86_512-inl.h:146
Definition: ops/shared-inl.h:40
HWY_INLINE __m512d operator()(__m512i v)
Definition: x86_512-inl.h:175
HWY_INLINE __m512 operator()(__m512i v)
Definition: x86_512-inl.h:171
HWY_INLINE __m512i operator()(__m512i v)
Definition: x86_512-inl.h:167
__m512d type
Definition: x86_512-inl.h:86
__m512 type
Definition: x86_512-inl.h:82
Definition: x86_512-inl.h:77
__m512i type
Definition: x86_512-inl.h:78
__mmask64 type
Definition: x86_512-inl.h:94
__mmask32 type
Definition: x86_512-inl.h:98
__mmask16 type
Definition: x86_512-inl.h:102
__mmask8 type
Definition: x86_512-inl.h:106
Definition: x86_512-inl.h:91
Definition: base.h:358
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()