Grok  10.0.3
x86_128-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
17 // operations when compiling for those targets.
18 // External include guard in highway.h - see comment there.
19 
20 #include <emmintrin.h>
21 #include <stdio.h>
22 #if HWY_TARGET == HWY_SSSE3
23 #include <tmmintrin.h> // SSSE3
24 #else
25 #include <smmintrin.h> // SSE4
26 #include <wmmintrin.h> // CLMUL
27 #endif
28 #include <stddef.h>
29 #include <stdint.h>
30 
31 #include "hwy/base.h"
32 #include "hwy/ops/shared-inl.h"
33 
34 #if HWY_IS_MSAN
35 #include <sanitizer/msan_interface.h>
36 #endif
37 
39 namespace hwy {
40 namespace HWY_NAMESPACE {
41 
42 #if HWY_TARGET <= HWY_AVX2
43 template <typename T>
44 using Full256 = Simd<T, 32 / sizeof(T), 0>;
45 #endif
46 
47 #if HWY_TARGET <= HWY_AVX3
48 template <typename T>
49 using Full512 = Simd<T, 64 / sizeof(T), 0>;
50 #endif
51 
52 namespace detail {
53 
54 template <typename T>
55 struct Raw128 {
56  using type = __m128i;
57 };
58 template <>
59 struct Raw128<float> {
60  using type = __m128;
61 };
62 template <>
63 struct Raw128<double> {
64  using type = __m128d;
65 };
66 
67 } // namespace detail
68 
69 template <typename T, size_t N = 16 / sizeof(T)>
70 class Vec128 {
71  using Raw = typename detail::Raw128<T>::type;
72 
73  public:
74  // Compound assignment. Only usable if there is a corresponding non-member
75  // binary operator overload. For example, only f32 and f64 support division.
77  return *this = (*this * other);
78  }
80  return *this = (*this / other);
81  }
83  return *this = (*this + other);
84  }
86  return *this = (*this - other);
87  }
89  return *this = (*this & other);
90  }
92  return *this = (*this | other);
93  }
95  return *this = (*this ^ other);
96  }
97 
98  Raw raw;
99 };
100 
101 template <typename T>
102 using Vec64 = Vec128<T, 8 / sizeof(T)>;
103 
104 template <typename T>
105 using Vec32 = Vec128<T, 4 / sizeof(T)>;
106 
107 #if HWY_TARGET <= HWY_AVX3
108 
109 // Forward-declare for use by DeduceD, see below.
110 template <typename T>
111 class Vec512;
112 
113 namespace detail {
114 
115 // Template arg: sizeof(lane type)
116 template <size_t size>
117 struct RawMask128 {};
118 template <>
120  using type = __mmask16;
121 };
122 template <>
124  using type = __mmask8;
125 };
126 template <>
128  using type = __mmask8;
129 };
130 template <>
132  using type = __mmask8;
133 };
134 
135 } // namespace detail
136 
137 template <typename T, size_t N = 16 / sizeof(T)>
138 struct Mask128 {
139  using Raw = typename detail::RawMask128<sizeof(T)>::type;
140 
141  static Mask128<T, N> FromBits(uint64_t mask_bits) {
142  return Mask128<T, N>{static_cast<Raw>(mask_bits)};
143  }
144 
145  Raw raw;
146 };
147 
148 #else // AVX2 or below
149 
150 // FF..FF or 0.
151 template <typename T, size_t N = 16 / sizeof(T)>
152 struct Mask128 {
153  typename detail::Raw128<T>::type raw;
154 };
155 
156 #endif // HWY_TARGET <= HWY_AVX3
157 
158 #if HWY_TARGET <= HWY_AVX2
159 // Forward-declare for use by DeduceD, see below.
160 template <typename T>
161 class Vec256;
162 #endif
163 
164 namespace detail {
165 
166 // Deduce Simd<T, N, 0> from Vec*<T, N> (pointers because Vec256/512 may be
167 // incomplete types at this point; this is simpler than avoiding multiple
168 // definitions of DFromV via #if)
169 struct DeduceD {
170  template <typename T, size_t N>
172  return Simd<T, N, 0>();
173  }
174 #if HWY_TARGET <= HWY_AVX2
175  template <typename T>
177  return Full256<T>();
178  }
179 #endif
180 #if HWY_TARGET <= HWY_AVX3
181  template <typename T>
183  return Full512<T>();
184  }
185 #endif
186 };
187 
188 // Workaround for MSVC v19.14: alias with a dependent type fails to specialize.
189 template <class V>
190 struct ExpandDFromV {
191  using type = decltype(DeduceD()(static_cast<V*>(nullptr)));
192 };
193 
194 } // namespace detail
195 
196 template <class V>
197 using DFromV = typename detail::ExpandDFromV<V>::type;
198 
199 template <class V>
200 using TFromV = TFromD<DFromV<V>>;
201 
202 // ------------------------------ BitCast
203 
204 namespace detail {
205 
206 HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
207 HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
208 HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
209 
210 template <typename T, size_t N>
211 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
212  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
213 }
214 
215 // Cannot rely on function overloading because return types differ.
216 template <typename T>
217 struct BitCastFromInteger128 {
218  HWY_INLINE __m128i operator()(__m128i v) { return v; }
219 };
220 template <>
221 struct BitCastFromInteger128<float> {
222  HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
223 };
224 template <>
225 struct BitCastFromInteger128<double> {
226  HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
227 };
228 
229 template <typename T, size_t N>
231  Vec128<uint8_t, N * sizeof(T)> v) {
232  return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
233 }
234 
235 } // namespace detail
236 
237 template <typename T, size_t N, typename FromT>
238 HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
239  Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
241 }
242 
243 // ------------------------------ Zero
244 
245 // Returns an all-zero vector/part.
246 template <typename T, size_t N, HWY_IF_LE128(T, N)>
247 HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
248  return Vec128<T, N>{_mm_setzero_si128()};
249 }
250 template <size_t N, HWY_IF_LE128(float, N)>
251 HWY_API Vec128<float, N> Zero(Simd<float, N, 0> /* tag */) {
252  return Vec128<float, N>{_mm_setzero_ps()};
253 }
254 template <size_t N, HWY_IF_LE128(double, N)>
256  return Vec128<double, N>{_mm_setzero_pd()};
257 }
258 
259 template <class D>
260 using VFromD = decltype(Zero(D()));
261 
262 // ------------------------------ Set
263 
264 // Returns a vector/part with all lanes set to "t".
265 template <size_t N, HWY_IF_LE128(uint8_t, N)>
266 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
267  return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
268 }
269 template <size_t N, HWY_IF_LE128(uint16_t, N)>
270 HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N, 0> /* tag */,
271  const uint16_t t) {
272  return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
273 }
274 template <size_t N, HWY_IF_LE128(uint32_t, N)>
275 HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N, 0> /* tag */,
276  const uint32_t t) {
277  return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
278 }
279 template <size_t N, HWY_IF_LE128(uint64_t, N)>
280 HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N, 0> /* tag */,
281  const uint64_t t) {
282  return Vec128<uint64_t, N>{
283  _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
284 }
285 template <size_t N, HWY_IF_LE128(int8_t, N)>
286 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
287  return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
288 }
289 template <size_t N, HWY_IF_LE128(int16_t, N)>
290 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
291  return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
292 }
293 template <size_t N, HWY_IF_LE128(int32_t, N)>
294 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
295  return Vec128<int32_t, N>{_mm_set1_epi32(t)};
296 }
297 template <size_t N, HWY_IF_LE128(int64_t, N)>
298 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
299  return Vec128<int64_t, N>{
300  _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
301 }
302 template <size_t N, HWY_IF_LE128(float, N)>
303 HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
304  return Vec128<float, N>{_mm_set1_ps(t)};
305 }
306 template <size_t N, HWY_IF_LE128(double, N)>
307 HWY_API Vec128<double, N> Set(Simd<double, N, 0> /* tag */, const double t) {
308  return Vec128<double, N>{_mm_set1_pd(t)};
309 }
310 
311 HWY_DIAGNOSTICS(push)
312 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
313 
314 // Returns a vector with uninitialized elements.
315 template <typename T, size_t N, HWY_IF_LE128(T, N)>
316 HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /* tag */) {
317  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
318  // generate an XOR instruction.
319  return Vec128<T, N>{_mm_undefined_si128()};
320 }
321 template <size_t N, HWY_IF_LE128(float, N)>
323  return Vec128<float, N>{_mm_undefined_ps()};
324 }
325 template <size_t N, HWY_IF_LE128(double, N)>
327  return Vec128<double, N>{_mm_undefined_pd()};
328 }
329 
330 HWY_DIAGNOSTICS(pop)
331 
332 // ------------------------------ GetLane
333 
334 // Gets the single value stored in a vector/part.
335 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
337  return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF);
338 }
339 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
340 HWY_API T GetLane(const Vec128<T, N> v) {
341  return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
342 }
343 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
344 HWY_API T GetLane(const Vec128<T, N> v) {
345  return static_cast<T>(_mm_cvtsi128_si32(v.raw));
346 }
347 template <size_t N>
349  return _mm_cvtss_f32(v.raw);
350 }
351 template <size_t N>
353 #if HWY_ARCH_X86_32
354  alignas(16) uint64_t lanes[2];
355  Store(v, Simd<uint64_t, N, 0>(), lanes);
356  return lanes[0];
357 #else
358  return static_cast<uint64_t>(_mm_cvtsi128_si64(v.raw));
359 #endif
360 }
361 template <size_t N>
363 #if HWY_ARCH_X86_32
364  alignas(16) int64_t lanes[2];
365  Store(v, Simd<int64_t, N, 0>(), lanes);
366  return lanes[0];
367 #else
368  return _mm_cvtsi128_si64(v.raw);
369 #endif
370 }
371 template <size_t N>
373  return _mm_cvtsd_f64(v.raw);
374 }
375 
376 // ================================================== LOGICAL
377 
378 // ------------------------------ And
379 
380 template <typename T, size_t N>
381 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
382  return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
383 }
384 template <size_t N>
386  const Vec128<float, N> b) {
387  return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)};
388 }
389 template <size_t N>
391  const Vec128<double, N> b) {
392  return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)};
393 }
394 
395 // ------------------------------ AndNot
396 
397 // Returns ~not_mask & mask.
398 template <typename T, size_t N>
399 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
400  return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
401 }
402 template <size_t N>
404  const Vec128<float, N> mask) {
405  return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
406 }
407 template <size_t N>
409  const Vec128<double, N> mask) {
410  return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
411 }
412 
413 // ------------------------------ Or
414 
415 template <typename T, size_t N>
416 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
417  return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
418 }
419 
420 template <size_t N>
422  const Vec128<float, N> b) {
423  return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)};
424 }
425 template <size_t N>
427  const Vec128<double, N> b) {
428  return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)};
429 }
430 
431 // ------------------------------ Xor
432 
433 template <typename T, size_t N>
434 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
435  return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
436 }
437 
438 template <size_t N>
440  const Vec128<float, N> b) {
441  return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)};
442 }
443 template <size_t N>
445  const Vec128<double, N> b) {
446  return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)};
447 }
448 
449 // ------------------------------ Not
450 
451 template <typename T, size_t N>
452 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
453  const DFromV<decltype(v)> d;
454  const RebindToUnsigned<decltype(d)> du;
455  using VU = VFromD<decltype(du)>;
456 #if HWY_TARGET <= HWY_AVX3
457  const __m128i vu = BitCast(du, v).raw;
458  return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
459 #else
460  return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)}));
461 #endif
462 }
463 
464 // ------------------------------ Or3
465 
466 template <typename T, size_t N>
467 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
468 #if HWY_TARGET <= HWY_AVX3
469  const DFromV<decltype(o1)> d;
470  const RebindToUnsigned<decltype(d)> du;
471  using VU = VFromD<decltype(du)>;
472  const __m128i ret = _mm_ternarylogic_epi64(
473  BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
474  return BitCast(d, VU{ret});
475 #else
476  return Or(o1, Or(o2, o3));
477 #endif
478 }
479 
480 // ------------------------------ OrAnd
481 
482 template <typename T, size_t N>
483 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
484 #if HWY_TARGET <= HWY_AVX3
485  const DFromV<decltype(o)> d;
486  const RebindToUnsigned<decltype(d)> du;
487  using VU = VFromD<decltype(du)>;
488  const __m128i ret = _mm_ternarylogic_epi64(
489  BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
490  return BitCast(d, VU{ret});
491 #else
492  return Or(o, And(a1, a2));
493 #endif
494 }
495 
496 // ------------------------------ IfVecThenElse
497 
498 template <typename T, size_t N>
499 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
500  Vec128<T, N> no) {
501 #if HWY_TARGET <= HWY_AVX3
502  const DFromV<decltype(no)> d;
503  const RebindToUnsigned<decltype(d)> du;
504  using VU = VFromD<decltype(du)>;
505  return BitCast(
506  d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw,
507  BitCast(du, no).raw, 0xCA)});
508 #else
509  return IfThenElse(MaskFromVec(mask), yes, no);
510 #endif
511 }
512 
513 // ------------------------------ Operator overloads (internal-only if float)
514 
515 template <typename T, size_t N>
516 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
517  return And(a, b);
518 }
519 
520 template <typename T, size_t N>
521 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
522  return Or(a, b);
523 }
524 
525 template <typename T, size_t N>
526 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
527  return Xor(a, b);
528 }
529 
530 // ------------------------------ PopulationCount
531 
532 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
533 #if HWY_TARGET == HWY_AVX3_DL
534 
535 #ifdef HWY_NATIVE_POPCNT
536 #undef HWY_NATIVE_POPCNT
537 #else
538 #define HWY_NATIVE_POPCNT
539 #endif
540 
541 namespace detail {
542 
543 template <typename T, size_t N>
545  Vec128<T, N> v) {
546  return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
547 }
548 template <typename T, size_t N>
550  Vec128<T, N> v) {
551  return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
552 }
553 template <typename T, size_t N>
555  Vec128<T, N> v) {
556  return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
557 }
558 template <typename T, size_t N>
560  Vec128<T, N> v) {
561  return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
562 }
563 
564 } // namespace detail
565 
566 template <typename T, size_t N>
568  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
569 }
570 
571 #endif // HWY_TARGET == HWY_AVX3_DL
572 
573 // ================================================== SIGN
574 
575 // ------------------------------ Neg
576 
577 template <typename T, size_t N, HWY_IF_FLOAT(T)>
578 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
579  return Xor(v, SignBit(DFromV<decltype(v)>()));
580 }
581 
582 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
583 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
584  return Zero(DFromV<decltype(v)>()) - v;
585 }
586 
587 // ------------------------------ Abs
588 
589 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
590 template <size_t N>
591 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
592 #if HWY_COMPILER_MSVC
593  // Workaround for incorrect codegen? (reaches breakpoint)
594  const auto zero = Zero(DFromV<decltype(v)>());
595  return Vec128<int8_t, N>{_mm_max_epi8(v.raw, (zero - v).raw)};
596 #else
597  return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
598 #endif
599 }
600 template <size_t N>
601 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
602  return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
603 }
604 template <size_t N>
605 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
606  return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
607 }
608 // i64 is implemented after BroadcastSignBit.
609 template <size_t N>
610 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
611  const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
612  return v & BitCast(DFromV<decltype(v)>(), mask);
613 }
614 template <size_t N>
616  const Vec128<int64_t, N> mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
617  return v & BitCast(DFromV<decltype(v)>(), mask);
618 }
619 
620 // ------------------------------ CopySign
621 
622 template <typename T, size_t N>
623 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
624  const Vec128<T, N> sign) {
625  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
626 
627  const DFromV<decltype(magn)> d;
628  const auto msb = SignBit(d);
629 
630 #if HWY_TARGET <= HWY_AVX3
631  const RebindToUnsigned<decltype(d)> du;
632  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
633  // 0 0 0 | 0
634  // 0 0 1 | 0
635  // 0 1 0 | 1
636  // 0 1 1 | 1
637  // 1 0 0 | 0
638  // 1 0 1 | 1
639  // 1 1 0 | 0
640  // 1 1 1 | 1
641  // The lane size does not matter because we are not using predication.
642  const __m128i out = _mm_ternarylogic_epi32(
643  BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
644  return BitCast(d, VFromD<decltype(du)>{out});
645 #else
646  return Or(AndNot(msb, magn), And(msb, sign));
647 #endif
648 }
649 
650 template <typename T, size_t N>
651 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
652  const Vec128<T, N> sign) {
653 #if HWY_TARGET <= HWY_AVX3
654  // AVX3 can also handle abs < 0, so no extra action needed.
655  return CopySign(abs, sign);
656 #else
657  return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
658 #endif
659 }
660 
661 // ================================================== MASK
662 
663 #if HWY_TARGET <= HWY_AVX3
664 
665 // ------------------------------ IfThenElse
666 
667 // Returns mask ? b : a.
668 
669 namespace detail {
670 
671 // Templates for signed/unsigned integer of a particular size.
672 template <typename T, size_t N>
674  Mask128<T, N> mask, Vec128<T, N> yes,
675  Vec128<T, N> no) {
676  return Vec128<T, N>{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
677 }
678 template <typename T, size_t N>
680  Mask128<T, N> mask, Vec128<T, N> yes,
681  Vec128<T, N> no) {
682  return Vec128<T, N>{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
683 }
684 template <typename T, size_t N>
686  Mask128<T, N> mask, Vec128<T, N> yes,
687  Vec128<T, N> no) {
688  return Vec128<T, N>{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
689 }
690 template <typename T, size_t N>
692  Mask128<T, N> mask, Vec128<T, N> yes,
693  Vec128<T, N> no) {
694  return Vec128<T, N>{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
695 }
696 
697 } // namespace detail
698 
699 template <typename T, size_t N>
700 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
701  Vec128<T, N> no) {
702  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
703 }
704 
705 template <size_t N>
708  return Vec128<float, N>{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)};
709 }
710 
711 template <size_t N>
713  Vec128<double, N> yes,
714  Vec128<double, N> no) {
715  return Vec128<double, N>{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)};
716 }
717 
718 namespace detail {
719 
720 template <typename T, size_t N>
722  Mask128<T, N> mask, Vec128<T, N> yes) {
723  return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
724 }
725 template <typename T, size_t N>
727  Mask128<T, N> mask, Vec128<T, N> yes) {
728  return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
729 }
730 template <typename T, size_t N>
732  Mask128<T, N> mask, Vec128<T, N> yes) {
733  return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
734 }
735 template <typename T, size_t N>
737  Mask128<T, N> mask, Vec128<T, N> yes) {
738  return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
739 }
740 
741 } // namespace detail
742 
743 template <typename T, size_t N>
744 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
745  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
746 }
747 
748 template <size_t N>
750  Vec128<float, N> yes) {
751  return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
752 }
753 
754 template <size_t N>
756  Vec128<double, N> yes) {
757  return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
758 }
759 
760 namespace detail {
761 
762 template <typename T, size_t N>
764  Mask128<T, N> mask, Vec128<T, N> no) {
765  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
766  return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
767 }
768 template <typename T, size_t N>
770  Mask128<T, N> mask, Vec128<T, N> no) {
771  return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
772 }
773 template <typename T, size_t N>
775  Mask128<T, N> mask, Vec128<T, N> no) {
776  return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
777 }
778 template <typename T, size_t N>
780  Mask128<T, N> mask, Vec128<T, N> no) {
781  return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
782 }
783 
784 } // namespace detail
785 
786 template <typename T, size_t N>
787 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
788  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
789 }
790 
791 template <size_t N>
793  Vec128<float, N> no) {
794  return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
795 }
796 
797 template <size_t N>
799  Vec128<double, N> no) {
800  return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
801 }
802 
803 // ------------------------------ Mask logical
804 
805 // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
806 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
807 #if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
808  HWY_COMPILER_CLANG >= 800
809 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
810 #else
811 #define HWY_COMPILER_HAS_MASK_INTRINSICS 0
812 #endif
813 #endif // HWY_COMPILER_HAS_MASK_INTRINSICS
814 
815 namespace detail {
816 
817 template <typename T, size_t N>
819  const Mask128<T, N> b) {
820 #if HWY_COMPILER_HAS_MASK_INTRINSICS
821  return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
822 #else
823  return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)};
824 #endif
825 }
826 template <typename T, size_t N>
828  const Mask128<T, N> b) {
829 #if HWY_COMPILER_HAS_MASK_INTRINSICS
830  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
831 #else
832  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
833 #endif
834 }
835 template <typename T, size_t N>
837  const Mask128<T, N> b) {
838 #if HWY_COMPILER_HAS_MASK_INTRINSICS
839  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
840 #else
841  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
842 #endif
843 }
844 template <typename T, size_t N>
846  const Mask128<T, N> b) {
847 #if HWY_COMPILER_HAS_MASK_INTRINSICS
848  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
849 #else
850  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
851 #endif
852 }
853 
854 template <typename T, size_t N>
856  const Mask128<T, N> b) {
857 #if HWY_COMPILER_HAS_MASK_INTRINSICS
858  return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
859 #else
860  return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)};
861 #endif
862 }
863 template <typename T, size_t N>
865  const Mask128<T, N> b) {
866 #if HWY_COMPILER_HAS_MASK_INTRINSICS
867  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
868 #else
869  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
870 #endif
871 }
872 template <typename T, size_t N>
874  const Mask128<T, N> b) {
875 #if HWY_COMPILER_HAS_MASK_INTRINSICS
876  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
877 #else
878  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
879 #endif
880 }
881 template <typename T, size_t N>
883  const Mask128<T, N> b) {
884 #if HWY_COMPILER_HAS_MASK_INTRINSICS
885  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
886 #else
887  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
888 #endif
889 }
890 
891 template <typename T, size_t N>
893  const Mask128<T, N> b) {
894 #if HWY_COMPILER_HAS_MASK_INTRINSICS
895  return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
896 #else
897  return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)};
898 #endif
899 }
900 template <typename T, size_t N>
902  const Mask128<T, N> b) {
903 #if HWY_COMPILER_HAS_MASK_INTRINSICS
904  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
905 #else
906  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
907 #endif
908 }
909 template <typename T, size_t N>
911  const Mask128<T, N> b) {
912 #if HWY_COMPILER_HAS_MASK_INTRINSICS
913  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
914 #else
915  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
916 #endif
917 }
918 template <typename T, size_t N>
920  const Mask128<T, N> b) {
921 #if HWY_COMPILER_HAS_MASK_INTRINSICS
922  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
923 #else
924  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
925 #endif
926 }
927 
928 template <typename T, size_t N>
930  const Mask128<T, N> b) {
931 #if HWY_COMPILER_HAS_MASK_INTRINSICS
932  return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
933 #else
934  return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)};
935 #endif
936 }
937 template <typename T, size_t N>
939  const Mask128<T, N> b) {
940 #if HWY_COMPILER_HAS_MASK_INTRINSICS
941  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
942 #else
943  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
944 #endif
945 }
946 template <typename T, size_t N>
948  const Mask128<T, N> b) {
949 #if HWY_COMPILER_HAS_MASK_INTRINSICS
950  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
951 #else
952  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
953 #endif
954 }
955 template <typename T, size_t N>
957  const Mask128<T, N> b) {
958 #if HWY_COMPILER_HAS_MASK_INTRINSICS
959  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
960 #else
961  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
962 #endif
963 }
964 
965 } // namespace detail
966 
967 template <typename T, size_t N>
968 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
969  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
970 }
971 
972 template <typename T, size_t N>
973 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
974  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
975 }
976 
977 template <typename T, size_t N>
978 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
979  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
980 }
981 
982 template <typename T, size_t N>
983 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
984  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
985 }
986 
987 template <typename T, size_t N>
988 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
989  // Flip only the valid bits.
990  // TODO(janwas): use _knot intrinsics if N >= 8.
991  return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
992 }
993 
994 #else // AVX2 or below
995 
996 // ------------------------------ Mask
997 
998 // Mask and Vec are the same (true = FF..FF).
999 template <typename T, size_t N>
1000 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1001  return Mask128<T, N>{v.raw};
1002 }
1003 
1004 template <typename T, size_t N>
1005 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1006  return Vec128<T, N>{v.raw};
1007 }
1008 
1009 template <typename T, size_t N>
1010 HWY_API Vec128<T, N> VecFromMask(const Simd<T, N, 0> /* tag */,
1011  const Mask128<T, N> v) {
1012  return Vec128<T, N>{v.raw};
1013 }
1014 
1015 #if HWY_TARGET == HWY_SSSE3
1016 
1017 // mask ? yes : no
1018 template <typename T, size_t N>
1019 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1020  Vec128<T, N> no) {
1021  const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask);
1022  return Or(And(vmask, yes), AndNot(vmask, no));
1023 }
1024 
1025 #else // HWY_TARGET == HWY_SSSE3
1026 
1027 // mask ? yes : no
1028 template <typename T, size_t N>
1029 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1030  Vec128<T, N> no) {
1031  return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1032 }
1033 template <size_t N>
1034 HWY_API Vec128<float, N> IfThenElse(const Mask128<float, N> mask,
1035  const Vec128<float, N> yes,
1036  const Vec128<float, N> no) {
1037  return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1038 }
1039 template <size_t N>
1040 HWY_API Vec128<double, N> IfThenElse(const Mask128<double, N> mask,
1041  const Vec128<double, N> yes,
1042  const Vec128<double, N> no) {
1043  return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1044 }
1045 
1046 #endif // HWY_TARGET == HWY_SSSE3
1047 
1048 // mask ? yes : 0
1049 template <typename T, size_t N>
1050 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1051  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1052 }
1053 
1054 // mask ? 0 : no
1055 template <typename T, size_t N>
1056 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1057  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1058 }
1059 
1060 // ------------------------------ Mask logical
1061 
1062 template <typename T, size_t N>
1063 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1064  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
1065 }
1066 
1067 template <typename T, size_t N>
1068 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1069  const Simd<T, N, 0> d;
1070  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1071 }
1072 
1073 template <typename T, size_t N>
1074 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1075  const Simd<T, N, 0> d;
1076  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1077 }
1078 
1079 template <typename T, size_t N>
1080 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1081  const Simd<T, N, 0> d;
1082  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1083 }
1084 
1085 template <typename T, size_t N>
1086 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1087  const Simd<T, N, 0> d;
1088  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1089 }
1090 
1091 #endif // HWY_TARGET <= HWY_AVX3
1092 
1093 // ------------------------------ ShiftLeft
1094 
1095 template <int kBits, size_t N>
1096 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
1097  return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1098 }
1099 
1100 template <int kBits, size_t N>
1101 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
1102  return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1103 }
1104 
1105 template <int kBits, size_t N>
1106 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
1107  return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1108 }
1109 
1110 template <int kBits, size_t N>
1111 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
1112  return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1113 }
1114 template <int kBits, size_t N>
1115 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
1116  return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1117 }
1118 template <int kBits, size_t N>
1119 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
1120  return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1121 }
1122 
1123 template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1124 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
1125  const DFromV<decltype(v)> d8;
1126  // Use raw instead of BitCast to support N=1.
1127  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
1128  return kBits == 1
1129  ? (v + v)
1130  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1131 }
1132 
1133 // ------------------------------ ShiftRight
1134 
1135 template <int kBits, size_t N>
1136 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
1137  return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
1138 }
1139 template <int kBits, size_t N>
1140 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
1141  return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
1142 }
1143 template <int kBits, size_t N>
1144 HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
1145  return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
1146 }
1147 
1148 template <int kBits, size_t N>
1149 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
1150  const DFromV<decltype(v)> d8;
1151  // Use raw instead of BitCast to support N=1.
1152  const Vec128<uint8_t, N> shifted{
1153  ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
1154  return shifted & Set(d8, 0xFF >> kBits);
1155 }
1156 
1157 template <int kBits, size_t N>
1158 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
1159  return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
1160 }
1161 template <int kBits, size_t N>
1162 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
1163  return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
1164 }
1165 
1166 template <int kBits, size_t N>
1167 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
1168  const DFromV<decltype(v)> di;
1169  const RebindToUnsigned<decltype(di)> du;
1170  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1171  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1172  return (shifted ^ shifted_sign) - shifted_sign;
1173 }
1174 
1175 // i64 is implemented after BroadcastSignBit.
1176 
1177 // ================================================== SWIZZLE (1)
1178 
1179 // ------------------------------ TableLookupBytes
1180 template <typename T, size_t N, typename TI, size_t NI>
1181 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
1182  const Vec128<TI, NI> from) {
1183  return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
1184 }
1185 
1186 // ------------------------------ TableLookupBytesOr0
1187 // For all vector widths; x86 anyway zeroes if >= 0x80.
1188 template <class V, class VI>
1189 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
1190  return TableLookupBytes(bytes, from);
1191 }
1192 
1193 // ------------------------------ Shuffles (ShiftRight, TableLookupBytes)
1194 
1195 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1196 // Shuffle0321 rotates one lane to the right (the previous least-significant
1197 // lane is now most-significant). These could also be implemented via
1198 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1199 
1200 // Swap 32-bit halves in 64-bit halves.
1201 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1202 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
1203  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1204  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
1205 }
1206 template <size_t N>
1208  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1209  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
1210 }
1211 
1212 // These are used by generic_ops-inl to implement LoadInterleaved3. As with
1213 // Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
1214 // comes from the first argument.
1215 namespace detail {
1216 
1217 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1219  const Twice<DFromV<decltype(a)>> d2;
1220  const auto ba = Combine(d2, b, a);
1221  alignas(16) const T kShuffle[8] = {1, 0, 7, 6};
1222  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1223 }
1224 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1226  const Twice<DFromV<decltype(a)>> d2;
1227  const auto ba = Combine(d2, b, a);
1228  alignas(16) const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c};
1229  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1230 }
1231 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1232 HWY_API Vec128<T, 4> Shuffle2301(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1233  const DFromV<decltype(a)> d;
1234  const RebindToFloat<decltype(d)> df;
1235  constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
1236  return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
1237  BitCast(df, b).raw, m)});
1238 }
1239 
1240 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1242  const Twice<DFromV<decltype(a)>> d2;
1243  const auto ba = Combine(d2, b, a);
1244  alignas(16) const T kShuffle[8] = {0, 3, 6, 5};
1245  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1246 }
1247 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1249  const Twice<DFromV<decltype(a)>> d2;
1250  const auto ba = Combine(d2, b, a);
1251  alignas(16) const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a};
1252  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1253 }
1254 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1255 HWY_API Vec128<T, 4> Shuffle1230(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1256  const DFromV<decltype(a)> d;
1257  const RebindToFloat<decltype(d)> df;
1258  constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
1259  return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
1260  BitCast(df, b).raw, m)});
1261 }
1262 
1263 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1265  const Twice<DFromV<decltype(a)>> d2;
1266  const auto ba = Combine(d2, b, a);
1267  alignas(16) const T kShuffle[8] = {2, 1, 4, 7};
1268  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1269 }
1270 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1272  const Twice<DFromV<decltype(a)>> d2;
1273  const auto ba = Combine(d2, b, a);
1274  alignas(16) const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e};
1275  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1276 }
1277 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1278 HWY_API Vec128<T, 4> Shuffle3012(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1279  const DFromV<decltype(a)> d;
1280  const RebindToFloat<decltype(d)> df;
1281  constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
1282  return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
1283  BitCast(df, b).raw, m)});
1284 }
1285 
1286 } // namespace detail
1287 
1288 // Swap 64-bit halves
1289 HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
1290  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1291 }
1292 HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
1293  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1294 }
1295 HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
1296  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
1297 }
1299  return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1300 }
1302  return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1303 }
1305  return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
1306 }
1307 
1308 // Rotate right 32 bits
1309 HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
1310  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1311 }
1312 HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
1313  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1314 }
1315 HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
1316  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
1317 }
1318 // Rotate left 32 bits
1319 HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
1320  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1321 }
1322 HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
1323  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1324 }
1325 HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
1326  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
1327 }
1328 
1329 // Reverse
1330 HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
1331  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1332 }
1333 HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
1334  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1335 }
1336 HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
1337  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
1338 }
1339 
1340 // ================================================== COMPARE
1341 
1342 #if HWY_TARGET <= HWY_AVX3
1343 
1344 // Comparisons set a mask bit to 1 if the condition is true, else 0.
1345 
1346 template <typename TFrom, size_t NFrom, typename TTo, size_t NTo>
1349  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1350  return Mask128<TTo, NTo>{m.raw};
1351 }
1352 
1353 namespace detail {
1354 
1355 template <typename T, size_t N>
1357  const Vec128<T, N> bit) {
1358  return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
1359 }
1360 template <typename T, size_t N>
1362  const Vec128<T, N> bit) {
1363  return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
1364 }
1365 template <typename T, size_t N>
1367  const Vec128<T, N> bit) {
1368  return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
1369 }
1370 template <typename T, size_t N>
1372  const Vec128<T, N> bit) {
1373  return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
1374 }
1375 
1376 } // namespace detail
1377 
1378 template <typename T, size_t N>
1379 HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
1380  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1381  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1382 }
1383 
1384 // ------------------------------ Equality
1385 
1386 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1388  return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
1389 }
1390 
1391 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1392 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1393  return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1394 }
1395 
1396 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1397 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1398  return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1399 }
1400 
1401 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1402 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1403  return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1404 }
1405 
1406 template <size_t N>
1407 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
1408  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1409 }
1410 
1411 template <size_t N>
1413  Vec128<double, N> b) {
1414  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1415 }
1416 
1417 // ------------------------------ Inequality
1418 
1419 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1421  return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
1422 }
1423 
1424 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1425 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1426  return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1427 }
1428 
1429 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1430 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1431  return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1432 }
1433 
1434 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1435 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1436  return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1437 }
1438 
1439 template <size_t N>
1440 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
1441  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1442 }
1443 
1444 template <size_t N>
1446  Vec128<double, N> b) {
1447  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1448 }
1449 
1450 // ------------------------------ Strict inequality
1451 
1452 // Signed/float <
1453 template <size_t N>
1454 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1455  return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1456 }
1457 template <size_t N>
1458 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1459  Vec128<int16_t, N> b) {
1460  return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1461 }
1462 template <size_t N>
1463 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1464  Vec128<int32_t, N> b) {
1465  return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1466 }
1467 template <size_t N>
1468 HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
1469  Vec128<int64_t, N> b) {
1470  return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1471 }
1472 
1473 template <size_t N>
1474 HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a,
1475  Vec128<uint8_t, N> b) {
1476  return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
1477 }
1478 template <size_t N>
1479 HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a,
1480  Vec128<uint16_t, N> b) {
1481  return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
1482 }
1483 template <size_t N>
1484 HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a,
1485  Vec128<uint32_t, N> b) {
1486  return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
1487 }
1488 template <size_t N>
1489 HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
1490  Vec128<uint64_t, N> b) {
1491  return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
1492 }
1493 
1494 template <size_t N>
1495 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1496  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1497 }
1498 template <size_t N>
1500  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1501 }
1502 
1503 // ------------------------------ Weak inequality
1504 
1505 template <size_t N>
1506 HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
1507  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1508 }
1509 template <size_t N>
1511  Vec128<double, N> b) {
1512  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1513 }
1514 
1515 // ------------------------------ Mask
1516 
1517 namespace detail {
1518 
1519 template <typename T, size_t N>
1521  const Vec128<T, N> v) {
1522  return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
1523 }
1524 template <typename T, size_t N>
1526  const Vec128<T, N> v) {
1527  return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
1528 }
1529 template <typename T, size_t N>
1531  const Vec128<T, N> v) {
1532  return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
1533 }
1534 template <typename T, size_t N>
1536  const Vec128<T, N> v) {
1537  return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
1538 }
1539 
1540 } // namespace detail
1541 
1542 template <typename T, size_t N>
1543 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1544  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1545 }
1546 // There do not seem to be native floating-point versions of these instructions.
1547 template <size_t N>
1549  const RebindToSigned<DFromV<decltype(v)>> di;
1550  return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
1551 }
1552 template <size_t N>
1554  const RebindToSigned<DFromV<decltype(v)>> di;
1555  return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
1556 }
1557 
1558 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1560  return Vec128<T, N>{_mm_movm_epi8(v.raw)};
1561 }
1562 
1563 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1564 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1565  return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1566 }
1567 
1568 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1569 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1570  return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1571 }
1572 
1573 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1574 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1575  return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1576 }
1577 
1578 template <size_t N>
1580  return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
1581 }
1582 
1583 template <size_t N>
1585  return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
1586 }
1587 
1588 template <typename T, size_t N>
1589 HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */,
1590  const Mask128<T, N> v) {
1591  return VecFromMask(v);
1592 }
1593 
1594 #else // AVX2 or below
1595 
1596 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1597 
1598 template <typename TFrom, typename TTo, size_t N>
1599 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
1600  Mask128<TFrom, N> m) {
1601  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1602  const Simd<TFrom, N, 0> d;
1603  return MaskFromVec(BitCast(Simd<TTo, N, 0>(), VecFromMask(d, m)));
1604 }
1605 
1606 template <typename T, size_t N>
1607 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1608  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1609  return (v & bit) == bit;
1610 }
1611 
1612 // ------------------------------ Equality
1613 
1614 // Unsigned
1615 template <size_t N>
1616 HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
1617  const Vec128<uint8_t, N> b) {
1618  return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1619 }
1620 template <size_t N>
1621 HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
1622  const Vec128<uint16_t, N> b) {
1623  return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1624 }
1625 template <size_t N>
1626 HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
1627  const Vec128<uint32_t, N> b) {
1628  return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1629 }
1630 template <size_t N>
1631 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
1632  const Vec128<uint64_t, N> b) {
1633 #if HWY_TARGET == HWY_SSSE3
1634  const Simd<uint32_t, N * 2, 0> d32;
1635  const Simd<uint64_t, N, 0> d64;
1636  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1637  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1638  return MaskFromVec(BitCast(d64, cmp64));
1639 #else
1640  return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1641 #endif
1642 }
1643 
1644 // Signed
1645 template <size_t N>
1646 HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
1647  const Vec128<int8_t, N> b) {
1648  return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1649 }
1650 template <size_t N>
1651 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
1652  Vec128<int16_t, N> b) {
1653  return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1654 }
1655 template <size_t N>
1656 HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
1657  const Vec128<int32_t, N> b) {
1658  return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1659 }
1660 template <size_t N>
1661 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
1662  const Vec128<int64_t, N> b) {
1663  // Same as signed ==; avoid duplicating the SSSE3 version.
1664  const DFromV<decltype(a)> d;
1665  RebindToUnsigned<decltype(d)> du;
1666  return RebindMask(d, BitCast(du, a) == BitCast(du, b));
1667 }
1668 
1669 // Float
1670 template <size_t N>
1671 HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
1672  const Vec128<float, N> b) {
1673  return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1674 }
1675 template <size_t N>
1676 HWY_API Mask128<double, N> operator==(const Vec128<double, N> a,
1677  const Vec128<double, N> b) {
1678  return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1679 }
1680 
1681 // ------------------------------ Inequality
1682 
1683 // This cannot have T as a template argument, otherwise it is not more
1684 // specialized than rewritten operator== in C++20, leading to compile
1685 // errors: https://gcc.godbolt.org/z/xsrPhPvPT.
1686 template <size_t N>
1687 HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
1688  Vec128<uint8_t, N> b) {
1689  return Not(a == b);
1690 }
1691 template <size_t N>
1692 HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
1693  Vec128<uint16_t, N> b) {
1694  return Not(a == b);
1695 }
1696 template <size_t N>
1697 HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
1698  Vec128<uint32_t, N> b) {
1699  return Not(a == b);
1700 }
1701 template <size_t N>
1702 HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
1703  Vec128<uint64_t, N> b) {
1704  return Not(a == b);
1705 }
1706 template <size_t N>
1707 HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
1708  Vec128<int8_t, N> b) {
1709  return Not(a == b);
1710 }
1711 template <size_t N>
1712 HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
1713  Vec128<int16_t, N> b) {
1714  return Not(a == b);
1715 }
1716 template <size_t N>
1717 HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
1718  Vec128<int32_t, N> b) {
1719  return Not(a == b);
1720 }
1721 template <size_t N>
1722 HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
1723  Vec128<int64_t, N> b) {
1724  return Not(a == b);
1725 }
1726 
1727 template <size_t N>
1728 HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
1729  const Vec128<float, N> b) {
1730  return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1731 }
1732 template <size_t N>
1733 HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
1734  const Vec128<double, N> b) {
1735  return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1736 }
1737 
1738 // ------------------------------ Strict inequality
1739 
1740 // Signed/float <
1741 template <size_t N>
1742 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1743  return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1744 }
1745 template <size_t N>
1746 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1747  Vec128<int16_t, N> b) {
1748  return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1749 }
1750 template <size_t N>
1751 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1752  Vec128<int32_t, N> b) {
1753  return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1754 }
1755 
1756 template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
1757 HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
1758  const DFromV<decltype(a)> du;
1759  const RebindToSigned<decltype(du)> di;
1760  const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1761  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
1762 }
1763 
1764 template <size_t N>
1765 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1766  return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1767 }
1768 template <size_t N>
1769 HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
1770  return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1771 }
1772 
1773 template <size_t N>
1774 HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
1775  const Vec128<int64_t, N> b) {
1776 #if HWY_TARGET == HWY_SSSE3
1777  // See https://stackoverflow.com/questions/65166174/:
1778  const Simd<int64_t, N, 0> d;
1779  const RepartitionToNarrow<decltype(d)> d32;
1780  const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw};
1781  const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw};
1782  // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper:
1783  // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0.
1784  const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw;
1785  // Duplicate upper to lower half.
1786  return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
1787 #else
1788  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2
1789 #endif
1790 }
1791 
1792 // ------------------------------ Weak inequality
1793 
1794 template <size_t N>
1795 HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
1796  const Vec128<float, N> b) {
1797  return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1798 }
1799 template <size_t N>
1800 HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a,
1801  const Vec128<double, N> b) {
1802  return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1803 }
1804 
1805 #endif // HWY_TARGET <= HWY_AVX3
1806 
1807 // ------------------------------ Reversed comparisons
1808 
1809 template <typename T, size_t N>
1810 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
1811  return b > a;
1812 }
1813 
1814 template <typename T, size_t N>
1815 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
1816  return b >= a;
1817 }
1818 
1819 // ------------------------------ FirstN (Iota, Lt)
1820 
1821 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1823 #if HWY_TARGET <= HWY_AVX3
1824  (void)d;
1825  const uint64_t all = (1ull << N) - 1;
1826  // BZHI only looks at the lower 8 bits of num!
1827  const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num);
1828  return Mask128<T, N>::FromBits(bits);
1829 #else
1830  const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1831  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1832 #endif
1833 }
1834 
1835 template <class D>
1836 using MFromD = decltype(FirstN(D(), 0));
1837 
1838 // ================================================== MEMORY (1)
1839 
1840 // Clang static analysis claims the memory immediately after a partial vector
1841 // store is uninitialized, and also flags the input to partial loads (at least
1842 // for loadl_pd) as "garbage". This is a false alarm because msan does not
1843 // raise errors. We work around this by using CopyBytes instead of intrinsics,
1844 // but only for the analyzer to avoid potentially bad code generation.
1845 // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
1846 #ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1847 #if defined(__clang_analyzer__) || \
1848  (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1849 #define HWY_SAFE_PARTIAL_LOAD_STORE 1
1850 #else
1851 #define HWY_SAFE_PARTIAL_LOAD_STORE 0
1852 #endif
1853 #endif // HWY_SAFE_PARTIAL_LOAD_STORE
1854 
1855 // ------------------------------ Load
1856 
1857 template <typename T>
1858 HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1859  return Vec128<T>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
1860 }
1862  const float* HWY_RESTRICT aligned) {
1863  return Vec128<float>{_mm_load_ps(aligned)};
1864 }
1866  const double* HWY_RESTRICT aligned) {
1867  return Vec128<double>{_mm_load_pd(aligned)};
1868 }
1869 
1870 template <typename T>
1872  return Vec128<T>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
1873 }
1874 HWY_API Vec128<float> LoadU(Full128<float> /* tag */,
1875  const float* HWY_RESTRICT p) {
1876  return Vec128<float>{_mm_loadu_ps(p)};
1877 }
1879  const double* HWY_RESTRICT p) {
1880  return Vec128<double>{_mm_loadu_pd(p)};
1881 }
1882 
1883 template <typename T>
1884 HWY_API Vec64<T> Load(Full64<T> /* tag */, const T* HWY_RESTRICT p) {
1885 #if HWY_SAFE_PARTIAL_LOAD_STORE
1886  __m128i v = _mm_setzero_si128();
1887  CopyBytes<8>(p, &v);
1888  return Vec64<T>{v};
1889 #else
1890  return Vec64<T>{_mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
1891 #endif
1892 }
1893 
1895  const float* HWY_RESTRICT p) {
1896 #if HWY_SAFE_PARTIAL_LOAD_STORE
1897  __m128 v = _mm_setzero_ps();
1898  CopyBytes<8>(p, &v);
1899  return Vec128<float, 2>{v};
1900 #else
1901  const __m128 hi = _mm_setzero_ps();
1902  return Vec128<float, 2>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
1903 #endif
1904 }
1905 
1907  const double* HWY_RESTRICT p) {
1908 #if HWY_SAFE_PARTIAL_LOAD_STORE
1909  __m128d v = _mm_setzero_pd();
1910  CopyBytes<8>(p, &v);
1911  return Vec64<double>{v};
1912 #else
1913  return Vec64<double>{_mm_load_sd(p)};
1914 #endif
1915 }
1916 
1918  const float* HWY_RESTRICT p) {
1919 #if HWY_SAFE_PARTIAL_LOAD_STORE
1920  __m128 v = _mm_setzero_ps();
1921  CopyBytes<4>(p, &v);
1922  return Vec128<float, 1>{v};
1923 #else
1924  return Vec128<float, 1>{_mm_load_ss(p)};
1925 #endif
1926 }
1927 
1928 // Any <= 32 bit except <float, 1>
1929 template <typename T, size_t N, HWY_IF_LE32(T, N)>
1930 HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
1931  constexpr size_t kSize = sizeof(T) * N;
1932 #if HWY_SAFE_PARTIAL_LOAD_STORE
1933  __m128 v = _mm_setzero_ps();
1934  CopyBytes<kSize>(p, &v);
1935  return Vec128<T, N>{v};
1936 #else
1937  int32_t bits = 0;
1938  CopyBytes<kSize>(p, &bits);
1939  return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
1940 #endif
1941 }
1942 
1943 // For < 128 bit, LoadU == Load.
1944 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1946  return Load(d, p);
1947 }
1948 
1949 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1950 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1951 HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
1952  return LoadU(d, p);
1953 }
1954 
1955 // Returns a vector with lane i=[0, N) set to "first" + i.
1956 template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
1957 HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
1958  HWY_ALIGN T lanes[16 / sizeof(T)];
1959  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
1960  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
1961  }
1962  return Load(d, lanes);
1963 }
1964 
1965 // ------------------------------ MaskedLoad
1966 
1967 #if HWY_TARGET <= HWY_AVX3
1968 
1969 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1971  const T* HWY_RESTRICT p) {
1972  return Vec128<T, N>{_mm_maskz_loadu_epi8(m.raw, p)};
1973 }
1974 
1975 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1976 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1977  const T* HWY_RESTRICT p) {
1978  return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, p)};
1979 }
1980 
1981 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1982 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1983  const T* HWY_RESTRICT p) {
1984  return Vec128<T, N>{_mm_maskz_loadu_epi32(m.raw, p)};
1985 }
1986 
1987 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1988 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1989  const T* HWY_RESTRICT p) {
1990  return Vec128<T, N>{_mm_maskz_loadu_epi64(m.raw, p)};
1991 }
1992 
1993 template <size_t N>
1995  Simd<float, N, 0> /* tag */,
1996  const float* HWY_RESTRICT p) {
1997  return Vec128<float, N>{_mm_maskz_loadu_ps(m.raw, p)};
1998 }
1999 
2000 template <size_t N>
2002  Simd<double, N, 0> /* tag */,
2003  const double* HWY_RESTRICT p) {
2004  return Vec128<double, N>{_mm_maskz_loadu_pd(m.raw, p)};
2005 }
2006 
2007 #elif HWY_TARGET == HWY_AVX2
2008 
2009 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2010 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
2011  const T* HWY_RESTRICT p) {
2012  auto p_p = reinterpret_cast<const int*>(p); // NOLINT
2013  return Vec128<T, N>{_mm_maskload_epi32(p_p, m.raw)};
2014 }
2015 
2016 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2017 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
2018  const T* HWY_RESTRICT p) {
2019  auto p_p = reinterpret_cast<const long long*>(p); // NOLINT
2020  return Vec128<T, N>{_mm_maskload_epi64(p_p, m.raw)};
2021 }
2022 
2023 template <size_t N>
2024 HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m, Simd<float, N, 0> d,
2025  const float* HWY_RESTRICT p) {
2026  const Vec128<int32_t, N> mi =
2027  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2028  return Vec128<float, N>{_mm_maskload_ps(p, mi.raw)};
2029 }
2030 
2031 template <size_t N>
2032 HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m, Simd<double, N, 0> d,
2033  const double* HWY_RESTRICT p) {
2034  const Vec128<int64_t, N> mi =
2035  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2036  return Vec128<double, N>{_mm_maskload_pd(p, mi.raw)};
2037 }
2038 
2039 // There is no maskload_epi8/16, so blend instead.
2040 template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2041 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
2042  const T* HWY_RESTRICT p) {
2043  return IfThenElseZero(m, Load(d, p));
2044 }
2045 
2046 #else // <= SSE4
2047 
2048 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
2049 template <typename T, size_t N>
2050 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
2051  const T* HWY_RESTRICT p) {
2052  return IfThenElseZero(m, Load(d, p));
2053 }
2054 
2055 #endif
2056 
2057 // ------------------------------ Store
2058 
2059 template <typename T>
2060 HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
2061  _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
2062 }
2064  float* HWY_RESTRICT aligned) {
2065  _mm_store_ps(aligned, v.raw);
2066 }
2068  double* HWY_RESTRICT aligned) {
2069  _mm_store_pd(aligned, v.raw);
2070 }
2071 
2072 template <typename T>
2074  _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
2075 }
2076 HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */,
2077  float* HWY_RESTRICT p) {
2078  _mm_storeu_ps(p, v.raw);
2079 }
2081  double* HWY_RESTRICT p) {
2082  _mm_storeu_pd(p, v.raw);
2083 }
2084 
2085 template <typename T>
2086 HWY_API void Store(Vec64<T> v, Full64<T> /* tag */, T* HWY_RESTRICT p) {
2087 #if HWY_SAFE_PARTIAL_LOAD_STORE
2088  CopyBytes<8>(&v, p);
2089 #else
2090  _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
2091 #endif
2092 }
2094  float* HWY_RESTRICT p) {
2095 #if HWY_SAFE_PARTIAL_LOAD_STORE
2096  CopyBytes<8>(&v, p);
2097 #else
2098  _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
2099 #endif
2100 }
2102  double* HWY_RESTRICT p) {
2103 #if HWY_SAFE_PARTIAL_LOAD_STORE
2104  CopyBytes<8>(&v, p);
2105 #else
2106  _mm_storel_pd(p, v.raw);
2107 #endif
2108 }
2109 
2110 // Any <= 32 bit except <float, 1>
2111 template <typename T, size_t N, HWY_IF_LE32(T, N)>
2112 HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2113  CopyBytes<sizeof(T) * N>(&v, p);
2114 }
2116  float* HWY_RESTRICT p) {
2117 #if HWY_SAFE_PARTIAL_LOAD_STORE
2118  CopyBytes<4>(&v, p);
2119 #else
2120  _mm_store_ss(p, v.raw);
2121 #endif
2122 }
2123 
2124 // For < 128 bit, StoreU == Store.
2125 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2127  Store(v, d, p);
2128 }
2129 
2130 // ------------------------------ BlendedStore
2131 
2132 namespace detail {
2133 
2134 // There is no maskload_epi8/16 with which we could safely implement
2135 // BlendedStore. Manual blending is also unsafe because loading a full vector
2136 // that crosses the array end causes asan faults. Resort to scalar code; the
2137 // caller should instead use memcpy, assuming m is FirstN(d, n).
2138 template <typename T, size_t N>
2140  T* HWY_RESTRICT p) {
2141  const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t.
2142  using TI = TFromD<decltype(di)>;
2143  alignas(16) TI buf[N];
2144  alignas(16) TI mask[N];
2145  Store(BitCast(di, v), di, buf);
2146  Store(BitCast(di, VecFromMask(d, m)), di, mask);
2147  for (size_t i = 0; i < N; ++i) {
2148  if (mask[i]) {
2149  CopyBytes<sizeof(T)>(buf + i, p + i);
2150  }
2151  }
2152 }
2153 } // namespace detail
2154 
2155 #if HWY_TARGET <= HWY_AVX3
2156 
2157 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2159  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2160  _mm_mask_storeu_epi8(p, m.raw, v.raw);
2161 }
2162 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2163 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2164  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2165  _mm_mask_storeu_epi16(p, m.raw, v.raw);
2166 }
2167 
2168 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2169 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2170  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2171  auto pi = reinterpret_cast<int*>(p); // NOLINT
2172  _mm_mask_storeu_epi32(pi, m.raw, v.raw);
2173 }
2174 
2175 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2176 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2177  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2178  auto pi = reinterpret_cast<long long*>(p); // NOLINT
2179  _mm_mask_storeu_epi64(pi, m.raw, v.raw);
2180 }
2181 
2182 template <size_t N>
2184  Simd<float, N, 0>, float* HWY_RESTRICT p) {
2185  _mm_mask_storeu_ps(p, m.raw, v.raw);
2186 }
2187 
2188 template <size_t N>
2190  Simd<double, N, 0>, double* HWY_RESTRICT p) {
2191  _mm_mask_storeu_pd(p, m.raw, v.raw);
2192 }
2193 
2194 #elif HWY_TARGET == HWY_AVX2
2195 
2196 template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2197 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2198  T* HWY_RESTRICT p) {
2199  detail::ScalarMaskedStore(v, m, d, p);
2200 }
2201 
2202 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2203 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2204  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2205  // For partial vectors, avoid writing other lanes by zeroing their mask.
2206  if (N < 4) {
2207  const Full128<T> df;
2208  const Mask128<T> mf{m.raw};
2209  m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2210  }
2211 
2212  auto pi = reinterpret_cast<int*>(p); // NOLINT
2213  _mm_maskstore_epi32(pi, m.raw, v.raw);
2214 }
2215 
2216 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2217 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2218  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2219  // For partial vectors, avoid writing other lanes by zeroing their mask.
2220  if (N < 2) {
2221  const Full128<T> df;
2222  const Mask128<T> mf{m.raw};
2223  m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2224  }
2225 
2226  auto pi = reinterpret_cast<long long*>(p); // NOLINT
2227  _mm_maskstore_epi64(pi, m.raw, v.raw);
2228 }
2229 
2230 template <size_t N>
2231 HWY_API void BlendedStore(Vec128<float, N> v, Mask128<float, N> m,
2232  Simd<float, N, 0> d, float* HWY_RESTRICT p) {
2233  using T = float;
2234  // For partial vectors, avoid writing other lanes by zeroing their mask.
2235  if (N < 4) {
2236  const Full128<T> df;
2237  const Mask128<T> mf{m.raw};
2238  m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2239  }
2240 
2241  const Vec128<MakeSigned<T>, N> mi =
2242  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2243  _mm_maskstore_ps(p, mi.raw, v.raw);
2244 }
2245 
2246 template <size_t N>
2247 HWY_API void BlendedStore(Vec128<double, N> v, Mask128<double, N> m,
2248  Simd<double, N, 0> d, double* HWY_RESTRICT p) {
2249  using T = double;
2250  // For partial vectors, avoid writing other lanes by zeroing their mask.
2251  if (N < 2) {
2252  const Full128<T> df;
2253  const Mask128<T> mf{m.raw};
2254  m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2255  }
2256 
2257  const Vec128<MakeSigned<T>, N> mi =
2258  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2259  _mm_maskstore_pd(p, mi.raw, v.raw);
2260 }
2261 
2262 #else // <= SSE4
2263 
2264 template <typename T, size_t N>
2265 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2266  T* HWY_RESTRICT p) {
2267  // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
2268  detail::ScalarMaskedStore(v, m, d, p);
2269 }
2270 
2271 #endif // SSE4
2272 
2273 // ================================================== ARITHMETIC
2274 
2275 // ------------------------------ Addition
2276 
2277 // Unsigned
2278 template <size_t N>
2279 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
2280  const Vec128<uint8_t, N> b) {
2281  return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2282 }
2283 template <size_t N>
2284 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
2285  const Vec128<uint16_t, N> b) {
2286  return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2287 }
2288 template <size_t N>
2289 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
2290  const Vec128<uint32_t, N> b) {
2291  return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2292 }
2293 template <size_t N>
2294 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
2295  const Vec128<uint64_t, N> b) {
2296  return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2297 }
2298 
2299 // Signed
2300 template <size_t N>
2301 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
2302  const Vec128<int8_t, N> b) {
2303  return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2304 }
2305 template <size_t N>
2306 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
2307  const Vec128<int16_t, N> b) {
2308  return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2309 }
2310 template <size_t N>
2311 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
2312  const Vec128<int32_t, N> b) {
2313  return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2314 }
2315 template <size_t N>
2316 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
2317  const Vec128<int64_t, N> b) {
2318  return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2319 }
2320 
2321 // Float
2322 template <size_t N>
2323 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
2324  const Vec128<float, N> b) {
2325  return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
2326 }
2327 template <size_t N>
2329  const Vec128<double, N> b) {
2330  return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
2331 }
2332 
2333 // ------------------------------ Subtraction
2334 
2335 // Unsigned
2336 template <size_t N>
2337 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
2338  const Vec128<uint8_t, N> b) {
2339  return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2340 }
2341 template <size_t N>
2342 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
2343  Vec128<uint16_t, N> b) {
2344  return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2345 }
2346 template <size_t N>
2347 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
2348  const Vec128<uint32_t, N> b) {
2349  return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2350 }
2351 template <size_t N>
2352 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
2353  const Vec128<uint64_t, N> b) {
2354  return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2355 }
2356 
2357 // Signed
2358 template <size_t N>
2359 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
2360  const Vec128<int8_t, N> b) {
2361  return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2362 }
2363 template <size_t N>
2364 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
2365  const Vec128<int16_t, N> b) {
2366  return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2367 }
2368 template <size_t N>
2369 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
2370  const Vec128<int32_t, N> b) {
2371  return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2372 }
2373 template <size_t N>
2374 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
2375  const Vec128<int64_t, N> b) {
2376  return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2377 }
2378 
2379 // Float
2380 template <size_t N>
2381 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
2382  const Vec128<float, N> b) {
2383  return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
2384 }
2385 template <size_t N>
2387  const Vec128<double, N> b) {
2388  return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
2389 }
2390 
2391 // ------------------------------ SumsOf8
2392 template <size_t N>
2393 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
2394  return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
2395 }
2396 
2397 // ------------------------------ SaturatedAdd
2398 
2399 // Returns a + b clamped to the destination range.
2400 
2401 // Unsigned
2402 template <size_t N>
2403 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
2404  const Vec128<uint8_t, N> b) {
2405  return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
2406 }
2407 template <size_t N>
2408 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
2409  const Vec128<uint16_t, N> b) {
2410  return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
2411 }
2412 
2413 // Signed
2414 template <size_t N>
2415 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
2416  const Vec128<int8_t, N> b) {
2417  return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
2418 }
2419 template <size_t N>
2420 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
2421  const Vec128<int16_t, N> b) {
2422  return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
2423 }
2424 
2425 // ------------------------------ SaturatedSub
2426 
2427 // Returns a - b clamped to the destination range.
2428 
2429 // Unsigned
2430 template <size_t N>
2431 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
2432  const Vec128<uint8_t, N> b) {
2433  return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
2434 }
2435 template <size_t N>
2436 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
2437  const Vec128<uint16_t, N> b) {
2438  return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
2439 }
2440 
2441 // Signed
2442 template <size_t N>
2443 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
2444  const Vec128<int8_t, N> b) {
2445  return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
2446 }
2447 template <size_t N>
2448 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
2449  const Vec128<int16_t, N> b) {
2450  return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
2451 }
2452 
2453 // ------------------------------ AverageRound
2454 
2455 // Returns (a + b + 1) / 2
2456 
2457 // Unsigned
2458 template <size_t N>
2459 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
2460  const Vec128<uint8_t, N> b) {
2461  return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
2462 }
2463 template <size_t N>
2464 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
2465  const Vec128<uint16_t, N> b) {
2466  return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
2467 }
2468 
2469 // ------------------------------ Integer multiplication
2470 
2471 template <size_t N>
2472 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
2473  const Vec128<uint16_t, N> b) {
2474  return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2475 }
2476 template <size_t N>
2477 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
2478  const Vec128<int16_t, N> b) {
2479  return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2480 }
2481 
2482 // Returns the upper 16 bits of a * b in each lane.
2483 template <size_t N>
2484 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
2485  const Vec128<uint16_t, N> b) {
2486  return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
2487 }
2488 template <size_t N>
2489 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
2490  const Vec128<int16_t, N> b) {
2491  return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
2492 }
2493 
2494 template <size_t N>
2495 HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
2496  const Vec128<int16_t, N> b) {
2497  return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
2498 }
2499 
2500 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
2501 // even and the upper half into its odd neighbor lane.
2502 template <size_t N>
2503 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
2504  const Vec128<uint32_t, N> b) {
2505  return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2506 }
2507 
2508 #if HWY_TARGET == HWY_SSSE3
2509 
2510 template <size_t N, HWY_IF_LE64(int32_t, N)> // N=1 or 2
2511 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2512  const Vec128<int32_t, N> b) {
2513  return Set(Simd<int64_t, (N + 1) / 2, 0>(),
2514  static_cast<int64_t>(GetLane(a)) * GetLane(b));
2515 }
2516 HWY_API Vec128<int64_t> MulEven(const Vec128<int32_t> a,
2517  const Vec128<int32_t> b) {
2518  alignas(16) int32_t a_lanes[4];
2519  alignas(16) int32_t b_lanes[4];
2520  const Full128<int32_t> di32;
2521  Store(a, di32, a_lanes);
2522  Store(b, di32, b_lanes);
2523  alignas(16) int64_t mul[2];
2524  mul[0] = static_cast<int64_t>(a_lanes[0]) * b_lanes[0];
2525  mul[1] = static_cast<int64_t>(a_lanes[2]) * b_lanes[2];
2526  return Load(Full128<int64_t>(), mul);
2527 }
2528 
2529 #else // HWY_TARGET == HWY_SSSE3
2530 
2531 template <size_t N>
2532 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2533  const Vec128<int32_t, N> b) {
2534  return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2535 }
2536 
2537 #endif // HWY_TARGET == HWY_SSSE3
2538 
2539 template <size_t N>
2540 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
2541  const Vec128<uint32_t, N> b) {
2542 #if HWY_TARGET == HWY_SSSE3
2543  // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
2544  // 64-bit right shift would also work but also needs port 5, so no benefit.
2545  // Notation: x=don't care, z=0.
2546  const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2547  const auto mullo_x2x0 = MulEven(a, b);
2548  const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2549  const auto mullo_x3x1 =
2550  MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2551  // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
2552  // the latter requires one more instruction or a constant.
2553  const __m128i mul_20 =
2554  _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2555  const __m128i mul_31 =
2556  _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2557  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2558 #else
2559  return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2560 #endif
2561 }
2562 
2563 template <size_t N>
2564 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
2565  const Vec128<int32_t, N> b) {
2566  // Same as unsigned; avoid duplicating the SSSE3 code.
2567  const DFromV<decltype(a)> d;
2568  const RebindToUnsigned<decltype(d)> du;
2569  return BitCast(d, BitCast(du, a) * BitCast(du, b));
2570 }
2571 
2572 // ------------------------------ RotateRight (ShiftRight, Or)
2573 
2574 template <int kBits, size_t N>
2575 HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) {
2576  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
2577 #if HWY_TARGET <= HWY_AVX3
2578  return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)};
2579 #else
2580  if (kBits == 0) return v;
2581  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
2582 #endif
2583 }
2584 
2585 template <int kBits, size_t N>
2586 HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
2587  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
2588 #if HWY_TARGET <= HWY_AVX3
2589  return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)};
2590 #else
2591  if (kBits == 0) return v;
2592  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
2593 #endif
2594 }
2595 
2596 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2597 
2598 template <size_t N>
2599 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
2600  const DFromV<decltype(v)> d;
2601  return VecFromMask(v < Zero(d));
2602 }
2603 
2604 template <size_t N>
2606  return ShiftRight<15>(v);
2607 }
2608 
2609 template <size_t N>
2611  return ShiftRight<31>(v);
2612 }
2613 
2614 template <size_t N>
2616  const DFromV<decltype(v)> d;
2617 #if HWY_TARGET <= HWY_AVX3
2618  (void)d;
2619  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
2620 #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2621  return VecFromMask(v < Zero(d));
2622 #else
2623  // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
2624  // avoids generating a zero.
2625  const RepartitionToNarrow<decltype(d)> d32;
2626  const auto sign = ShiftRight<31>(BitCast(d32, v));
2627  return Vec128<int64_t, N>{
2628  _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2629 #endif
2630 }
2631 
2632 template <size_t N>
2633 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
2634 #if HWY_TARGET <= HWY_AVX3
2635  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
2636 #else
2637  const auto zero = Zero(DFromV<decltype(v)>());
2638  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2639 #endif
2640 }
2641 
2642 template <int kBits, size_t N>
2643 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
2644 #if HWY_TARGET <= HWY_AVX3
2645  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, kBits)};
2646 #else
2647  const DFromV<decltype(v)> di;
2648  const RebindToUnsigned<decltype(di)> du;
2649  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2650  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
2651  return right | sign;
2652 #endif
2653 }
2654 
2655 // ------------------------------ ZeroIfNegative (BroadcastSignBit)
2656 template <typename T, size_t N, HWY_IF_FLOAT(T)>
2657 HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
2658  const DFromV<decltype(v)> d;
2659 #if HWY_TARGET == HWY_SSSE3
2660  const RebindToSigned<decltype(d)> di;
2661  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2662 #else
2663  const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS
2664 #endif
2665  return IfThenElse(mask, Zero(d), v);
2666 }
2667 
2668 // ------------------------------ IfNegativeThenElse
2669 template <size_t N>
2671  const Vec128<int8_t, N> yes,
2672  const Vec128<int8_t, N> no) {
2673  // int8: IfThenElse only looks at the MSB.
2674  return IfThenElse(MaskFromVec(v), yes, no);
2675 }
2676 
2677 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2679  Vec128<T, N> no) {
2680  static_assert(IsSigned<T>(), "Only works for signed/float");
2681  const DFromV<decltype(v)> d;
2682  const RebindToSigned<decltype(d)> di;
2683 
2684  // 16-bit: no native blendv, so copy sign to lower byte's MSB.
2685  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
2686  return IfThenElse(MaskFromVec(v), yes, no);
2687 }
2688 
2689 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2690 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
2691  Vec128<T, N> no) {
2692  static_assert(IsSigned<T>(), "Only works for signed/float");
2693  const DFromV<decltype(v)> d;
2694  const RebindToFloat<decltype(d)> df;
2695 
2696  // 32/64-bit: use float IfThenElse, which only looks at the MSB.
2697  return BitCast(d, IfThenElse(MaskFromVec(BitCast(df, v)), BitCast(df, yes),
2698  BitCast(df, no)));
2699 }
2700 
2701 // ------------------------------ ShiftLeftSame
2702 
2703 template <size_t N>
2704 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
2705  const int bits) {
2706  return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2707 }
2708 template <size_t N>
2709 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
2710  const int bits) {
2711  return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2712 }
2713 template <size_t N>
2714 HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
2715  const int bits) {
2716  return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2717 }
2718 
2719 template <size_t N>
2720 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
2721  const int bits) {
2722  return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2723 }
2724 
2725 template <size_t N>
2726 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
2727  const int bits) {
2728  return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2729 }
2730 
2731 template <size_t N>
2732 HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
2733  const int bits) {
2734  return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2735 }
2736 
2737 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2738 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
2739  const DFromV<decltype(v)> d8;
2740  // Use raw instead of BitCast to support N=1.
2741  const Vec128<T, N> shifted{
2742  ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
2743  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
2744 }
2745 
2746 // ------------------------------ ShiftRightSame (BroadcastSignBit)
2747 
2748 template <size_t N>
2749 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
2750  const int bits) {
2751  return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2752 }
2753 template <size_t N>
2754 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
2755  const int bits) {
2756  return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2757 }
2758 template <size_t N>
2759 HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
2760  const int bits) {
2761  return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2762 }
2763 
2764 template <size_t N>
2765 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
2766  const int bits) {
2767  const DFromV<decltype(v)> d8;
2768  // Use raw instead of BitCast to support N=1.
2769  const Vec128<uint8_t, N> shifted{
2770  ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
2771  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
2772 }
2773 
2774 template <size_t N>
2775 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
2776  const int bits) {
2777  return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2778 }
2779 
2780 template <size_t N>
2781 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
2782  const int bits) {
2783  return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2784 }
2785 template <size_t N>
2786 HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
2787  const int bits) {
2788 #if HWY_TARGET <= HWY_AVX3
2789  return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2790 #else
2791  const DFromV<decltype(v)> di;
2792  const RebindToUnsigned<decltype(di)> du;
2793  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2794  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
2795  return right | sign;
2796 #endif
2797 }
2798 
2799 template <size_t N>
2800 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
2801  const DFromV<decltype(v)> di;
2802  const RebindToUnsigned<decltype(di)> du;
2803  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2804  const auto shifted_sign =
2805  BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
2806  return (shifted ^ shifted_sign) - shifted_sign;
2807 }
2808 
2809 // ------------------------------ Floating-point mul / div
2810 
2811 template <size_t N>
2812 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
2813  return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2814 }
2816  const Vec128<float, 1> b) {
2817  return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
2818 }
2819 template <size_t N>
2821  const Vec128<double, N> b) {
2822  return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
2823 }
2825  return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
2826 }
2827 
2828 template <size_t N>
2829 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
2830  const Vec128<float, N> b) {
2831  return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2832 }
2834  const Vec128<float, 1> b) {
2835  return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
2836 }
2837 template <size_t N>
2839  const Vec128<double, N> b) {
2840  return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
2841 }
2843  return Vec64<double>{_mm_div_sd(a.raw, b.raw)};
2844 }
2845 
2846 // Approximate reciprocal
2847 template <size_t N>
2848 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
2849  return Vec128<float, N>{_mm_rcp_ps(v.raw)};
2850 }
2852  return Vec128<float, 1>{_mm_rcp_ss(v.raw)};
2853 }
2854 
2855 // Absolute value of difference.
2856 template <size_t N>
2857 HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
2858  const Vec128<float, N> b) {
2859  return Abs(a - b);
2860 }
2861 
2862 // ------------------------------ Floating-point multiply-add variants
2863 
2864 // Returns mul * x + add
2865 template <size_t N>
2866 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
2867  const Vec128<float, N> x,
2868  const Vec128<float, N> add) {
2869 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2870  return mul * x + add;
2871 #else
2872  return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2873 #endif
2874 }
2875 template <size_t N>
2877  const Vec128<double, N> x,
2878  const Vec128<double, N> add) {
2879 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2880  return mul * x + add;
2881 #else
2882  return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
2883 #endif
2884 }
2885 
2886 // Returns add - mul * x
2887 template <size_t N>
2888 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
2889  const Vec128<float, N> x,
2890  const Vec128<float, N> add) {
2891 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2892  return add - mul * x;
2893 #else
2894  return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2895 #endif
2896 }
2897 template <size_t N>
2899  const Vec128<double, N> x,
2900  const Vec128<double, N> add) {
2901 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2902  return add - mul * x;
2903 #else
2904  return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
2905 #endif
2906 }
2907 
2908 // Returns mul * x - sub
2909 template <size_t N>
2910 HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
2911  const Vec128<float, N> x,
2912  const Vec128<float, N> sub) {
2913 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2914  return mul * x - sub;
2915 #else
2916  return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2917 #endif
2918 }
2919 template <size_t N>
2921  const Vec128<double, N> x,
2922  const Vec128<double, N> sub) {
2923 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2924  return mul * x - sub;
2925 #else
2926  return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
2927 #endif
2928 }
2929 
2930 // Returns -mul * x - sub
2931 template <size_t N>
2932 HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
2933  const Vec128<float, N> x,
2934  const Vec128<float, N> sub) {
2935 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2936  return Neg(mul) * x - sub;
2937 #else
2938  return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2939 #endif
2940 }
2941 template <size_t N>
2943  const Vec128<double, N> x,
2944  const Vec128<double, N> sub) {
2945 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2946  return Neg(mul) * x - sub;
2947 #else
2948  return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
2949 #endif
2950 }
2951 
2952 // ------------------------------ Floating-point square root
2953 
2954 // Full precision square root
2955 template <size_t N>
2956 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
2957  return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
2958 }
2960  return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
2961 }
2962 template <size_t N>
2964  return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
2965 }
2967  return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
2968 }
2969 
2970 // Approximate reciprocal square root
2971 template <size_t N>
2972 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
2973  return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
2974 }
2976  return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)};
2977 }
2978 
2979 // ------------------------------ Min (Gt, IfThenElse)
2980 
2981 namespace detail {
2982 
2983 template <typename T, size_t N>
2985  const Vec128<T, N> b) {
2986  const DFromV<decltype(a)> d;
2987  const RebindToUnsigned<decltype(d)> du;
2988  const RebindToSigned<decltype(d)> di;
2989  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
2990  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
2991  return IfThenElse(gt, b, a);
2992 }
2993 
2994 } // namespace detail
2995 
2996 // Unsigned
2997 template <size_t N>
2998 HWY_API Vec128<uint8_t, N> Min(const Vec128<uint8_t, N> a,
2999  const Vec128<uint8_t, N> b) {
3000  return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
3001 }
3002 template <size_t N>
3003 HWY_API Vec128<uint16_t, N> Min(const Vec128<uint16_t, N> a,
3004  const Vec128<uint16_t, N> b) {
3005 #if HWY_TARGET == HWY_SSSE3
3006  return detail::MinU(a, b);
3007 #else
3008  return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
3009 #endif
3010 }
3011 template <size_t N>
3012 HWY_API Vec128<uint32_t, N> Min(const Vec128<uint32_t, N> a,
3013  const Vec128<uint32_t, N> b) {
3014 #if HWY_TARGET == HWY_SSSE3
3015  return detail::MinU(a, b);
3016 #else
3017  return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
3018 #endif
3019 }
3020 template <size_t N>
3021 HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
3022  const Vec128<uint64_t, N> b) {
3023 #if HWY_TARGET <= HWY_AVX3
3024  return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
3025 #else
3026  return detail::MinU(a, b);
3027 #endif
3028 }
3029 
3030 // Signed
3031 template <size_t N>
3032 HWY_API Vec128<int8_t, N> Min(const Vec128<int8_t, N> a,
3033  const Vec128<int8_t, N> b) {
3034 #if HWY_TARGET == HWY_SSSE3
3035  return IfThenElse(a < b, a, b);
3036 #else
3037  return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
3038 #endif
3039 }
3040 template <size_t N>
3041 HWY_API Vec128<int16_t, N> Min(const Vec128<int16_t, N> a,
3042  const Vec128<int16_t, N> b) {
3043  return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
3044 }
3045 template <size_t N>
3046 HWY_API Vec128<int32_t, N> Min(const Vec128<int32_t, N> a,
3047  const Vec128<int32_t, N> b) {
3048 #if HWY_TARGET == HWY_SSSE3
3049  return IfThenElse(a < b, a, b);
3050 #else
3051  return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
3052 #endif
3053 }
3054 template <size_t N>
3055 HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
3056  const Vec128<int64_t, N> b) {
3057 #if HWY_TARGET <= HWY_AVX3
3058  return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
3059 #else
3060  return IfThenElse(a < b, a, b);
3061 #endif
3062 }
3063 
3064 // Float
3065 template <size_t N>
3066 HWY_API Vec128<float, N> Min(const Vec128<float, N> a,
3067  const Vec128<float, N> b) {
3068  return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
3069 }
3070 template <size_t N>
3072  const Vec128<double, N> b) {
3073  return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
3074 }
3075 
3076 // ------------------------------ Max (Gt, IfThenElse)
3077 
3078 namespace detail {
3079 template <typename T, size_t N>
3081  const Vec128<T, N> b) {
3082  const DFromV<decltype(a)> d;
3083  const RebindToUnsigned<decltype(d)> du;
3084  const RebindToSigned<decltype(d)> di;
3085  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
3086  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
3087  return IfThenElse(gt, a, b);
3088 }
3089 
3090 } // namespace detail
3091 
3092 // Unsigned
3093 template <size_t N>
3094 HWY_API Vec128<uint8_t, N> Max(const Vec128<uint8_t, N> a,
3095  const Vec128<uint8_t, N> b) {
3096  return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
3097 }
3098 template <size_t N>
3099 HWY_API Vec128<uint16_t, N> Max(const Vec128<uint16_t, N> a,
3100  const Vec128<uint16_t, N> b) {
3101 #if HWY_TARGET == HWY_SSSE3
3102  return detail::MaxU(a, b);
3103 #else
3104  return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
3105 #endif
3106 }
3107 template <size_t N>
3108 HWY_API Vec128<uint32_t, N> Max(const Vec128<uint32_t, N> a,
3109  const Vec128<uint32_t, N> b) {
3110 #if HWY_TARGET == HWY_SSSE3
3111  return detail::MaxU(a, b);
3112 #else
3113  return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
3114 #endif
3115 }
3116 template <size_t N>
3117 HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
3118  const Vec128<uint64_t, N> b) {
3119 #if HWY_TARGET <= HWY_AVX3
3120  return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
3121 #else
3122  return detail::MaxU(a, b);
3123 #endif
3124 }
3125 
3126 // Signed
3127 template <size_t N>
3128 HWY_API Vec128<int8_t, N> Max(const Vec128<int8_t, N> a,
3129  const Vec128<int8_t, N> b) {
3130 #if HWY_TARGET == HWY_SSSE3
3131  return IfThenElse(a < b, b, a);
3132 #else
3133  return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
3134 #endif
3135 }
3136 template <size_t N>
3137 HWY_API Vec128<int16_t, N> Max(const Vec128<int16_t, N> a,
3138  const Vec128<int16_t, N> b) {
3139  return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
3140 }
3141 template <size_t N>
3142 HWY_API Vec128<int32_t, N> Max(const Vec128<int32_t, N> a,
3143  const Vec128<int32_t, N> b) {
3144 #if HWY_TARGET == HWY_SSSE3
3145  return IfThenElse(a < b, b, a);
3146 #else
3147  return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
3148 #endif
3149 }
3150 template <size_t N>
3151 HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
3152  const Vec128<int64_t, N> b) {
3153 #if HWY_TARGET <= HWY_AVX3
3154  return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
3155 #else
3156  return IfThenElse(a < b, b, a);
3157 #endif
3158 }
3159 
3160 // Float
3161 template <size_t N>
3162 HWY_API Vec128<float, N> Max(const Vec128<float, N> a,
3163  const Vec128<float, N> b) {
3164  return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
3165 }
3166 template <size_t N>
3168  const Vec128<double, N> b) {
3169  return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
3170 }
3171 
3172 // ================================================== MEMORY (2)
3173 
3174 // ------------------------------ Non-temporal stores
3175 
3176 // On clang6, we see incorrect code generated for _mm_stream_pi, so
3177 // round even partial vectors up to 16 bytes.
3178 template <typename T, size_t N>
3179 HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
3180  T* HWY_RESTRICT aligned) {
3181  _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
3182 }
3183 template <size_t N>
3185  float* HWY_RESTRICT aligned) {
3186  _mm_stream_ps(aligned, v.raw);
3187 }
3188 template <size_t N>
3190  double* HWY_RESTRICT aligned) {
3191  _mm_stream_pd(aligned, v.raw);
3192 }
3193 
3194 // ------------------------------ Scatter
3195 
3196 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3197 HWY_DIAGNOSTICS(push)
3198 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3199 
3200 // Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
3201 using GatherIndex64 = long long int; // NOLINT(runtime/int)
3202 static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
3203 
3204 #if HWY_TARGET <= HWY_AVX3
3205 namespace detail {
3206 
3207 template <typename T, size_t N>
3209  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3210  const Vec128<int32_t, N> offset) {
3211  if (N == 4) {
3212  _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
3213  } else {
3214  const __mmask8 mask = (1u << N) - 1;
3215  _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
3216  }
3217 }
3218 template <typename T, size_t N>
3220  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3221  const Vec128<int32_t, N> index) {
3222  if (N == 4) {
3223  _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
3224  } else {
3225  const __mmask8 mask = (1u << N) - 1;
3226  _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
3227  }
3228 }
3229 
3230 template <typename T, size_t N>
3232  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3233  const Vec128<int64_t, N> offset) {
3234  if (N == 2) {
3235  _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
3236  } else {
3237  const __mmask8 mask = (1u << N) - 1;
3238  _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
3239  }
3240 }
3241 template <typename T, size_t N>
3243  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3244  const Vec128<int64_t, N> index) {
3245  if (N == 2) {
3246  _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
3247  } else {
3248  const __mmask8 mask = (1u << N) - 1;
3249  _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
3250  }
3251 }
3252 
3253 } // namespace detail
3254 
3255 template <typename T, size_t N, typename Offset>
3257  T* HWY_RESTRICT base,
3258  const Vec128<Offset, N> offset) {
3259  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3260  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
3261 }
3262 template <typename T, size_t N, typename Index>
3263 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
3264  const Vec128<Index, N> index) {
3265  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3266  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
3267 }
3268 
3269 template <size_t N>
3271  float* HWY_RESTRICT base,
3272  const Vec128<int32_t, N> offset) {
3273  if (N == 4) {
3274  _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
3275  } else {
3276  const __mmask8 mask = (1u << N) - 1;
3277  _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
3278  }
3279 }
3280 template <size_t N>
3282  float* HWY_RESTRICT base,
3283  const Vec128<int32_t, N> index) {
3284  if (N == 4) {
3285  _mm_i32scatter_ps(base, index.raw, v.raw, 4);
3286  } else {
3287  const __mmask8 mask = (1u << N) - 1;
3288  _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
3289  }
3290 }
3291 
3292 template <size_t N>
3294  double* HWY_RESTRICT base,
3295  const Vec128<int64_t, N> offset) {
3296  if (N == 2) {
3297  _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
3298  } else {
3299  const __mmask8 mask = (1u << N) - 1;
3300  _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
3301  }
3302 }
3303 template <size_t N>
3305  double* HWY_RESTRICT base,
3306  const Vec128<int64_t, N> index) {
3307  if (N == 2) {
3308  _mm_i64scatter_pd(base, index.raw, v.raw, 8);
3309  } else {
3310  const __mmask8 mask = (1u << N) - 1;
3311  _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
3312  }
3313 }
3314 #else // HWY_TARGET <= HWY_AVX3
3315 
3316 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
3317 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
3318  T* HWY_RESTRICT base,
3319  const Vec128<Offset, N> offset) {
3320  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3321 
3322  alignas(16) T lanes[N];
3323  Store(v, d, lanes);
3324 
3325  alignas(16) Offset offset_lanes[N];
3326  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
3327 
3328  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
3329  for (size_t i = 0; i < N; ++i) {
3330  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
3331  }
3332 }
3333 
3334 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
3335 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
3336  const Vec128<Index, N> index) {
3337  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3338 
3339  alignas(16) T lanes[N];
3340  Store(v, d, lanes);
3341 
3342  alignas(16) Index index_lanes[N];
3343  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
3344 
3345  for (size_t i = 0; i < N; ++i) {
3346  base[index_lanes[i]] = lanes[i];
3347  }
3348 }
3349 
3350 #endif
3351 
3352 // ------------------------------ Gather (Load/Store)
3353 
3354 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3355 
3356 template <typename T, size_t N, typename Offset>
3357 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
3358  const T* HWY_RESTRICT base,
3359  const Vec128<Offset, N> offset) {
3360  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3361 
3362  alignas(16) Offset offset_lanes[N];
3363  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
3364 
3365  alignas(16) T lanes[N];
3366  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
3367  for (size_t i = 0; i < N; ++i) {
3368  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
3369  }
3370  return Load(d, lanes);
3371 }
3372 
3373 template <typename T, size_t N, typename Index>
3374 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
3375  const T* HWY_RESTRICT base,
3376  const Vec128<Index, N> index) {
3377  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3378 
3379  alignas(16) Index index_lanes[N];
3380  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
3381 
3382  alignas(16) T lanes[N];
3383  for (size_t i = 0; i < N; ++i) {
3384  lanes[i] = base[index_lanes[i]];
3385  }
3386  return Load(d, lanes);
3387 }
3388 
3389 #else
3390 
3391 namespace detail {
3392 
3393 template <typename T, size_t N>
3394 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<4> /* tag */,
3395  Simd<T, N, 0> /* d */,
3396  const T* HWY_RESTRICT base,
3397  const Vec128<int32_t, N> offset) {
3398  return Vec128<T, N>{_mm_i32gather_epi32(
3399  reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
3400 }
3401 template <typename T, size_t N>
3402 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<4> /* tag */,
3403  Simd<T, N, 0> /* d */,
3404  const T* HWY_RESTRICT base,
3405  const Vec128<int32_t, N> index) {
3406  return Vec128<T, N>{_mm_i32gather_epi32(
3407  reinterpret_cast<const int32_t*>(base), index.raw, 4)};
3408 }
3409 
3410 template <typename T, size_t N>
3411 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<8> /* tag */,
3412  Simd<T, N, 0> /* d */,
3413  const T* HWY_RESTRICT base,
3414  const Vec128<int64_t, N> offset) {
3415  return Vec128<T, N>{_mm_i64gather_epi64(
3416  reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
3417 }
3418 template <typename T, size_t N>
3419 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<8> /* tag */,
3420  Simd<T, N, 0> /* d */,
3421  const T* HWY_RESTRICT base,
3422  const Vec128<int64_t, N> index) {
3423  return Vec128<T, N>{_mm_i64gather_epi64(
3424  reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
3425 }
3426 
3427 } // namespace detail
3428 
3429 template <typename T, size_t N, typename Offset>
3430 HWY_API Vec128<T, N> GatherOffset(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
3431  const Vec128<Offset, N> offset) {
3432  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
3433 }
3434 template <typename T, size_t N, typename Index>
3435 HWY_API Vec128<T, N> GatherIndex(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
3436  const Vec128<Index, N> index) {
3437  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
3438 }
3439 
3440 template <size_t N>
3441 HWY_API Vec128<float, N> GatherOffset(Simd<float, N, 0> /* tag */,
3442  const float* HWY_RESTRICT base,
3443  const Vec128<int32_t, N> offset) {
3444  return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
3445 }
3446 template <size_t N>
3447 HWY_API Vec128<float, N> GatherIndex(Simd<float, N, 0> /* tag */,
3448  const float* HWY_RESTRICT base,
3449  const Vec128<int32_t, N> index) {
3450  return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
3451 }
3452 
3453 template <size_t N>
3454 HWY_API Vec128<double, N> GatherOffset(Simd<double, N, 0> /* tag */,
3455  const double* HWY_RESTRICT base,
3456  const Vec128<int64_t, N> offset) {
3457  return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
3458 }
3459 template <size_t N>
3460 HWY_API Vec128<double, N> GatherIndex(Simd<double, N, 0> /* tag */,
3461  const double* HWY_RESTRICT base,
3462  const Vec128<int64_t, N> index) {
3463  return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
3464 }
3465 
3466 #endif // HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3467 
3468 HWY_DIAGNOSTICS(pop)
3469 
3470 // ================================================== SWIZZLE (2)
3471 
3472 // ------------------------------ LowerHalf
3473 
3474 // Returns upper/lower half of a vector.
3475 template <typename T, size_t N>
3476 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
3477  Vec128<T, N> v) {
3478  return Vec128<T, N / 2>{v.raw};
3479 }
3480 
3481 template <typename T, size_t N>
3482 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
3483  return LowerHalf(Simd<T, N / 2, 0>(), v);
3484 }
3485 
3486 // ------------------------------ ShiftLeftBytes
3487 
3488 template <int kBytes, typename T, size_t N>
3489 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
3490  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3491  return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
3492 }
3493 
3494 template <int kBytes, typename T, size_t N>
3495 HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
3496  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
3497 }
3498 
3499 // ------------------------------ ShiftLeftLanes
3500 
3501 template <int kLanes, typename T, size_t N>
3502 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
3503  const Repartition<uint8_t, decltype(d)> d8;
3504  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3505 }
3506 
3507 template <int kLanes, typename T, size_t N>
3508 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
3509  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
3510 }
3511 
3512 // ------------------------------ ShiftRightBytes
3513 template <int kBytes, typename T, size_t N>
3514 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
3515  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3516  // For partial vectors, clear upper lanes so we shift in zeros.
3517  if (N != 16 / sizeof(T)) {
3518  const Vec128<T> vfull{v.raw};
3519  v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
3520  }
3521  return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
3522 }
3523 
3524 // ------------------------------ ShiftRightLanes
3525 template <int kLanes, typename T, size_t N>
3526 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
3527  const Repartition<uint8_t, decltype(d)> d8;
3528  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
3529 }
3530 
3531 // ------------------------------ UpperHalf (ShiftRightBytes)
3532 
3533 // Full input: copy hi into lo (smaller instruction encoding than shifts).
3534 template <typename T>
3536  return Vec64<T>{_mm_unpackhi_epi64(v.raw, v.raw)};
3537 }
3538 HWY_API Vec128<float, 2> UpperHalf(Full64<float> /* tag */, Vec128<float> v) {
3539  return Vec128<float, 2>{_mm_movehl_ps(v.raw, v.raw)};
3540 }
3542  return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)};
3543 }
3544 
3545 // Partial
3546 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3547 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
3548  Vec128<T, N> v) {
3549  const DFromV<decltype(v)> d;
3550  const RebindToUnsigned<decltype(d)> du;
3551  const auto vu = BitCast(du, v);
3552  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
3553  return Vec128<T, (N + 1) / 2>{upper.raw};
3554 }
3555 
3556 // ------------------------------ ExtractLane (UpperHalf)
3557 
3558 namespace detail {
3559 
3560 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
3561 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3562  static_assert(kLane < N, "Lane index out of bounds");
3563 #if HWY_TARGET == HWY_SSSE3
3564  const int pair = _mm_extract_epi16(v.raw, kLane / 2);
3565  constexpr int kShift = kLane & 1 ? 8 : 0;
3566  return static_cast<T>((pair >> kShift) & 0xFF);
3567 #else
3568  return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF);
3569 #endif
3570 }
3571 
3572 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3573 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3574  static_assert(kLane < N, "Lane index out of bounds");
3575  return static_cast<T>(_mm_extract_epi16(v.raw, kLane) & 0xFFFF);
3576 }
3577 
3578 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3579 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3580  static_assert(kLane < N, "Lane index out of bounds");
3581 #if HWY_TARGET == HWY_SSSE3
3582  alignas(16) T lanes[4];
3583  Store(v, DFromV<decltype(v)>(), lanes);
3584  return lanes[kLane];
3585 #else
3586  return static_cast<T>(_mm_extract_epi32(v.raw, kLane));
3587 #endif
3588 }
3589 
3590 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3591 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3592  static_assert(kLane < N, "Lane index out of bounds");
3593 #if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3594  alignas(16) T lanes[2];
3595  Store(v, DFromV<decltype(v)>(), lanes);
3596  return lanes[kLane];
3597 #else
3598  return static_cast<T>(_mm_extract_epi64(v.raw, kLane));
3599 #endif
3600 }
3601 
3602 template <size_t kLane, size_t N>
3603 HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
3604  static_assert(kLane < N, "Lane index out of bounds");
3605 #if HWY_TARGET == HWY_SSSE3
3606  alignas(16) float lanes[4];
3607  Store(v, DFromV<decltype(v)>(), lanes);
3608  return lanes[kLane];
3609 #else
3610  // Bug in the intrinsic, returns int but should be float.
3611  const int bits = _mm_extract_ps(v.raw, kLane);
3612  float ret;
3613  CopyBytes<4>(&bits, &ret);
3614  return ret;
3615 #endif
3616 }
3617 
3618 // There is no extract_pd; two overloads because there is no UpperHalf for N=1.
3619 template <size_t kLane>
3621  static_assert(kLane == 0, "Lane index out of bounds");
3622  return GetLane(v);
3623 }
3624 
3625 template <size_t kLane>
3627  static_assert(kLane < 2, "Lane index out of bounds");
3628  const Half<DFromV<decltype(v)>> dh;
3629  return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v));
3630 }
3631 
3632 } // namespace detail
3633 
3634 // Requires one overload per vector length because ExtractLane<3> may be a
3635 // compile error if it calls _mm_extract_epi64.
3636 template <typename T>
3637 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
3638  HWY_DASSERT(i == 0);
3639  (void)i;
3640  return GetLane(v);
3641 }
3642 
3643 template <typename T>
3644 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
3645 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3646  if (__builtin_constant_p(i)) {
3647  switch (i) {
3648  case 0:
3649  return detail::ExtractLane<0>(v);
3650  case 1:
3651  return detail::ExtractLane<1>(v);
3652  }
3653  }
3654 #endif
3655  alignas(16) T lanes[2];
3656  Store(v, DFromV<decltype(v)>(), lanes);
3657  return lanes[i];
3658 }
3659 
3660 template <typename T>
3661 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
3662 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3663  if (__builtin_constant_p(i)) {
3664  switch (i) {
3665  case 0:
3666  return detail::ExtractLane<0>(v);
3667  case 1:
3668  return detail::ExtractLane<1>(v);
3669  case 2:
3670  return detail::ExtractLane<2>(v);
3671  case 3:
3672  return detail::ExtractLane<3>(v);
3673  }
3674  }
3675 #endif
3676  alignas(16) T lanes[4];
3677  Store(v, DFromV<decltype(v)>(), lanes);
3678  return lanes[i];
3679 }
3680 
3681 template <typename T>
3682 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
3683 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3684  if (__builtin_constant_p(i)) {
3685  switch (i) {
3686  case 0:
3687  return detail::ExtractLane<0>(v);
3688  case 1:
3689  return detail::ExtractLane<1>(v);
3690  case 2:
3691  return detail::ExtractLane<2>(v);
3692  case 3:
3693  return detail::ExtractLane<3>(v);
3694  case 4:
3695  return detail::ExtractLane<4>(v);
3696  case 5:
3697  return detail::ExtractLane<5>(v);
3698  case 6:
3699  return detail::ExtractLane<6>(v);
3700  case 7:
3701  return detail::ExtractLane<7>(v);
3702  }
3703  }
3704 #endif
3705  alignas(16) T lanes[8];
3706  Store(v, DFromV<decltype(v)>(), lanes);
3707  return lanes[i];
3708 }
3709 
3710 template <typename T>
3711 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
3712 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3713  if (__builtin_constant_p(i)) {
3714  switch (i) {
3715  case 0:
3716  return detail::ExtractLane<0>(v);
3717  case 1:
3718  return detail::ExtractLane<1>(v);
3719  case 2:
3720  return detail::ExtractLane<2>(v);
3721  case 3:
3722  return detail::ExtractLane<3>(v);
3723  case 4:
3724  return detail::ExtractLane<4>(v);
3725  case 5:
3726  return detail::ExtractLane<5>(v);
3727  case 6:
3728  return detail::ExtractLane<6>(v);
3729  case 7:
3730  return detail::ExtractLane<7>(v);
3731  case 8:
3732  return detail::ExtractLane<8>(v);
3733  case 9:
3734  return detail::ExtractLane<9>(v);
3735  case 10:
3736  return detail::ExtractLane<10>(v);
3737  case 11:
3738  return detail::ExtractLane<11>(v);
3739  case 12:
3740  return detail::ExtractLane<12>(v);
3741  case 13:
3742  return detail::ExtractLane<13>(v);
3743  case 14:
3744  return detail::ExtractLane<14>(v);
3745  case 15:
3746  return detail::ExtractLane<15>(v);
3747  }
3748  }
3749 #endif
3750  alignas(16) T lanes[16];
3751  Store(v, DFromV<decltype(v)>(), lanes);
3752  return lanes[i];
3753 }
3754 
3755 // ------------------------------ InsertLane (UpperHalf)
3756 
3757 namespace detail {
3758 
3759 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
3760 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3761  static_assert(kLane < N, "Lane index out of bounds");
3762 #if HWY_TARGET == HWY_SSSE3
3763  const DFromV<decltype(v)> d;
3764  alignas(16) T lanes[16];
3765  Store(v, d, lanes);
3766  lanes[kLane] = t;
3767  return Load(d, lanes);
3768 #else
3769  return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)};
3770 #endif
3771 }
3772 
3773 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3774 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3775  static_assert(kLane < N, "Lane index out of bounds");
3776  return Vec128<T, N>{_mm_insert_epi16(v.raw, t, kLane)};
3777 }
3778 
3779 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3780 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3781  static_assert(kLane < N, "Lane index out of bounds");
3782 #if HWY_TARGET == HWY_SSSE3
3783  alignas(16) T lanes[4];
3784  const DFromV<decltype(v)> d;
3785  Store(v, d, lanes);
3786  lanes[kLane] = t;
3787  return Load(d, lanes);
3788 #else
3789  MakeSigned<T> ti;
3790  CopyBytes<sizeof(T)>(&t, &ti); // don't just cast because T might be float.
3791  return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
3792 #endif
3793 }
3794 
3795 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3796 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3797  static_assert(kLane < N, "Lane index out of bounds");
3798 #if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3799  const DFromV<decltype(v)> d;
3800  alignas(16) T lanes[2];
3801  Store(v, d, lanes);
3802  lanes[kLane] = t;
3803  return Load(d, lanes);
3804 #else
3805  MakeSigned<T> ti;
3806  CopyBytes<sizeof(T)>(&t, &ti); // don't just cast because T might be float.
3807  return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
3808 #endif
3809 }
3810 
3811 template <size_t kLane, size_t N>
3812 HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
3813  static_assert(kLane < N, "Lane index out of bounds");
3814 #if HWY_TARGET == HWY_SSSE3
3815  const DFromV<decltype(v)> d;
3816  alignas(16) float lanes[4];
3817  Store(v, d, lanes);
3818  lanes[kLane] = t;
3819  return Load(d, lanes);
3820 #else
3821  return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)};
3822 #endif
3823 }
3824 
3825 // There is no insert_pd; two overloads because there is no UpperHalf for N=1.
3826 template <size_t kLane>
3828  static_assert(kLane == 0, "Lane index out of bounds");
3829  return Set(DFromV<decltype(v)>(), t);
3830 }
3831 
3832 template <size_t kLane>
3834  static_assert(kLane < 2, "Lane index out of bounds");
3835  const DFromV<decltype(v)> d;
3836  const Vec128<double> vt = Set(d, t);
3837  if (kLane == 0) {
3838  return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)};
3839  }
3840  return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)};
3841 }
3842 
3843 } // namespace detail
3844 
3845 // Requires one overload per vector length because InsertLane<3> may be a
3846 // compile error if it calls _mm_insert_epi64.
3847 
3848 template <typename T>
3849 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
3850  HWY_DASSERT(i == 0);
3851  (void)i;
3852  return Set(DFromV<decltype(v)>(), t);
3853 }
3854 
3855 template <typename T>
3856 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
3857 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3858  if (__builtin_constant_p(i)) {
3859  switch (i) {
3860  case 0:
3861  return detail::InsertLane<0>(v, t);
3862  case 1:
3863  return detail::InsertLane<1>(v, t);
3864  }
3865  }
3866 #endif
3867  const DFromV<decltype(v)> d;
3868  alignas(16) T lanes[2];
3869  Store(v, d, lanes);
3870  lanes[i] = t;
3871  return Load(d, lanes);
3872 }
3873 
3874 template <typename T>
3875 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
3876 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3877  if (__builtin_constant_p(i)) {
3878  switch (i) {
3879  case 0:
3880  return detail::InsertLane<0>(v, t);
3881  case 1:
3882  return detail::InsertLane<1>(v, t);
3883  case 2:
3884  return detail::InsertLane<2>(v, t);
3885  case 3:
3886  return detail::InsertLane<3>(v, t);
3887  }
3888  }
3889 #endif
3890  const DFromV<decltype(v)> d;
3891  alignas(16) T lanes[4];
3892  Store(v, d, lanes);
3893  lanes[i] = t;
3894  return Load(d, lanes);
3895 }
3896 
3897 template <typename T>
3898 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
3899 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3900  if (__builtin_constant_p(i)) {
3901  switch (i) {
3902  case 0:
3903  return detail::InsertLane<0>(v, t);
3904  case 1:
3905  return detail::InsertLane<1>(v, t);
3906  case 2:
3907  return detail::InsertLane<2>(v, t);
3908  case 3:
3909  return detail::InsertLane<3>(v, t);
3910  case 4:
3911  return detail::InsertLane<4>(v, t);
3912  case 5:
3913  return detail::InsertLane<5>(v, t);
3914  case 6:
3915  return detail::InsertLane<6>(v, t);
3916  case 7:
3917  return detail::InsertLane<7>(v, t);
3918  }
3919  }
3920 #endif
3921  const DFromV<decltype(v)> d;
3922  alignas(16) T lanes[8];
3923  Store(v, d, lanes);
3924  lanes[i] = t;
3925  return Load(d, lanes);
3926 }
3927 
3928 template <typename T>
3929 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
3930 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3931  if (__builtin_constant_p(i)) {
3932  switch (i) {
3933  case 0:
3934  return detail::InsertLane<0>(v, t);
3935  case 1:
3936  return detail::InsertLane<1>(v, t);
3937  case 2:
3938  return detail::InsertLane<2>(v, t);
3939  case 3:
3940  return detail::InsertLane<3>(v, t);
3941  case 4:
3942  return detail::InsertLane<4>(v, t);
3943  case 5:
3944  return detail::InsertLane<5>(v, t);
3945  case 6:
3946  return detail::InsertLane<6>(v, t);
3947  case 7:
3948  return detail::InsertLane<7>(v, t);
3949  case 8:
3950  return detail::InsertLane<8>(v, t);
3951  case 9:
3952  return detail::InsertLane<9>(v, t);
3953  case 10:
3954  return detail::InsertLane<10>(v, t);
3955  case 11:
3956  return detail::InsertLane<11>(v, t);
3957  case 12:
3958  return detail::InsertLane<12>(v, t);
3959  case 13:
3960  return detail::InsertLane<13>(v, t);
3961  case 14:
3962  return detail::InsertLane<14>(v, t);
3963  case 15:
3964  return detail::InsertLane<15>(v, t);
3965  }
3966  }
3967 #endif
3968  const DFromV<decltype(v)> d;
3969  alignas(16) T lanes[16];
3970  Store(v, d, lanes);
3971  lanes[i] = t;
3972  return Load(d, lanes);
3973 }
3974 
3975 // ------------------------------ CombineShiftRightBytes
3976 
3977 template <int kBytes, typename T, class V = Vec128<T>>
3978 HWY_API V CombineShiftRightBytes(Full128<T> d, V hi, V lo) {
3979  const Repartition<uint8_t, decltype(d)> d8;
3980  return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
3981  BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
3982 }
3983 
3984 template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
3985  class V = Vec128<T, N>>
3986 HWY_API V CombineShiftRightBytes(Simd<T, N, 0> d, V hi, V lo) {
3987  constexpr size_t kSize = N * sizeof(T);
3988  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3989  const Repartition<uint8_t, decltype(d)> d8;
3990  const Full128<uint8_t> d_full8;
3991  using V8 = VFromD<decltype(d_full8)>;
3992  const V8 hi8{BitCast(d8, hi).raw};
3993  // Move into most-significant bytes
3994  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
3995  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
3996  return V{BitCast(Full128<T>(), r).raw};
3997 }
3998 
3999 // ------------------------------ Broadcast/splat any lane
4000 
4001 // Unsigned
4002 template <int kLane, size_t N>
4004  static_assert(0 <= kLane && kLane < N, "Invalid lane");
4005  if (kLane < 4) {
4006  const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
4007  return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)};
4008  } else {
4009  const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
4010  return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)};
4011  }
4012 }
4013 template <int kLane, size_t N>
4015  static_assert(0 <= kLane && kLane < N, "Invalid lane");
4016  return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
4017 }
4018 template <int kLane, size_t N>
4020  static_assert(0 <= kLane && kLane < N, "Invalid lane");
4021  return Vec128<uint64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
4022 }
4023 
4024 // Signed
4025 template <int kLane, size_t N>
4027  static_assert(0 <= kLane && kLane < N, "Invalid lane");
4028  if (kLane < 4) {
4029  const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
4030  return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)};
4031  } else {
4032  const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
4033  return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)};
4034  }
4035 }
4036 template <int kLane, size_t N>
4038  static_assert(0 <= kLane && kLane < N, "Invalid lane");
4039  return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
4040 }
4041 template <int kLane, size_t N>
4043  static_assert(0 <= kLane && kLane < N, "Invalid lane");
4044  return Vec128<int64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
4045 }
4046 
4047 // Float
4048 template <int kLane, size_t N>
4050  static_assert(0 <= kLane && kLane < N, "Invalid lane");
4051  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
4052 }
4053 template <int kLane, size_t N>
4055  static_assert(0 <= kLane && kLane < N, "Invalid lane");
4056  return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
4057 }
4058 
4059 // ------------------------------ TableLookupLanes (Shuffle01)
4060 
4061 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
4062 template <typename T, size_t N = 16 / sizeof(T)>
4063 struct Indices128 {
4064  __m128i raw;
4065 };
4066 
4067 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
4068  HWY_IF_LANE_SIZE(T, 4)>
4070  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
4071 #if HWY_IS_DEBUG_BUILD
4072  const Rebind<TI, decltype(d)> di;
4073  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
4074  AllTrue(di, Lt(vec, Set(di, N))));
4075 #endif
4076 
4077 #if HWY_TARGET <= HWY_AVX2
4078  (void)d;
4079  return Indices128<T, N>{vec.raw};
4080 #else
4081  const Repartition<uint8_t, decltype(d)> d8;
4082  using V8 = VFromD<decltype(d8)>;
4083  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
4084  0, 1, 2, 3, 0, 1, 2, 3};
4085 
4086  // Broadcast each lane index to all 4 bytes of T
4087  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
4088  0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
4089  const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
4090 
4091  // Shift to bytes
4092  const Repartition<uint16_t, decltype(d)> d16;
4093  const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
4094 
4095  return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
4096 #endif
4097 }
4098 
4099 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
4100  HWY_IF_LANE_SIZE(T, 8)>
4101 HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
4102  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
4103 #if HWY_IS_DEBUG_BUILD
4104  const Rebind<TI, decltype(d)> di;
4105  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
4106  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
4107 #else
4108  (void)d;
4109 #endif
4110 
4111  // No change - even without AVX3, we can shuffle+blend.
4112  return Indices128<T, N>{vec.raw};
4113 }
4114 
4115 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
4116 HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
4117  const Rebind<TI, decltype(d)> di;
4118  return IndicesFromVec(d, LoadU(di, idx));
4119 }
4120 
4121 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4123 #if HWY_TARGET <= HWY_AVX2
4124  const DFromV<decltype(v)> d;
4125  const RebindToFloat<decltype(d)> df;
4126  const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)};
4127  return BitCast(d, perm);
4128 #else
4129  return TableLookupBytes(v, Vec128<T, N>{idx.raw});
4130 #endif
4131 }
4132 
4133 template <size_t N, HWY_IF_GE64(float, N)>
4135  Indices128<float, N> idx) {
4136 #if HWY_TARGET <= HWY_AVX2
4137  return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
4138 #else
4139  const DFromV<decltype(v)> df;
4140  const RebindToSigned<decltype(df)> di;
4141  return BitCast(df,
4143 #endif
4144 }
4145 
4146 // Single lane: no change
4147 template <typename T>
4149  Indices128<T, 1> /* idx */) {
4150  return v;
4151 }
4152 
4153 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4155  const Full128<T> d;
4156  Vec128<int64_t> vidx{idx.raw};
4157 #if HWY_TARGET <= HWY_AVX2
4158  // There is no _mm_permute[x]var_epi64.
4159  vidx += vidx; // bit1 is the decider (unusual)
4160  const Full128<double> df;
4161  return BitCast(
4162  d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)});
4163 #else
4164  // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
4165  // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
4166  // to obtain an all-zero or all-one mask.
4167  const Full128<int64_t> di;
4168  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
4169  const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same));
4170  return IfThenElse(mask_same, v, Shuffle01(v));
4171 #endif
4172 }
4173 
4175  Indices128<double> idx) {
4176  Vec128<int64_t> vidx{idx.raw};
4177 #if HWY_TARGET <= HWY_AVX2
4178  vidx += vidx; // bit1 is the decider (unusual)
4179  return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
4180 #else
4181  // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
4182  // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
4183  // to obtain an all-zero or all-one mask.
4184  const Full128<double> d;
4185  const Full128<int64_t> di;
4186  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
4187  const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
4188  return IfThenElse(mask_same, v, Shuffle01(v));
4189 #endif
4190 }
4191 
4192 // ------------------------------ ReverseBlocks
4193 
4194 // Single block: no change
4195 template <typename T>
4196 HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
4197  return v;
4198 }
4199 
4200 // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
4201 
4202 // Single lane: no change
4203 template <typename T>
4204 HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
4205  return v;
4206 }
4207 
4208 // Two lanes: shuffle
4209 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4211  return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
4212 }
4213 
4214 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4215 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
4216  return Shuffle01(v);
4217 }
4218 
4219 // Four lanes: shuffle
4220 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4221 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
4222  return Shuffle0123(v);
4223 }
4224 
4225 // 16-bit
4226 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4227 HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
4228 #if HWY_TARGET <= HWY_AVX3
4229  if (N == 1) return v;
4230  if (N == 2) {
4231  const Repartition<uint32_t, decltype(d)> du32;
4232  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
4233  }
4234  const RebindToSigned<decltype(d)> di;
4235  alignas(16) constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4236  const Vec128<int16_t, N> idx = Load(di, kReverse + (N == 8 ? 0 : 4));
4237  return BitCast(d, Vec128<int16_t, N>{
4238  _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4239 #else
4240  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
4241  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
4242 #endif
4243 }
4244 
4245 // ------------------------------ Reverse2
4246 
4247 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4248 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
4249  const Repartition<uint32_t, decltype(d)> du32;
4250  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
4251 }
4252 
4253 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4254 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4255  return Shuffle2301(v);
4256 }
4257 
4258 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4259 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4260  return Shuffle01(v);
4261 }
4262 
4263 // ------------------------------ Reverse4
4264 
4265 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4266 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
4267  const RebindToSigned<decltype(d)> di;
4268  // 4x 16-bit: a single shufflelo suffices.
4269  if (N == 4) {
4270  return BitCast(d, Vec128<int16_t, N>{_mm_shufflelo_epi16(
4271  BitCast(di, v).raw, _MM_SHUFFLE(0, 1, 2, 3))});
4272  }
4273 
4274 #if HWY_TARGET <= HWY_AVX3
4275  alignas(16) constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4};
4276  const Vec128<int16_t, N> idx = Load(di, kReverse4);
4277  return BitCast(d, Vec128<int16_t, N>{
4278  _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4279 #else
4280  const RepartitionToWide<decltype(di)> dw;
4281  return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
4282 #endif
4283 }
4284 
4285 // 4x 32-bit: use Shuffle0123
4286 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4287 HWY_API Vec128<T> Reverse4(Full128<T> /* tag */, const Vec128<T> v) {
4288  return Shuffle0123(v);
4289 }
4290 
4291 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4292 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
4293  HWY_ASSERT(0); // don't have 4 u64 lanes
4294 }
4295 
4296 // ------------------------------ Reverse8
4297 
4298 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4299 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
4300 #if HWY_TARGET <= HWY_AVX3
4301  const RebindToSigned<decltype(d)> di;
4302  alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
4303  15, 14, 13, 12, 11, 10, 9, 8};
4304  const Vec128<int16_t, N> idx = Load(di, kReverse8);
4305  return BitCast(d, Vec128<int16_t, N>{
4306  _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4307 #else
4308  const RepartitionToWide<decltype(d)> dw;
4309  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
4310 #endif
4311 }
4312 
4313 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4314 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
4315  HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
4316 }
4317 
4318 // ------------------------------ InterleaveLower
4319 
4320 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
4321 // the least-significant lane) and "b". To concatenate two half-width integers
4322 // into one, use ZipLower/Upper instead (also works with scalar).
4323 
4324 template <size_t N, HWY_IF_LE128(uint8_t, N)>
4326  const Vec128<uint8_t, N> b) {
4327  return Vec128<uint8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
4328 }
4329 template <size_t N, HWY_IF_LE128(uint16_t, N)>
4331  const Vec128<uint16_t, N> b) {
4332  return Vec128<uint16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
4333 }
4334 template <size_t N, HWY_IF_LE128(uint32_t, N)>
4336  const Vec128<uint32_t, N> b) {
4337  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
4338 }
4339 template <size_t N, HWY_IF_LE128(uint64_t, N)>
4341  const Vec128<uint64_t, N> b) {
4342  return Vec128<uint64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
4343 }
4344 
4345 template <size_t N, HWY_IF_LE128(int8_t, N)>
4347  const Vec128<int8_t, N> b) {
4348  return Vec128<int8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
4349 }
4350 template <size_t N, HWY_IF_LE128(int16_t, N)>
4352  const Vec128<int16_t, N> b) {
4353  return Vec128<int16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
4354 }
4355 template <size_t N, HWY_IF_LE128(int32_t, N)>
4357  const Vec128<int32_t, N> b) {
4358  return Vec128<int32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
4359 }
4360 template <size_t N, HWY_IF_LE128(int64_t, N)>
4362  const Vec128<int64_t, N> b) {
4363  return Vec128<int64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
4364 }
4365 
4366 template <size_t N, HWY_IF_LE128(float, N)>
4367 HWY_API Vec128<float, N> InterleaveLower(const Vec128<float, N> a,
4368  const Vec128<float, N> b) {
4369  return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
4370 }
4371 template <size_t N, HWY_IF_LE128(double, N)>
4373  const Vec128<double, N> b) {
4374  return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
4375 }
4376 
4377 // Additional overload for the optional tag (also for 256/512).
4378 template <class V>
4379 HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
4380  return InterleaveLower(a, b);
4381 }
4382 
4383 // ------------------------------ InterleaveUpper (UpperHalf)
4384 
4385 // All functions inside detail lack the required D parameter.
4386 namespace detail {
4387 
4389  const Vec128<uint8_t> b) {
4390  return Vec128<uint8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
4391 }
4393  const Vec128<uint16_t> b) {
4394  return Vec128<uint16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
4395 }
4397  const Vec128<uint32_t> b) {
4398  return Vec128<uint32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
4399 }
4401  const Vec128<uint64_t> b) {
4402  return Vec128<uint64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
4403 }
4404 
4406  const Vec128<int8_t> b) {
4407  return Vec128<int8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
4408 }
4410  const Vec128<int16_t> b) {
4411  return Vec128<int16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
4412 }
4414  const Vec128<int32_t> b) {
4415  return Vec128<int32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
4416 }
4418  const Vec128<int64_t> b) {
4419  return Vec128<int64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
4420 }
4421 
4422 HWY_API Vec128<float> InterleaveUpper(const Vec128<float> a,
4423  const Vec128<float> b) {
4424  return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
4425 }
4427  const Vec128<double> b) {
4428  return Vec128<double>{_mm_unpackhi_pd(a.raw, b.raw)};
4429 }
4430 
4431 } // namespace detail
4432 
4433 // Full
4434 template <typename T, class V = Vec128<T>>
4435 HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
4436  return detail::InterleaveUpper(a, b);
4437 }
4438 
4439 // Partial
4440 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
4441 HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
4442  const Half<decltype(d)> d2;
4443  return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
4444 }
4445 
4446 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
4447 
4448 // Same as Interleave*, except that the return lanes are double-width integers;
4449 // this is necessary because the single-lane scalar cannot return two values.
4450 template <class V, class DW = RepartitionToWide<DFromV<V>>>
4451 HWY_API VFromD<DW> ZipLower(V a, V b) {
4452  return BitCast(DW(), InterleaveLower(a, b));
4453 }
4454 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4455 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
4456  return BitCast(dw, InterleaveLower(D(), a, b));
4457 }
4458 
4459 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4460 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
4461  return BitCast(dw, InterleaveUpper(D(), a, b));
4462 }
4463 
4464 // ================================================== COMBINE
4465 
4466 // ------------------------------ Combine (InterleaveLower)
4467 
4468 // N = N/2 + N/2 (upper half undefined)
4469 template <typename T, size_t N, HWY_IF_LE128(T, N)>
4470 HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
4471  Vec128<T, N / 2> lo_half) {
4472  const Half<decltype(d)> d2;
4473  const RebindToUnsigned<decltype(d2)> du2;
4474  // Treat half-width input as one lane, and expand to two lanes.
4475  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
4476  const VU lo{BitCast(du2, lo_half).raw};
4477  const VU hi{BitCast(du2, hi_half).raw};
4478  return BitCast(d, InterleaveLower(lo, hi));
4479 }
4480 
4481 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
4482 
4483 template <typename T, HWY_IF_NOT_FLOAT(T)>
4485  return Vec128<T>{_mm_move_epi64(lo.raw)};
4486 }
4487 
4488 template <typename T, HWY_IF_FLOAT(T)>
4489 HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec64<T> lo) {
4490  const RebindToUnsigned<decltype(d)> du;
4491  return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo)));
4492 }
4493 
4494 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4496  return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
4497 }
4498 
4499 // ------------------------------ Concat full (InterleaveLower)
4500 
4501 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
4502 template <typename T>
4503 HWY_API Vec128<T> ConcatLowerLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4504  const Repartition<uint64_t, decltype(d)> d64;
4505  return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
4506 }
4507 
4508 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
4509 template <typename T>
4510 HWY_API Vec128<T> ConcatUpperUpper(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4511  const Repartition<uint64_t, decltype(d)> d64;
4512  return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
4513 }
4514 
4515 // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
4516 template <typename T>
4517 HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
4518  const Vec128<T> lo) {
4519  return CombineShiftRightBytes<8>(d, hi, lo);
4520 }
4521 
4522 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4523 template <typename T>
4525  const Repartition<double, decltype(d)> dd;
4526 #if HWY_TARGET == HWY_SSSE3
4527  return BitCast(
4528  d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw,
4529  _MM_SHUFFLE2(1, 0))});
4530 #else
4531  // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle.
4532  return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
4533  BitCast(dd, lo).raw, 1)});
4534 #endif
4535 }
4537  Vec128<float> lo) {
4538 #if HWY_TARGET == HWY_SSSE3
4539  (void)d;
4540  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
4541 #else
4542  // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
4543  const RepartitionToWide<decltype(d)> dd;
4544  return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
4545  BitCast(dd, lo).raw, 1)});
4546 #endif
4547 }
4549  Vec128<double> hi, Vec128<double> lo) {
4550 #if HWY_TARGET == HWY_SSSE3
4551  return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
4552 #else
4553  // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
4554  return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)};
4555 #endif
4556 }
4557 
4558 // ------------------------------ Concat partial (Combine, LowerHalf)
4559 
4560 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4561 HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, Vec128<T, N> hi,
4562  Vec128<T, N> lo) {
4563  const Half<decltype(d)> d2;
4564  return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
4565 }
4566 
4567 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4568 HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, Vec128<T, N> hi,
4569  Vec128<T, N> lo) {
4570  const Half<decltype(d)> d2;
4571  return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
4572 }
4573 
4574 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4575 HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
4576  const Vec128<T, N> lo) {
4577  const Half<decltype(d)> d2;
4578  return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
4579 }
4580 
4581 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4583  Vec128<T, N> lo) {
4584  const Half<decltype(d)> d2;
4585  return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
4586 }
4587 
4588 // ------------------------------ ConcatOdd
4589 
4590 // 8-bit full
4591 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4592 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4593  const Repartition<uint16_t, decltype(d)> dw;
4594  // Right-shift 8 bits per u16 so we can pack.
4595  const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
4596  const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
4597  return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4598 }
4599 
4600 // 8-bit x8
4601 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4603  const Repartition<uint32_t, decltype(d)> du32;
4604  // Don't care about upper half, no need to zero.
4605  alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
4606  const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8));
4607  const Vec64<T> L = TableLookupBytes(lo, shuf);
4608  const Vec64<T> H = TableLookupBytes(hi, shuf);
4609  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4610 }
4611 
4612 // 8-bit x4
4613 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4615  const Repartition<uint16_t, decltype(d)> du16;
4616  // Don't care about upper half, no need to zero.
4617  alignas(16) const uint8_t kCompactOddU8[4] = {1, 3};
4618  const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8));
4619  const Vec32<T> L = TableLookupBytes(lo, shuf);
4620  const Vec32<T> H = TableLookupBytes(hi, shuf);
4621  return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
4622 }
4623 
4624 // 16-bit full
4625 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4626 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4627  const Repartition<uint32_t, decltype(d)> dw;
4628  // Right-shift 16 bits per u32 so we can pack.
4629  const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
4630  const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
4631  return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
4632 }
4633 
4634 // 16-bit x4
4635 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4637  const Repartition<uint32_t, decltype(d)> du32;
4638  // Don't care about upper half, no need to zero.
4639  alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
4640  const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16));
4641  const Vec64<T> L = TableLookupBytes(lo, shuf);
4642  const Vec64<T> H = TableLookupBytes(hi, shuf);
4643  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4644 }
4645 
4646 // 32-bit full
4647 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4648 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4649  const RebindToFloat<decltype(d)> df;
4650  return BitCast(
4651  d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
4652  _MM_SHUFFLE(3, 1, 3, 1))});
4653 }
4654 template <size_t N>
4656  Vec128<float> lo) {
4657  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
4658 }
4659 
4660 // Any type x2
4661 template <typename T>
4662 HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
4663  Vec128<T, 2> lo) {
4664  return InterleaveUpper(d, lo, hi);
4665 }
4666 
4667 // ------------------------------ ConcatEven (InterleaveLower)
4668 
4669 // 8-bit full
4670 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4671 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4672  const Repartition<uint16_t, decltype(d)> dw;
4673  // Isolate lower 8 bits per u16 so we can pack.
4674  const Vec128<uint16_t> mask = Set(dw, 0x00FF);
4675  const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask);
4676  const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask);
4677  return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4678 }
4679 
4680 // 8-bit x8
4681 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4683  const Repartition<uint32_t, decltype(d)> du32;
4684  // Don't care about upper half, no need to zero.
4685  alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
4686  const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8));
4687  const Vec64<T> L = TableLookupBytes(lo, shuf);
4688  const Vec64<T> H = TableLookupBytes(hi, shuf);
4689  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4690 }
4691 
4692 // 8-bit x4
4693 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4695  const Repartition<uint16_t, decltype(d)> du16;
4696  // Don't care about upper half, no need to zero.
4697  alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2};
4698  const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8));
4699  const Vec32<T> L = TableLookupBytes(lo, shuf);
4700  const Vec32<T> H = TableLookupBytes(hi, shuf);
4701  return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
4702 }
4703 
4704 // 16-bit full
4705 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4706 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4707  const Repartition<uint32_t, decltype(d)> dw;
4708  // Isolate lower 16 bits per u32 so we can pack.
4709  const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
4710  const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
4711  const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
4712  return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
4713 }
4714 
4715 // 16-bit x4
4716 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4718  const Repartition<uint32_t, decltype(d)> du32;
4719  // Don't care about upper half, no need to zero.
4720  alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
4721  const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16));
4722  const Vec64<T> L = TableLookupBytes(lo, shuf);
4723  const Vec64<T> H = TableLookupBytes(hi, shuf);
4724  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4725 }
4726 
4727 // 32-bit full
4728 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4729 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4730  const RebindToFloat<decltype(d)> df;
4731  return BitCast(
4732  d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
4733  _MM_SHUFFLE(2, 0, 2, 0))});
4734 }
4736  Vec128<float> lo) {
4737  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
4738 }
4739 
4740 // Any T x2
4741 template <typename T>
4742 HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
4743  Vec128<T, 2> lo) {
4744  return InterleaveLower(d, lo, hi);
4745 }
4746 
4747 // ------------------------------ DupEven (InterleaveLower)
4748 
4749 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4750 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
4751  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4752 }
4753 template <size_t N>
4755  return Vec128<float, N>{
4756  _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4757 }
4758 
4759 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4760 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
4761  return InterleaveLower(DFromV<decltype(v)>(), v, v);
4762 }
4763 
4764 // ------------------------------ DupOdd (InterleaveUpper)
4765 
4766 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4767 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
4768  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4769 }
4770 template <size_t N>
4772  return Vec128<float, N>{
4773  _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4774 }
4775 
4776 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4777 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
4778  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
4779 }
4780 
4781 // ------------------------------ OddEven (IfThenElse)
4782 
4783 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4785  const DFromV<decltype(a)> d;
4786  const Repartition<uint8_t, decltype(d)> d8;
4787  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
4788  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4789  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
4790 }
4791 
4792 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4793 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4794 #if HWY_TARGET == HWY_SSSE3
4795  const DFromV<decltype(a)> d;
4796  const Repartition<uint8_t, decltype(d)> d8;
4797  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
4798  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
4799  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
4800 #else
4801  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
4802 #endif
4803 }
4804 
4805 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4806 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4807 #if HWY_TARGET == HWY_SSSE3
4808  const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4809  const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4810  return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
4811 #else
4812  // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle.
4813  const DFromV<decltype(a)> d;
4814  const RebindToFloat<decltype(d)> df;
4815  return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw,
4816  BitCast(df, b).raw, 5)});
4817 #endif
4818 }
4819 
4820 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4821 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4822  // Same as ConcatUpperLower for full vectors; do not call that because this
4823  // is more efficient for 64x1 vectors.
4824  const DFromV<decltype(a)> d;
4825  const RebindToFloat<decltype(d)> dd;
4826 #if HWY_TARGET == HWY_SSSE3
4827  return BitCast(
4828  d, Vec128<double, N>{_mm_shuffle_pd(
4829  BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))});
4830 #else
4831  // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
4832  return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw,
4833  BitCast(dd, b).raw, 1)});
4834 #endif
4835 }
4836 
4837 template <size_t N>
4838 HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) {
4839 #if HWY_TARGET == HWY_SSSE3
4840  // SHUFPS must fill the lower half of the output from one input, so we
4841  // need another shuffle. Unpack avoids another immediate byte.
4842  const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4843  const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4844  return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
4845 #else
4846  return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
4847 #endif
4848 }
4849 
4850 // ------------------------------ OddEvenBlocks
4851 template <typename T, size_t N>
4852 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
4853  return even;
4854 }
4855 
4856 // ------------------------------ SwapAdjacentBlocks
4857 
4858 template <typename T, size_t N>
4859 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
4860  return v;
4861 }
4862 
4863 // ------------------------------ Shl (ZipLower, Mul)
4864 
4865 // Use AVX2/3 variable shifts where available, otherwise multiply by powers of
4866 // two from loading float exponents, which is considerably faster (according
4867 // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
4868 
4869 #if HWY_TARGET > HWY_AVX3 // AVX2 or older
4870 namespace detail {
4871 
4872 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
4873 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4874 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
4875  const DFromV<decltype(v)> d;
4876  const RepartitionToWide<decltype(d)> dw;
4877  const Rebind<float, decltype(dw)> df;
4878  const auto zero = Zero(d);
4879  // Move into exponent (this u16 will become the upper half of an f32)
4880  const auto exp = ShiftLeft<23 - 16>(v);
4881  const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
4882  // Insert 0 into lower halves for reinterpreting as binary32.
4883  const auto f0 = ZipLower(dw, zero, upper);
4884  const auto f1 = ZipUpper(dw, zero, upper);
4885  // See comment below.
4886  const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
4887  const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
4888  return Vec128<MakeUnsigned<T>, N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
4889 }
4890 
4891 // Same, for 32-bit shifts.
4892 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4893 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
4894  const DFromV<decltype(v)> d;
4895  const auto exp = ShiftLeft<23>(v);
4896  const auto f = exp + Set(d, 0x3F800000); // 1.0f
4897  // Do not use ConvertTo because we rely on the native 0x80..00 overflow
4898  // behavior. cvt instead of cvtt should be equivalent, but avoids test
4899  // failure under GCC 10.2.1.
4900  return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
4901 }
4902 
4903 } // namespace detail
4904 #endif // HWY_TARGET > HWY_AVX3
4905 
4906 template <size_t N>
4908  const Vec128<uint16_t, N> bits) {
4909 #if HWY_TARGET <= HWY_AVX3
4910  return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
4911 #else
4912  return v * detail::Pow2(bits);
4913 #endif
4914 }
4916  const Vec128<uint16_t, 1> bits) {
4917  return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)};
4918 }
4919 
4920 template <size_t N>
4922  const Vec128<uint32_t, N> bits) {
4923 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4924  return v * detail::Pow2(bits);
4925 #else
4926  return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
4927 #endif
4928 }
4930  const Vec128<uint32_t, 1> bits) {
4931  return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)};
4932 }
4933 
4934 HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
4935  const Vec128<uint64_t> bits) {
4936 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4937  // Individual shifts and combine
4938  const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
4939  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
4940  const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
4941  return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
4942 #else
4943  return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
4944 #endif
4945 }
4946 HWY_API Vec64<uint64_t> operator<<(const Vec64<uint64_t> v,
4947  const Vec64<uint64_t> bits) {
4948  return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
4949 }
4950 
4951 // Signed left shift is the same as unsigned.
4952 template <typename T, size_t N, HWY_IF_SIGNED(T)>
4953 HWY_API Vec128<T, N> operator<<(const Vec128<T, N> v, const Vec128<T, N> bits) {
4954  const DFromV<decltype(v)> di;
4955  const RebindToUnsigned<decltype(di)> du;
4956  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
4957 }
4958 
4959 // ------------------------------ Shr (mul, mask, BroadcastSignBit)
4960 
4961 // Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use
4962 // widening multiplication by powers of two obtained by loading float exponents,
4963 // followed by a constant right-shift. This is still faster than a scalar or
4964 // bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.
4965 
4966 template <size_t N>
4968  const Vec128<uint16_t, N> bits) {
4969 #if HWY_TARGET <= HWY_AVX3
4970  return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
4971 #else
4972  const Simd<uint16_t, N, 0> d;
4973  // For bits=0, we cannot mul by 2^16, so fix the result later.
4974  const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
4975  // Replace output with input where bits == 0.
4976  return IfThenElse(bits == Zero(d), in, out);
4977 #endif
4978 }
4980  const Vec128<uint16_t, 1> bits) {
4981  return Vec128<uint16_t, 1>{_mm_srl_epi16(in.raw, bits.raw)};
4982 }
4983 
4984 template <size_t N>
4986  const Vec128<uint32_t, N> bits) {
4987 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4988  // 32x32 -> 64 bit mul, then shift right by 32.
4989  const Simd<uint32_t, N, 0> d32;
4990  // Move odd lanes into position for the second mul. Shuffle more gracefully
4991  // handles N=1 than repartitioning to u64 and shifting 32 bits right.
4992  const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
4993  // For bits=0, we cannot mul by 2^32, so fix the result later.
4994  const auto mul = detail::Pow2(Set(d32, 32) - bits);
4995  const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0
4996  const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
4997  // No need to shift right, already in the correct position.
4998  const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ?
4999  const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
5000  // Replace output with input where bits == 0.
5001  return IfThenElse(bits == Zero(d32), in, out);
5002 #else
5003  return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
5004 #endif
5005 }
5007  const Vec128<uint32_t, 1> bits) {
5008  return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits.raw)};
5009 }
5010 
5011 HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
5012  const Vec128<uint64_t> bits) {
5013 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5014  // Individual shifts and combine
5015  const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
5016  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
5017  const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
5018  return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
5019 #else
5020  return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
5021 #endif
5022 }
5023 HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v,
5024  const Vec64<uint64_t> bits) {
5025  return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)};
5026 }
5027 
5028 #if HWY_TARGET > HWY_AVX3 // AVX2 or older
5029 namespace detail {
5030 
5031 // Also used in x86_256-inl.h.
5032 template <class DI, class V>
5033 HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
5034  const RebindToUnsigned<DI> du;
5035  const auto count = BitCast(du, count_i); // same type as value to shift
5036  // Clear sign and restore afterwards. This is preferable to shifting the MSB
5037  // downwards because Shr is somewhat more expensive than Shl.
5038  const auto sign = BroadcastSignBit(v);
5039  const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below
5040  return BitCast(di, abs >> count) ^ sign;
5041 }
5042 
5043 } // namespace detail
5044 #endif // HWY_TARGET > HWY_AVX3
5045 
5046 template <size_t N>
5048  const Vec128<int16_t, N> bits) {
5049 #if HWY_TARGET <= HWY_AVX3
5050  return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
5051 #else
5052  return detail::SignedShr(Simd<int16_t, N, 0>(), v, bits);
5053 #endif
5054 }
5056  const Vec128<int16_t, 1> bits) {
5057  return Vec128<int16_t, 1>{_mm_sra_epi16(v.raw, bits.raw)};
5058 }
5059 
5060 template <size_t N>
5062  const Vec128<int32_t, N> bits) {
5063 #if HWY_TARGET <= HWY_AVX3
5064  return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
5065 #else
5066  return detail::SignedShr(Simd<int32_t, N, 0>(), v, bits);
5067 #endif
5068 }
5070  const Vec128<int32_t, 1> bits) {
5071  return Vec128<int32_t, 1>{_mm_sra_epi32(v.raw, bits.raw)};
5072 }
5073 
5074 template <size_t N>
5076  const Vec128<int64_t, N> bits) {
5077 #if HWY_TARGET <= HWY_AVX3
5078  return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
5079 #else
5080  return detail::SignedShr(Simd<int64_t, N, 0>(), v, bits);
5081 #endif
5082 }
5083 
5084 // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
5085 
5086 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
5087  const Vec128<uint64_t> b) {
5088  alignas(16) uint64_t mul[2];
5089  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
5090  return Load(Full128<uint64_t>(), mul);
5091 }
5092 
5093 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
5094  const Vec128<uint64_t> b) {
5095  alignas(16) uint64_t mul[2];
5096  const Half<Full128<uint64_t>> d2;
5097  mul[0] =
5098  Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
5099  return Load(Full128<uint64_t>(), mul);
5100 }
5101 
5102 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5103 
5104 template <size_t N>
5105 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
5106  Vec128<bfloat16_t, 2 * N> a,
5107  Vec128<bfloat16_t, 2 * N> b,
5108  const Vec128<float, N> sum0,
5109  Vec128<float, N>& sum1) {
5110  // TODO(janwas): _mm_dpbf16_ps when available
5111  const Repartition<uint16_t, decltype(df32)> du16;
5112  const RebindToUnsigned<decltype(df32)> du32;
5113  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
5114  // Lane order within sum0/1 is undefined, hence we can avoid the
5115  // longer-latency lane-crossing PromoteTo.
5116  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
5117  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
5118  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
5119  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
5120  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
5121  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
5122 }
5123 
5124 // ================================================== CONVERT
5125 
5126 // ------------------------------ Promotions (part w/ narrow lanes -> full)
5127 
5128 // Unsigned: zero-extend.
5129 template <size_t N>
5130 HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
5131  const Vec128<uint8_t, N> v) {
5132 #if HWY_TARGET == HWY_SSSE3
5133  const __m128i zero = _mm_setzero_si128();
5134  return Vec128<uint16_t, N>{_mm_unpacklo_epi8(v.raw, zero)};
5135 #else
5136  return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(v.raw)};
5137 #endif
5138 }
5139 template <size_t N>
5140 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
5141  const Vec128<uint16_t, N> v) {
5142 #if HWY_TARGET == HWY_SSSE3
5143  return Vec128<uint32_t, N>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
5144 #else
5145  return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(v.raw)};
5146 #endif
5147 }
5148 template <size_t N>
5149 HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
5150  const Vec128<uint32_t, N> v) {
5151 #if HWY_TARGET == HWY_SSSE3
5152  return Vec128<uint64_t, N>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
5153 #else
5154  return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(v.raw)};
5155 #endif
5156 }
5157 template <size_t N>
5158 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
5159  const Vec128<uint8_t, N> v) {
5160 #if HWY_TARGET == HWY_SSSE3
5161  const __m128i zero = _mm_setzero_si128();
5162  const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
5163  return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
5164 #else
5165  return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(v.raw)};
5166 #endif
5167 }
5168 
5169 // Unsigned to signed: same plus cast.
5170 template <size_t N>
5171 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> di,
5172  const Vec128<uint8_t, N> v) {
5173  return BitCast(di, PromoteTo(Simd<uint16_t, N, 0>(), v));
5174 }
5175 template <size_t N>
5176 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> di,
5177  const Vec128<uint16_t, N> v) {
5178  return BitCast(di, PromoteTo(Simd<uint32_t, N, 0>(), v));
5179 }
5180 template <size_t N>
5181 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> di,
5182  const Vec128<uint8_t, N> v) {
5183  return BitCast(di, PromoteTo(Simd<uint32_t, N, 0>(), v));
5184 }
5185 
5186 // Signed: replicate sign bit.
5187 template <size_t N>
5188 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
5189  const Vec128<int8_t, N> v) {
5190 #if HWY_TARGET == HWY_SSSE3
5191  return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(v.raw, v.raw)});
5192 #else
5193  return Vec128<int16_t, N>{_mm_cvtepi8_epi16(v.raw)};
5194 #endif
5195 }
5196 template <size_t N>
5197 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
5198  const Vec128<int16_t, N> v) {
5199 #if HWY_TARGET == HWY_SSSE3
5200  return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(v.raw, v.raw)});
5201 #else
5202  return Vec128<int32_t, N>{_mm_cvtepi16_epi32(v.raw)};
5203 #endif
5204 }
5205 template <size_t N>
5206 HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
5207  const Vec128<int32_t, N> v) {
5208 #if HWY_TARGET == HWY_SSSE3
5209  return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(v.raw, v.raw)});
5210 #else
5211  return Vec128<int64_t, N>{_mm_cvtepi32_epi64(v.raw)};
5212 #endif
5213 }
5214 template <size_t N>
5215 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
5216  const Vec128<int8_t, N> v) {
5217 #if HWY_TARGET == HWY_SSSE3
5218  const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
5219  const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
5220  return ShiftRight<24>(Vec128<int32_t, N>{x4});
5221 #else
5222  return Vec128<int32_t, N>{_mm_cvtepi8_epi32(v.raw)};
5223 #endif
5224 }
5225 
5226 // Workaround for origin tracking bug in Clang msan prior to 11.0
5227 // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
5228 #if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
5229 #define HWY_INLINE_F16 HWY_NOINLINE
5230 #else
5231 #define HWY_INLINE_F16 HWY_INLINE
5232 #endif
5233 template <size_t N>
5235  const Vec128<float16_t, N> v) {
5236 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5237  const RebindToSigned<decltype(df32)> di32;
5238  const RebindToUnsigned<decltype(df32)> du32;
5239  // Expand to u32 so we can shift.
5240  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
5241  const auto sign = ShiftRight<15>(bits16);
5242  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
5243  const auto mantissa = bits16 & Set(du32, 0x3FF);
5244  const auto subnormal =
5245  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
5246  Set(df32, 1.0f / 16384 / 1024));
5247 
5248  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
5249  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
5250  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
5251  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
5252  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
5253 #else
5254  (void)df32;
5255  return Vec128<float, N>{_mm_cvtph_ps(v.raw)};
5256 #endif
5257 }
5258 
5259 template <size_t N>
5260 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
5261  const Vec128<bfloat16_t, N> v) {
5262  const Rebind<uint16_t, decltype(df32)> du16;
5263  const RebindToSigned<decltype(df32)> di32;
5264  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
5265 }
5266 
5267 template <size_t N>
5269  const Vec128<float, N> v) {
5270  return Vec128<double, N>{_mm_cvtps_pd(v.raw)};
5271 }
5272 
5273 template <size_t N>
5274 HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
5275  const Vec128<int32_t, N> v) {
5276  return Vec128<double, N>{_mm_cvtepi32_pd(v.raw)};
5277 }
5278 
5279 // ------------------------------ Demotions (full -> part w/ narrow lanes)
5280 
5281 template <size_t N>
5282 HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
5283  const Vec128<int32_t, N> v) {
5284 #if HWY_TARGET == HWY_SSSE3
5285  const Simd<int32_t, N, 0> di32;
5286  const Simd<uint16_t, N * 2, 0> du16;
5287  const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
5288  const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
5289  const auto clamped = Or(zero_if_neg, too_big);
5290  // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
5291  alignas(16) constexpr uint16_t kLower2Bytes[16] = {
5292  0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
5293  const auto lo2 = Load(du16, kLower2Bytes);
5294  return Vec128<uint16_t, N>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
5295 #else
5296  return Vec128<uint16_t, N>{_mm_packus_epi32(v.raw, v.raw)};
5297 #endif
5298 }
5299 
5300 template <size_t N>
5301 HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
5302  const Vec128<int32_t, N> v) {
5303  return Vec128<int16_t, N>{_mm_packs_epi32(v.raw, v.raw)};
5304 }
5305 
5306 template <size_t N>
5307 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
5308  const Vec128<int32_t, N> v) {
5309  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
5310  return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
5311 }
5312 
5313 template <size_t N>
5314 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
5315  const Vec128<int16_t, N> v) {
5316  return Vec128<uint8_t, N>{_mm_packus_epi16(v.raw, v.raw)};
5317 }
5318 
5319 template <size_t N>
5320 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
5321  const Vec128<int32_t, N> v) {
5322  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
5323  return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
5324 }
5325 
5326 template <size_t N>
5327 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
5328  const Vec128<int16_t, N> v) {
5329  return Vec128<int8_t, N>{_mm_packs_epi16(v.raw, v.raw)};
5330 }
5331 
5332 // Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate).
5333 // clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain.
5334 HWY_DIAGNOSTICS(push)
5335 HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain")
5336 
5337 template <size_t N>
5338 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
5339  const Vec128<float, N> v) {
5340 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5341  const RebindToUnsigned<decltype(df16)> du16;
5342  const Rebind<uint32_t, decltype(df16)> du;
5343  const RebindToSigned<decltype(du)> di;
5344  const auto bits32 = BitCast(du, v);
5345  const auto sign = ShiftRight<31>(bits32);
5346  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
5347  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
5348 
5349  const auto k15 = Set(di, 15);
5350  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
5351  const auto is_tiny = exp < Set(di, -24);
5352 
5353  const auto is_subnormal = exp < Set(di, -14);
5354  const auto biased_exp16 =
5355  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
5356  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
5357  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
5358  (mantissa32 >> (Set(du, 13) + sub_exp));
5359  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
5360  ShiftRight<13>(mantissa32)); // <1024
5361 
5362  const auto sign16 = ShiftLeft<15>(sign);
5363  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
5364  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
5365  return BitCast(df16, DemoteTo(du16, bits16));
5366 #else
5367  (void)df16;
5368  return Vec128<float16_t, N>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
5369 #endif
5370 }
5371 
5372 HWY_DIAGNOSTICS(pop)
5373 
5374 template <size_t N>
5375 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
5376  const Vec128<float, N> v) {
5377  // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
5378  const Rebind<int32_t, decltype(dbf16)> di32;
5379  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
5380  const Rebind<uint16_t, decltype(dbf16)> du16;
5381  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
5382  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
5383 }
5384 
5385 template <size_t N>
5386 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
5387  Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
5388  // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16.
5389  const RebindToUnsigned<decltype(dbf16)> du16;
5390  const Repartition<uint32_t, decltype(dbf16)> du32;
5391  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
5392  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
5393 }
5394 
5395 template <size_t N>
5396 HWY_API Vec128<float, N> DemoteTo(Simd<float, N, 0> /* tag */,
5397  const Vec128<double, N> v) {
5398  return Vec128<float, N>{_mm_cvtpd_ps(v.raw)};
5399 }
5400 
5401 namespace detail {
5402 
5403 // For well-defined float->int demotion in all x86_*-inl.h.
5404 
5405 template <size_t N>
5407  -> decltype(Zero(d)) {
5408  // The max can be exactly represented in binary64, so clamping beforehand
5409  // prevents x86 conversion from raising an exception and returning 80..00.
5410  return Min(v, Set(d, 2147483647.0));
5411 }
5412 
5413 // For ConvertTo float->int of same size, clamping before conversion would
5414 // change the result because the max integer value is not exactly representable.
5415 // Instead detect the overflow result after conversion and fix it.
5416 template <class DI, class DF = RebindToFloat<DI>>
5418  decltype(Zero(di).raw) converted_raw)
5419  -> VFromD<DI> {
5420  // Combinations of original and output sign:
5421  // --: normal <0 or -huge_val to 80..00: OK
5422  // -+: -0 to 0 : OK
5423  // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF
5424  // ++: normal >0 : OK
5425  const auto converted = VFromD<DI>{converted_raw};
5426  const auto sign_wrong = AndNot(BitCast(di, original), converted);
5427 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
5428  // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
5429  // Add() if using that instead. Work around with one more instruction.
5430  const RebindToUnsigned<DI> du;
5431  const VFromD<DI> mask = BroadcastSignBit(sign_wrong);
5432  const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask)));
5433  return IfVecThenElse(mask, max, converted);
5434 #else
5435  return Xor(converted, BroadcastSignBit(sign_wrong));
5436 #endif
5437 }
5438 
5439 } // namespace detail
5440 
5441 template <size_t N>
5442 HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* tag */,
5443  const Vec128<double, N> v) {
5444  const auto clamped = detail::ClampF64ToI32Max(Simd<double, N, 0>(), v);
5445  return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
5446 }
5447 
5448 // For already range-limited input [0, 255].
5449 template <size_t N>
5450 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
5451  const Simd<uint32_t, N, 0> d32;
5452  const Simd<uint8_t, N * 4, 0> d8;
5453  alignas(16) static constexpr uint32_t k8From32[4] = {
5454  0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
5455  // Also replicate bytes into all 32 bit lanes for safety.
5456  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
5457  return LowerHalf(LowerHalf(BitCast(d8, quad)));
5458 }
5459 
5460 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
5461 
5462 template <size_t N>
5463 HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
5464  const Vec128<int32_t, N> v) {
5465  return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
5466 }
5467 
5468 template <size_t N>
5470  const Vec128<int64_t, N> v) {
5471 #if HWY_TARGET <= HWY_AVX3
5472  (void)dd;
5473  return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
5474 #else
5475  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
5476  const Repartition<uint32_t, decltype(dd)> d32;
5477  const Repartition<uint64_t, decltype(dd)> d64;
5478 
5479  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
5480  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
5481  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
5482 
5483  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
5484  const auto k52 = Set(d32, 0x43300000);
5485  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
5486 
5487  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
5488  return (v_upper - k84_63_52) + v_lower; // order matters!
5489 #endif
5490 }
5491 
5492 // Truncates (rounds toward zero).
5493 template <size_t N>
5494 HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N, 0> di,
5495  const Vec128<float, N> v) {
5496  return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw));
5497 }
5498 
5499 // Full (partial handled below)
5501 #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
5502  return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw));
5503 #elif HWY_ARCH_X86_64
5504  const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
5505  const Half<Full128<double>> dd2;
5506  const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
5507  return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1));
5508 #else
5509  using VI = VFromD<decltype(di)>;
5510  const VI k0 = Zero(di);
5511  const VI k1 = Set(di, 1);
5512  const VI k51 = Set(di, 51);
5513 
5514  // Exponent indicates whether the number can be represented as int64_t.
5515  const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
5516  const VI exp = biased_exp - Set(di, 0x3FF);
5517  const auto in_range = exp < Set(di, 63);
5518 
5519  // If we were to cap the exponent at 51 and add 2^52, the number would be in
5520  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
5521  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
5522  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
5523  // manually shift the mantissa into place (we already have many of the
5524  // inputs anyway).
5525  const VI shift_mnt = Max(k51 - exp, k0);
5526  const VI shift_int = Max(exp - k51, k0);
5527  const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
5528  // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
5529  const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
5530  // For inputs larger than 2^52, insert zeros at the bottom.
5531  const VI shifted = int52 << shift_int;
5532  // Restore the one bit lost when shifting in the implicit 1-bit.
5533  const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
5534 
5535  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
5536  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
5537  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
5538  const VI magnitude = IfThenElse(in_range, restored, limit);
5539 
5540  // If the input was negative, negate the integer (two's complement).
5541  return (magnitude ^ sign_mask) - sign_mask;
5542 #endif
5543 }
5545  // Only need to specialize for non-AVX3, 64-bit (single scalar op)
5546 #if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
5547  const Vec64<int64_t> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
5548  return detail::FixConversionOverflow(di, v, i0.raw);
5549 #else
5550  (void)di;
5551  const auto full = ConvertTo(Full128<int64_t>(), Vec128<double>{v.raw});
5552  return Vec64<int64_t>{full.raw};
5553 #endif
5554 }
5555 
5556 template <size_t N>
5557 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
5558  const Simd<int32_t, N, 0> di;
5559  return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw));
5560 }
5561 
5562 // ------------------------------ Floating-point rounding (ConvertTo)
5563 
5564 #if HWY_TARGET == HWY_SSSE3
5565 
5566 // Toward nearest integer, ties to even
5567 template <typename T, size_t N, HWY_IF_FLOAT(T)>
5569  // Rely on rounding after addition with a large value such that no mantissa
5570  // bits remain (assuming the current mode is nearest-even). We may need a
5571  // compiler flag for precise floating-point to prevent "optimizing" this out.
5572  const Simd<T, N, 0> df;
5573  const auto max = Set(df, MantissaEnd<T>());
5574  const auto large = CopySignToAbs(max, v);
5575  const auto added = large + v;
5576  const auto rounded = added - large;
5577  // Keep original if NaN or the magnitude is large (already an int).
5578  return IfThenElse(Abs(v) < max, rounded, v);
5579 }
5580 
5581 namespace detail {
5582 
5583 // Truncating to integer and converting back to float is correct except when the
5584 // input magnitude is large, in which case the input was already an integer
5585 // (because mantissa >> exponent is zero).
5586 template <typename T, size_t N, HWY_IF_FLOAT(T)>
5588  return Abs(v) < Set(Simd<T, N, 0>(), MantissaEnd<T>());
5589 }
5590 
5591 } // namespace detail
5592 
5593 // Toward zero, aka truncate
5594 template <typename T, size_t N, HWY_IF_FLOAT(T)>
5596  const Simd<T, N, 0> df;
5597  const RebindToSigned<decltype(df)> di;
5598 
5599  const auto integer = ConvertTo(di, v); // round toward 0
5600  const auto int_f = ConvertTo(df, integer);
5601 
5602  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
5603 }
5604 
5605 // Toward +infinity, aka ceiling
5606 template <typename T, size_t N, HWY_IF_FLOAT(T)>
5608  const Simd<T, N, 0> df;
5609  const RebindToSigned<decltype(df)> di;
5610 
5611  const auto integer = ConvertTo(di, v); // round toward 0
5612  const auto int_f = ConvertTo(df, integer);
5613 
5614  // Truncating a positive non-integer ends up smaller; if so, add 1.
5615  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
5616 
5617  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
5618 }
5619 
5620 // Toward -infinity, aka floor
5621 template <typename T, size_t N, HWY_IF_FLOAT(T)>
5623  const Simd<T, N, 0> df;
5624  const RebindToSigned<decltype(df)> di;
5625 
5626  const auto integer = ConvertTo(di, v); // round toward 0
5627  const auto int_f = ConvertTo(df, integer);
5628 
5629  // Truncating a negative non-integer ends up larger; if so, subtract 1.
5630  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
5631 
5632  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
5633 }
5634 
5635 #else
5636 
5637 // Toward nearest integer, ties to even
5638 template <size_t N>
5639 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
5640  return Vec128<float, N>{
5641  _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5642 }
5643 template <size_t N>
5644 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
5645  return Vec128<double, N>{
5646  _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5647 }
5648 
5649 // Toward zero, aka truncate
5650 template <size_t N>
5651 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
5652  return Vec128<float, N>{
5653  _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5654 }
5655 template <size_t N>
5656 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
5657  return Vec128<double, N>{
5658  _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5659 }
5660 
5661 // Toward +infinity, aka ceiling
5662 template <size_t N>
5663 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
5664  return Vec128<float, N>{
5665  _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5666 }
5667 template <size_t N>
5668 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
5669  return Vec128<double, N>{
5670  _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5671 }
5672 
5673 // Toward -infinity, aka floor
5674 template <size_t N>
5675 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
5676  return Vec128<float, N>{
5677  _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5678 }
5679 template <size_t N>
5680 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
5681  return Vec128<double, N>{
5682  _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5683 }
5684 
5685 #endif // !HWY_SSSE3
5686 
5687 // ------------------------------ Floating-point classification
5688 
5689 template <size_t N>
5691 #if HWY_TARGET <= HWY_AVX3
5692  return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x81)};
5693 #else
5694  return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)};
5695 #endif
5696 }
5697 template <size_t N>
5699 #if HWY_TARGET <= HWY_AVX3
5700  return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x81)};
5701 #else
5702  return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)};
5703 #endif
5704 }
5705 
5706 #if HWY_TARGET <= HWY_AVX3
5707 
5708 template <size_t N>
5710  return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x18)};
5711 }
5712 template <size_t N>
5714  return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x18)};
5715 }
5716 
5717 // Returns whether normal/subnormal/zero.
5718 template <size_t N>
5720  // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
5721  // and negate the mask.
5722  return Not(Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x99)});
5723 }
5724 template <size_t N>
5726  return Not(Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x99)});
5727 }
5728 
5729 #else
5730 
5731 template <typename T, size_t N, HWY_IF_FLOAT(T)>
5732 HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
5733  const Simd<T, N, 0> d;
5734  const RebindToSigned<decltype(d)> di;
5735  const VFromD<decltype(di)> vi = BitCast(di, v);
5736  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
5737  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
5738 }
5739 
5740 // Returns whether normal/subnormal/zero.
5741 template <typename T, size_t N, HWY_IF_FLOAT(T)>
5742 HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
5743  const Simd<T, N, 0> d;
5744  const RebindToUnsigned<decltype(d)> du;
5745  const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
5746  const VFromD<decltype(du)> vu = BitCast(du, v);
5747  // Shift left to clear the sign bit, then right so we can compare with the
5748  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
5749  // negative and non-negative floats would be greater). MSVC seems to generate
5750  // incorrect code if we instead add vu + vu.
5751  const VFromD<decltype(di)> exp =
5752  BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
5753  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
5754 }
5755 
5756 #endif // HWY_TARGET <= HWY_AVX3
5757 
5758 // ================================================== CRYPTO
5759 
5760 #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
5761 
5762 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
5763 #ifdef HWY_NATIVE_AES
5764 #undef HWY_NATIVE_AES
5765 #else
5766 #define HWY_NATIVE_AES
5767 #endif
5768 
5769 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
5770  Vec128<uint8_t> round_key) {
5771  return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
5772 }
5773 
5774 HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
5775  Vec128<uint8_t> round_key) {
5776  return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
5777 }
5778 
5779 template <size_t N, HWY_IF_LE128(uint64_t, N)>
5780 HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
5781  Vec128<uint64_t, N> b) {
5782  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
5783 }
5784 
5785 template <size_t N, HWY_IF_LE128(uint64_t, N)>
5786 HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
5787  Vec128<uint64_t, N> b) {
5788  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
5789 }
5790 
5791 #endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
5792 
5793 // ================================================== MISC
5794 
5795 template <typename T>
5796 struct CompressIsPartition {
5797 #if HWY_TARGET <= HWY_AVX3
5798  // AVX3 supports native compress, but a table-based approach allows
5799  // 'partitioning' (also moving mask=false lanes to the top), which helps
5800  // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8
5801  // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3
5802  // u32x8 etc.).
5803  enum { value = (sizeof(T) == 8) };
5804 #else
5805  enum { value = 1 };
5806 #endif
5807 };
5808 
5809 #if HWY_TARGET <= HWY_AVX3
5810 
5811 // ------------------------------ LoadMaskBits
5812 
5813 // `p` points to at least 8 readable bytes, not all of which need be valid.
5814 template <typename T, size_t N, HWY_IF_LE128(T, N)>
5815 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> /* tag */,
5816  const uint8_t* HWY_RESTRICT bits) {
5817  uint64_t mask_bits = 0;
5818  constexpr size_t kNumBytes = (N + 7) / 8;
5819  CopyBytes<kNumBytes>(bits, &mask_bits);
5820  if (N < 8) {
5821  mask_bits &= (1ull << N) - 1;
5822  }
5823 
5824  return Mask128<T, N>::FromBits(mask_bits);
5825 }
5826 
5827 // ------------------------------ StoreMaskBits
5828 
5829 // `p` points to at least 8 writable bytes.
5830 template <typename T, size_t N>
5831 HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
5832  const Mask128<T, N> mask, uint8_t* bits) {
5833  constexpr size_t kNumBytes = (N + 7) / 8;
5834  CopyBytes<kNumBytes>(&mask.raw, bits);
5835 
5836  // Non-full byte, need to clear the undefined upper bits.
5837  if (N < 8) {
5838  const int mask = (1 << N) - 1;
5839  bits[0] = static_cast<uint8_t>(bits[0] & mask);
5840  }
5841 
5842  return kNumBytes;
5843 }
5844 
5845 // ------------------------------ Mask testing
5846 
5847 // Beware: the suffix indicates the number of mask bits, not lane size!
5848 
5849 template <typename T, size_t N>
5850 HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
5851  const Mask128<T, N> mask) {
5852  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
5853  return PopCount(mask_bits);
5854 }
5855 
5856 template <typename T, size_t N>
5857 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
5858  const Mask128<T, N> mask) {
5859  const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
5860  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
5861 }
5862 
5863 template <typename T, size_t N>
5864 HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
5865  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
5866  return mask_bits == 0;
5867 }
5868 
5869 template <typename T, size_t N>
5870 HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
5871  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
5872  // Cannot use _kortestc because we may have less than 8 mask bits.
5873  return mask_bits == (1u << N) - 1;
5874 }
5875 
5876 // ------------------------------ Compress
5877 
5878 #if HWY_TARGET != HWY_AVX3_DL
5879 namespace detail {
5880 
5881 // Returns permutevar_epi16 indices for 16-bit Compress. Also used by x86_256.
5882 HWY_INLINE Vec128<uint16_t> IndicesForCompress16(uint64_t mask_bits) {
5883  Full128<uint16_t> du16;
5884  // Table of u16 indices packed into bytes to reduce L1 usage. Will be unpacked
5885  // to u16. Ideally we would broadcast 8*3 (half of the 8 bytes currently used)
5886  // bits into each lane and then varshift, but that does not fit in 16 bits.
5887  Rebind<uint8_t, decltype(du16)> du8;
5888  alignas(16) constexpr uint8_t tbl[2048] = {
5889  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
5890  1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2,
5891  0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
5892  0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0,
5893  0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0,
5894  0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0,
5895  0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
5896  0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0,
5897  0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2,
5898  3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1,
5899  2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0,
5900  0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0,
5901  0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0,
5902  0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0,
5903  0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0,
5904  0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0,
5905  1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0,
5906  2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4,
5907  5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3,
5908  4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4,
5909  5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
5910  0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0,
5911  0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0,
5912  0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0,
5913  0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1,
5914  2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4,
5915  6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6,
5916  0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4,
5917  6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0,
5918  0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0,
5919  0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0,
5920  0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0,
5921  2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0,
5922  1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3,
5923  5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3,
5924  5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0,
5925  0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6,
5926  0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0,
5927  0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0,
5928  0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0,
5929  0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7,
5930  0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1,
5931  7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7,
5932  0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0,
5933  0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0,
5934  0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0,
5935  0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0,
5936  0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0,
5937  1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0,
5938  3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3,
5939  4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2,
5940  3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0,
5941  0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0,
5942  0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0,
5943  0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0,
5944  0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0,
5945  0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1,
5946  4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2,
5947  4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5,
5948  7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4,
5949  5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5,
5950  7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0,
5951  0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0,
5952  0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0,
5953  3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0,
5954  1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2,
5955  3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6,
5956  7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7,
5957  0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6,
5958  7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0,
5959  0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0,
5960  0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0,
5961  0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2,
5962  5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1,
5963  2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5,
5964  6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5,
5965  6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0,
5966  0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7,
5967  0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0,
5968  0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0,
5969  1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0,
5970  2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
5971  return PromoteTo(du16, Load(du8, tbl + mask_bits * 8));
5972 }
5973 
5974 } // namespace detail
5975 #endif // HWY_TARGET != HWY_AVX3_DL
5976 
5977 // Single lane: no-op
5978 template <typename T>
5979 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
5980  return v;
5981 }
5982 
5983 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
5984 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
5985  const Simd<T, N, 0> d;
5986  const Rebind<uint16_t, decltype(d)> du;
5987  const auto vu = BitCast(du, v); // (required for float16_t inputs)
5988 
5989 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
5990  const Vec128<uint16_t, N> cu{_mm_maskz_compress_epi16(mask.raw, vu.raw)};
5991 #else
5992  const auto idx = detail::IndicesForCompress16(uint64_t{mask.raw});
5993  const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
5994 #endif // HWY_TARGET != HWY_AVX3_DL
5995  return BitCast(d, cu);
5996 }
5997 
5998 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
5999 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
6000  return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
6001 }
6002 
6003 template <size_t N, HWY_IF_GE64(float, N)>
6005  return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
6006 }
6007 
6008 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6009 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
6010  HWY_DASSERT(mask.raw < 4);
6011 
6012  // There are only 2 lanes, so we can afford to load the index vector directly.
6013  alignas(16) constexpr uint8_t u8_indices[64] = {
6014  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6015  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6016  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6017  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6018 
6019  const Full128<T> d;
6020  const Repartition<uint8_t, decltype(d)> d8;
6021  const auto index = Load(d8, u8_indices + 16 * mask.raw);
6022  return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
6023 }
6024 
6025 // ------------------------------ CompressNot (Compress)
6026 
6027 // Single lane: no-op
6028 template <typename T>
6029 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6030  return v;
6031 }
6032 
6033 template <typename T, size_t N>
6034 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6035  return Compress(v, Not(mask));
6036 }
6037 
6038 // ------------------------------ CompressBlocksNot
6039 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
6040  Mask128<uint64_t> /* m */) {
6041  return v;
6042 }
6043 
6044 // ------------------------------ CompressBits (LoadMaskBits)
6045 
6046 template <typename T, size_t N>
6047 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
6048  const uint8_t* HWY_RESTRICT bits) {
6049  return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
6050 }
6051 
6052 // ------------------------------ CompressStore
6053 
6054 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6056  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6057  const Rebind<uint16_t, decltype(d)> du;
6058  const auto vu = BitCast(du, v); // (required for float16_t inputs)
6059 
6060  const uint64_t mask_bits{mask.raw};
6061 
6062 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
6063  _mm_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
6064 #else
6065  const auto idx = detail::IndicesForCompress16(mask_bits);
6066  const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
6067  StoreU(BitCast(d, cu), d, unaligned);
6068 #endif // HWY_TARGET == HWY_AVX3_DL
6069 
6070  const size_t count = PopCount(mask_bits & ((1ull << N) - 1));
6071  // Workaround for MSAN not marking output as initialized (b/233326619)
6072 #if HWY_IS_MSAN
6073  __msan_unpoison(unaligned, count * sizeof(T));
6074 #endif
6075  return count;
6076 }
6077 
6078 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
6079 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
6080  Simd<T, N, 0> /* tag */,
6081  T* HWY_RESTRICT unaligned) {
6082  _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6083  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6084  // Workaround for MSAN not marking output as initialized (b/233326619)
6085 #if HWY_IS_MSAN
6086  __msan_unpoison(unaligned, count * sizeof(T));
6087 #endif
6088  return count;
6089 }
6090 
6091 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
6092 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
6093  Simd<T, N, 0> /* tag */,
6094  T* HWY_RESTRICT unaligned) {
6095  _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6096  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6097  // Workaround for MSAN not marking output as initialized (b/233326619)
6098 #if HWY_IS_MSAN
6099  __msan_unpoison(unaligned, count * sizeof(T));
6100 #endif
6101  return count;
6102 }
6103 
6104 template <size_t N, HWY_IF_LE128(float, N)>
6106  Simd<float, N, 0> /* tag */,
6107  float* HWY_RESTRICT unaligned) {
6108  _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6109  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6110  // Workaround for MSAN not marking output as initialized (b/233326619)
6111 #if HWY_IS_MSAN
6112  __msan_unpoison(unaligned, count * sizeof(float));
6113 #endif
6114  return count;
6115 }
6116 
6117 template <size_t N, HWY_IF_LE128(double, N)>
6119  Simd<double, N, 0> /* tag */,
6120  double* HWY_RESTRICT unaligned) {
6121  _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6122  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6123  // Workaround for MSAN not marking output as initialized (b/233326619)
6124 #if HWY_IS_MSAN
6125  __msan_unpoison(unaligned, count * sizeof(double));
6126 #endif
6127  return count;
6128 }
6129 
6130 // ------------------------------ CompressBlendedStore (CompressStore)
6131 template <typename T, size_t N>
6132 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
6133  Simd<T, N, 0> d,
6134  T* HWY_RESTRICT unaligned) {
6135  // AVX-512 already does the blending at no extra cost (latency 11,
6136  // rthroughput 2 - same as compress plus store).
6137  if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
6138  // We're relying on the mask to blend. Clear the undefined upper bits.
6139  if (N != 16 / sizeof(T)) {
6140  m = And(m, FirstN(d, N));
6141  }
6142  return CompressStore(v, m, d, unaligned);
6143  } else {
6144  const size_t count = CountTrue(d, m);
6145  const Vec128<T, N> compressed = Compress(v, m);
6146 #if HWY_MEM_OPS_MIGHT_FAULT
6147  // BlendedStore tests mask for each lane, but we know that the mask is
6148  // FirstN, so we can just copy.
6149  alignas(16) T buf[N];
6150  Store(compressed, d, buf);
6151  memcpy(unaligned, buf, count * sizeof(T));
6152 #else
6153  BlendedStore(compressed, FirstN(d, count), d, unaligned);
6154 #endif
6155  // Workaround: as of 2022-02-23 MSAN does not mark the output as
6156  // initialized.
6157 #if HWY_IS_MSAN
6158  __msan_unpoison(unaligned, count * sizeof(T));
6159 #endif
6160  return count;
6161  }
6162 }
6163 
6164 // ------------------------------ CompressBitsStore (LoadMaskBits)
6165 
6166 template <typename T, size_t N>
6167 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
6168  const uint8_t* HWY_RESTRICT bits,
6169  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6170  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
6171 }
6172 
6173 #else // AVX2 or below
6174 
6175 // ------------------------------ LoadMaskBits (TestBit)
6176 
6177 namespace detail {
6178 
6179 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
6180 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6181  const RebindToUnsigned<decltype(d)> du;
6182  // Easier than Set(), which would require an >8-bit type, which would not
6183  // compile for T=uint8_t, N=1.
6184  const Vec128<T, N> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};
6185 
6186  // Replicate bytes 8x such that each byte contains the bit that governs it.
6187  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
6188  1, 1, 1, 1, 1, 1, 1, 1};
6189  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
6190 
6191  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
6192  1, 2, 4, 8, 16, 32, 64, 128};
6193  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
6194 }
6195 
6196 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6197 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6198  const RebindToUnsigned<decltype(d)> du;
6199  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
6200  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
6201  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
6202 }
6203 
6204 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
6205 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6206  const RebindToUnsigned<decltype(d)> du;
6207  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
6208  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
6209  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
6210 }
6211 
6212 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
6213 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6214  const RebindToUnsigned<decltype(d)> du;
6215  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
6216  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
6217 }
6218 
6219 } // namespace detail
6220 
6221 // `p` points to at least 8 readable bytes, not all of which need be valid.
6222 template <typename T, size_t N, HWY_IF_LE128(T, N)>
6223 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
6224  const uint8_t* HWY_RESTRICT bits) {
6225  uint64_t mask_bits = 0;
6226  constexpr size_t kNumBytes = (N + 7) / 8;
6227  CopyBytes<kNumBytes>(bits, &mask_bits);
6228  if (N < 8) {
6229  mask_bits &= (1ull << N) - 1;
6230  }
6231 
6232  return detail::LoadMaskBits(d, mask_bits);
6233 }
6234 
6235 // ------------------------------ StoreMaskBits
6236 
6237 namespace detail {
6238 
6239 constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
6240  return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
6241 }
6242 
6243 template <typename T, size_t N>
6244 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
6245  const Mask128<T, N> mask) {
6246  const Simd<T, N, 0> d;
6247  const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
6248  return U64FromInt(_mm_movemask_epi8(sign_bits));
6249 }
6250 
6251 template <typename T, size_t N>
6252 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
6253  const Mask128<T, N> mask) {
6254  // Remove useless lower half of each u16 while preserving the sign bit.
6255  const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
6256  return U64FromInt(_mm_movemask_epi8(sign_bits));
6257 }
6258 
6259 template <typename T, size_t N>
6260 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
6261  const Mask128<T, N> mask) {
6262  const Simd<T, N, 0> d;
6263  const Simd<float, N, 0> df;
6264  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
6265  return U64FromInt(_mm_movemask_ps(sign_bits.raw));
6266 }
6267 
6268 template <typename T, size_t N>
6269 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
6270  const Mask128<T, N> mask) {
6271  const Simd<T, N, 0> d;
6272  const Simd<double, N, 0> df;
6273  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
6274  return U64FromInt(_mm_movemask_pd(sign_bits.raw));
6275 }
6276 
6277 // Returns the lowest N of the _mm_movemask* bits.
6278 template <typename T, size_t N>
6279 constexpr uint64_t OnlyActive(uint64_t mask_bits) {
6280  return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
6281 }
6282 
6283 template <typename T, size_t N>
6284 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
6285  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
6286 }
6287 
6288 } // namespace detail
6289 
6290 // `p` points to at least 8 writable bytes.
6291 template <typename T, size_t N>
6292 HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
6293  const Mask128<T, N> mask, uint8_t* bits) {
6294  constexpr size_t kNumBytes = (N + 7) / 8;
6295  const uint64_t mask_bits = detail::BitsFromMask(mask);
6296  CopyBytes<kNumBytes>(&mask_bits, bits);
6297  return kNumBytes;
6298 }
6299 
6300 // ------------------------------ Mask testing
6301 
6302 template <typename T, size_t N>
6303 HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
6304  // Cheaper than PTEST, which is 2 uop / 3L.
6305  return detail::BitsFromMask(mask) == 0;
6306 }
6307 
6308 template <typename T, size_t N>
6309 HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
6310  constexpr uint64_t kAllBits =
6311  detail::OnlyActive<T, N>((1ull << (16 / sizeof(T))) - 1);
6312  return detail::BitsFromMask(mask) == kAllBits;
6313 }
6314 
6315 template <typename T, size_t N>
6316 HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
6317  const Mask128<T, N> mask) {
6318  return PopCount(detail::BitsFromMask(mask));
6319 }
6320 
6321 template <typename T, size_t N>
6322 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
6323  const Mask128<T, N> mask) {
6324  const uint64_t mask_bits = detail::BitsFromMask(mask);
6325  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
6326 }
6327 
6328 // ------------------------------ Compress, CompressBits
6329 
6330 namespace detail {
6331 
6332 // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
6333 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6334 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6335  HWY_DASSERT(mask_bits < 256);
6336  const Rebind<uint8_t, decltype(d)> d8;
6337  const Simd<uint16_t, N, 0> du;
6338 
6339  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
6340  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
6341  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
6342  // store lane indices and convert to byte indices (2*lane + 0..1), with the
6343  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
6344  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
6345  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
6346  // is likely more costly than the higher cache footprint from storing bytes.
6347  alignas(16) constexpr uint8_t table[2048] = {
6348  // PrintCompress16x8Tables
6349  0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6350  2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6351  4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
6352  2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6353  6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
6354  2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
6355  4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
6356  2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6357  8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
6358  2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
6359  4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
6360  2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
6361  6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
6362  2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
6363  4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
6364  2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6365  10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
6366  2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
6367  4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
6368  2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
6369  6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
6370  2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
6371  4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
6372  2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
6373  8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
6374  2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
6375  4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
6376  2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
6377  6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
6378  2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
6379  4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
6380  2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6381  12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
6382  2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
6383  4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
6384  2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
6385  6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
6386  2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
6387  4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
6388  2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
6389  8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
6390  2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
6391  4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
6392  2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
6393  6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
6394  2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
6395  4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
6396  2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
6397  10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
6398  2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
6399  4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
6400  2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
6401  6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
6402  2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
6403  4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
6404  2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
6405  8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
6406  2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
6407  4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
6408  2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
6409  6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
6410  2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
6411  4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
6412  2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6413  14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
6414  2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
6415  4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
6416  2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
6417  6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
6418  2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
6419  4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
6420  2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
6421  8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
6422  2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
6423  4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
6424  2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
6425  6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
6426  2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
6427  4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
6428  2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
6429  10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
6430  2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
6431  4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
6432  2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
6433  6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
6434  2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
6435  4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
6436  2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
6437  8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
6438  2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
6439  4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
6440  2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
6441  6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
6442  2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
6443  4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
6444  2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
6445  12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
6446  2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
6447  4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
6448  2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
6449  6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
6450  2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
6451  4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
6452  2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
6453  8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
6454  2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
6455  4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
6456  2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
6457  6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
6458  2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
6459  4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
6460  2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
6461  10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
6462  2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
6463  4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
6464  2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
6465  6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
6466  2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
6467  4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
6468  2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
6469  8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
6470  2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
6471  4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
6472  2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
6473  6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
6474  2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
6475  4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
6476  2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
6477 
6478  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
6479  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
6480  return BitCast(d, pairs + Set(du, 0x0100));
6481 }
6482 
6483 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6484 HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
6485  uint64_t mask_bits) {
6486  HWY_DASSERT(mask_bits < 256);
6487  const Rebind<uint8_t, decltype(d)> d8;
6488  const Simd<uint16_t, N, 0> du;
6489 
6490  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
6491  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
6492  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
6493  // store lane indices and convert to byte indices (2*lane + 0..1), with the
6494  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
6495  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
6496  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
6497  // is likely more costly than the higher cache footprint from storing bytes.
6498  alignas(16) constexpr uint8_t table[2048] = {
6499  // PrintCompressNot16x8Tables
6500  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
6501  0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
6502  0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
6503  0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
6504  0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
6505  0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
6506  0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
6507  0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
6508  0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
6509  0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
6510  0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
6511  0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
6512  0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
6513  0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
6514  0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
6515  0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
6516  0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
6517  0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
6518  0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
6519  0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
6520  0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
6521  0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
6522  0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
6523  0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
6524  0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
6525  0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
6526  0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
6527  0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
6528  0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
6529  0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
6530  0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
6531  0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
6532  0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
6533  0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
6534  0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
6535  0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
6536  0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
6537  0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
6538  0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
6539  0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
6540  0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
6541  0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
6542  0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
6543  0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
6544  0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
6545  0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
6546  0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
6547  0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
6548  0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
6549  0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
6550  0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
6551  0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
6552  0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
6553  0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
6554  0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
6555  0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
6556  0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
6557  0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
6558  0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
6559  0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
6560  0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
6561  0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
6562  0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
6563  0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
6564  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
6565  0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
6566  0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
6567  0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
6568  0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
6569  0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
6570  0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
6571  0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
6572  0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
6573  0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
6574  0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
6575  0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
6576  0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
6577  0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
6578  0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
6579  0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
6580  0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
6581  0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
6582  0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
6583  0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
6584  0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
6585  0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
6586  0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
6587  0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
6588  0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
6589  0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
6590  0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
6591  0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
6592  0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
6593  0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
6594  0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
6595  0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
6596  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
6597  0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
6598  0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
6599  0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
6600  0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
6601  0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
6602  0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
6603  0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
6604  0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
6605  0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
6606  0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
6607  0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
6608  0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
6609  0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
6610  0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
6611  0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
6612  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
6613  0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
6614  0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
6615  0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
6616  0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
6617  0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
6618  0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
6619  0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
6620  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
6621  0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
6622  0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
6623  0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
6624  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
6625  0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
6626  0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
6627  0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
6628 
6629  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
6630  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
6631  return BitCast(d, pairs + Set(du, 0x0100));
6632 }
6633 
6634 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6635 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6636  HWY_DASSERT(mask_bits < 16);
6637 
6638  // There are only 4 lanes, so we can afford to load the index vector directly.
6639  alignas(16) constexpr uint8_t u8_indices[256] = {
6640  // PrintCompress32x4Tables
6641  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6642  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6643  4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
6644  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6645  8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
6646  0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
6647  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
6648  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6649  12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
6650  0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
6651  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
6652  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
6653  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
6654  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
6655  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
6656  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6657 
6658  const Repartition<uint8_t, decltype(d)> d8;
6659  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6660 }
6661 
6662 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6663 HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
6664  uint64_t mask_bits) {
6665  HWY_DASSERT(mask_bits < 16);
6666 
6667  // There are only 4 lanes, so we can afford to load the index vector directly.
6668  alignas(16) constexpr uint8_t u8_indices[256] = {
6669  // PrintCompressNot32x4Tables
6670  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6671  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
6672  8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
6673  14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
6674  12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
6675  2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
6676  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6677  10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6678  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
6679  2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6680  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6681  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
6682  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6683  10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6684  12, 13, 14, 15};
6685 
6686  const Repartition<uint8_t, decltype(d)> d8;
6687  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6688 }
6689 
6690 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6691 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6692  HWY_DASSERT(mask_bits < 4);
6693 
6694  // There are only 2 lanes, so we can afford to load the index vector directly.
6695  alignas(16) constexpr uint8_t u8_indices[64] = {
6696  // PrintCompress64x2Tables
6697  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6698  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6699  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6700  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6701 
6702  const Repartition<uint8_t, decltype(d)> d8;
6703  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6704 }
6705 
6706 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6707 HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
6708  uint64_t mask_bits) {
6709  HWY_DASSERT(mask_bits < 4);
6710 
6711  // There are only 2 lanes, so we can afford to load the index vector directly.
6712  alignas(16) constexpr uint8_t u8_indices[64] = {
6713  // PrintCompressNot64x2Tables
6714  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6715  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6716  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6717  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6718 
6719  const Repartition<uint8_t, decltype(d)> d8;
6720  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6721 }
6722 
6723 template <typename T, size_t N>
6724 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
6725  const Simd<T, N, 0> d;
6726  const RebindToUnsigned<decltype(d)> du;
6727 
6728  HWY_DASSERT(mask_bits < (1ull << N));
6729  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6730  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6731 }
6732 
6733 template <typename T, size_t N>
6734 HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
6735  const Simd<T, N, 0> d;
6736  const RebindToUnsigned<decltype(d)> du;
6737 
6738  HWY_DASSERT(mask_bits < (1ull << N));
6739  const auto indices = BitCast(du, detail::IndicesFromNotBits(d, mask_bits));
6740  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6741 }
6742 
6743 } // namespace detail
6744 
6745 // Single lane: no-op
6746 template <typename T>
6747 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6748  return v;
6749 }
6750 
6751 // Two lanes: conditional swap
6752 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6753 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
6754  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
6755  const Full128<T> d;
6756  const Vec128<T> m = VecFromMask(d, mask);
6757  const Vec128<T> maskL = DupEven(m);
6758  const Vec128<T> maskH = DupOdd(m);
6759  const Vec128<T> swap = AndNot(maskL, maskH);
6760  return IfVecThenElse(swap, Shuffle01(v), v);
6761 }
6762 
6763 // General case
6764 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
6765 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
6767 }
6768 
6769 // Single lane: no-op
6770 template <typename T>
6771 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6772  return v;
6773 }
6774 
6775 // Two lanes: conditional swap
6776 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6777 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
6778  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
6779  const Full128<T> d;
6780  const Vec128<T> m = VecFromMask(d, mask);
6781  const Vec128<T> maskL = DupEven(m);
6782  const Vec128<T> maskH = DupOdd(m);
6783  const Vec128<T> swap = AndNot(maskH, maskL);
6784  return IfVecThenElse(swap, Shuffle01(v), v);
6785 }
6786 
6787 // General case
6788 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
6789 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6790  // For partial vectors, we cannot pull the Not() into the table because
6791  // BitsFromMask clears the upper bits.
6792  if (N < 16 / sizeof(T)) {
6794  }
6795  return detail::CompressNotBits(v, detail::BitsFromMask(mask));
6796 }
6797 
6798 // ------------------------------ CompressBlocksNot
6799 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
6800  Mask128<uint64_t> /* m */) {
6801  return v;
6802 }
6803 
6804 template <typename T, size_t N>
6805 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
6806  const uint8_t* HWY_RESTRICT bits) {
6807  uint64_t mask_bits = 0;
6808  constexpr size_t kNumBytes = (N + 7) / 8;
6809  CopyBytes<kNumBytes>(bits, &mask_bits);
6810  if (N < 8) {
6811  mask_bits &= (1ull << N) - 1;
6812  }
6813 
6814  return detail::CompressBits(v, mask_bits);
6815 }
6816 
6817 // ------------------------------ CompressStore, CompressBitsStore
6818 
6819 template <typename T, size_t N>
6820 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
6821  T* HWY_RESTRICT unaligned) {
6822  const RebindToUnsigned<decltype(d)> du;
6823 
6824  const uint64_t mask_bits = detail::BitsFromMask(m);
6825  HWY_DASSERT(mask_bits < (1ull << N));
6826  const size_t count = PopCount(mask_bits);
6827 
6828  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
6829  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6830  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6831  StoreU(compressed, d, unaligned);
6832  // Workaround for MSAN not marking output as initialized (b/233326619)
6833 #if HWY_IS_MSAN
6834  __msan_unpoison(unaligned, count * sizeof(T));
6835 #endif
6836 
6837  return count;
6838 }
6839 
6840 template <typename T, size_t N>
6841 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
6842  Simd<T, N, 0> d,
6843  T* HWY_RESTRICT unaligned) {
6844  const RebindToUnsigned<decltype(d)> du;
6845 
6846  const uint64_t mask_bits = detail::BitsFromMask(m);
6847  HWY_DASSERT(mask_bits < (1ull << N));
6848  const size_t count = PopCount(mask_bits);
6849 
6850  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
6851  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6852  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6853  BlendedStore(compressed, FirstN(d, count), d, unaligned);
6854  // Workaround for MSAN not marking output as initialized (b/233326619)
6855 #if HWY_IS_MSAN
6856  __msan_unpoison(unaligned, count * sizeof(T));
6857 #endif
6858  return count;
6859 }
6860 
6861 template <typename T, size_t N>
6862 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
6863  const uint8_t* HWY_RESTRICT bits,
6864  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6865  const RebindToUnsigned<decltype(d)> du;
6866 
6867  uint64_t mask_bits = 0;
6868  constexpr size_t kNumBytes = (N + 7) / 8;
6869  CopyBytes<kNumBytes>(bits, &mask_bits);
6870  if (N < 8) {
6871  mask_bits &= (1ull << N) - 1;
6872  }
6873  const size_t count = PopCount(mask_bits);
6874 
6875  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
6876  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6877  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6878  StoreU(compressed, d, unaligned);
6879 
6880  // Workaround for MSAN not marking output as initialized (b/233326619)
6881 #if HWY_IS_MSAN
6882  __msan_unpoison(unaligned, count * sizeof(T));
6883 #endif
6884  return count;
6885 }
6886 
6887 #endif // HWY_TARGET <= HWY_AVX3
6888 
6889 // ------------------------------ StoreInterleaved2/3/4
6890 
6891 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
6892 // generic_ops-inl.h.
6893 
6894 // ------------------------------ Reductions
6895 
6896 namespace detail {
6897 
6898 // N=1 for any T: no-op
6899 template <typename T>
6900 HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
6901  const Vec128<T, 1> v) {
6902  return v;
6903 }
6904 template <typename T>
6905 HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
6906  const Vec128<T, 1> v) {
6907  return v;
6908 }
6909 template <typename T>
6910 HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
6911  const Vec128<T, 1> v) {
6912  return v;
6913 }
6914 
6915 // u32/i32/f32:
6916 
6917 // N=2
6918 template <typename T>
6919 HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
6920  const Vec128<T, 2> v10) {
6921  return v10 + Shuffle2301(v10);
6922 }
6923 template <typename T>
6924 HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
6925  const Vec128<T, 2> v10) {
6926  return Min(v10, Shuffle2301(v10));
6927 }
6928 template <typename T>
6929 HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
6930  const Vec128<T, 2> v10) {
6931  return Max(v10, Shuffle2301(v10));
6932 }
6933 
6934 // N=4 (full)
6935 template <typename T>
6936 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
6937  const Vec128<T> v3210) {
6938  const Vec128<T> v1032 = Shuffle1032(v3210);
6939  const Vec128<T> v31_20_31_20 = v3210 + v1032;
6940  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
6941  return v20_31_20_31 + v31_20_31_20;
6942 }
6943 template <typename T>
6944 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
6945  const Vec128<T> v3210) {
6946  const Vec128<T> v1032 = Shuffle1032(v3210);
6947  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
6948  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
6949  return Min(v20_31_20_31, v31_20_31_20);
6950 }
6951 template <typename T>
6952 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
6953  const Vec128<T> v3210) {
6954  const Vec128<T> v1032 = Shuffle1032(v3210);
6955  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
6956  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
6957  return Max(v20_31_20_31, v31_20_31_20);
6958 }
6959 
6960 // u64/i64/f64:
6961 
6962 // N=2 (full)
6963 template <typename T>
6964 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
6965  const Vec128<T> v10) {
6966  const Vec128<T> v01 = Shuffle01(v10);
6967  return v10 + v01;
6968 }
6969 template <typename T>
6970 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
6971  const Vec128<T> v10) {
6972  const Vec128<T> v01 = Shuffle01(v10);
6973  return Min(v10, v01);
6974 }
6975 template <typename T>
6976 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
6977  const Vec128<T> v10) {
6978  const Vec128<T> v01 = Shuffle01(v10);
6979  return Max(v10, v01);
6980 }
6981 
6982 // u16/i16
6983 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6984 HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
6985  const Repartition<int32_t, Simd<T, N, 0>> d32;
6986  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6987  const auto odd = ShiftRight<16>(BitCast(d32, v));
6988  const auto min = MinOfLanes(d32, Min(even, odd));
6989  // Also broadcast into odd lanes.
6990  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
6991 }
6992 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6993 HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
6994  const Repartition<int32_t, Simd<T, N, 0>> d32;
6995  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6996  const auto odd = ShiftRight<16>(BitCast(d32, v));
6997  const auto min = MaxOfLanes(d32, Max(even, odd));
6998  // Also broadcast into odd lanes.
6999  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
7000 }
7001 
7002 } // namespace detail
7003 
7004 // Supported for u/i/f 32/64. Returns the same value in each lane.
7005 template <typename T, size_t N>
7006 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
7007  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
7008 }
7009 template <typename T, size_t N>
7010 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
7011  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
7012 }
7013 template <typename T, size_t N>
7014 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
7015  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
7016 }
7017 
7018 // ------------------------------ Lt128
7019 
7020 namespace detail {
7021 
7022 // Returns vector-mask for Lt128. Also used by x86_256/x86_512.
7023 template <class D, class V = VFromD<D>>
7024 HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
7025  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
7026  // Truth table of Eq and Lt for Hi and Lo u64.
7027  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
7028  // =H =L cH cL | out = cH | (=H & cL)
7029  // 0 0 0 0 | 0
7030  // 0 0 0 1 | 0
7031  // 0 0 1 0 | 1
7032  // 0 0 1 1 | 1
7033  // 0 1 0 0 | 0
7034  // 0 1 0 1 | 0
7035  // 0 1 1 0 | 1
7036  // 1 0 0 0 | 0
7037  // 1 0 0 1 | 1
7038  // 1 1 0 0 | 0
7039  const auto eqHL = Eq(a, b);
7040  const V ltHL = VecFromMask(d, Lt(a, b));
7041  const V ltLX = ShiftLeftLanes<1>(ltHL);
7042  const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
7043  return InterleaveUpper(d, vecHx, vecHx);
7044 }
7045 
7046 template <class D, class V = VFromD<D>>
7047 HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) {
7048  // No specialization required for AVX-512: Mask <-> Vec is fast, and
7049  // copying mask bits to their neighbor seems infeasible.
7050  const V ltHL = VecFromMask(d, Lt(a, b));
7051  return InterleaveUpper(d, ltHL, ltHL);
7052 }
7053 
7054 } // namespace detail
7055 
7056 template <class D, class V = VFromD<D>>
7057 HWY_API MFromD<D> Lt128(D d, const V a, const V b) {
7058  return MaskFromVec(detail::Lt128Vec(d, a, b));
7059 }
7060 
7061 template <class D, class V = VFromD<D>>
7062 HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) {
7063  return MaskFromVec(detail::Lt128UpperVec(d, a, b));
7064 }
7065 
7066 // ------------------------------ Min128, Max128 (Lt128)
7067 
7068 // Avoids the extra MaskFromVec in Lt128.
7069 template <class D, class V = VFromD<D>>
7070 HWY_API V Min128(D d, const V a, const V b) {
7071  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
7072 }
7073 
7074 template <class D, class V = VFromD<D>>
7075 HWY_API V Max128(D d, const V a, const V b) {
7076  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
7077 }
7078 
7079 template <class D, class V = VFromD<D>>
7080 HWY_API V Min128Upper(D d, const V a, const V b) {
7081  return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
7082 }
7083 
7084 template <class D, class V = VFromD<D>>
7085 HWY_API V Max128Upper(D d, const V a, const V b) {
7086  return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
7087 }
7088 
7089 // ================================================== Operator wrapper
7090 
7091 // These apply to all x86_*-inl.h because there are no restrictions on V.
7092 
7093 template <class V>
7094 HWY_API V Add(V a, V b) {
7095  return a + b;
7096 }
7097 template <class V>
7098 HWY_API V Sub(V a, V b) {
7099  return a - b;
7100 }
7101 
7102 template <class V>
7103 HWY_API V Mul(V a, V b) {
7104  return a * b;
7105 }
7106 template <class V>
7107 HWY_API V Div(V a, V b) {
7108  return a / b;
7109 }
7110 
7111 template <class V>
7112 V Shl(V a, V b) {
7113  return a << b;
7114 }
7115 template <class V>
7116 V Shr(V a, V b) {
7117  return a >> b;
7118 }
7119 
7120 template <class V>
7121 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
7122  return a == b;
7123 }
7124 template <class V>
7125 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
7126  return a != b;
7127 }
7128 template <class V>
7129 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
7130  return a < b;
7131 }
7132 
7133 template <class V>
7134 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
7135  return a > b;
7136 }
7137 template <class V>
7138 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
7139  return a >= b;
7140 }
7141 
7142 template <class V>
7143 HWY_API auto Le(V a, V b) -> decltype(a == b) {
7144  return a <= b;
7145 }
7146 
7147 // NOLINTNEXTLINE(google-readability-namespace-comments)
7148 } // namespace HWY_NAMESPACE
7149 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:346
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_IF_LE128(T, N)
Definition: base.h:332
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_MAYBE_UNUSED
Definition: base.h:73
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition: x86_128-inl.h:141
Raw raw
Definition: arm_neon-inl.h:814
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: x86_128-inl.h:88
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: x86_128-inl.h:91
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: x86_128-inl.h:79
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: x86_128-inl.h:94
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: x86_128-inl.h:76
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: x86_128-inl.h:82
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: x86_128-inl.h:85
Definition: wasm_256-inl.h:39
Definition: x86_512-inl.h:112
#define HWY_AVX3_DL
Definition: detect_targets.h:62
#define HWY_TARGET
Definition: detect_targets.h:341
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3345
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5417
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5406
HWY_API void ScalarMaskedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: x86_128-inl.h:2139
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1700
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b)
Definition: x86_128-inl.h:7047
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:3080
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2984
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
trn2 HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2793
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:763
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1155
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
Vec128< T, 4/sizeof(T)> Vec32
Definition: arm_neon-inl.h:800
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:252
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
Simd< T, 32/sizeof(T), 0 > Full256
Definition: wasm_256-inl.h:32
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:797
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
long long int GatherIndex64
Definition: x86_128-inl.h:3201
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
constexpr HWY_API bool IsSigned()
Definition: base.h:534
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
__m128i raw
Definition: x86_128-inl.h:4064
Definition: ops/shared-inl.h:40
HWY_INLINE __m128d operator()(__m128i v)
Definition: x86_128-inl.h:226
HWY_INLINE __m128 operator()(__m128i v)
Definition: x86_128-inl.h:222
HWY_INLINE __m128i operator()(__m128i v)
Definition: x86_128-inl.h:218
Definition: arm_neon-inl.h:823
Full512< T > operator()(const hwy::HWY_NAMESPACE::Vec512< T > *) const
Definition: x86_128-inl.h:182
Simd< T, N, 0 > operator()(const Vec128< T, N > *) const
Definition: x86_128-inl.h:171
Full256< T > operator()(const hwy::HWY_NAMESPACE::Vec256< T > *) const
Definition: x86_128-inl.h:176
Definition: x86_128-inl.h:190
decltype(DeduceD()(static_cast< V * >(nullptr))) type
Definition: x86_128-inl.h:191
__m128d type
Definition: x86_128-inl.h:64
__f32x4 type
Definition: wasm_128-inl.h:60
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56
Definition: x86_128-inl.h:119
__mmask16 type
Definition: x86_128-inl.h:120
Definition: x86_128-inl.h:123
__mmask8 type
Definition: x86_128-inl.h:124
Definition: x86_128-inl.h:127
__mmask8 type
Definition: x86_128-inl.h:128
Definition: x86_128-inl.h:131
__mmask8 type
Definition: x86_128-inl.h:132
Definition: x86_128-inl.h:117
Definition: base.h:358
HWY_AFTER_NAMESPACE()
#define HWY_INLINE_F16
Definition: x86_128-inl.h:5231
HWY_BEFORE_NAMESPACE()