Grok  10.0.3
wasm_128-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 128-bit WASM vectors and operations.
17 // External include guard in highway.h - see comment there.
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <wasm_simd128.h>
22 
23 #include "hwy/base.h"
24 #include "hwy/ops/shared-inl.h"
25 
26 #ifdef HWY_WASM_OLD_NAMES
27 #define wasm_i8x16_shuffle wasm_v8x16_shuffle
28 #define wasm_i16x8_shuffle wasm_v16x8_shuffle
29 #define wasm_i32x4_shuffle wasm_v32x4_shuffle
30 #define wasm_i64x2_shuffle wasm_v64x2_shuffle
31 #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
32 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
33 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
34 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
35 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
36 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
37 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
38 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate
39 #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
40 #define wasm_u16x8_add_sat wasm_u16x8_add_saturate
41 #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
42 #define wasm_i8x16_add_sat wasm_i8x16_add_saturate
43 #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
44 #define wasm_i16x8_add_sat wasm_i16x8_add_saturate
45 #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
46 #endif
47 
49 namespace hwy {
50 namespace HWY_NAMESPACE {
51 
52 namespace detail {
53 
54 template <typename T>
55 struct Raw128 {
56  using type = __v128_u;
57 };
58 template <>
59 struct Raw128<float> {
60  using type = __f32x4;
61 };
62 
63 } // namespace detail
64 
65 template <typename T, size_t N = 16 / sizeof(T)>
66 class Vec128 {
67  using Raw = typename detail::Raw128<T>::type;
68 
69  public:
70  // Compound assignment. Only usable if there is a corresponding non-member
71  // binary operator overload. For example, only f32 and f64 support division.
73  return *this = (*this * other);
74  }
76  return *this = (*this / other);
77  }
79  return *this = (*this + other);
80  }
82  return *this = (*this - other);
83  }
85  return *this = (*this & other);
86  }
88  return *this = (*this | other);
89  }
91  return *this = (*this ^ other);
92  }
93 
94  Raw raw;
95 };
96 
97 template <typename T>
98 using Vec64 = Vec128<T, 8 / sizeof(T)>;
99 
100 template <typename T>
101 using Vec32 = Vec128<T, 4 / sizeof(T)>;
102 
103 // FF..FF or 0.
104 template <typename T, size_t N = 16 / sizeof(T)>
105 struct Mask128 {
107 };
108 
109 namespace detail {
110 
111 // Deduce Simd<T, N, 0> from Vec128<T, N>
112 struct DeduceD {
113  template <typename T, size_t N>
115  return Simd<T, N, 0>();
116  }
117 };
118 
119 } // namespace detail
120 
121 template <class V>
122 using DFromV = decltype(detail::DeduceD()(V()));
123 
124 template <class V>
125 using TFromV = TFromD<DFromV<V>>;
126 
127 // ------------------------------ BitCast
128 
129 namespace detail {
130 
131 HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
132 HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
133  return static_cast<__v128_u>(v);
134 }
135 HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
136  return static_cast<__v128_u>(v);
137 }
138 
139 template <typename T, size_t N>
140 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
141  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
142 }
143 
144 // Cannot rely on function overloading because return types differ.
145 template <typename T>
147  HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
148 };
149 template <>
150 struct BitCastFromInteger128<float> {
151  HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
152 };
153 
154 template <typename T, size_t N>
156  Vec128<uint8_t, N * sizeof(T)> v) {
157  return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
158 }
159 
160 } // namespace detail
161 
162 template <typename T, size_t N, typename FromT>
163 HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
164  Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
166 }
167 
168 // ------------------------------ Zero
169 
170 // Returns an all-zero vector/part.
171 template <typename T, size_t N, HWY_IF_LE128(T, N)>
173  return Vec128<T, N>{wasm_i32x4_splat(0)};
174 }
175 template <size_t N, HWY_IF_LE128(float, N)>
177  return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
178 }
179 
180 template <class D>
181 using VFromD = decltype(Zero(D()));
182 
183 // ------------------------------ Set
184 
185 // Returns a vector/part with all lanes set to "t".
186 template <size_t N, HWY_IF_LE128(uint8_t, N)>
187 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
188  return Vec128<uint8_t, N>{wasm_i8x16_splat(static_cast<int8_t>(t))};
189 }
190 template <size_t N, HWY_IF_LE128(uint16_t, N)>
192  const uint16_t t) {
193  return Vec128<uint16_t, N>{wasm_i16x8_splat(static_cast<int16_t>(t))};
194 }
195 template <size_t N, HWY_IF_LE128(uint32_t, N)>
197  const uint32_t t) {
198  return Vec128<uint32_t, N>{wasm_i32x4_splat(static_cast<int32_t>(t))};
199 }
200 template <size_t N, HWY_IF_LE128(uint64_t, N)>
202  const uint64_t t) {
203  return Vec128<uint64_t, N>{wasm_i64x2_splat(static_cast<int64_t>(t))};
204 }
205 
206 template <size_t N, HWY_IF_LE128(int8_t, N)>
207 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
208  return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
209 }
210 template <size_t N, HWY_IF_LE128(int16_t, N)>
211 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
212  return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
213 }
214 template <size_t N, HWY_IF_LE128(int32_t, N)>
215 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
216  return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
217 }
218 template <size_t N, HWY_IF_LE128(int64_t, N)>
219 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
220  return Vec128<int64_t, N>{wasm_i64x2_splat(t)};
221 }
222 
223 template <size_t N, HWY_IF_LE128(float, N)>
224 HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
225  return Vec128<float, N>{wasm_f32x4_splat(t)};
226 }
227 
228 HWY_DIAGNOSTICS(push)
229 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
230 
231 // Returns a vector with uninitialized elements.
232 template <typename T, size_t N, HWY_IF_LE128(T, N)>
234  return Zero(d);
235 }
236 
237 HWY_DIAGNOSTICS(pop)
238 
239 // Returns a vector with lane i=[0, N) set to "first" + i.
240 template <typename T, size_t N, typename T2>
241 Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
242  HWY_ALIGN T lanes[16 / sizeof(T)];
243  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
244  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
245  }
246  return Load(d, lanes);
247 }
248 
249 // ================================================== ARITHMETIC
250 
251 // ------------------------------ Addition
252 
253 // Unsigned
254 template <size_t N>
256  const Vec128<uint8_t, N> b) {
257  return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
258 }
259 template <size_t N>
261  const Vec128<uint16_t, N> b) {
262  return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
263 }
264 template <size_t N>
266  const Vec128<uint32_t, N> b) {
267  return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
268 }
269 template <size_t N>
271  const Vec128<uint64_t, N> b) {
272  return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
273 }
274 
275 // Signed
276 template <size_t N>
278  const Vec128<int8_t, N> b) {
279  return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
280 }
281 template <size_t N>
283  const Vec128<int16_t, N> b) {
284  return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
285 }
286 template <size_t N>
288  const Vec128<int32_t, N> b) {
289  return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
290 }
291 template <size_t N>
293  const Vec128<int64_t, N> b) {
294  return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
295 }
296 
297 // Float
298 template <size_t N>
300  const Vec128<float, N> b) {
301  return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
302 }
303 
304 // ------------------------------ Subtraction
305 
306 // Unsigned
307 template <size_t N>
309  const Vec128<uint8_t, N> b) {
310  return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
311 }
312 template <size_t N>
315  return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
316 }
317 template <size_t N>
319  const Vec128<uint32_t, N> b) {
320  return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
321 }
322 template <size_t N>
324  const Vec128<uint64_t, N> b) {
325  return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
326 }
327 
328 // Signed
329 template <size_t N>
331  const Vec128<int8_t, N> b) {
332  return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
333 }
334 template <size_t N>
336  const Vec128<int16_t, N> b) {
337  return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
338 }
339 template <size_t N>
341  const Vec128<int32_t, N> b) {
342  return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
343 }
344 template <size_t N>
346  const Vec128<int64_t, N> b) {
347  return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
348 }
349 
350 // Float
351 template <size_t N>
353  const Vec128<float, N> b) {
354  return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
355 }
356 
357 // ------------------------------ SaturatedAdd
358 
359 // Returns a + b clamped to the destination range.
360 
361 // Unsigned
362 template <size_t N>
364  const Vec128<uint8_t, N> b) {
365  return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
366 }
367 template <size_t N>
369  const Vec128<uint16_t, N> b) {
370  return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
371 }
372 
373 // Signed
374 template <size_t N>
376  const Vec128<int8_t, N> b) {
377  return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
378 }
379 template <size_t N>
381  const Vec128<int16_t, N> b) {
382  return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
383 }
384 
385 // ------------------------------ SaturatedSub
386 
387 // Returns a - b clamped to the destination range.
388 
389 // Unsigned
390 template <size_t N>
392  const Vec128<uint8_t, N> b) {
393  return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
394 }
395 template <size_t N>
397  const Vec128<uint16_t, N> b) {
398  return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
399 }
400 
401 // Signed
402 template <size_t N>
404  const Vec128<int8_t, N> b) {
405  return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
406 }
407 template <size_t N>
409  const Vec128<int16_t, N> b) {
410  return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
411 }
412 
413 // ------------------------------ Average
414 
415 // Returns (a + b + 1) / 2
416 
417 // Unsigned
418 template <size_t N>
420  const Vec128<uint8_t, N> b) {
421  return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
422 }
423 template <size_t N>
425  const Vec128<uint16_t, N> b) {
426  return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
427 }
428 
429 // ------------------------------ Absolute value
430 
431 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
432 template <size_t N>
434  return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
435 }
436 template <size_t N>
438  return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
439 }
440 template <size_t N>
442  return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
443 }
444 template <size_t N>
446  return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
447 }
448 
449 template <size_t N>
451  return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
452 }
453 
454 // ------------------------------ Shift lanes by constant #bits
455 
456 // Unsigned
457 template <int kBits, size_t N>
459  return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
460 }
461 template <int kBits, size_t N>
463  return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
464 }
465 template <int kBits, size_t N>
467  return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
468 }
469 template <int kBits, size_t N>
471  return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
472 }
473 template <int kBits, size_t N>
475  return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
476 }
477 template <int kBits, size_t N>
479  return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
480 }
481 
482 // Signed
483 template <int kBits, size_t N>
485  return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
486 }
487 template <int kBits, size_t N>
489  return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
490 }
491 template <int kBits, size_t N>
493  return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
494 }
495 template <int kBits, size_t N>
497  return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
498 }
499 template <int kBits, size_t N>
501  return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
502 }
503 template <int kBits, size_t N>
505  return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
506 }
507 
508 // 8-bit
509 template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
511  const DFromV<decltype(v)> d8;
512  // Use raw instead of BitCast to support N=1.
513  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
514  return kBits == 1
515  ? (v + v)
516  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
517 }
518 
519 template <int kBits, size_t N>
521  const DFromV<decltype(v)> d8;
522  // Use raw instead of BitCast to support N=1.
523  const Vec128<uint8_t, N> shifted{
524  ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
525  return shifted & Set(d8, 0xFF >> kBits);
526 }
527 
528 template <int kBits, size_t N>
530  const DFromV<decltype(v)> di;
531  const RebindToUnsigned<decltype(di)> du;
532  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
533  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
534  return (shifted ^ shifted_sign) - shifted_sign;
535 }
536 
537 // ------------------------------ RotateRight (ShiftRight, Or)
538 template <int kBits, typename T, size_t N>
539 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
540  constexpr size_t kSizeInBits = sizeof(T) * 8;
541  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
542  if (kBits == 0) return v;
543  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
544 }
545 
546 // ------------------------------ Shift lanes by same variable #bits
547 
548 // After https://reviews.llvm.org/D108415 shift argument became unsigned.
549 HWY_DIAGNOSTICS(push)
550 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
551 
552 // Unsigned
553 template <size_t N>
555  const int bits) {
556  return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
557 }
558 template <size_t N>
560  const int bits) {
561  return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
562 }
563 template <size_t N>
565  const int bits) {
566  return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
567 }
568 template <size_t N>
570  const int bits) {
571  return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
572 }
573 template <size_t N>
575  const int bits) {
576  return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
577 }
578 template <size_t N>
580  const int bits) {
581  return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
582 }
583 
584 // Signed
585 template <size_t N>
587  const int bits) {
588  return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
589 }
590 template <size_t N>
592  const int bits) {
593  return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
594 }
595 template <size_t N>
597  const int bits) {
598  return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
599 }
600 template <size_t N>
602  const int bits) {
603  return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
604 }
605 template <size_t N>
607  const int bits) {
608  return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
609 }
610 template <size_t N>
612  const int bits) {
613  return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
614 }
615 
616 // 8-bit
617 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
619  const DFromV<decltype(v)> d8;
620  // Use raw instead of BitCast to support N=1.
621  const Vec128<T, N> shifted{
622  ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
623  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
624 }
625 
626 template <size_t N>
628  const int bits) {
629  const DFromV<decltype(v)> d8;
630  // Use raw instead of BitCast to support N=1.
631  const Vec128<uint8_t, N> shifted{
632  ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
633  return shifted & Set(d8, 0xFF >> bits);
634 }
635 
636 template <size_t N>
638  const DFromV<decltype(v)> di;
639  const RebindToUnsigned<decltype(di)> du;
640  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
641  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
642  return (shifted ^ shifted_sign) - shifted_sign;
643 }
644 
645 // ignore Wsign-conversion
646 HWY_DIAGNOSTICS(pop)
647 
648 // ------------------------------ Minimum
649 
650 // Unsigned
651 template <size_t N>
653  return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
654 }
655 template <size_t N>
657  return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
658 }
659 template <size_t N>
661  return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
662 }
663 template <size_t N>
664 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
665  // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
666  const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
667  const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
668  const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
669  const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
670  alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
671  return Vec128<uint64_t, N>{wasm_v128_load(min)};
672 }
673 
674 // Signed
675 template <size_t N>
677  return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
678 }
679 template <size_t N>
681  return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
682 }
683 template <size_t N>
685  return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
686 }
687 template <size_t N>
688 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
689  alignas(16) int64_t min[4];
690  min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
691  wasm_i64x2_extract_lane(b.raw, 0));
692  min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
693  wasm_i64x2_extract_lane(b.raw, 1));
694  return Vec128<int64_t, N>{wasm_v128_load(min)};
695 }
696 
697 // Float
698 template <size_t N>
700  return Vec128<float, N>{wasm_f32x4_min(a.raw, b.raw)};
701 }
702 
703 // ------------------------------ Maximum
704 
705 // Unsigned
706 template <size_t N>
708  return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
709 }
710 template <size_t N>
712  return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
713 }
714 template <size_t N>
716  return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
717 }
718 template <size_t N>
719 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
720  // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
721  const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
722  const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
723  const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
724  const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
725  alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
726  return Vec128<uint64_t, N>{wasm_v128_load(max)};
727 }
728 
729 // Signed
730 template <size_t N>
732  return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
733 }
734 template <size_t N>
736  return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
737 }
738 template <size_t N>
740  return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
741 }
742 template <size_t N>
743 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
744  alignas(16) int64_t max[2];
745  max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
746  wasm_i64x2_extract_lane(b.raw, 0));
747  max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
748  wasm_i64x2_extract_lane(b.raw, 1));
749  return Vec128<int64_t, N>{wasm_v128_load(max)};
750 }
751 
752 // Float
753 template <size_t N>
755  return Vec128<float, N>{wasm_f32x4_max(a.raw, b.raw)};
756 }
757 
758 // ------------------------------ Integer multiplication
759 
760 // Unsigned
761 template <size_t N>
763  const Vec128<uint16_t, N> b) {
764  return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
765 }
766 template <size_t N>
768  const Vec128<uint32_t, N> b) {
769  return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
770 }
771 
772 // Signed
773 template <size_t N>
775  const Vec128<int16_t, N> b) {
776  return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
777 }
778 template <size_t N>
780  const Vec128<int32_t, N> b) {
781  return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
782 }
783 
784 // Returns the upper 16 bits of a * b in each lane.
785 template <size_t N>
786 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
787  const Vec128<uint16_t, N> b) {
788  // TODO(eustas): replace, when implemented in WASM.
789  const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
790  const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
791  const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
792  const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
793  const auto l = wasm_i32x4_mul(al, bl);
794  const auto h = wasm_i32x4_mul(ah, bh);
795  // TODO(eustas): shift-right + narrow?
796  return Vec128<uint16_t, N>{
797  wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
798 }
799 template <size_t N>
800 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
801  const Vec128<int16_t, N> b) {
802  // TODO(eustas): replace, when implemented in WASM.
803  const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
804  const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
805  const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
806  const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
807  const auto l = wasm_i32x4_mul(al, bl);
808  const auto h = wasm_i32x4_mul(ah, bh);
809  // TODO(eustas): shift-right + narrow?
810  return Vec128<int16_t, N>{
811  wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
812 }
813 
814 template <size_t N>
815 HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
816  Vec128<int16_t, N> b) {
817  const DFromV<decltype(a)> d;
818  const RebindToUnsigned<decltype(d)> du;
819 
820  const Vec128<uint16_t, N> lo = BitCast(du, Mul(a, b));
821  const Vec128<int16_t, N> hi = MulHigh(a, b);
822  // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
823  // carry that into the result. Instead isolate the top two bits because only
824  // they can influence the result.
825  const Vec128<uint16_t, N> lo_top2 = ShiftRight<14>(lo);
826  // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
827  const Vec128<uint16_t, N> rounding = ShiftRight<1>(Add(lo_top2, Set(du, 1)));
828  return Add(Add(hi, hi), BitCast(d, rounding));
829 }
830 
831 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
832 template <size_t N>
833 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
834  const Vec128<int32_t, N> b) {
835  // TODO(eustas): replace, when implemented in WASM.
836  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
837  const auto ae = wasm_v128_and(a.raw, kEvenMask);
838  const auto be = wasm_v128_and(b.raw, kEvenMask);
839  return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
840 }
841 template <size_t N>
842 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
843  const Vec128<uint32_t, N> b) {
844  // TODO(eustas): replace, when implemented in WASM.
845  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
846  const auto ae = wasm_v128_and(a.raw, kEvenMask);
847  const auto be = wasm_v128_and(b.raw, kEvenMask);
848  return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
849 }
850 
851 // ------------------------------ Negate
852 
853 template <typename T, size_t N, HWY_IF_FLOAT(T)>
854 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
855  return Xor(v, SignBit(DFromV<decltype(v)>()));
856 }
857 
858 template <size_t N>
860  return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
861 }
862 template <size_t N>
864  return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
865 }
866 template <size_t N>
868  return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
869 }
870 template <size_t N>
872  return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
873 }
874 
875 // ------------------------------ Floating-point mul / div
876 
877 template <size_t N>
879  return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
880 }
881 
882 template <size_t N>
883 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
884  const Vec128<float, N> b) {
885  return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
886 }
887 
888 // Approximate reciprocal
889 template <size_t N>
890 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
891  const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
892  return one / v;
893 }
894 
895 // Absolute value of difference.
896 template <size_t N>
897 HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
898  const Vec128<float, N> b) {
899  return Abs(a - b);
900 }
901 
902 // ------------------------------ Floating-point multiply-add variants
903 
904 // Returns mul * x + add
905 template <size_t N>
906 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
907  const Vec128<float, N> x,
908  const Vec128<float, N> add) {
909  // TODO(eustas): replace, when implemented in WASM.
910  // TODO(eustas): is it wasm_f32x4_qfma?
911  return mul * x + add;
912 }
913 
914 // Returns add - mul * x
915 template <size_t N>
916 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
917  const Vec128<float, N> x,
918  const Vec128<float, N> add) {
919  // TODO(eustas): replace, when implemented in WASM.
920  return add - mul * x;
921 }
922 
923 // Returns mul * x - sub
924 template <size_t N>
925 HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
926  const Vec128<float, N> x,
927  const Vec128<float, N> sub) {
928  // TODO(eustas): replace, when implemented in WASM.
929  // TODO(eustas): is it wasm_f32x4_qfms?
930  return mul * x - sub;
931 }
932 
933 // Returns -mul * x - sub
934 template <size_t N>
935 HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
936  const Vec128<float, N> x,
937  const Vec128<float, N> sub) {
938  // TODO(eustas): replace, when implemented in WASM.
939  return Neg(mul) * x - sub;
940 }
941 
942 // ------------------------------ Floating-point square root
943 
944 // Full precision square root
945 template <size_t N>
946 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
947  return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
948 }
949 
950 // Approximate reciprocal square root
951 template <size_t N>
952 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
953  // TODO(eustas): find cheaper a way to calculate this.
954  const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
955  return one / Sqrt(v);
956 }
957 
958 // ------------------------------ Floating-point rounding
959 
960 // Toward nearest integer, ties to even
961 template <size_t N>
962 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
963  return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
964 }
965 
966 // Toward zero, aka truncate
967 template <size_t N>
968 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
969  return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
970 }
971 
972 // Toward +infinity, aka ceiling
973 template <size_t N>
974 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
975  return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
976 }
977 
978 // Toward -infinity, aka floor
979 template <size_t N>
980 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
981  return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
982 }
983 
984 // ------------------------------ Floating-point classification
985 template <typename T, size_t N>
986 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
987  return v != v;
988 }
989 
990 template <typename T, size_t N, HWY_IF_FLOAT(T)>
991 HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
992  const Simd<T, N, 0> d;
993  const RebindToSigned<decltype(d)> di;
994  const VFromD<decltype(di)> vi = BitCast(di, v);
995  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
996  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
997 }
998 
999 // Returns whether normal/subnormal/zero.
1000 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1001 HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
1002  const Simd<T, N, 0> d;
1003  const RebindToUnsigned<decltype(d)> du;
1004  const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1005  const VFromD<decltype(du)> vu = BitCast(du, v);
1006  // 'Shift left' to clear the sign bit, then right so we can compare with the
1007  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1008  // negative and non-negative floats would be greater).
1009  const VFromD<decltype(di)> exp =
1010  BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1011  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1012 }
1013 
1014 // ================================================== COMPARE
1015 
1016 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1017 
1018 template <typename TFrom, typename TTo, size_t N>
1019 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
1020  Mask128<TFrom, N> m) {
1021  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1022  return Mask128<TTo, N>{m.raw};
1023 }
1024 
1025 template <typename T, size_t N>
1026 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1027  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1028  return (v & bit) == bit;
1029 }
1030 
1031 // ------------------------------ Equality
1032 
1033 // Unsigned
1034 template <size_t N>
1036  const Vec128<uint8_t, N> b) {
1037  return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1038 }
1039 template <size_t N>
1041  const Vec128<uint16_t, N> b) {
1042  return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
1043 }
1044 template <size_t N>
1046  const Vec128<uint32_t, N> b) {
1047  return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1048 }
1049 template <size_t N>
1051  const Vec128<uint64_t, N> b) {
1052  return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1053 }
1054 
1055 // Signed
1056 template <size_t N>
1058  const Vec128<int8_t, N> b) {
1059  return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1060 }
1061 template <size_t N>
1063  Vec128<int16_t, N> b) {
1064  return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
1065 }
1066 template <size_t N>
1068  const Vec128<int32_t, N> b) {
1069  return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1070 }
1071 template <size_t N>
1073  const Vec128<int64_t, N> b) {
1074  return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1075 }
1076 
1077 // Float
1078 template <size_t N>
1080  const Vec128<float, N> b) {
1081  return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
1082 }
1083 
1084 // ------------------------------ Inequality
1085 
1086 // Unsigned
1087 template <size_t N>
1089  const Vec128<uint8_t, N> b) {
1090  return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1091 }
1092 template <size_t N>
1094  const Vec128<uint16_t, N> b) {
1095  return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1096 }
1097 template <size_t N>
1099  const Vec128<uint32_t, N> b) {
1100  return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1101 }
1102 template <size_t N>
1104  const Vec128<uint64_t, N> b) {
1105  return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1106 }
1107 
1108 // Signed
1109 template <size_t N>
1111  const Vec128<int8_t, N> b) {
1112  return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1113 }
1114 template <size_t N>
1116  const Vec128<int16_t, N> b) {
1117  return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1118 }
1119 template <size_t N>
1121  const Vec128<int32_t, N> b) {
1122  return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1123 }
1124 template <size_t N>
1126  const Vec128<int64_t, N> b) {
1127  return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1128 }
1129 
1130 // Float
1131 template <size_t N>
1133  const Vec128<float, N> b) {
1134  return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
1135 }
1136 
1137 // ------------------------------ Strict inequality
1138 
1139 template <size_t N>
1141  const Vec128<int8_t, N> b) {
1142  return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
1143 }
1144 template <size_t N>
1146  const Vec128<int16_t, N> b) {
1147  return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
1148 }
1149 template <size_t N>
1151  const Vec128<int32_t, N> b) {
1152  return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
1153 }
1154 template <size_t N>
1156  const Vec128<int64_t, N> b) {
1157  return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
1158 }
1159 
1160 template <size_t N>
1162  const Vec128<uint8_t, N> b) {
1163  return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
1164 }
1165 template <size_t N>
1167  const Vec128<uint16_t, N> b) {
1168  return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
1169 }
1170 template <size_t N>
1172  const Vec128<uint32_t, N> b) {
1173  return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
1174 }
1175 template <size_t N>
1177  const Vec128<uint64_t, N> b) {
1178  const DFromV<decltype(a)> d;
1179  const Repartition<uint32_t, decltype(d)> d32;
1180  const auto a32 = BitCast(d32, a);
1181  const auto b32 = BitCast(d32, b);
1182  // If the upper halves are not equal, this is the answer.
1183  const auto m_gt = a32 > b32;
1184 
1185  // Otherwise, the lower half decides.
1186  const auto m_eq = a32 == b32;
1187  const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1188  const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));
1189 
1190  const auto gt = Or(lo_gt, m_gt);
1191  // Copy result in upper 32 bits to lower 32 bits.
1192  return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
1193 }
1194 
1195 template <size_t N>
1197  const Vec128<float, N> b) {
1198  return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
1199 }
1200 
1201 template <typename T, size_t N>
1202 HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
1203  return operator>(b, a);
1204 }
1205 
1206 // ------------------------------ Weak inequality
1207 
1208 // Float <= >=
1209 template <size_t N>
1211  const Vec128<float, N> b) {
1212  return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
1213 }
1214 template <size_t N>
1216  const Vec128<float, N> b) {
1217  return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
1218 }
1219 
1220 // ------------------------------ FirstN (Iota, Lt)
1221 
1222 template <typename T, size_t N>
1223 HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
1224  const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
1225  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1226 }
1227 
1228 // ================================================== LOGICAL
1229 
1230 // ------------------------------ Not
1231 
1232 template <typename T, size_t N>
1233 HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
1234  return Vec128<T, N>{wasm_v128_not(v.raw)};
1235 }
1236 
1237 // ------------------------------ And
1238 
1239 template <typename T, size_t N>
1240 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
1241  return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
1242 }
1243 
1244 // ------------------------------ AndNot
1245 
1246 // Returns ~not_mask & mask.
1247 template <typename T, size_t N>
1248 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
1249  return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1250 }
1251 
1252 // ------------------------------ Or
1253 
1254 template <typename T, size_t N>
1255 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
1256  return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
1257 }
1258 
1259 // ------------------------------ Xor
1260 
1261 template <typename T, size_t N>
1262 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
1263  return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
1264 }
1265 
1266 // ------------------------------ Or3
1267 
1268 template <typename T, size_t N>
1269 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
1270  return Or(o1, Or(o2, o3));
1271 }
1272 
1273 // ------------------------------ OrAnd
1274 
1275 template <typename T, size_t N>
1276 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1277  return Or(o, And(a1, a2));
1278 }
1279 
1280 // ------------------------------ IfVecThenElse
1281 
1282 template <typename T, size_t N>
1283 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
1284  Vec128<T, N> no) {
1285  return IfThenElse(MaskFromVec(mask), yes, no);
1286 }
1287 
1288 // ------------------------------ Operator overloads (internal-only if float)
1289 
1290 template <typename T, size_t N>
1291 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
1292  return And(a, b);
1293 }
1294 
1295 template <typename T, size_t N>
1296 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
1297  return Or(a, b);
1298 }
1299 
1300 template <typename T, size_t N>
1301 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
1302  return Xor(a, b);
1303 }
1304 
1305 // ------------------------------ CopySign
1306 
1307 template <typename T, size_t N>
1308 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
1309  const Vec128<T, N> sign) {
1310  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1311  const auto msb = SignBit(DFromV<decltype(magn)>());
1312  return Or(AndNot(msb, magn), And(msb, sign));
1313 }
1314 
1315 template <typename T, size_t N>
1316 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
1317  const Vec128<T, N> sign) {
1318  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1319  return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
1320 }
1321 
1322 // ------------------------------ BroadcastSignBit (compare)
1323 
1324 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1325 HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
1326  return ShiftRight<sizeof(T) * 8 - 1>(v);
1327 }
1328 template <size_t N>
1330  const DFromV<decltype(v)> d;
1331  return VecFromMask(d, v < Zero(d));
1332 }
1333 
1334 // ------------------------------ Mask
1335 
1336 // Mask and Vec are the same (true = FF..FF).
1337 template <typename T, size_t N>
1338 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1339  return Mask128<T, N>{v.raw};
1340 }
1341 
1342 template <typename T, size_t N>
1343 HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */, Mask128<T, N> v) {
1344  return Vec128<T, N>{v.raw};
1345 }
1346 
1347 // mask ? yes : no
1348 template <typename T, size_t N>
1349 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1350  Vec128<T, N> no) {
1351  return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1352 }
1353 
1354 // mask ? yes : 0
1355 template <typename T, size_t N>
1356 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1357  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1358 }
1359 
1360 // mask ? 0 : no
1361 template <typename T, size_t N>
1362 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1363  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1364 }
1365 
1366 template <typename T, size_t N>
1367 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1368  Vec128<T, N> no) {
1369  static_assert(IsSigned<T>(), "Only works for signed/float");
1370  const DFromV<decltype(v)> d;
1371  const RebindToSigned<decltype(d)> di;
1372 
1373  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
1374  return IfThenElse(MaskFromVec(v), yes, no);
1375 }
1376 
1377 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1379  const DFromV<decltype(v)> d;
1380  const auto zero = Zero(d);
1381  return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
1382 }
1383 
1384 // ------------------------------ Mask logical
1385 
1386 template <typename T, size_t N>
1387 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1388  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
1389 }
1390 
1391 template <typename T, size_t N>
1392 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1393  const Simd<T, N, 0> d;
1394  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1395 }
1396 
1397 template <typename T, size_t N>
1398 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1399  const Simd<T, N, 0> d;
1400  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1401 }
1402 
1403 template <typename T, size_t N>
1404 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1405  const Simd<T, N, 0> d;
1406  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1407 }
1408 
1409 template <typename T, size_t N>
1410 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1411  const Simd<T, N, 0> d;
1412  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1413 }
1414 
1415 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)
1416 
1417 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
1418 // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
1419 // scalar count operand, per-lane shift instructions would require extract_lane
1420 // for each lane, and hoping that shuffle is correctly mapped to a native
1421 // instruction. Using non-vector shifts would incur a store-load forwarding
1422 // stall when loading the result vector. We instead test bits of the shift
1423 // count to "predicate" a shift of the entire vector by a constant.
1424 
1425 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1427  const DFromV<decltype(v)> d;
1428  Mask128<T, N> mask;
1429  // Need a signed type for BroadcastSignBit.
1430  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1431  // Move the highest valid bit of the shift count into the sign bit.
1432  test = ShiftLeft<12>(test);
1433 
1434  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1435  test = ShiftLeft<1>(test); // next bit (descending order)
1436  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1437 
1438  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1439  test = ShiftLeft<1>(test); // next bit (descending order)
1440  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1441 
1442  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1443  test = ShiftLeft<1>(test); // next bit (descending order)
1444  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1445 
1446  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1447  return IfThenElse(mask, ShiftLeft<1>(v), v);
1448 }
1449 
1450 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1451 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1452  const DFromV<decltype(v)> d;
1453  Mask128<T, N> mask;
1454  // Need a signed type for BroadcastSignBit.
1455  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1456  // Move the highest valid bit of the shift count into the sign bit.
1457  test = ShiftLeft<27>(test);
1458 
1459  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1460  test = ShiftLeft<1>(test); // next bit (descending order)
1461  v = IfThenElse(mask, ShiftLeft<16>(v), v);
1462 
1463  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1464  test = ShiftLeft<1>(test); // next bit (descending order)
1465  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1466 
1467  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1468  test = ShiftLeft<1>(test); // next bit (descending order)
1469  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1470 
1471  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1472  test = ShiftLeft<1>(test); // next bit (descending order)
1473  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1474 
1475  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1476  return IfThenElse(mask, ShiftLeft<1>(v), v);
1477 }
1478 
1479 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1480 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1481  const DFromV<decltype(v)> d;
1482  alignas(16) T lanes[2];
1483  alignas(16) T bits_lanes[2];
1484  Store(v, d, lanes);
1485  Store(bits, d, bits_lanes);
1486  lanes[0] <<= bits_lanes[0];
1487  lanes[1] <<= bits_lanes[1];
1488  return Load(d, lanes);
1489 }
1490 
1491 // ------------------------------ Shr (BroadcastSignBit, IfThenElse)
1492 
1493 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1495  const DFromV<decltype(v)> d;
1496  Mask128<T, N> mask;
1497  // Need a signed type for BroadcastSignBit.
1498  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1499  // Move the highest valid bit of the shift count into the sign bit.
1500  test = ShiftLeft<12>(test);
1501 
1502  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1503  test = ShiftLeft<1>(test); // next bit (descending order)
1504  v = IfThenElse(mask, ShiftRight<8>(v), v);
1505 
1506  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1507  test = ShiftLeft<1>(test); // next bit (descending order)
1508  v = IfThenElse(mask, ShiftRight<4>(v), v);
1509 
1510  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1511  test = ShiftLeft<1>(test); // next bit (descending order)
1512  v = IfThenElse(mask, ShiftRight<2>(v), v);
1513 
1514  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1515  return IfThenElse(mask, ShiftRight<1>(v), v);
1516 }
1517 
1518 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1519 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
1520  const DFromV<decltype(v)> d;
1521  Mask128<T, N> mask;
1522  // Need a signed type for BroadcastSignBit.
1523  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1524  // Move the highest valid bit of the shift count into the sign bit.
1525  test = ShiftLeft<27>(test);
1526 
1527  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1528  test = ShiftLeft<1>(test); // next bit (descending order)
1529  v = IfThenElse(mask, ShiftRight<16>(v), v);
1530 
1531  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1532  test = ShiftLeft<1>(test); // next bit (descending order)
1533  v = IfThenElse(mask, ShiftRight<8>(v), v);
1534 
1535  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1536  test = ShiftLeft<1>(test); // next bit (descending order)
1537  v = IfThenElse(mask, ShiftRight<4>(v), v);
1538 
1539  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1540  test = ShiftLeft<1>(test); // next bit (descending order)
1541  v = IfThenElse(mask, ShiftRight<2>(v), v);
1542 
1543  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1544  return IfThenElse(mask, ShiftRight<1>(v), v);
1545 }
1546 
1547 // ================================================== MEMORY
1548 
1549 // ------------------------------ Load
1550 
1551 template <typename T>
1552 HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1553  return Vec128<T>{wasm_v128_load(aligned)};
1554 }
1555 
1556 template <typename T, size_t N>
1557 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1558  const T* HWY_RESTRICT aligned) {
1559  return IfThenElseZero(m, Load(d, aligned));
1560 }
1561 
1562 // Partial load.
1563 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1565  Vec128<T, N> v;
1566  CopyBytes<sizeof(T) * N>(p, &v);
1567  return v;
1568 }
1569 
1570 // LoadU == Load.
1571 template <typename T, size_t N>
1572 HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
1573  return Load(d, p);
1574 }
1575 
1576 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1577 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1578 HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
1579  return Load(d, p);
1580 }
1581 
1582 // ------------------------------ Store
1583 
1584 template <typename T>
1585 HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
1586  wasm_v128_store(aligned, v.raw);
1587 }
1588 
1589 // Partial store.
1590 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1592  CopyBytes<sizeof(T) * N>(&v, p);
1593 }
1594 
1596  float* HWY_RESTRICT p) {
1597  *p = wasm_f32x4_extract_lane(v.raw, 0);
1598 }
1599 
1600 // StoreU == Store.
1601 template <typename T, size_t N>
1602 HWY_API void StoreU(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT p) {
1603  Store(v, d, p);
1604 }
1605 
1606 template <typename T, size_t N>
1607 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
1608  T* HWY_RESTRICT p) {
1609  StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
1610 }
1611 
1612 // ------------------------------ Non-temporal stores
1613 
1614 // Same as aligned stores on non-x86.
1615 
1616 template <typename T, size_t N>
1617 HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
1618  T* HWY_RESTRICT aligned) {
1619  wasm_v128_store(aligned, v.raw);
1620 }
1621 
1622 // ------------------------------ Scatter (Store)
1623 
1624 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
1625 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
1626  T* HWY_RESTRICT base,
1627  const Vec128<Offset, N> offset) {
1628  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1629 
1630  alignas(16) T lanes[N];
1631  Store(v, d, lanes);
1632 
1633  alignas(16) Offset offset_lanes[N];
1634  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
1635 
1636  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
1637  for (size_t i = 0; i < N; ++i) {
1638  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1639  }
1640 }
1641 
1642 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
1643 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
1644  const Vec128<Index, N> index) {
1645  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1646 
1647  alignas(16) T lanes[N];
1648  Store(v, d, lanes);
1649 
1650  alignas(16) Index index_lanes[N];
1651  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
1652 
1653  for (size_t i = 0; i < N; ++i) {
1654  base[index_lanes[i]] = lanes[i];
1655  }
1656 }
1657 
1658 // ------------------------------ Gather (Load/Store)
1659 
1660 template <typename T, size_t N, typename Offset>
1661 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
1662  const T* HWY_RESTRICT base,
1663  const Vec128<Offset, N> offset) {
1664  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1665 
1666  alignas(16) Offset offset_lanes[N];
1667  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
1668 
1669  alignas(16) T lanes[N];
1670  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1671  for (size_t i = 0; i < N; ++i) {
1672  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1673  }
1674  return Load(d, lanes);
1675 }
1676 
1677 template <typename T, size_t N, typename Index>
1678 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
1679  const T* HWY_RESTRICT base,
1680  const Vec128<Index, N> index) {
1681  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1682 
1683  alignas(16) Index index_lanes[N];
1684  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
1685 
1686  alignas(16) T lanes[N];
1687  for (size_t i = 0; i < N; ++i) {
1688  lanes[i] = base[index_lanes[i]];
1689  }
1690  return Load(d, lanes);
1691 }
1692 
1693 // ================================================== SWIZZLE
1694 
1695 // ------------------------------ ExtractLane
1696 
1697 namespace detail {
1698 
1699 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1701  return static_cast<T>(wasm_i8x16_extract_lane(v.raw, kLane));
1702 }
1703 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1705  return static_cast<T>(wasm_i16x8_extract_lane(v.raw, kLane));
1706 }
1707 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1708 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1709  return static_cast<T>(wasm_i32x4_extract_lane(v.raw, kLane));
1710 }
1711 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1712 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1713  return static_cast<T>(wasm_i64x2_extract_lane(v.raw, kLane));
1714 }
1715 
1716 template <size_t kLane, size_t N>
1718  return wasm_f32x4_extract_lane(v.raw, kLane);
1719 }
1720 
1721 } // namespace detail
1722 
1723 // One overload per vector length just in case *_extract_lane raise compile
1724 // errors if their argument is out of bounds (even if that would never be
1725 // reached at runtime).
1726 template <typename T>
1727 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
1728  HWY_DASSERT(i == 0);
1729  (void)i;
1730  return GetLane(v);
1731 }
1732 
1733 template <typename T>
1734 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
1735 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1736  if (__builtin_constant_p(i)) {
1737  switch (i) {
1738  case 0:
1739  return detail::ExtractLane<0>(v);
1740  case 1:
1741  return detail::ExtractLane<1>(v);
1742  }
1743  }
1744 #endif
1745  alignas(16) T lanes[2];
1746  Store(v, DFromV<decltype(v)>(), lanes);
1747  return lanes[i];
1748 }
1749 
1750 template <typename T>
1751 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
1752 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1753  if (__builtin_constant_p(i)) {
1754  switch (i) {
1755  case 0:
1756  return detail::ExtractLane<0>(v);
1757  case 1:
1758  return detail::ExtractLane<1>(v);
1759  case 2:
1760  return detail::ExtractLane<2>(v);
1761  case 3:
1762  return detail::ExtractLane<3>(v);
1763  }
1764  }
1765 #endif
1766  alignas(16) T lanes[4];
1767  Store(v, DFromV<decltype(v)>(), lanes);
1768  return lanes[i];
1769 }
1770 
1771 template <typename T>
1772 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
1773 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1774  if (__builtin_constant_p(i)) {
1775  switch (i) {
1776  case 0:
1777  return detail::ExtractLane<0>(v);
1778  case 1:
1779  return detail::ExtractLane<1>(v);
1780  case 2:
1781  return detail::ExtractLane<2>(v);
1782  case 3:
1783  return detail::ExtractLane<3>(v);
1784  case 4:
1785  return detail::ExtractLane<4>(v);
1786  case 5:
1787  return detail::ExtractLane<5>(v);
1788  case 6:
1789  return detail::ExtractLane<6>(v);
1790  case 7:
1791  return detail::ExtractLane<7>(v);
1792  }
1793  }
1794 #endif
1795  alignas(16) T lanes[8];
1796  Store(v, DFromV<decltype(v)>(), lanes);
1797  return lanes[i];
1798 }
1799 
1800 template <typename T>
1801 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
1802 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1803  if (__builtin_constant_p(i)) {
1804  switch (i) {
1805  case 0:
1806  return detail::ExtractLane<0>(v);
1807  case 1:
1808  return detail::ExtractLane<1>(v);
1809  case 2:
1810  return detail::ExtractLane<2>(v);
1811  case 3:
1812  return detail::ExtractLane<3>(v);
1813  case 4:
1814  return detail::ExtractLane<4>(v);
1815  case 5:
1816  return detail::ExtractLane<5>(v);
1817  case 6:
1818  return detail::ExtractLane<6>(v);
1819  case 7:
1820  return detail::ExtractLane<7>(v);
1821  case 8:
1822  return detail::ExtractLane<8>(v);
1823  case 9:
1824  return detail::ExtractLane<9>(v);
1825  case 10:
1826  return detail::ExtractLane<10>(v);
1827  case 11:
1828  return detail::ExtractLane<11>(v);
1829  case 12:
1830  return detail::ExtractLane<12>(v);
1831  case 13:
1832  return detail::ExtractLane<13>(v);
1833  case 14:
1834  return detail::ExtractLane<14>(v);
1835  case 15:
1836  return detail::ExtractLane<15>(v);
1837  }
1838  }
1839 #endif
1840  alignas(16) T lanes[16];
1841  Store(v, DFromV<decltype(v)>(), lanes);
1842  return lanes[i];
1843 }
1844 
1845 // ------------------------------ GetLane
1846 template <typename T, size_t N>
1847 HWY_API T GetLane(const Vec128<T, N> v) {
1848  return detail::ExtractLane<0>(v);
1849 }
1850 
1851 // ------------------------------ InsertLane
1852 
1853 namespace detail {
1854 
1855 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1857  static_assert(kLane < N, "Lane index out of bounds");
1858  return Vec128<T, N>{
1859  wasm_i8x16_replace_lane(v.raw, kLane, static_cast<int8_t>(t))};
1860 }
1861 
1862 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1864  static_assert(kLane < N, "Lane index out of bounds");
1865  return Vec128<T, N>{
1866  wasm_i16x8_replace_lane(v.raw, kLane, static_cast<int16_t>(t))};
1867 }
1868 
1869 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1870 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
1871  static_assert(kLane < N, "Lane index out of bounds");
1872  return Vec128<T, N>{
1873  wasm_i32x4_replace_lane(v.raw, kLane, static_cast<int32_t>(t))};
1874 }
1875 
1876 template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1877 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
1878  static_assert(kLane < N, "Lane index out of bounds");
1879  return Vec128<T, N>{
1880  wasm_i64x2_replace_lane(v.raw, kLane, static_cast<int64_t>(t))};
1881 }
1882 
1883 template <size_t kLane, size_t N>
1885  static_assert(kLane < N, "Lane index out of bounds");
1886  return Vec128<float, N>{wasm_f32x4_replace_lane(v.raw, kLane, t)};
1887 }
1888 
1889 template <size_t kLane, size_t N>
1891  static_assert(kLane < 2, "Lane index out of bounds");
1892  return Vec128<double, N>{wasm_f64x2_replace_lane(v.raw, kLane, t)};
1893 }
1894 
1895 } // namespace detail
1896 
1897 // Requires one overload per vector length because InsertLane<3> may be a
1898 // compile error if it calls wasm_f64x2_replace_lane.
1899 
1900 template <typename T>
1901 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
1902  HWY_DASSERT(i == 0);
1903  (void)i;
1904  return Set(DFromV<decltype(v)>(), t);
1905 }
1906 
1907 template <typename T>
1908 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
1909 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1910  if (__builtin_constant_p(i)) {
1911  switch (i) {
1912  case 0:
1913  return detail::InsertLane<0>(v, t);
1914  case 1:
1915  return detail::InsertLane<1>(v, t);
1916  }
1917  }
1918 #endif
1919  const DFromV<decltype(v)> d;
1920  alignas(16) T lanes[2];
1921  Store(v, d, lanes);
1922  lanes[i] = t;
1923  return Load(d, lanes);
1924 }
1925 
1926 template <typename T>
1927 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
1928 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1929  if (__builtin_constant_p(i)) {
1930  switch (i) {
1931  case 0:
1932  return detail::InsertLane<0>(v, t);
1933  case 1:
1934  return detail::InsertLane<1>(v, t);
1935  case 2:
1936  return detail::InsertLane<2>(v, t);
1937  case 3:
1938  return detail::InsertLane<3>(v, t);
1939  }
1940  }
1941 #endif
1942  const DFromV<decltype(v)> d;
1943  alignas(16) T lanes[4];
1944  Store(v, d, lanes);
1945  lanes[i] = t;
1946  return Load(d, lanes);
1947 }
1948 
1949 template <typename T>
1950 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
1951 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1952  if (__builtin_constant_p(i)) {
1953  switch (i) {
1954  case 0:
1955  return detail::InsertLane<0>(v, t);
1956  case 1:
1957  return detail::InsertLane<1>(v, t);
1958  case 2:
1959  return detail::InsertLane<2>(v, t);
1960  case 3:
1961  return detail::InsertLane<3>(v, t);
1962  case 4:
1963  return detail::InsertLane<4>(v, t);
1964  case 5:
1965  return detail::InsertLane<5>(v, t);
1966  case 6:
1967  return detail::InsertLane<6>(v, t);
1968  case 7:
1969  return detail::InsertLane<7>(v, t);
1970  }
1971  }
1972 #endif
1973  const DFromV<decltype(v)> d;
1974  alignas(16) T lanes[8];
1975  Store(v, d, lanes);
1976  lanes[i] = t;
1977  return Load(d, lanes);
1978 }
1979 
1980 template <typename T>
1981 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
1982 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1983  if (__builtin_constant_p(i)) {
1984  switch (i) {
1985  case 0:
1986  return detail::InsertLane<0>(v, t);
1987  case 1:
1988  return detail::InsertLane<1>(v, t);
1989  case 2:
1990  return detail::InsertLane<2>(v, t);
1991  case 3:
1992  return detail::InsertLane<3>(v, t);
1993  case 4:
1994  return detail::InsertLane<4>(v, t);
1995  case 5:
1996  return detail::InsertLane<5>(v, t);
1997  case 6:
1998  return detail::InsertLane<6>(v, t);
1999  case 7:
2000  return detail::InsertLane<7>(v, t);
2001  case 8:
2002  return detail::InsertLane<8>(v, t);
2003  case 9:
2004  return detail::InsertLane<9>(v, t);
2005  case 10:
2006  return detail::InsertLane<10>(v, t);
2007  case 11:
2008  return detail::InsertLane<11>(v, t);
2009  case 12:
2010  return detail::InsertLane<12>(v, t);
2011  case 13:
2012  return detail::InsertLane<13>(v, t);
2013  case 14:
2014  return detail::InsertLane<14>(v, t);
2015  case 15:
2016  return detail::InsertLane<15>(v, t);
2017  }
2018  }
2019 #endif
2020  const DFromV<decltype(v)> d;
2021  alignas(16) T lanes[16];
2022  Store(v, d, lanes);
2023  lanes[i] = t;
2024  return Load(d, lanes);
2025 }
2026 
2027 // ------------------------------ LowerHalf
2028 
2029 template <typename T, size_t N>
2030 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
2031  Vec128<T, N> v) {
2032  return Vec128<T, N / 2>{v.raw};
2033 }
2034 
2035 template <typename T, size_t N>
2036 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
2037  return LowerHalf(Simd<T, N / 2, 0>(), v);
2038 }
2039 
2040 // ------------------------------ ShiftLeftBytes
2041 
2042 // 0x01..0F, kBytes = 1 => 0x02..0F00
2043 template <int kBytes, typename T, size_t N>
2044 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
2045  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2046  const __i8x16 zero = wasm_i8x16_splat(0);
2047  switch (kBytes) {
2048  case 0:
2049  return v;
2050 
2051  case 1:
2052  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
2053  6, 7, 8, 9, 10, 11, 12, 13, 14)};
2054 
2055  case 2:
2056  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
2057  5, 6, 7, 8, 9, 10, 11, 12, 13)};
2058 
2059  case 3:
2060  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
2061  3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
2062 
2063  case 4:
2064  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
2065  2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
2066 
2067  case 5:
2068  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
2069  1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
2070 
2071  case 6:
2072  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2073  16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
2074 
2075  case 7:
2076  return Vec128<T, N>{wasm_i8x16_shuffle(
2077  v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
2078 
2079  case 8:
2080  return Vec128<T, N>{wasm_i8x16_shuffle(
2081  v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
2082 
2083  case 9:
2084  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2085  16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
2086  6)};
2087 
2088  case 10:
2089  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2090  16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
2091  5)};
2092 
2093  case 11:
2094  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2095  16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
2096  4)};
2097 
2098  case 12:
2099  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2100  16, 16, 16, 16, 16, 16, 16, 0, 1,
2101  2, 3)};
2102 
2103  case 13:
2104  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2105  16, 16, 16, 16, 16, 16, 16, 16, 0,
2106  1, 2)};
2107 
2108  case 14:
2109  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2110  16, 16, 16, 16, 16, 16, 16, 16, 16,
2111  0, 1)};
2112 
2113  case 15:
2114  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2115  16, 16, 16, 16, 16, 16, 16, 16, 16,
2116  16, 0)};
2117  }
2118  return Vec128<T, N>{zero};
2119 }
2120 
2121 template <int kBytes, typename T, size_t N>
2122 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
2123  return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
2124 }
2125 
2126 // ------------------------------ ShiftLeftLanes
2127 
2128 template <int kLanes, typename T, size_t N>
2129 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2130  const Repartition<uint8_t, decltype(d)> d8;
2131  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2132 }
2133 
2134 template <int kLanes, typename T, size_t N>
2135 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
2136  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
2137 }
2138 
2139 // ------------------------------ ShiftRightBytes
2140 namespace detail {
2141 
2142 // Helper function allows zeroing invalid lanes in caller.
2143 template <int kBytes, typename T, size_t N>
2144 HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) {
2145  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2146  const __i8x16 zero = wasm_i8x16_splat(0);
2147 
2148  switch (kBytes) {
2149  case 0:
2150  return v.raw;
2151 
2152  case 1:
2153  return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2154  12, 13, 14, 15, 16);
2155 
2156  case 2:
2157  return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2158  13, 14, 15, 16, 16);
2159 
2160  case 3:
2161  return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2162  13, 14, 15, 16, 16, 16);
2163 
2164  case 4:
2165  return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2166  14, 15, 16, 16, 16, 16);
2167 
2168  case 5:
2169  return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
2170  15, 16, 16, 16, 16, 16);
2171 
2172  case 6:
2173  return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2174  16, 16, 16, 16, 16, 16);
2175 
2176  case 7:
2177  return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2178  16, 16, 16, 16, 16, 16, 16);
2179 
2180  case 8:
2181  return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
2182  16, 16, 16, 16, 16, 16, 16);
2183 
2184  case 9:
2185  return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
2186  16, 16, 16, 16, 16, 16, 16);
2187 
2188  case 10:
2189  return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
2190  16, 16, 16, 16, 16, 16, 16);
2191 
2192  case 11:
2193  return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
2194  16, 16, 16, 16, 16, 16, 16);
2195 
2196  case 12:
2197  return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
2198  16, 16, 16, 16, 16, 16, 16);
2199 
2200  case 13:
2201  return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
2202  16, 16, 16, 16, 16, 16, 16);
2203 
2204  case 14:
2205  return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
2206  16, 16, 16, 16, 16, 16, 16);
2207 
2208  case 15:
2209  return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
2210  16, 16, 16, 16, 16, 16, 16);
2211  case 16:
2212  return zero;
2213  }
2214 }
2215 
2216 } // namespace detail
2217 
2218 // 0x01..0F, kBytes = 1 => 0x0001..0E
2219 template <int kBytes, typename T, size_t N>
2220 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
2221  // For partial vectors, clear upper lanes so we shift in zeros.
2222  if (N != 16 / sizeof(T)) {
2223  const Vec128<T> vfull{v.raw};
2224  v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
2225  }
2226  return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
2227 }
2228 
2229 // ------------------------------ ShiftRightLanes
2230 template <int kLanes, typename T, size_t N>
2231 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2232  const Repartition<uint8_t, decltype(d)> d8;
2233  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2234 }
2235 
2236 // ------------------------------ UpperHalf (ShiftRightBytes)
2237 
2238 // Full input: copy hi into lo (smaller instruction encoding than shifts).
2239 template <typename T>
2241  return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
2242 }
2243 HWY_API Vec64<float> UpperHalf(Full64<float> /* tag */, const Vec128<float> v) {
2244  return Vec64<float>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
2245 }
2246 
2247 // Partial
2248 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2249 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
2250  Vec128<T, N> v) {
2251  const DFromV<decltype(v)> d;
2252  const RebindToUnsigned<decltype(d)> du;
2253  const auto vu = BitCast(du, v);
2254  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
2255  return Vec128<T, (N + 1) / 2>{upper.raw};
2256 }
2257 
2258 // ------------------------------ CombineShiftRightBytes
2259 
2260 template <int kBytes, typename T, class V = Vec128<T>>
2261 HWY_API V CombineShiftRightBytes(Full128<T> /* tag */, V hi, V lo) {
2262  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2263  switch (kBytes) {
2264  case 0:
2265  return lo;
2266 
2267  case 1:
2268  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2269  11, 12, 13, 14, 15, 16)};
2270 
2271  case 2:
2272  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2273  11, 12, 13, 14, 15, 16, 17)};
2274 
2275  case 3:
2276  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2277  12, 13, 14, 15, 16, 17, 18)};
2278 
2279  case 4:
2280  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2281  13, 14, 15, 16, 17, 18, 19)};
2282 
2283  case 5:
2284  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2285  14, 15, 16, 17, 18, 19, 20)};
2286 
2287  case 6:
2288  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
2289  14, 15, 16, 17, 18, 19, 20, 21)};
2290 
2291  case 7:
2292  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
2293  15, 16, 17, 18, 19, 20, 21, 22)};
2294 
2295  case 8:
2296  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
2297  16, 17, 18, 19, 20, 21, 22, 23)};
2298 
2299  case 9:
2300  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
2301  17, 18, 19, 20, 21, 22, 23, 24)};
2302 
2303  case 10:
2304  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
2305  17, 18, 19, 20, 21, 22, 23, 24, 25)};
2306 
2307  case 11:
2308  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
2309  18, 19, 20, 21, 22, 23, 24, 25, 26)};
2310 
2311  case 12:
2312  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
2313  19, 20, 21, 22, 23, 24, 25, 26, 27)};
2314 
2315  case 13:
2316  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
2317  20, 21, 22, 23, 24, 25, 26, 27, 28)};
2318 
2319  case 14:
2320  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
2321  21, 22, 23, 24, 25, 26, 27, 28, 29)};
2322 
2323  case 15:
2324  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
2325  22, 23, 24, 25, 26, 27, 28, 29, 30)};
2326  }
2327  return hi;
2328 }
2329 
2330 template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
2331  class V = Vec128<T, N>>
2333  constexpr size_t kSize = N * sizeof(T);
2334  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
2335  const Repartition<uint8_t, decltype(d)> d8;
2336  const Full128<uint8_t> d_full8;
2337  using V8 = VFromD<decltype(d_full8)>;
2338  const V8 hi8{BitCast(d8, hi).raw};
2339  // Move into most-significant bytes
2340  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
2341  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
2342  return V{BitCast(Full128<T>(), r).raw};
2343 }
2344 
2345 // ------------------------------ Broadcast/splat any lane
2346 
2347 template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2349  static_assert(0 <= kLane && kLane < N, "Invalid lane");
2350  return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
2351  kLane, kLane, kLane, kLane, kLane)};
2352 }
2353 
2354 template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2355 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2356  static_assert(0 <= kLane && kLane < N, "Invalid lane");
2357  return Vec128<T, N>{
2358  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
2359 }
2360 
2361 template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2362 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2363  static_assert(0 <= kLane && kLane < N, "Invalid lane");
2364  return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
2365 }
2366 
2367 // ------------------------------ TableLookupBytes
2368 
2369 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
2370 // lane indices in [0, 16).
2371 template <typename T, size_t N, typename TI, size_t NI>
2372 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
2373  const Vec128<TI, NI> from) {
2374 // Not yet available in all engines, see
2375 // https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
2376 // V8 implementation of this had a bug, fixed on 2021-04-03:
2377 // https://chromium-review.googlesource.com/c/v8/v8/+/2822951
2378 #if 0
2379  return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
2380 #else
2381  alignas(16) uint8_t control[16];
2382  alignas(16) uint8_t input[16];
2383  alignas(16) uint8_t output[16];
2384  wasm_v128_store(control, from.raw);
2385  wasm_v128_store(input, bytes.raw);
2386  for (size_t i = 0; i < 16; ++i) {
2387  output[i] = control[i] < 16 ? input[control[i]] : 0;
2388  }
2389  return Vec128<TI, NI>{wasm_v128_load(output)};
2390 #endif
2391 }
2392 
2393 template <typename T, size_t N, typename TI, size_t NI>
2394 HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes,
2395  const Vec128<TI, NI> from) {
2396  const Simd<TI, NI, 0> d;
2397  // Mask size must match vector type, so cast everything to this type.
2398  Repartition<int8_t, decltype(d)> di8;
2399  Repartition<int8_t, Simd<T, N, 0>> d_bytes8;
2400  const auto msb = BitCast(di8, from) < Zero(di8);
2401  const auto lookup =
2402  TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
2403  return BitCast(d, IfThenZeroElse(msb, lookup));
2404 }
2405 
2406 // ------------------------------ Hard-coded shuffles
2407 
2408 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
2409 // Shuffle0321 rotates one lane to the right (the previous least-significant
2410 // lane is now most-significant). These could also be implemented via
2411 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
2412 
2413 // Swap 32-bit halves in 64-bit halves.
2414 template <typename T, size_t N>
2416  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2417  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2418  return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
2419 }
2420 
2421 // These are used by generic_ops-inl to implement LoadInterleaved3.
2422 namespace detail {
2423 
2424 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2426  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2427  return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16,
2428  0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2429  0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2430 }
2431 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2433  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2434  return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8,
2435  0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2436 }
2437 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2438 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> a, const Vec128<T, N> b) {
2439  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2440  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
2441 }
2442 
2443 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2445  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2446  return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16,
2447  0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2448  0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2449 }
2450 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2452  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2453  return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8,
2454  0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2455 }
2456 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2457 HWY_API Vec128<T, N> Shuffle1230(const Vec128<T, N> a, const Vec128<T, N> b) {
2458  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2459  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
2460 }
2461 
2462 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2464  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2465  return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16,
2466  0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2467  0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2468 }
2469 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2471  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2472  return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8,
2473  0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2474 }
2475 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2476 HWY_API Vec128<T, N> Shuffle3012(const Vec128<T, N> a, const Vec128<T, N> b) {
2477  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2478  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
2479 }
2480 
2481 } // namespace detail
2482 
2483 // Swap 64-bit halves
2484 template <typename T>
2485 HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
2486  static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
2487  return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2488 }
2489 template <typename T>
2490 HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
2491  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2492  return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2493 }
2494 
2495 // Rotate right 32 bits
2496 template <typename T>
2497 HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
2498  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2499  return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
2500 }
2501 
2502 // Rotate left 32 bits
2503 template <typename T>
2504 HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
2505  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2506  return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
2507 }
2508 
2509 // Reverse
2510 template <typename T>
2511 HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
2512  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2513  return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
2514 }
2515 
2516 // ------------------------------ TableLookupLanes
2517 
2518 // Returned by SetTableIndices for use by TableLookupLanes.
2519 template <typename T, size_t N>
2520 struct Indices128 {
2521  __v128_u raw;
2522 };
2523 
2524 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
2526  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2527 #if HWY_IS_DEBUG_BUILD
2528  const Rebind<TI, decltype(d)> di;
2529  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2530  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
2531 #endif
2532 
2533  const Repartition<uint8_t, decltype(d)> d8;
2534  using V8 = VFromD<decltype(d8)>;
2535  const Repartition<uint16_t, decltype(d)> d16;
2536 
2537  // Broadcast each lane index to all bytes of T and shift to bytes
2538  static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
2539  if (sizeof(T) == 4) {
2540  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2541  0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2542  const V8 lane_indices =
2543  TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
2544  const V8 byte_indices =
2545  BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
2546  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
2547  0, 1, 2, 3, 0, 1, 2, 3};
2548  return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
2549  } else {
2550  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2551  0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2552  const V8 lane_indices =
2553  TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
2554  const V8 byte_indices =
2555  BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
2556  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
2557  0, 1, 2, 3, 4, 5, 6, 7};
2558  return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
2559  }
2560 }
2561 
2562 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
2563 HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
2564  const Rebind<TI, decltype(d)> di;
2565  return IndicesFromVec(d, LoadU(di, idx));
2566 }
2567 
2568 template <typename T, size_t N>
2569 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
2570  using TI = MakeSigned<T>;
2571  const DFromV<decltype(v)> d;
2572  const Rebind<TI, decltype(d)> di;
2573  return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
2574 }
2575 
2576 // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
2577 
2578 // Single lane: no change
2579 template <typename T>
2580 HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
2581  return v;
2582 }
2583 
2584 // Two lanes: shuffle
2585 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2586 HWY_API Vec128<T, 2> Reverse(Simd<T, 2, 0> /* tag */, const Vec128<T, 2> v) {
2587  return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
2588 }
2589 
2590 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2591 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
2592  return Shuffle01(v);
2593 }
2594 
2595 // Four lanes: shuffle
2596 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2597 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
2598  return Shuffle0123(v);
2599 }
2600 
2601 // 16-bit
2602 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2603 HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
2604  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2605  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
2606 }
2607 
2608 // ------------------------------ Reverse2
2609 
2610 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2611 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
2612  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2613  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2614 }
2615 
2616 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2617 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2618  return Shuffle2301(v);
2619 }
2620 
2621 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2622 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2623  return Shuffle01(v);
2624 }
2625 
2626 // ------------------------------ Reverse4
2627 
2628 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2629 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
2630  return BitCast(d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2,
2631  1, 0, 7, 6, 5, 4)});
2632 }
2633 
2634 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2635 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2636  return Shuffle0123(v);
2637 }
2638 
2639 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2640 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
2641  HWY_ASSERT(0); // don't have 8 u64 lanes
2642 }
2643 
2644 // ------------------------------ Reverse8
2645 
2646 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2647 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
2648  return Reverse(d, v);
2649 }
2650 
2651 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2652 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
2653  HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
2654 }
2655 
2656 // ------------------------------ InterleaveLower
2657 
2658 template <size_t N>
2660  Vec128<uint8_t, N> b) {
2661  return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
2662  a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2663 }
2664 template <size_t N>
2666  Vec128<uint16_t, N> b) {
2667  return Vec128<uint16_t, N>{
2668  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2669 }
2670 template <size_t N>
2672  Vec128<uint32_t, N> b) {
2673  return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2674 }
2675 template <size_t N>
2677  Vec128<uint64_t, N> b) {
2678  return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2679 }
2680 
2681 template <size_t N>
2683  Vec128<int8_t, N> b) {
2684  return Vec128<int8_t, N>{wasm_i8x16_shuffle(
2685  a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2686 }
2687 template <size_t N>
2689  Vec128<int16_t, N> b) {
2690  return Vec128<int16_t, N>{
2691  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2692 }
2693 template <size_t N>
2695  Vec128<int32_t, N> b) {
2696  return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2697 }
2698 template <size_t N>
2700  Vec128<int64_t, N> b) {
2701  return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2702 }
2703 
2704 template <size_t N>
2706  Vec128<float, N> b) {
2707  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2708 }
2709 
2710 template <size_t N>
2712  Vec128<double, N> b) {
2713  return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2714 }
2715 
2716 // Additional overload for the optional tag.
2717 template <class V>
2718 HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2719  return InterleaveLower(a, b);
2720 }
2721 
2722 // ------------------------------ InterleaveUpper (UpperHalf)
2723 
2724 // All functions inside detail lack the required D parameter.
2725 namespace detail {
2726 
2727 template <size_t N>
2729  Vec128<uint8_t, N> b) {
2730  return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2731  26, 11, 27, 12, 28, 13, 29, 14,
2732  30, 15, 31)};
2733 }
2734 template <size_t N>
2736  Vec128<uint16_t, N> b) {
2737  return Vec128<uint16_t, N>{
2738  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2739 }
2740 template <size_t N>
2742  Vec128<uint32_t, N> b) {
2743  return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2744 }
2745 template <size_t N>
2747  Vec128<uint64_t, N> b) {
2748  return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2749 }
2750 
2751 template <size_t N>
2753  Vec128<int8_t, N> b) {
2754  return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2755  26, 11, 27, 12, 28, 13, 29, 14,
2756  30, 15, 31)};
2757 }
2758 template <size_t N>
2760  Vec128<int16_t, N> b) {
2761  return Vec128<int16_t, N>{
2762  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2763 }
2764 template <size_t N>
2766  Vec128<int32_t, N> b) {
2767  return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2768 }
2769 template <size_t N>
2771  Vec128<int64_t, N> b) {
2772  return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2773 }
2774 
2775 template <size_t N>
2777  Vec128<float, N> b) {
2778  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2779 }
2780 
2781 template <size_t N>
2783  Vec128<double, N> b) {
2784  return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2785 }
2786 
2787 } // namespace detail
2788 
2789 // Full
2790 template <typename T, class V = Vec128<T>>
2791 HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
2792  return detail::InterleaveUpper(a, b);
2793 }
2794 
2795 // Partial
2796 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
2797 HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
2798  const Half<decltype(d)> d2;
2799  return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
2800 }
2801 
2802 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2803 
2804 // Same as Interleave*, except that the return lanes are double-width integers;
2805 // this is necessary because the single-lane scalar cannot return two values.
2806 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2807 HWY_API VFromD<DW> ZipLower(V a, V b) {
2808  return BitCast(DW(), InterleaveLower(a, b));
2809 }
2810 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2811 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2812  return BitCast(dw, InterleaveLower(D(), a, b));
2813 }
2814 
2815 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2816 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2817  return BitCast(dw, InterleaveUpper(D(), a, b));
2818 }
2819 
2820 // ================================================== COMBINE
2821 
2822 // ------------------------------ Combine (InterleaveLower)
2823 
2824 // N = N/2 + N/2 (upper half undefined)
2825 template <typename T, size_t N>
2826 HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
2827  Vec128<T, N / 2> lo_half) {
2828  const Half<decltype(d)> d2;
2829  const RebindToUnsigned<decltype(d2)> du2;
2830  // Treat half-width input as one lane, and expand to two lanes.
2831  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
2832  const VU lo{BitCast(du2, lo_half).raw};
2833  const VU hi{BitCast(du2, hi_half).raw};
2834  return BitCast(d, InterleaveLower(lo, hi));
2835 }
2836 
2837 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
2838 
2839 template <typename T, size_t N>
2840 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
2841  return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
2842 }
2843 
2844 // ------------------------------ ConcatLowerLower
2845 
2846 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2847 template <typename T>
2849  const Vec128<T> lo) {
2850  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
2851 }
2852 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2853 HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
2854  const Vec128<T, N> lo) {
2855  const Half<decltype(d)> d2;
2856  return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
2857 }
2858 
2859 // ------------------------------ ConcatUpperUpper
2860 
2861 template <typename T>
2863  const Vec128<T> lo) {
2864  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
2865 }
2866 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2867 HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
2868  const Vec128<T, N> lo) {
2869  const Half<decltype(d)> d2;
2870  return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
2871 }
2872 
2873 // ------------------------------ ConcatLowerUpper
2874 
2875 template <typename T>
2877  const Vec128<T> lo) {
2878  return CombineShiftRightBytes<8>(d, hi, lo);
2879 }
2880 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2881 HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
2882  const Vec128<T, N> lo) {
2883  const Half<decltype(d)> d2;
2884  return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
2885 }
2886 
2887 // ------------------------------ ConcatUpperLower
2888 template <typename T, size_t N>
2889 HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
2890  const Vec128<T, N> lo) {
2891  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
2892 }
2893 
2894 // ------------------------------ ConcatOdd
2895 
2896 // 8-bit full
2897 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2899  return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15,
2900  17, 19, 21, 23, 25, 27, 29, 31)};
2901 }
2902 
2903 // 8-bit x8
2904 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2906  Vec128<T, 8> lo) {
2907  // Don't care about upper half.
2908  return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21,
2909  23, 1, 3, 5, 7, 17, 19, 21, 23)};
2910 }
2911 
2912 // 8-bit x4
2913 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2914 HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2915  Vec128<T, 4> lo) {
2916  // Don't care about upper 3/4.
2917  return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
2918  19, 1, 3, 17, 19, 1, 3, 17, 19)};
2919 }
2920 
2921 // 16-bit full
2922 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2923 HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2924  return Vec128<T>{
2925  wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
2926 }
2927 
2928 // 16-bit x4
2929 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2930 HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2931  Vec128<T, 4> lo) {
2932  // Don't care about upper half.
2933  return Vec128<T, 4>{
2934  wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
2935 }
2936 
2937 // 32-bit full
2938 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2939 HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2940  return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2941 }
2942 
2943 // Any T x2
2944 template <typename T>
2945 HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
2946  Vec128<T, 2> lo) {
2947  return InterleaveUpper(d, lo, hi);
2948 }
2949 
2950 // ------------------------------ ConcatEven (InterleaveLower)
2951 
2952 // 8-bit full
2953 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2955  return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14,
2956  16, 18, 20, 22, 24, 26, 28, 30)};
2957 }
2958 
2959 // 8-bit x8
2960 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2962  Vec128<T, 8> lo) {
2963  // Don't care about upper half.
2964  return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20,
2965  22, 0, 2, 4, 6, 16, 18, 20, 22)};
2966 }
2967 
2968 // 8-bit x4
2969 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2970 HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2971  Vec128<T, 4> lo) {
2972  // Don't care about upper 3/4.
2973  return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16,
2974  18, 0, 2, 16, 18, 0, 2, 16, 18)};
2975 }
2976 
2977 // 16-bit full
2978 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2979 HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2980  return Vec128<T>{
2981  wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
2982 }
2983 
2984 // 16-bit x4
2985 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2986 HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2987  Vec128<T, 4> lo) {
2988  // Don't care about upper half.
2989  return Vec128<T, 4>{
2990  wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
2991 }
2992 
2993 // 32-bit full
2994 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2995 HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2996  return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2997 }
2998 
2999 // Any T x2
3000 template <typename T>
3001 HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
3002  Vec128<T, 2> lo) {
3003  return InterleaveLower(d, lo, hi);
3004 }
3005 
3006 // ------------------------------ DupEven (InterleaveLower)
3007 
3008 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3009 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
3010  return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
3011 }
3012 
3013 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3014 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
3015  return InterleaveLower(DFromV<decltype(v)>(), v, v);
3016 }
3017 
3018 // ------------------------------ DupOdd (InterleaveUpper)
3019 
3020 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3021 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3022  return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
3023 }
3024 
3025 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3026 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
3027  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
3028 }
3029 
3030 // ------------------------------ OddEven
3031 
3032 namespace detail {
3033 
3034 template <typename T, size_t N>
3036  const Vec128<T, N> b) {
3037  const DFromV<decltype(a)> d;
3038  const Repartition<uint8_t, decltype(d)> d8;
3039  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3040  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3041  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
3042 }
3043 template <typename T, size_t N>
3045  const Vec128<T, N> b) {
3046  return Vec128<T, N>{
3047  wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
3048 }
3049 template <typename T, size_t N>
3051  const Vec128<T, N> b) {
3052  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3053 }
3054 template <typename T, size_t N>
3056  const Vec128<T, N> b) {
3057  return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
3058 }
3059 
3060 } // namespace detail
3061 
3062 template <typename T, size_t N>
3063 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
3064  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3065 }
3066 template <size_t N>
3068  const Vec128<float, N> b) {
3069  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3070 }
3071 
3072 // ------------------------------ OddEvenBlocks
3073 template <typename T, size_t N>
3074 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
3075  return even;
3076 }
3077 
3078 // ------------------------------ SwapAdjacentBlocks
3079 
3080 template <typename T, size_t N>
3081 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
3082  return v;
3083 }
3084 
3085 // ------------------------------ ReverseBlocks
3086 
3087 // Single block: no change
3088 template <typename T>
3089 HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
3090  return v;
3091 }
3092 
3093 // ================================================== CONVERT
3094 
3095 // ------------------------------ Promotions (part w/ narrow lanes -> full)
3096 
3097 // Unsigned: zero-extend.
3098 template <size_t N>
3100  const Vec128<uint8_t, N> v) {
3101  return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
3102 }
3103 template <size_t N>
3105  const Vec128<uint8_t, N> v) {
3106  return Vec128<uint32_t, N>{
3107  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3108 }
3109 template <size_t N>
3111  const Vec128<uint8_t, N> v) {
3112  return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
3113 }
3114 template <size_t N>
3116  const Vec128<uint8_t, N> v) {
3117  return Vec128<int32_t, N>{
3118  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3119 }
3120 template <size_t N>
3121 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
3122  const Vec128<uint16_t, N> v) {
3123  return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
3124 }
3125 template <size_t N>
3127  const Vec128<uint32_t, N> v) {
3128  return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
3129 }
3130 
3131 template <size_t N>
3133  const Vec128<uint16_t, N> v) {
3134  return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
3135 }
3136 
3137 // Signed: replicate sign bit.
3138 template <size_t N>
3139 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
3140  const Vec128<int8_t, N> v) {
3141  return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
3142 }
3143 template <size_t N>
3144 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
3145  const Vec128<int8_t, N> v) {
3146  return Vec128<int32_t, N>{
3147  wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
3148 }
3149 template <size_t N>
3150 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
3151  const Vec128<int16_t, N> v) {
3152  return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
3153 }
3154 template <size_t N>
3155 HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
3156  const Vec128<int32_t, N> v) {
3157  return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
3158 }
3159 
3160 template <size_t N>
3162  const Vec128<int32_t, N> v) {
3163  return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
3164 }
3165 
3166 template <size_t N>
3167 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
3168  const Vec128<float16_t, N> v) {
3169  const RebindToSigned<decltype(df32)> di32;
3170  const RebindToUnsigned<decltype(df32)> du32;
3171  // Expand to u32 so we can shift.
3172  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
3173  const auto sign = ShiftRight<15>(bits16);
3174  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
3175  const auto mantissa = bits16 & Set(du32, 0x3FF);
3176  const auto subnormal =
3177  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
3178  Set(df32, 1.0f / 16384 / 1024));
3179 
3180  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
3181  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
3182  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3183  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
3184  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3185 }
3186 
3187 template <size_t N>
3188 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
3189  const Vec128<bfloat16_t, N> v) {
3190  const Rebind<uint16_t, decltype(df32)> du16;
3191  const RebindToSigned<decltype(df32)> di32;
3192  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3193 }
3194 
3195 // ------------------------------ Demotions (full -> part w/ narrow lanes)
3196 
3197 template <size_t N>
3199  const Vec128<int32_t, N> v) {
3200  return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
3201 }
3202 
3203 template <size_t N>
3205  const Vec128<int32_t, N> v) {
3206  return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
3207 }
3208 
3209 template <size_t N>
3211  const Vec128<int32_t, N> v) {
3212  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
3213  return Vec128<uint8_t, N>{
3214  wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3215 }
3216 
3217 template <size_t N>
3219  const Vec128<int16_t, N> v) {
3220  return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
3221 }
3222 
3223 template <size_t N>
3225  const Vec128<int32_t, N> v) {
3226  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
3227  return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
3228 }
3229 
3230 template <size_t N>
3232  const Vec128<int16_t, N> v) {
3233  return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
3234 }
3235 
3236 template <size_t N>
3237 HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* di */,
3238  const Vec128<double, N> v) {
3239  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
3240 }
3241 
3242 template <size_t N>
3243 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
3244  const Vec128<float, N> v) {
3245  const RebindToUnsigned<decltype(df16)> du16;
3246  const Rebind<uint32_t, decltype(du16)> du;
3247  const RebindToSigned<decltype(du)> di;
3248  const auto bits32 = BitCast(du, v);
3249  const auto sign = ShiftRight<31>(bits32);
3250  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3251  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3252 
3253  const auto k15 = Set(di, 15);
3254  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3255  const auto is_tiny = exp < Set(di, -24);
3256 
3257  const auto is_subnormal = exp < Set(di, -14);
3258  const auto biased_exp16 =
3259  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3260  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3261  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3262  (mantissa32 >> (Set(du, 13) + sub_exp));
3263  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3264  ShiftRight<13>(mantissa32)); // <1024
3265 
3266  const auto sign16 = ShiftLeft<15>(sign);
3267  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3268  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3269  return Vec128<float16_t, N>{DemoteTo(du16, bits16).raw};
3270 }
3271 
3272 template <size_t N>
3273 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
3274  const Vec128<float, N> v) {
3275  const Rebind<int32_t, decltype(dbf16)> di32;
3276  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3277  const Rebind<uint16_t, decltype(dbf16)> du16;
3278  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3279  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3280 }
3281 
3282 template <size_t N>
3283 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
3284  Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
3285  const RebindToUnsigned<decltype(dbf16)> du16;
3286  const Repartition<uint32_t, decltype(dbf16)> du32;
3287  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
3288  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
3289 }
3290 
3291 // For already range-limited input [0, 255].
3292 template <size_t N>
3293 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
3294  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
3295  return Vec128<uint8_t, N>{
3296  wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3297 }
3298 
3299 // ------------------------------ Convert i32 <=> f32 (Round)
3300 
3301 template <size_t N>
3303  const Vec128<int32_t, N> v) {
3304  return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
3305 }
3306 // Truncates (rounds toward zero).
3307 template <size_t N>
3309  const Vec128<float, N> v) {
3310  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
3311 }
3312 
3313 template <size_t N>
3314 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
3315  return ConvertTo(Simd<int32_t, N, 0>(), Round(v));
3316 }
3317 
3318 // ================================================== MISC
3319 
3320 // ------------------------------ SumsOf8 (ShiftRight, Add)
3321 template <size_t N>
3322 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
3323  const DFromV<decltype(v)> du8;
3324  const RepartitionToWide<decltype(du8)> du16;
3325  const RepartitionToWide<decltype(du16)> du32;
3326  const RepartitionToWide<decltype(du32)> du64;
3327  using VU16 = VFromD<decltype(du16)>;
3328 
3329  const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
3330  const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
3331  const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
3332 
3333  const VU16 szz_FE_zz_BA_zz_76_zz_32 =
3334  BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
3335  const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
3336  Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
3337  const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
3338  BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
3339  const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
3340  Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
3341  return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
3342 }
3343 
3344 // ------------------------------ LoadMaskBits (TestBit)
3345 
3346 namespace detail {
3347 
3348 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
3349 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3350  const RebindToUnsigned<decltype(d)> du;
3351  // Easier than Set(), which would require an >8-bit type, which would not
3352  // compile for T=uint8_t, N=1.
3353  const Vec128<T, N> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
3354 
3355  // Replicate bytes 8x such that each byte contains the bit that governs it.
3356  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
3357  1, 1, 1, 1, 1, 1, 1, 1};
3358  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
3359 
3360  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
3361  1, 2, 4, 8, 16, 32, 64, 128};
3362  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
3363 }
3364 
3365 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3366 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3367  const RebindToUnsigned<decltype(d)> du;
3368  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3369  return RebindMask(
3370  d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
3371 }
3372 
3373 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3374 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3375  const RebindToUnsigned<decltype(d)> du;
3376  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
3377  return RebindMask(
3378  d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
3379 }
3380 
3381 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3382 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3383  const RebindToUnsigned<decltype(d)> du;
3384  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
3385  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
3386 }
3387 
3388 } // namespace detail
3389 
3390 // `p` points to at least 8 readable bytes, not all of which need be valid.
3391 template <typename T, size_t N, HWY_IF_LE128(T, N)>
3392 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
3393  const uint8_t* HWY_RESTRICT bits) {
3394  uint64_t mask_bits = 0;
3395  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
3396  return detail::LoadMaskBits(d, mask_bits);
3397 }
3398 
3399 // ------------------------------ Mask
3400 
3401 namespace detail {
3402 
3403 // Full
3404 template <typename T>
3405 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
3406  const Mask128<T> mask) {
3407  alignas(16) uint64_t lanes[2];
3408  wasm_v128_store(lanes, mask.raw);
3409 
3410  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3411  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
3412  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
3413  return (hi + lo);
3414 }
3415 
3416 // 64-bit
3417 template <typename T>
3419  const Mask128<T, 8> mask) {
3420  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3421  return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
3422  kMagic) >>
3423  56;
3424 }
3425 
3426 // 32-bit or less: need masking
3427 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3428 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
3429  const Mask128<T, N> mask) {
3430  uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
3431  // Clear potentially undefined bytes.
3432  bytes &= (1ULL << (N * 8)) - 1;
3433  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3434  return (bytes * kMagic) >> 56;
3435 }
3436 
3437 template <typename T, size_t N>
3439  const Mask128<T, N> mask) {
3440  // Remove useless lower half of each u16 while preserving the sign bit.
3441  const __i16x8 zero = wasm_i16x8_splat(0);
3442  const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
3443  return BitsFromMask(hwy::SizeTag<1>(), mask8);
3444 }
3445 
3446 template <typename T, size_t N>
3448  const Mask128<T, N> mask) {
3449  const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
3450  const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
3451  const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
3452  alignas(16) uint32_t lanes[4];
3453  wasm_v128_store(lanes, sliced_mask);
3454  return lanes[0] | lanes[1] | lanes[2] | lanes[3];
3455 }
3456 
3457 template <typename T, size_t N>
3459  const Mask128<T, N> mask) {
3460  const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
3461  const __i64x2 slice = wasm_i64x2_make(1, 2);
3462  const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
3463  alignas(16) uint64_t lanes[2];
3464  wasm_v128_store(lanes, sliced_mask);
3465  return lanes[0] | lanes[1];
3466 }
3467 
3468 // Returns the lowest N bits for the BitsFromMask result.
3469 template <typename T, size_t N>
3470 constexpr uint64_t OnlyActive(uint64_t bits) {
3471  return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
3472 }
3473 
3474 // Returns 0xFF for bytes with index >= N, otherwise 0.
3475 template <size_t N>
3476 constexpr __i8x16 BytesAbove() {
3477  return
3478  (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
3479  : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
3480  : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
3481  : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
3482  : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
3483  : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
3484  : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
3485  : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
3486  : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
3487  : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3488  -1, -1, -1, -1, -1)
3489  : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3490  -1, -1, -1, -1)
3491  : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
3492  -1, -1, -1, -1)
3493  : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
3494  -1, -1, -1)
3495  : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
3496  -1, -1, -1)
3497  : (N == 11)
3498  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
3499  : (N == 13)
3500  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
3501  : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
3502 }
3503 
3504 template <typename T, size_t N>
3505 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
3506  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
3507 }
3508 
3509 template <typename T>
3510 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
3511  return PopCount(BitsFromMask(tag, m));
3512 }
3513 
3514 template <typename T>
3515 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
3516  return PopCount(BitsFromMask(tag, m));
3517 }
3518 
3519 template <typename T>
3520 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
3521  const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
3522  const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
3523  alignas(16) uint64_t lanes[2];
3524  wasm_v128_store(lanes, shifted_bits);
3525  return PopCount(lanes[0] | lanes[1]);
3526 }
3527 
3528 template <typename T>
3529 HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
3530  alignas(16) int64_t lanes[2];
3531  wasm_v128_store(lanes, m.raw);
3532  return static_cast<size_t>(-(lanes[0] + lanes[1]));
3533 }
3534 
3535 } // namespace detail
3536 
3537 // `p` points to at least 8 writable bytes.
3538 template <typename T, size_t N>
3539 HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
3540  const Mask128<T, N> mask, uint8_t* bits) {
3541  const uint64_t mask_bits = detail::BitsFromMask(mask);
3542  const size_t kNumBytes = (N + 7) / 8;
3543  CopyBytes<kNumBytes>(&mask_bits, bits);
3544  return kNumBytes;
3545 }
3546 
3547 template <typename T, size_t N>
3548 HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
3549  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
3550 }
3551 
3552 // Partial vector
3553 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3554 HWY_API size_t CountTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
3555  // Ensure all undefined bytes are 0.
3556  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3557  return CountTrue(d, Mask128<T>{AndNot(mask, m).raw});
3558 }
3559 
3560 // Full vector
3561 template <typename T>
3562 HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
3563 #if 0
3564  // Casting followed by wasm_i8x16_any_true results in wasm error:
3565  // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
3566  const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
3567  return !wasm_i8x16_any_true(v8.raw);
3568 #else
3569  (void)d;
3570  return (wasm_i64x2_extract_lane(m.raw, 0) |
3571  wasm_i64x2_extract_lane(m.raw, 1)) == 0;
3572 #endif
3573 }
3574 
3575 // Full vector
3576 namespace detail {
3577 template <typename T>
3579  return wasm_i8x16_all_true(m.raw);
3580 }
3581 template <typename T>
3583  return wasm_i16x8_all_true(m.raw);
3584 }
3585 template <typename T>
3587  return wasm_i32x4_all_true(m.raw);
3588 }
3589 template <typename T>
3591  return wasm_i64x2_all_true(m.raw);
3592 }
3593 
3594 } // namespace detail
3595 
3596 template <typename T, size_t N>
3597 HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
3598  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
3599 }
3600 
3601 // Partial vectors
3602 
3603 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3604 HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> m) {
3605  // Ensure all undefined bytes are 0.
3606  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3607  return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
3608 }
3609 
3610 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3611 HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
3612  // Ensure all undefined bytes are FF.
3613  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3614  return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
3615 }
3616 
3617 template <typename T, size_t N>
3618 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
3619  const Mask128<T, N> mask) {
3620  const uint64_t bits = detail::BitsFromMask(mask);
3621  return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
3622 }
3623 
3624 // ------------------------------ Compress
3625 
3626 namespace detail {
3627 
3628 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3629 HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
3630  HWY_DASSERT(mask_bits < 256);
3631  const Simd<T, N, 0> d;
3632  const Rebind<uint8_t, decltype(d)> d8;
3633  const Simd<uint16_t, N, 0> du;
3634 
3635  // We need byte indices for TableLookupBytes (one vector's worth for each of
3636  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
3637  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
3638  // with the doubling baked into the table. Unpacking nibbles is likely more
3639  // costly than the higher cache footprint from storing bytes.
3640  alignas(16) constexpr uint8_t table[256 * 8] = {
3641  // PrintCompress16x8Tables
3642  0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3643  2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3644  4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
3645  2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3646  6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
3647  2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
3648  4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
3649  2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3650  8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
3651  2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
3652  4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
3653  2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
3654  6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
3655  2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
3656  4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
3657  2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3658  10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
3659  2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
3660  4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
3661  2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
3662  6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
3663  2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
3664  4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
3665  2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
3666  8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
3667  2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
3668  4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
3669  2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
3670  6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
3671  2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
3672  4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
3673  2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3674  12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
3675  2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
3676  4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
3677  2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
3678  6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
3679  2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
3680  4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
3681  2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
3682  8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
3683  2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
3684  4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
3685  2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
3686  6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
3687  2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
3688  4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
3689  2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
3690  10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
3691  2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
3692  4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
3693  2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
3694  6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
3695  2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
3696  4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
3697  2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
3698  8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
3699  2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
3700  4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
3701  2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
3702  6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
3703  2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
3704  4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
3705  2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3706  14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
3707  2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
3708  4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
3709  2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
3710  6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
3711  2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
3712  4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
3713  2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
3714  8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
3715  2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
3716  4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
3717  2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
3718  6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
3719  2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
3720  4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
3721  2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
3722  10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
3723  2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
3724  4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
3725  2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
3726  6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
3727  2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
3728  4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
3729  2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
3730  8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
3731  2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
3732  4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
3733  2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
3734  6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
3735  2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
3736  4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
3737  2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
3738  12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
3739  2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
3740  4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
3741  2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
3742  6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
3743  2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
3744  4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
3745  2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
3746  8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
3747  2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
3748  4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
3749  2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
3750  6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
3751  2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
3752  4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
3753  2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
3754  10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
3755  2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
3756  4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
3757  2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
3758  6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
3759  2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
3760  4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
3761  2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
3762  8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
3763  2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
3764  4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
3765  2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
3766  6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
3767  2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
3768  4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
3769  2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
3770 
3771  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
3772  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
3773  return BitCast(d, pairs + Set(du, 0x0100));
3774 }
3775 
3776 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3777 HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
3778  HWY_DASSERT(mask_bits < 256);
3779  const Simd<T, N, 0> d;
3780  const Rebind<uint8_t, decltype(d)> d8;
3781  const Simd<uint16_t, N, 0> du;
3782 
3783  // We need byte indices for TableLookupBytes (one vector's worth for each of
3784  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
3785  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
3786  // with the doubling baked into the table. Unpacking nibbles is likely more
3787  // costly than the higher cache footprint from storing bytes.
3788  alignas(16) constexpr uint8_t table[256 * 8] = {
3789  // PrintCompressNot16x8Tables
3790  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
3791  0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
3792  0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
3793  0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
3794  0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
3795  0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
3796  0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
3797  0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
3798  0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
3799  0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
3800  0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
3801  0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
3802  0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
3803  0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
3804  0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
3805  0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
3806  0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
3807  0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
3808  0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
3809  0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
3810  0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
3811  0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
3812  0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
3813  0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
3814  0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
3815  0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
3816  0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
3817  0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
3818  0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
3819  0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
3820  0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
3821  0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
3822  0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
3823  0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
3824  0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
3825  0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
3826  0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
3827  0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
3828  0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
3829  0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
3830  0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
3831  0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
3832  0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
3833  0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
3834  0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
3835  0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
3836  0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
3837  0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
3838  0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
3839  0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
3840  0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
3841  0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
3842  0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
3843  0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
3844  0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
3845  0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
3846  0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
3847  0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
3848  0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
3849  0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
3850  0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
3851  0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
3852  0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
3853  0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
3854  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
3855  0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
3856  0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
3857  0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
3858  0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
3859  0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
3860  0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
3861  0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
3862  0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
3863  0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
3864  0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
3865  0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
3866  0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
3867  0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
3868  0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
3869  0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
3870  0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
3871  0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
3872  0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
3873  0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
3874  0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
3875  0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
3876  0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
3877  0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
3878  0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
3879  0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
3880  0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
3881  0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
3882  0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
3883  0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
3884  0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
3885  0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
3886  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
3887  0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
3888  0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
3889  0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
3890  0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
3891  0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
3892  0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
3893  0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
3894  0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
3895  0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
3896  0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
3897  0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
3898  0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
3899  0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
3900  0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
3901  0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
3902  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
3903  0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
3904  0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
3905  0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
3906  0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
3907  0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
3908  0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
3909  0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
3910  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
3911  0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
3912  0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
3913  0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
3914  0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
3915  0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
3916  0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
3917  0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
3918 
3919  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
3920  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
3921  return BitCast(d, pairs + Set(du, 0x0100));
3922 }
3923 
3924 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3925 HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
3926  HWY_DASSERT(mask_bits < 16);
3927 
3928  // There are only 4 lanes, so we can afford to load the index vector directly.
3929  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
3930  // PrintCompress32x4Tables
3931  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3932  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3933  4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
3934  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3935  8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
3936  0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
3937  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
3938  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3939  12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
3940  0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
3941  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
3942  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
3943  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
3944  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
3945  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
3946  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3947  const Simd<T, N, 0> d;
3948  const Repartition<uint8_t, decltype(d)> d8;
3949  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
3950 }
3951 
3952 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3953 HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
3954  HWY_DASSERT(mask_bits < 16);
3955 
3956  // There are only 4 lanes, so we can afford to load the index vector directly.
3957  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
3958  // PrintCompressNot32x4Tables
3959  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
3960  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
3961  8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
3962  14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
3963  12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
3964  2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
3965  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
3966  10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3967  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
3968  2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
3969  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
3970  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
3971  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
3972  10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
3973  12, 13, 14, 15};
3974  const Simd<T, N, 0> d;
3975  const Repartition<uint8_t, decltype(d)> d8;
3976  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
3977 }
3978 
3979 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3980 HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
3981  HWY_DASSERT(mask_bits < 4);
3982 
3983  // There are only 2 lanes, so we can afford to load the index vector directly.
3984  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
3985  // PrintCompress64x2Tables
3986  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3987  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3988  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3989  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3990 
3991  const Simd<T, N, 0> d;
3992  const Repartition<uint8_t, decltype(d)> d8;
3993  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
3994 }
3995 
3996 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3997 HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
3998  HWY_DASSERT(mask_bits < 4);
3999 
4000  // There are only 2 lanes, so we can afford to load the index vector directly.
4001  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
4002  // PrintCompressNot64x2Tables
4003  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4004  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4005  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4006  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4007 
4008  const Simd<T, N, 0> d;
4009  const Repartition<uint8_t, decltype(d)> d8;
4010  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
4011 }
4012 
4013 // Helper functions called by both Compress and CompressStore - avoids a
4014 // redundant BitsFromMask in the latter.
4015 
4016 template <typename T, size_t N>
4017 HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
4018  const auto idx = detail::IdxFromBits<T, N>(mask_bits);
4019  const DFromV<decltype(v)> d;
4020  const RebindToSigned<decltype(d)> di;
4021  return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
4022 }
4023 
4024 template <typename T, size_t N>
4025 HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
4026  const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
4027  const DFromV<decltype(v)> d;
4028  const RebindToSigned<decltype(d)> di;
4029  return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
4030 }
4031 
4032 } // namespace detail
4033 
4034 template <typename T>
4035 struct CompressIsPartition {
4036  enum { value = 1 };
4037 };
4038 
4039 // Single lane: no-op
4040 template <typename T>
4041 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
4042  return v;
4043 }
4044 
4045 // Two lanes: conditional swap
4046 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4048  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
4049  const Full128<T> d;
4050  const Vec128<T> m = VecFromMask(d, mask);
4051  const Vec128<T> maskL = DupEven(m);
4052  const Vec128<T> maskH = DupOdd(m);
4053  const Vec128<T> swap = AndNot(maskL, maskH);
4054  return IfVecThenElse(swap, Shuffle01(v), v);
4055 }
4056 
4057 // General case
4058 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
4059 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
4060  return detail::Compress(v, detail::BitsFromMask(mask));
4061 }
4062 
4063 // Single lane: no-op
4064 template <typename T>
4065 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
4066  return v;
4067 }
4068 
4069 // Two lanes: conditional swap
4070 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4071 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
4072  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
4073  const Full128<T> d;
4074  const Vec128<T> m = VecFromMask(d, mask);
4075  const Vec128<T> maskL = DupEven(m);
4076  const Vec128<T> maskH = DupOdd(m);
4077  const Vec128<T> swap = AndNot(maskH, maskL);
4078  return IfVecThenElse(swap, Shuffle01(v), v);
4079 }
4080 
4081 // General case
4082 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
4083 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
4084  // For partial vectors, we cannot pull the Not() into the table because
4085  // BitsFromMask clears the upper bits.
4086  if (N < 16 / sizeof(T)) {
4087  return detail::Compress(v, detail::BitsFromMask(Not(mask)));
4088  }
4090 }
4091 // ------------------------------ CompressBlocksNot
4092 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
4093  Mask128<uint64_t> /* m */) {
4094  return v;
4095 }
4096 
4097 // ------------------------------ CompressBits
4098 
4099 template <typename T, size_t N>
4100 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
4101  const uint8_t* HWY_RESTRICT bits) {
4102  uint64_t mask_bits = 0;
4103  constexpr size_t kNumBytes = (N + 7) / 8;
4104  CopyBytes<kNumBytes>(bits, &mask_bits);
4105  if (N < 8) {
4106  mask_bits &= (1ull << N) - 1;
4107  }
4108 
4109  return detail::Compress(v, mask_bits);
4110 }
4111 
4112 // ------------------------------ CompressStore
4113 template <typename T, size_t N>
4114 HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
4115  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
4116  const uint64_t mask_bits = detail::BitsFromMask(mask);
4117  const auto c = detail::Compress(v, mask_bits);
4118  StoreU(c, d, unaligned);
4119  return PopCount(mask_bits);
4120 }
4121 
4122 // ------------------------------ CompressBlendedStore
4123 template <typename T, size_t N>
4124 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
4125  Simd<T, N, 0> d,
4126  T* HWY_RESTRICT unaligned) {
4127  const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
4128  using TU = TFromD<decltype(du)>;
4129  const uint64_t mask_bits = detail::BitsFromMask(m);
4130  const size_t count = PopCount(mask_bits);
4131  const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
4132  const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
4133  BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
4134  return count;
4135 }
4136 
4137 // ------------------------------ CompressBitsStore
4138 
4139 template <typename T, size_t N>
4140 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
4141  const uint8_t* HWY_RESTRICT bits,
4142  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
4143  uint64_t mask_bits = 0;
4144  constexpr size_t kNumBytes = (N + 7) / 8;
4145  CopyBytes<kNumBytes>(bits, &mask_bits);
4146  if (N < 8) {
4147  mask_bits &= (1ull << N) - 1;
4148  }
4149 
4150  const auto c = detail::Compress(v, mask_bits);
4151  StoreU(c, d, unaligned);
4152  return PopCount(mask_bits);
4153 }
4154 
4155 // ------------------------------ StoreInterleaved2/3/4
4156 
4157 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
4158 // generic_ops-inl.h.
4159 
4160 // ------------------------------ MulEven/Odd (Load)
4161 
4162 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
4163  const Vec128<uint64_t> b) {
4164  alignas(16) uint64_t mul[2];
4165  mul[0] =
4166  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
4167  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
4168  return Load(Full128<uint64_t>(), mul);
4169 }
4170 
4171 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
4172  const Vec128<uint64_t> b) {
4173  alignas(16) uint64_t mul[2];
4174  mul[0] =
4175  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
4176  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
4177  return Load(Full128<uint64_t>(), mul);
4178 }
4179 
4180 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4181 
4182 template <size_t N>
4183 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
4184  Vec128<bfloat16_t, 2 * N> a,
4185  Vec128<bfloat16_t, 2 * N> b,
4186  const Vec128<float, N> sum0,
4187  Vec128<float, N>& sum1) {
4188  const Repartition<uint16_t, decltype(df32)> du16;
4189  const RebindToUnsigned<decltype(df32)> du32;
4190  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
4191  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
4192  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
4193  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
4194  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
4195  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
4196  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
4197 }
4198 
4199 // ------------------------------ Reductions
4200 
4201 namespace detail {
4202 
4203 // N=1 for any T: no-op
4204 template <typename T>
4206  const Vec128<T, 1> v) {
4207  return v;
4208 }
4209 template <typename T>
4210 HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
4211  const Vec128<T, 1> v) {
4212  return v;
4213 }
4214 template <typename T>
4215 HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
4216  const Vec128<T, 1> v) {
4217  return v;
4218 }
4219 
4220 // u32/i32/f32:
4221 
4222 // N=2
4223 template <typename T>
4225  const Vec128<T, 2> v10) {
4226  return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
4227 }
4228 template <typename T>
4230  const Vec128<T, 2> v10) {
4231  return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
4232 }
4233 template <typename T>
4234 HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
4235  const Vec128<T, 2> v10) {
4236  return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
4237 }
4238 
4239 // N=4 (full)
4240 template <typename T>
4242  const Vec128<T> v3210) {
4243  const Vec128<T> v1032 = Shuffle1032(v3210);
4244  const Vec128<T> v31_20_31_20 = v3210 + v1032;
4245  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4246  return v20_31_20_31 + v31_20_31_20;
4247 }
4248 template <typename T>
4250  const Vec128<T> v3210) {
4251  const Vec128<T> v1032 = Shuffle1032(v3210);
4252  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
4253  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4254  return Min(v20_31_20_31, v31_20_31_20);
4255 }
4256 template <typename T>
4257 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
4258  const Vec128<T> v3210) {
4259  const Vec128<T> v1032 = Shuffle1032(v3210);
4260  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
4261  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4262  return Max(v20_31_20_31, v31_20_31_20);
4263 }
4264 
4265 // u64/i64/f64:
4266 
4267 // N=2 (full)
4268 template <typename T>
4270  const Vec128<T> v10) {
4271  const Vec128<T> v01 = Shuffle01(v10);
4272  return v10 + v01;
4273 }
4274 template <typename T>
4276  const Vec128<T> v10) {
4277  const Vec128<T> v01 = Shuffle01(v10);
4278  return Min(v10, v01);
4279 }
4280 template <typename T>
4281 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
4282  const Vec128<T> v10) {
4283  const Vec128<T> v01 = Shuffle01(v10);
4284  return Max(v10, v01);
4285 }
4286 
4287 // u16/i16
4288 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4289 HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
4290  const DFromV<decltype(v)> d;
4291  const Repartition<int32_t, decltype(d)> d32;
4292  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4293  const auto odd = ShiftRight<16>(BitCast(d32, v));
4294  const auto min = MinOfLanes(d32, Min(even, odd));
4295  // Also broadcast into odd lanes.
4296  return BitCast(d, Or(min, ShiftLeft<16>(min)));
4297 }
4298 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4299 HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
4300  const DFromV<decltype(v)> d;
4301  const Repartition<int32_t, decltype(d)> d32;
4302  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4303  const auto odd = ShiftRight<16>(BitCast(d32, v));
4304  const auto min = MaxOfLanes(d32, Max(even, odd));
4305  // Also broadcast into odd lanes.
4306  return BitCast(d, Or(min, ShiftLeft<16>(min)));
4307 }
4308 
4309 } // namespace detail
4310 
4311 // Supported for u/i/f 32/64. Returns the same value in each lane.
4312 template <typename T, size_t N>
4313 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4314  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4315 }
4316 template <typename T, size_t N>
4317 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4318  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4319 }
4320 template <typename T, size_t N>
4321 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4322  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4323 }
4324 
4325 // ------------------------------ Lt128
4326 
4327 template <typename T, size_t N, HWY_IF_LE128(T, N)>
4328 HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
4329  Vec128<T, N> b) {
4330  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
4331  // Truth table of Eq and Lt for Hi and Lo u64.
4332  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
4333  // =H =L cH cL | out = cH | (=H & cL)
4334  // 0 0 0 0 | 0
4335  // 0 0 0 1 | 0
4336  // 0 0 1 0 | 1
4337  // 0 0 1 1 | 1
4338  // 0 1 0 0 | 0
4339  // 0 1 0 1 | 0
4340  // 0 1 1 0 | 1
4341  // 1 0 0 0 | 0
4342  // 1 0 0 1 | 1
4343  // 1 1 0 0 | 0
4344  const Mask128<T, N> eqHL = Eq(a, b);
4345  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
4346  // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
4347  // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
4348  // comparison result leftwards requires only 4. IfThenElse compiles to the
4349  // same code as OrAnd().
4350  const Vec128<T, N> ltLx = DupEven(ltHL);
4351  const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
4352  return MaskFromVec(DupOdd(outHx));
4353 }
4354 
4355 template <typename T, size_t N, HWY_IF_LE128(T, N)>
4356 HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
4357  Vec128<T, N> b) {
4358  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
4359  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
4360 }
4361 
4362 // ------------------------------ Min128, Max128 (Lt128)
4363 
4364 // Without a native OddEven, it seems infeasible to go faster than Lt128.
4365 template <class D>
4366 HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
4367  return IfThenElse(Lt128(d, a, b), a, b);
4368 }
4369 
4370 template <class D>
4371 HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
4372  return IfThenElse(Lt128(d, b, a), a, b);
4373 }
4374 
4375 template <class D>
4376 HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
4377  return IfThenElse(Lt128Upper(d, a, b), a, b);
4378 }
4379 
4380 template <class D>
4381 HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
4382  return IfThenElse(Lt128Upper(d, b, a), a, b);
4383 }
4384 
4385 // ================================================== Operator wrapper
4386 
4387 template <class V>
4388 HWY_API V Add(V a, V b) {
4389  return a + b;
4390 }
4391 template <class V>
4392 HWY_API V Sub(V a, V b) {
4393  return a - b;
4394 }
4395 
4396 template <class V>
4397 HWY_API V Mul(V a, V b) {
4398  return a * b;
4399 }
4400 template <class V>
4401 HWY_API V Div(V a, V b) {
4402  return a / b;
4403 }
4404 
4405 template <class V>
4406 V Shl(V a, V b) {
4407  return a << b;
4408 }
4409 template <class V>
4410 V Shr(V a, V b) {
4411  return a >> b;
4412 }
4413 
4414 template <class V>
4415 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
4416  return a == b;
4417 }
4418 template <class V>
4419 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
4420  return a != b;
4421 }
4422 template <class V>
4423 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
4424  return a < b;
4425 }
4426 
4427 template <class V>
4428 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
4429  return a > b;
4430 }
4431 template <class V>
4432 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
4433  return a >= b;
4434 }
4435 
4436 template <class V>
4437 HWY_API auto Le(V a, V b) -> decltype(a == b) {
4438  return a <= b;
4439 }
4440 
4441 // NOLINTNEXTLINE(google-readability-namespace-comments)
4442 } // namespace HWY_NAMESPACE
4443 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
detail::Raw128< T >::type raw
Definition: wasm_128-inl.h:106
Raw raw
Definition: arm_neon-inl.h:814
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: wasm_128-inl.h:84
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: wasm_128-inl.h:87
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: wasm_128-inl.h:75
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: wasm_128-inl.h:90
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: wasm_128-inl.h:72
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: wasm_128-inl.h:78
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: wasm_128-inl.h:81
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:2144
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3476
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1700
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5491
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5339
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
Vec128< T, 4/sizeof(T)> Vec32
Definition: arm_neon-inl.h:800
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:797
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
__v128_u raw
Definition: wasm_128-inl.h:2521
Definition: ops/shared-inl.h:40
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition: wasm_128-inl.h:151
HWY_INLINE __v128_u operator()(__v128_u v)
Definition: wasm_128-inl.h:147
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: wasm_128-inl.h:114
__f32x4 type
Definition: wasm_128-inl.h:60
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56
Definition: base.h:358
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()