Grok  10.0.3
wasm_256-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 256-bit WASM vectors and operations. Experimental.
17 // External include guard in highway.h - see comment there.
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <wasm_simd128.h>
22 
23 #include "hwy/base.h"
24 #include "hwy/ops/shared-inl.h"
25 #include "hwy/ops/wasm_128-inl.h"
26 
28 namespace hwy {
29 namespace HWY_NAMESPACE {
30 
31 template <typename T>
32 using Full256 = Simd<T, 32 / sizeof(T), 0>;
33 
34 template <typename T>
35 using Full128 = Simd<T, 16 / sizeof(T), 0>;
36 
37 // TODO(richardwinterton): add this to DeduceD in wasm_128 similar to x86_128.
38 template <typename T>
39 class Vec256 {
40  public:
41  // Compound assignment. Only usable if there is a corresponding non-member
42  // binary operator overload. For example, only f32 and f64 support division.
44  return *this = (*this * other);
45  }
47  return *this = (*this / other);
48  }
50  return *this = (*this + other);
51  }
53  return *this = (*this - other);
54  }
56  return *this = (*this & other);
57  }
59  return *this = (*this | other);
60  }
62  return *this = (*this ^ other);
63  }
64 
67 };
68 
69 template <typename T>
70 struct Mask256 {
73 };
74 
75 // ------------------------------ BitCast
76 
77 template <typename T, typename FromT>
79  const Half<decltype(d)> dh;
80  Vec256<T> ret;
81  ret.v0 = BitCast(dh, v.v0);
82  ret.v1 = BitCast(dh, v.v1);
83  return ret;
84 
85  // TODO(richardwinterton): implement other ops like this
86 }
87 
88 // ------------------------------ Zero
89 
90 // Returns an all-zero vector/part.
91 template <typename T>
93  return Vec256<T>{wasm_i32x4_splat(0)};
94 }
96  return Vec256<float>{wasm_f32x4_splat(0.0f)};
97 }
98 
99 template <class D>
100 using VFromD = decltype(Zero(D()));
101 
102 // ------------------------------ Set
103 
104 // Returns a vector/part with all lanes set to "t".
105 HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
106  return Vec256<uint8_t>{wasm_i8x16_splat(static_cast<int8_t>(t))};
107 }
108 HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
109  return Vec256<uint16_t>{wasm_i16x8_splat(static_cast<int16_t>(t))};
110 }
111 HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
112  return Vec256<uint32_t>{wasm_i32x4_splat(static_cast<int32_t>(t))};
113 }
114 HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
115  return Vec256<uint64_t>{wasm_i64x2_splat(static_cast<int64_t>(t))};
116 }
117 
118 HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
119  return Vec256<int8_t>{wasm_i8x16_splat(t)};
120 }
121 HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
122  return Vec256<int16_t>{wasm_i16x8_splat(t)};
123 }
124 HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
125  return Vec256<int32_t>{wasm_i32x4_splat(t)};
126 }
127 HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
128  return Vec256<int64_t>{wasm_i64x2_splat(t)};
129 }
130 
131 HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
132  return Vec256<float>{wasm_f32x4_splat(t)};
133 }
134 
135 HWY_DIAGNOSTICS(push)
136 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
137 
138 // Returns a vector with uninitialized elements.
139 template <typename T>
141  return Zero(d);
142 }
143 
144 HWY_DIAGNOSTICS(pop)
145 
146 // Returns a vector with lane i=[0, N) set to "first" + i.
147 template <typename T, typename T2>
148 Vec256<T> Iota(const Full256<T> d, const T2 first) {
149  HWY_ALIGN T lanes[16 / sizeof(T)];
150  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
151  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
152  }
153  return Load(d, lanes);
154 }
155 
156 // ================================================== ARITHMETIC
157 
158 // ------------------------------ Addition
159 
160 // Unsigned
162  const Vec256<uint8_t> b) {
163  return Vec256<uint8_t>{wasm_i8x16_add(a.raw, b.raw)};
164 }
166  const Vec256<uint16_t> b) {
167  return Vec256<uint16_t>{wasm_i16x8_add(a.raw, b.raw)};
168 }
170  const Vec256<uint32_t> b) {
171  return Vec256<uint32_t>{wasm_i32x4_add(a.raw, b.raw)};
172 }
173 
174 // Signed
176  const Vec256<int8_t> b) {
177  return Vec256<int8_t>{wasm_i8x16_add(a.raw, b.raw)};
178 }
180  const Vec256<int16_t> b) {
181  return Vec256<int16_t>{wasm_i16x8_add(a.raw, b.raw)};
182 }
184  const Vec256<int32_t> b) {
185  return Vec256<int32_t>{wasm_i32x4_add(a.raw, b.raw)};
186 }
187 
188 // Float
190  return Vec256<float>{wasm_f32x4_add(a.raw, b.raw)};
191 }
192 
193 // ------------------------------ Subtraction
194 
195 // Unsigned
197  const Vec256<uint8_t> b) {
198  return Vec256<uint8_t>{wasm_i8x16_sub(a.raw, b.raw)};
199 }
201  return Vec256<uint16_t>{wasm_i16x8_sub(a.raw, b.raw)};
202 }
204  const Vec256<uint32_t> b) {
205  return Vec256<uint32_t>{wasm_i32x4_sub(a.raw, b.raw)};
206 }
207 
208 // Signed
210  const Vec256<int8_t> b) {
211  return Vec256<int8_t>{wasm_i8x16_sub(a.raw, b.raw)};
212 }
214  const Vec256<int16_t> b) {
215  return Vec256<int16_t>{wasm_i16x8_sub(a.raw, b.raw)};
216 }
218  const Vec256<int32_t> b) {
219  return Vec256<int32_t>{wasm_i32x4_sub(a.raw, b.raw)};
220 }
221 
222 // Float
224  return Vec256<float>{wasm_f32x4_sub(a.raw, b.raw)};
225 }
226 
227 // ------------------------------ SumsOf8
229  HWY_ABORT("not implemented");
230 }
231 
232 // ------------------------------ SaturatedAdd
233 
234 // Returns a + b clamped to the destination range.
235 
236 // Unsigned
238  const Vec256<uint8_t> b) {
239  return Vec256<uint8_t>{wasm_u8x16_add_sat(a.raw, b.raw)};
240 }
242  const Vec256<uint16_t> b) {
243  return Vec256<uint16_t>{wasm_u16x8_add_sat(a.raw, b.raw)};
244 }
245 
246 // Signed
248  const Vec256<int8_t> b) {
249  return Vec256<int8_t>{wasm_i8x16_add_sat(a.raw, b.raw)};
250 }
252  const Vec256<int16_t> b) {
253  return Vec256<int16_t>{wasm_i16x8_add_sat(a.raw, b.raw)};
254 }
255 
256 // ------------------------------ SaturatedSub
257 
258 // Returns a - b clamped to the destination range.
259 
260 // Unsigned
262  const Vec256<uint8_t> b) {
263  return Vec256<uint8_t>{wasm_u8x16_sub_sat(a.raw, b.raw)};
264 }
266  const Vec256<uint16_t> b) {
267  return Vec256<uint16_t>{wasm_u16x8_sub_sat(a.raw, b.raw)};
268 }
269 
270 // Signed
272  const Vec256<int8_t> b) {
273  return Vec256<int8_t>{wasm_i8x16_sub_sat(a.raw, b.raw)};
274 }
276  const Vec256<int16_t> b) {
277  return Vec256<int16_t>{wasm_i16x8_sub_sat(a.raw, b.raw)};
278 }
279 
280 // ------------------------------ Average
281 
282 // Returns (a + b + 1) / 2
283 
284 // Unsigned
286  const Vec256<uint8_t> b) {
287  return Vec256<uint8_t>{wasm_u8x16_avgr(a.raw, b.raw)};
288 }
290  const Vec256<uint16_t> b) {
291  return Vec256<uint16_t>{wasm_u16x8_avgr(a.raw, b.raw)};
292 }
293 
294 // ------------------------------ Absolute value
295 
296 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
298  return Vec256<int8_t>{wasm_i8x16_abs(v.raw)};
299 }
301  return Vec256<int16_t>{wasm_i16x8_abs(v.raw)};
302 }
304  return Vec256<int32_t>{wasm_i32x4_abs(v.raw)};
305 }
307  return Vec256<int32_t>{wasm_i62x2_abs(v.raw)};
308 }
309 
311  return Vec256<float>{wasm_f32x4_abs(v.raw)};
312 }
313 
314 // ------------------------------ Shift lanes by constant #bits
315 
316 // Unsigned
317 template <int kBits>
319  return Vec256<uint16_t>{wasm_i16x8_shl(v.raw, kBits)};
320 }
321 template <int kBits>
323  return Vec256<uint16_t>{wasm_u16x8_shr(v.raw, kBits)};
324 }
325 template <int kBits>
327  return Vec256<uint32_t>{wasm_i32x4_shl(v.raw, kBits)};
328 }
329 template <int kBits>
331  return Vec256<uint32_t>{wasm_u32x4_shr(v.raw, kBits)};
332 }
333 
334 // Signed
335 template <int kBits>
337  return Vec256<int16_t>{wasm_i16x8_shl(v.raw, kBits)};
338 }
339 template <int kBits>
341  return Vec256<int16_t>{wasm_i16x8_shr(v.raw, kBits)};
342 }
343 template <int kBits>
345  return Vec256<int32_t>{wasm_i32x4_shl(v.raw, kBits)};
346 }
347 template <int kBits>
349  return Vec256<int32_t>{wasm_i32x4_shr(v.raw, kBits)};
350 }
351 
352 // 8-bit
353 template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
355  const Full256<T> d8;
356  // Use raw instead of BitCast to support N=1.
357  const Vec256<T> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
358  return kBits == 1
359  ? (v + v)
360  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
361 }
362 
363 template <int kBits>
365  const Full256<uint8_t> d8;
366  // Use raw instead of BitCast to support N=1.
367  const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
368  return shifted & Set(d8, 0xFF >> kBits);
369 }
370 
371 template <int kBits>
373  const Full256<int8_t> di;
374  const Full256<uint8_t> du;
375  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
376  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
377  return (shifted ^ shifted_sign) - shifted_sign;
378 }
379 
380 // ------------------------------ RotateRight (ShiftRight, Or)
381 template <int kBits, typename T>
383  constexpr size_t kSizeInBits = sizeof(T) * 8;
384  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
385  if (kBits == 0) return v;
386  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
387 }
388 
389 // ------------------------------ Shift lanes by same variable #bits
390 
391 // Unsigned
393  const int bits) {
394  return Vec256<uint16_t>{wasm_i16x8_shl(v.raw, bits)};
395 }
397  const int bits) {
398  return Vec256<uint16_t>{wasm_u16x8_shr(v.raw, bits)};
399 }
401  const int bits) {
402  return Vec256<uint32_t>{wasm_i32x4_shl(v.raw, bits)};
403 }
405  const int bits) {
406  return Vec256<uint32_t>{wasm_u32x4_shr(v.raw, bits)};
407 }
408 
409 // Signed
411  return Vec256<int16_t>{wasm_i16x8_shl(v.raw, bits)};
412 }
414  const int bits) {
415  return Vec256<int16_t>{wasm_i16x8_shr(v.raw, bits)};
416 }
418  return Vec256<int32_t>{wasm_i32x4_shl(v.raw, bits)};
419 }
421  const int bits) {
422  return Vec256<int32_t>{wasm_i32x4_shr(v.raw, bits)};
423 }
424 
425 // 8-bit
426 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
427 HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
428  const Full256<T> d8;
429  // Use raw instead of BitCast to support N=1.
430  const Vec256<T> shifted{ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
431  return shifted & Set(d8, (0xFF << bits) & 0xFF);
432 }
433 
435  const Full256<uint8_t> d8;
436  // Use raw instead of BitCast to support N=1.
437  const Vec256<uint8_t> shifted{
438  ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
439  return shifted & Set(d8, 0xFF >> bits);
440 }
441 
443  const Full256<int8_t> di;
444  const Full256<uint8_t> du;
445  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
446  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
447  return (shifted ^ shifted_sign) - shifted_sign;
448 }
449 
450 // ------------------------------ Minimum
451 
452 // Unsigned
454  return Vec256<uint8_t>{wasm_u8x16_min(a.raw, b.raw)};
455 }
457  const Vec256<uint16_t> b) {
458  return Vec256<uint16_t>{wasm_u16x8_min(a.raw, b.raw)};
459 }
461  const Vec256<uint32_t> b) {
462  return Vec256<uint32_t>{wasm_u32x4_min(a.raw, b.raw)};
463 }
465  const Vec256<uint64_t> b) {
466  alignas(32) float min[4];
467  min[0] =
468  HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
469  min[1] =
470  HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
471  return Vec256<uint64_t>{wasm_v128_load(min)};
472 }
473 
474 // Signed
476  return Vec256<int8_t>{wasm_i8x16_min(a.raw, b.raw)};
477 }
479  return Vec256<int16_t>{wasm_i16x8_min(a.raw, b.raw)};
480 }
482  return Vec256<int32_t>{wasm_i32x4_min(a.raw, b.raw)};
483 }
485  alignas(32) float min[4];
486  min[0] =
487  HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
488  min[1] =
489  HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
490  return Vec256<int64_t>{wasm_v128_load(min)};
491 }
492 
493 // Float
495  return Vec256<float>{wasm_f32x4_min(a.raw, b.raw)};
496 }
497 
498 // ------------------------------ Maximum
499 
500 // Unsigned
502  return Vec256<uint8_t>{wasm_u8x16_max(a.raw, b.raw)};
503 }
505  const Vec256<uint16_t> b) {
506  return Vec256<uint16_t>{wasm_u16x8_max(a.raw, b.raw)};
507 }
509  const Vec256<uint32_t> b) {
510  return Vec256<uint32_t>{wasm_u32x4_max(a.raw, b.raw)};
511 }
513  const Vec256<uint64_t> b) {
514  alignas(32) float max[4];
515  max[0] =
516  HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
517  max[1] =
518  HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
519  return Vec256<int64_t>{wasm_v128_load(max)};
520 }
521 
522 // Signed
524  return Vec256<int8_t>{wasm_i8x16_max(a.raw, b.raw)};
525 }
527  return Vec256<int16_t>{wasm_i16x8_max(a.raw, b.raw)};
528 }
530  return Vec256<int32_t>{wasm_i32x4_max(a.raw, b.raw)};
531 }
533  alignas(32) float max[4];
534  max[0] =
535  HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
536  max[1] =
537  HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
538  return Vec256<int64_t>{wasm_v128_load(max)};
539 }
540 
541 // Float
543  return Vec256<float>{wasm_f32x4_max(a.raw, b.raw)};
544 }
545 
546 // ------------------------------ Integer multiplication
547 
548 // Unsigned
550  const Vec256<uint16_t> b) {
551  return Vec256<uint16_t>{wasm_i16x8_mul(a.raw, b.raw)};
552 }
554  const Vec256<uint32_t> b) {
555  return Vec256<uint32_t>{wasm_i32x4_mul(a.raw, b.raw)};
556 }
557 
558 // Signed
560  const Vec256<int16_t> b) {
561  return Vec256<int16_t>{wasm_i16x8_mul(a.raw, b.raw)};
562 }
564  const Vec256<int32_t> b) {
565  return Vec256<int32_t>{wasm_i32x4_mul(a.raw, b.raw)};
566 }
567 
568 // Returns the upper 16 bits of a * b in each lane.
570  const Vec256<uint16_t> b) {
571  // TODO(eustas): replace, when implemented in WASM.
572  const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
573  const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
574  const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
575  const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
576  const auto l = wasm_i32x4_mul(al, bl);
577  const auto h = wasm_i32x4_mul(ah, bh);
578  // TODO(eustas): shift-right + narrow?
579  return Vec256<uint16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
580 }
582  const Vec256<int16_t> b) {
583  // TODO(eustas): replace, when implemented in WASM.
584  const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
585  const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
586  const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
587  const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
588  const auto l = wasm_i32x4_mul(al, bl);
589  const auto h = wasm_i32x4_mul(ah, bh);
590  // TODO(eustas): shift-right + narrow?
591  return Vec256<int16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
592 }
593 
595  HWY_ASSERT(0);
596 }
597 
598 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
600  const Vec256<int32_t> b) {
601  // TODO(eustas): replace, when implemented in WASM.
602  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
603  const auto ae = wasm_v128_and(a.raw, kEvenMask);
604  const auto be = wasm_v128_and(b.raw, kEvenMask);
605  return Vec256<int64_t>{wasm_i64x2_mul(ae, be)};
606 }
608  const Vec256<uint32_t> b) {
609  // TODO(eustas): replace, when implemented in WASM.
610  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
611  const auto ae = wasm_v128_and(a.raw, kEvenMask);
612  const auto be = wasm_v128_and(b.raw, kEvenMask);
613  return Vec256<uint64_t>{wasm_i64x2_mul(ae, be)};
614 }
615 
616 // ------------------------------ Negate
617 
618 template <typename T, HWY_IF_FLOAT(T)>
620  return Xor(v, SignBit(Full256<T>()));
621 }
622 
624  return Vec256<int8_t>{wasm_i8x16_neg(v.raw)};
625 }
627  return Vec256<int16_t>{wasm_i16x8_neg(v.raw)};
628 }
630  return Vec256<int32_t>{wasm_i32x4_neg(v.raw)};
631 }
633  return Vec256<int64_t>{wasm_i64x2_neg(v.raw)};
634 }
635 
636 // ------------------------------ Floating-point mul / div
637 
639  return Vec256<float>{wasm_f32x4_mul(a.raw, b.raw)};
640 }
641 
643  return Vec256<float>{wasm_f32x4_div(a.raw, b.raw)};
644 }
645 
646 // Approximate reciprocal
648  const Vec256<float> one = Vec256<float>{wasm_f32x4_splat(1.0f)};
649  return one / v;
650 }
651 
652 // Absolute value of difference.
654  return Abs(a - b);
655 }
656 
657 // ------------------------------ Floating-point multiply-add variants
658 
659 // Returns mul * x + add
661  const Vec256<float> add) {
662  // TODO(eustas): replace, when implemented in WASM.
663  // TODO(eustas): is it wasm_f32x4_qfma?
664  return mul * x + add;
665 }
666 
667 // Returns add - mul * x
669  const Vec256<float> add) {
670  // TODO(eustas): replace, when implemented in WASM.
671  return add - mul * x;
672 }
673 
674 // Returns mul * x - sub
676  const Vec256<float> sub) {
677  // TODO(eustas): replace, when implemented in WASM.
678  // TODO(eustas): is it wasm_f32x4_qfms?
679  return mul * x - sub;
680 }
681 
682 // Returns -mul * x - sub
684  const Vec256<float> sub) {
685  // TODO(eustas): replace, when implemented in WASM.
686  return Neg(mul) * x - sub;
687 }
688 
689 // ------------------------------ Floating-point square root
690 
691 // Full precision square root
693  return Vec256<float>{wasm_f32x4_sqrt(v.raw)};
694 }
695 
696 // Approximate reciprocal square root
698  // TODO(eustas): find cheaper a way to calculate this.
699  const Vec256<float> one = Vec256<float>{wasm_f32x4_splat(1.0f)};
700  return one / Sqrt(v);
701 }
702 
703 // ------------------------------ Floating-point rounding
704 
705 // Toward nearest integer, ties to even
707  return Vec256<float>{wasm_f32x4_nearest(v.raw)};
708 }
709 
710 // Toward zero, aka truncate
712  return Vec256<float>{wasm_f32x4_trunc(v.raw)};
713 }
714 
715 // Toward +infinity, aka ceiling
717  return Vec256<float>{wasm_f32x4_ceil(v.raw)};
718 }
719 
720 // Toward -infinity, aka floor
722  return Vec256<float>{wasm_f32x4_floor(v.raw)};
723 }
724 
725 // ------------------------------ Floating-point classification
726 
727 template <typename T>
729  return v != v;
730 }
731 
732 template <typename T, HWY_IF_FLOAT(T)>
734  const Full256<T> d;
735  const RebindToSigned<decltype(d)> di;
736  const VFromD<decltype(di)> vi = BitCast(di, v);
737  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
738  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
739 }
740 
741 // Returns whether normal/subnormal/zero.
742 template <typename T, HWY_IF_FLOAT(T)>
744  const Full256<T> d;
745  const RebindToUnsigned<decltype(d)> du;
746  const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
747  const VFromD<decltype(du)> vu = BitCast(du, v);
748  // 'Shift left' to clear the sign bit, then right so we can compare with the
749  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
750  // negative and non-negative floats would be greater).
751  const VFromD<decltype(di)> exp =
752  BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
753  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
754 }
755 
756 // ================================================== COMPARE
757 
758 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
759 
760 template <typename TFrom, typename TTo>
762  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
763  return Mask256<TTo>{m.raw};
764 }
765 
766 template <typename T>
768  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
769  return (v & bit) == bit;
770 }
771 
772 // ------------------------------ Equality
773 
774 // Unsigned
776  const Vec256<uint8_t> b) {
777  return Mask256<uint8_t>{wasm_i8x16_eq(a.raw, b.raw)};
778 }
780  const Vec256<uint16_t> b) {
781  return Mask256<uint16_t>{wasm_i16x8_eq(a.raw, b.raw)};
782 }
784  const Vec256<uint32_t> b) {
785  return Mask256<uint32_t>{wasm_i32x4_eq(a.raw, b.raw)};
786 }
787 
788 // Signed
790  const Vec256<int8_t> b) {
791  return Mask256<int8_t>{wasm_i8x16_eq(a.raw, b.raw)};
792 }
794  return Mask256<int16_t>{wasm_i16x8_eq(a.raw, b.raw)};
795 }
797  const Vec256<int32_t> b) {
798  return Mask256<int32_t>{wasm_i32x4_eq(a.raw, b.raw)};
799 }
800 
801 // Float
803  const Vec256<float> b) {
804  return Mask256<float>{wasm_f32x4_eq(a.raw, b.raw)};
805 }
806 
807 // ------------------------------ Inequality
808 
809 // Unsigned
811  const Vec256<uint8_t> b) {
812  return Mask256<uint8_t>{wasm_i8x16_ne(a.raw, b.raw)};
813 }
815  const Vec256<uint16_t> b) {
816  return Mask256<uint16_t>{wasm_i16x8_ne(a.raw, b.raw)};
817 }
819  const Vec256<uint32_t> b) {
820  return Mask256<uint32_t>{wasm_i32x4_ne(a.raw, b.raw)};
821 }
822 
823 // Signed
825  const Vec256<int8_t> b) {
826  return Mask256<int8_t>{wasm_i8x16_ne(a.raw, b.raw)};
827 }
829  return Mask256<int16_t>{wasm_i16x8_ne(a.raw, b.raw)};
830 }
832  const Vec256<int32_t> b) {
833  return Mask256<int32_t>{wasm_i32x4_ne(a.raw, b.raw)};
834 }
835 
836 // Float
838  const Vec256<float> b) {
839  return Mask256<float>{wasm_f32x4_ne(a.raw, b.raw)};
840 }
841 
842 // ------------------------------ Strict inequality
843 
845  const Vec256<int8_t> b) {
846  return Mask256<int8_t>{wasm_i8x16_gt(a.raw, b.raw)};
847 }
849  const Vec256<int16_t> b) {
850  return Mask256<int16_t>{wasm_i16x8_gt(a.raw, b.raw)};
851 }
853  const Vec256<int32_t> b) {
854  return Mask256<int32_t>{wasm_i32x4_gt(a.raw, b.raw)};
855 }
857  const Vec256<int64_t> b) {
858  const Rebind < int32_t, DFromV<decltype(a)> d32;
859  const auto a32 = BitCast(d32, a);
860  const auto b32 = BitCast(d32, b);
861  // If the upper half is less than or greater, this is the answer.
862  const auto m_gt = a32 < b32;
863 
864  // Otherwise, the lower half decides.
865  const auto m_eq = a32 == b32;
866  const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
867  const auto lo_gt = And(m_eq, lo_in_hi);
868 
869  const auto gt = Or(lo_gt, m_gt);
870  // Copy result in upper 32 bits to lower 32 bits.
871  return Mask256<int64_t>{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)};
872 }
873 
874 template <typename T, HWY_IF_UNSIGNED(T)>
876  const Full256<T> du;
877  const RebindToSigned<decltype(du)> di;
878  const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
879  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
880 }
881 
883  return Mask256<float>{wasm_f32x4_gt(a.raw, b.raw)};
884 }
885 
886 template <typename T>
888  return operator>(b, a);
889 }
890 
891 // ------------------------------ Weak inequality
892 
893 // Float <= >=
895  const Vec256<float> b) {
896  return Mask256<float>{wasm_f32x4_le(a.raw, b.raw)};
897 }
899  const Vec256<float> b) {
900  return Mask256<float>{wasm_f32x4_ge(a.raw, b.raw)};
901 }
902 
903 // ------------------------------ FirstN (Iota, Lt)
904 
905 template <typename T>
906 HWY_API Mask256<T> FirstN(const Full256<T> d, size_t num) {
907  const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
908  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
909 }
910 
911 // ================================================== LOGICAL
912 
913 // ------------------------------ Not
914 
915 template <typename T>
917  return Vec256<T>{wasm_v128_not(v.raw)};
918 }
919 
920 // ------------------------------ And
921 
922 template <typename T>
924  return Vec256<T>{wasm_v128_and(a.raw, b.raw)};
925 }
926 
927 // ------------------------------ AndNot
928 
929 // Returns ~not_mask & mask.
930 template <typename T>
932  return Vec256<T>{wasm_v128_andnot(mask.raw, not_mask.raw)};
933 }
934 
935 // ------------------------------ Or
936 
937 template <typename T>
939  return Vec256<T>{wasm_v128_or(a.raw, b.raw)};
940 }
941 
942 // ------------------------------ Xor
943 
944 template <typename T>
946  return Vec256<T>{wasm_v128_xor(a.raw, b.raw)};
947 }
948 
949 // ------------------------------ Or3
950 
951 template <typename T>
953  return Or(o1, Or(o2, o3));
954 }
955 
956 // ------------------------------ OrAnd
957 
958 template <typename T>
960  return Or(o, And(a1, a2));
961 }
962 
963 // ------------------------------ IfVecThenElse
964 
965 template <typename T>
967  return IfThenElse(MaskFromVec(mask), yes, no);
968 }
969 
970 // ------------------------------ Operator overloads (internal-only if float)
971 
972 template <typename T>
974  return And(a, b);
975 }
976 
977 template <typename T>
979  return Or(a, b);
980 }
981 
982 template <typename T>
984  return Xor(a, b);
985 }
986 
987 // ------------------------------ CopySign
988 
989 template <typename T>
990 HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
991  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
992  const auto msb = SignBit(Full256<T>());
993  return Or(AndNot(msb, magn), And(msb, sign));
994 }
995 
996 template <typename T>
998  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
999  return Or(abs, And(SignBit(Full256<T>()), sign));
1000 }
1001 
1002 // ------------------------------ BroadcastSignBit (compare)
1003 
1004 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
1006  return ShiftRight<sizeof(T) * 8 - 1>(v);
1007 }
1010 }
1011 
1012 // ------------------------------ Mask
1013 
1014 // Mask and Vec are the same (true = FF..FF).
1015 template <typename T>
1017  return Mask256<T>{v.raw};
1018 }
1019 
1020 template <typename T>
1022  return Vec256<T>{v.raw};
1023 }
1024 
1025 // mask ? yes : no
1026 template <typename T>
1028  return Vec256<T>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1029 }
1030 
1031 // mask ? yes : 0
1032 template <typename T>
1034  return yes & VecFromMask(Full256<T>(), mask);
1035 }
1036 
1037 // mask ? 0 : no
1038 template <typename T>
1040  return AndNot(VecFromMask(Full256<T>(), mask), no);
1041 }
1042 
1043 template <typename T>
1046  HWY_ASSERT(0);
1047 }
1048 
1049 template <typename T, HWY_IF_FLOAT(T)>
1050 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
1051  const Full256<T> d;
1052  const auto zero = Zero(d);
1053  return IfThenElse(Mask256<T>{(v > zero).raw}, v, zero);
1054 }
1055 
1056 // ------------------------------ Mask logical
1057 
1058 template <typename T>
1059 HWY_API Mask256<T> Not(const Mask256<T> m) {
1060  return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
1061 }
1062 
1063 template <typename T>
1064 HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
1065  const Full256<T> d;
1066  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1067 }
1068 
1069 template <typename T>
1070 HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
1071  const Full256<T> d;
1072  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1073 }
1074 
1075 template <typename T>
1076 HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
1077  const Full256<T> d;
1078  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1079 }
1080 
1081 template <typename T>
1082 HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
1083  const Full256<T> d;
1084  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1085 }
1086 
1087 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)
1088 
1089 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
1090 // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
1091 // scalar count operand, per-lane shift instructions would require extract_lane
1092 // for each lane, and hoping that shuffle is correctly mapped to a native
1093 // instruction. Using non-vector shifts would incur a store-load forwarding
1094 // stall when loading the result vector. We instead test bits of the shift
1095 // count to "predicate" a shift of the entire vector by a constant.
1096 
1097 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1098 HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) {
1099  const Full256<T> d;
1100  Mask256<T> mask;
1101  // Need a signed type for BroadcastSignBit.
1102  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1103  // Move the highest valid bit of the shift count into the sign bit.
1104  test = ShiftLeft<12>(test);
1105 
1106  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1107  test = ShiftLeft<1>(test); // next bit (descending order)
1108  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1109 
1110  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1111  test = ShiftLeft<1>(test); // next bit (descending order)
1112  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1113 
1114  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1115  test = ShiftLeft<1>(test); // next bit (descending order)
1116  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1117 
1118  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1119  return IfThenElse(mask, ShiftLeft<1>(v), v);
1120 }
1121 
1122 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1123 HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) {
1124  const Full256<T> d;
1125  Mask256<T> mask;
1126  // Need a signed type for BroadcastSignBit.
1127  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1128  // Move the highest valid bit of the shift count into the sign bit.
1129  test = ShiftLeft<27>(test);
1130 
1131  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1132  test = ShiftLeft<1>(test); // next bit (descending order)
1133  v = IfThenElse(mask, ShiftLeft<16>(v), v);
1134 
1135  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1136  test = ShiftLeft<1>(test); // next bit (descending order)
1137  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1138 
1139  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1140  test = ShiftLeft<1>(test); // next bit (descending order)
1141  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1142 
1143  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1144  test = ShiftLeft<1>(test); // next bit (descending order)
1145  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1146 
1147  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1148  return IfThenElse(mask, ShiftLeft<1>(v), v);
1149 }
1150 
1151 // ------------------------------ Shr (BroadcastSignBit, IfThenElse)
1152 
1153 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1154 HWY_API Vec256<T> operator>>(Vec256<T> v, const Vec256<T> bits) {
1155  const Full256<T> d;
1156  Mask256<T> mask;
1157  // Need a signed type for BroadcastSignBit.
1158  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1159  // Move the highest valid bit of the shift count into the sign bit.
1160  test = ShiftLeft<12>(test);
1161 
1162  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1163  test = ShiftLeft<1>(test); // next bit (descending order)
1164  v = IfThenElse(mask, ShiftRight<8>(v), v);
1165 
1166  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1167  test = ShiftLeft<1>(test); // next bit (descending order)
1168  v = IfThenElse(mask, ShiftRight<4>(v), v);
1169 
1170  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1171  test = ShiftLeft<1>(test); // next bit (descending order)
1172  v = IfThenElse(mask, ShiftRight<2>(v), v);
1173 
1174  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1175  return IfThenElse(mask, ShiftRight<1>(v), v);
1176 }
1177 
1178 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1180  const Full256<T> d;
1181  Mask256<T> mask;
1182  // Need a signed type for BroadcastSignBit.
1183  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1184  // Move the highest valid bit of the shift count into the sign bit.
1185  test = ShiftLeft<27>(test);
1186 
1187  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1188  test = ShiftLeft<1>(test); // next bit (descending order)
1189  v = IfThenElse(mask, ShiftRight<16>(v), v);
1190 
1191  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1192  test = ShiftLeft<1>(test); // next bit (descending order)
1193  v = IfThenElse(mask, ShiftRight<8>(v), v);
1194 
1195  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1196  test = ShiftLeft<1>(test); // next bit (descending order)
1197  v = IfThenElse(mask, ShiftRight<4>(v), v);
1198 
1199  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1200  test = ShiftLeft<1>(test); // next bit (descending order)
1201  v = IfThenElse(mask, ShiftRight<2>(v), v);
1202 
1203  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1204  return IfThenElse(mask, ShiftRight<1>(v), v);
1205 }
1206 
1207 // ================================================== MEMORY
1208 
1209 // ------------------------------ Load
1210 
1211 template <typename T>
1212 HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
1213  return Vec256<T>{wasm_v128_load(aligned)};
1214 }
1215 
1216 template <typename T>
1218  const T* HWY_RESTRICT aligned) {
1219  return IfThenElseZero(m, Load(d, aligned));
1220 }
1221 
1222 // LoadU == Load.
1223 template <typename T>
1225  return Load(d, p);
1226 }
1227 
1228 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1229 template <typename T>
1231  return Load(d, p);
1232 }
1233 
1234 // ------------------------------ Store
1235 
1236 template <typename T>
1237 HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
1238  wasm_v128_store(aligned, v.raw);
1239 }
1240 
1241 // StoreU == Store.
1242 template <typename T>
1244  Store(v, d, p);
1245 }
1246 
1247 template <typename T>
1249  T* HWY_RESTRICT p) {
1250  StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
1251 }
1252 
1253 // ------------------------------ Non-temporal stores
1254 
1255 // Same as aligned stores on non-x86.
1256 
1257 template <typename T>
1259  T* HWY_RESTRICT aligned) {
1260  wasm_v128_store(aligned, v.raw);
1261 }
1262 
1263 // ------------------------------ Scatter (Store)
1264 
1265 template <typename T, typename Offset>
1267  const Vec256<Offset> offset) {
1268  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1269 
1270  alignas(32) T lanes[32 / sizeof(T)];
1271  Store(v, d, lanes);
1272 
1273  alignas(32) Offset offset_lanes[32 / sizeof(T)];
1274  Store(offset, Full256<Offset>(), offset_lanes);
1275 
1276  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
1277  for (size_t i = 0; i < N; ++i) {
1278  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1279  }
1280 }
1281 
1282 template <typename T, typename Index>
1284  const Vec256<Index> index) {
1285  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1286 
1287  alignas(32) T lanes[32 / sizeof(T)];
1288  Store(v, d, lanes);
1289 
1290  alignas(32) Index index_lanes[32 / sizeof(T)];
1291  Store(index, Full256<Index>(), index_lanes);
1292 
1293  for (size_t i = 0; i < N; ++i) {
1294  base[index_lanes[i]] = lanes[i];
1295  }
1296 }
1297 
1298 // ------------------------------ Gather (Load/Store)
1299 
1300 template <typename T, typename Offset>
1302  const Vec256<Offset> offset) {
1303  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1304 
1305  alignas(32) Offset offset_lanes[32 / sizeof(T)];
1306  Store(offset, Full256<Offset>(), offset_lanes);
1307 
1308  alignas(32) T lanes[32 / sizeof(T)];
1309  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1310  for (size_t i = 0; i < N; ++i) {
1311  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1312  }
1313  return Load(d, lanes);
1314 }
1315 
1316 template <typename T, typename Index>
1318  const Vec256<Index> index) {
1319  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1320 
1321  alignas(32) Index index_lanes[32 / sizeof(T)];
1322  Store(index, Full256<Index>(), index_lanes);
1323 
1324  alignas(32) T lanes[32 / sizeof(T)];
1325  for (size_t i = 0; i < N; ++i) {
1326  lanes[i] = base[index_lanes[i]];
1327  }
1328  return Load(d, lanes);
1329 }
1330 
1331 // ================================================== SWIZZLE
1332 
1333 // ------------------------------ ExtractLane
1334 template <typename T, size_t N>
1335 HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
1336  HWY_ASSERT(0);
1337 }
1338 
1339 // ------------------------------ InsertLane
1340 template <typename T, size_t N>
1341 HWY_API Vec128<T, N> InsertLane(const Vec128<T, N> v, size_t i, T t) {
1342  HWY_ASSERT(0);
1343 }
1344 
1345 // ------------------------------ GetLane
1346 // Gets the single value stored in a vector/part.
1348  return wasm_i8x16_extract_lane(v.raw, 0);
1349 }
1351  return wasm_i8x16_extract_lane(v.raw, 0);
1352 }
1354  return wasm_i16x8_extract_lane(v.raw, 0);
1355 }
1357  return wasm_i16x8_extract_lane(v.raw, 0);
1358 }
1360  return wasm_i32x4_extract_lane(v.raw, 0);
1361 }
1363  return wasm_i32x4_extract_lane(v.raw, 0);
1364 }
1366  return wasm_i64x2_extract_lane(v.raw, 0);
1367 }
1369  return wasm_i64x2_extract_lane(v.raw, 0);
1370 }
1371 
1373  return wasm_f32x4_extract_lane(v.raw, 0);
1374 }
1375 
1376 // ------------------------------ LowerHalf
1377 
1378 template <typename T>
1380  return Vec128<T>{v.raw};
1381 }
1382 
1383 template <typename T>
1385  return LowerHalf(Full128<T>(), v);
1386 }
1387 
1388 // ------------------------------ ShiftLeftBytes
1389 
1390 // 0x01..0F, kBytes = 1 => 0x02..0F00
1391 template <int kBytes, typename T>
1393  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1394  const __i8x16 zero = wasm_i8x16_splat(0);
1395  switch (kBytes) {
1396  case 0:
1397  return v;
1398 
1399  case 1:
1400  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
1401  7, 8, 9, 10, 11, 12, 13, 14)};
1402 
1403  case 2:
1404  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
1405  6, 7, 8, 9, 10, 11, 12, 13)};
1406 
1407  case 3:
1408  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
1409  4, 5, 6, 7, 8, 9, 10, 11, 12)};
1410 
1411  case 4:
1412  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
1413  3, 4, 5, 6, 7, 8, 9, 10, 11)};
1414 
1415  case 5:
1416  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
1417  2, 3, 4, 5, 6, 7, 8, 9, 10)};
1418 
1419  case 6:
1420  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1421  0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1422 
1423  case 7:
1424  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1425  16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1426 
1427  case 8:
1428  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1429  16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1430 
1431  case 9:
1432  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1433  16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
1434 
1435  case 10:
1436  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1437  16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
1438 
1439  case 11:
1440  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1441  16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
1442 
1443  case 12:
1444  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1445  16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
1446 
1447  case 13:
1448  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1449  16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
1450 
1451  case 14:
1452  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1453  16, 16, 16, 16, 16, 16, 16, 16, 0,
1454  1)};
1455 
1456  case 15:
1457  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1458  16, 16, 16, 16, 16, 16, 16, 16, 16,
1459  0)};
1460  }
1461  return Vec256<T>{zero};
1462 }
1463 
1464 template <int kBytes, typename T>
1466  return ShiftLeftBytes<kBytes>(Full256<T>(), v);
1467 }
1468 
1469 // ------------------------------ ShiftLeftLanes
1470 
1471 template <int kLanes, typename T>
1473  const Repartition<uint8_t, decltype(d)> d8;
1474  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1475 }
1476 
1477 template <int kLanes, typename T>
1479  return ShiftLeftLanes<kLanes>(Full256<T>(), v);
1480 }
1481 
1482 // ------------------------------ ShiftRightBytes
1483 namespace detail {
1484 
1485 // Helper function allows zeroing invalid lanes in caller.
1486 template <int kBytes, typename T>
1487 HWY_API __i8x16 ShrBytes(const Vec256<T> v) {
1488  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1489  const __i8x16 zero = wasm_i8x16_splat(0);
1490 
1491  switch (kBytes) {
1492  case 0:
1493  return v.raw;
1494 
1495  case 1:
1496  return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1497  12, 13, 14, 15, 16);
1498 
1499  case 2:
1500  return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1501  13, 14, 15, 16, 16);
1502 
1503  case 3:
1504  return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1505  13, 14, 15, 16, 16, 16);
1506 
1507  case 4:
1508  return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1509  14, 15, 16, 16, 16, 16);
1510 
1511  case 5:
1512  return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1513  15, 16, 16, 16, 16, 16);
1514 
1515  case 6:
1516  return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1517  16, 16, 16, 16, 16, 16);
1518 
1519  case 7:
1520  return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1521  16, 16, 16, 16, 16, 16, 16);
1522 
1523  case 8:
1524  return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1525  16, 16, 16, 16, 16, 16, 16);
1526 
1527  case 9:
1528  return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1529  16, 16, 16, 16, 16, 16, 16);
1530 
1531  case 10:
1532  return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1533  16, 16, 16, 16, 16, 16, 16);
1534 
1535  case 11:
1536  return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1537  16, 16, 16, 16, 16, 16, 16);
1538 
1539  case 12:
1540  return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1541  16, 16, 16, 16, 16, 16, 16);
1542 
1543  case 13:
1544  return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1545  16, 16, 16, 16, 16, 16, 16);
1546 
1547  case 14:
1548  return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1549  16, 16, 16, 16, 16, 16, 16);
1550 
1551  case 15:
1552  return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1553  16, 16, 16, 16, 16, 16, 16);
1554  case 16:
1555  return zero;
1556  }
1557 }
1558 
1559 } // namespace detail
1560 
1561 // 0x01..0F, kBytes = 1 => 0x0001..0E
1562 template <int kBytes, typename T>
1564  return Vec256<T>{detail::ShrBytes<kBytes>(v)};
1565 }
1566 
1567 // ------------------------------ ShiftRightLanes
1568 template <int kLanes, typename T>
1570  const Repartition<uint8_t, decltype(d)> d8;
1571  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1572 }
1573 
1574 // ------------------------------ UpperHalf (ShiftRightBytes)
1575 
1576 // Full input: copy hi into lo (smaller instruction encoding than shifts).
1577 template <typename T>
1578 HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Full128<T> /* tag */,
1579  const Vec256<T> v) {
1580  return Vec128<T, 8 / sizeof(T)>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1581 }
1583  const Vec128<float> v) {
1584  return Vec128<float, 2>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1585 }
1586 
1587 // ------------------------------ CombineShiftRightBytes
1588 
1589 template <int kBytes, typename T, class V = Vec256<T>>
1590 HWY_API V CombineShiftRightBytes(Full256<T> /* tag */, V hi, V lo) {
1591  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1592  switch (kBytes) {
1593  case 0:
1594  return lo;
1595 
1596  case 1:
1597  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1598  11, 12, 13, 14, 15, 16)};
1599 
1600  case 2:
1601  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1602  11, 12, 13, 14, 15, 16, 17)};
1603 
1604  case 3:
1605  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1606  12, 13, 14, 15, 16, 17, 18)};
1607 
1608  case 4:
1609  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1610  13, 14, 15, 16, 17, 18, 19)};
1611 
1612  case 5:
1613  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1614  14, 15, 16, 17, 18, 19, 20)};
1615 
1616  case 6:
1617  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1618  14, 15, 16, 17, 18, 19, 20, 21)};
1619 
1620  case 7:
1621  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1622  15, 16, 17, 18, 19, 20, 21, 22)};
1623 
1624  case 8:
1625  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1626  16, 17, 18, 19, 20, 21, 22, 23)};
1627 
1628  case 9:
1629  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1630  17, 18, 19, 20, 21, 22, 23, 24)};
1631 
1632  case 10:
1633  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1634  17, 18, 19, 20, 21, 22, 23, 24, 25)};
1635 
1636  case 11:
1637  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1638  18, 19, 20, 21, 22, 23, 24, 25, 26)};
1639 
1640  case 12:
1641  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1642  19, 20, 21, 22, 23, 24, 25, 26, 27)};
1643 
1644  case 13:
1645  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1646  20, 21, 22, 23, 24, 25, 26, 27, 28)};
1647 
1648  case 14:
1649  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1650  21, 22, 23, 24, 25, 26, 27, 28, 29)};
1651 
1652  case 15:
1653  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
1654  22, 23, 24, 25, 26, 27, 28, 29, 30)};
1655  }
1656  return hi;
1657 }
1658 
1659 // ------------------------------ Broadcast/splat any lane
1660 
1661 // Unsigned
1662 template <int kLane>
1664  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1665  return Vec256<uint16_t>{wasm_i16x8_shuffle(
1666  v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1667 }
1668 template <int kLane>
1670  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1671  return Vec256<uint32_t>{
1672  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1673 }
1674 
1675 // Signed
1676 template <int kLane>
1678  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1679  return Vec256<int16_t>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
1680  kLane, kLane, kLane, kLane, kLane)};
1681 }
1682 template <int kLane>
1684  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1685  return Vec256<int32_t>{
1686  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1687 }
1688 
1689 // Float
1690 template <int kLane>
1692  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1693  return Vec256<float>{
1694  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1695 }
1696 
1697 // ------------------------------ TableLookupBytes
1698 
1699 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
1700 // lane indices in [0, 16).
1701 template <typename T, typename TI>
1703  const Vec256<TI> from) {
1704 // Not yet available in all engines, see
1705 // https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
1706 // V8 implementation of this had a bug, fixed on 2021-04-03:
1707 // https://chromium-review.googlesource.com/c/v8/v8/+/2822951
1708 #if 0
1709  return Vec256<TI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
1710 #else
1711  alignas(32) uint8_t control[16];
1712  alignas(32) uint8_t input[16];
1713  alignas(32) uint8_t output[16];
1714  wasm_v128_store(control, from.raw);
1715  wasm_v128_store(input, bytes.raw);
1716  for (size_t i = 0; i < 16; ++i) {
1717  output[i] = control[i] < 16 ? input[control[i]] : 0;
1718  }
1719  return Vec256<TI>{wasm_v128_load(output)};
1720 #endif
1721 }
1722 
1723 template <typename T, typename TI>
1725  const Vec256<TI> from) {
1726  const Full256<TI> d;
1727  // Mask size must match vector type, so cast everything to this type.
1728  Repartition<int8_t, decltype(d)> di8;
1730  const auto msb = BitCast(di8, from) < Zero(di8);
1731  const auto lookup =
1732  TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
1733  return BitCast(d, IfThenZeroElse(msb, lookup));
1734 }
1735 
1736 // ------------------------------ Hard-coded shuffles
1737 
1738 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1739 // Shuffle0321 rotates one lane to the right (the previous least-significant
1740 // lane is now most-significant). These could also be implemented via
1741 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1742 
1743 // Swap 32-bit halves in 64-bit halves.
1744 HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
1745  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1746 }
1747 HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
1748  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1749 }
1750 HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
1751  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1752 }
1753 
1754 // Swap 64-bit halves
1756  return Vec128<uint32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1757 }
1759  return Vec128<int32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1760 }
1762  return Vec128<float>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1763 }
1764 
1765 // Rotate right 32 bits
1767  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1768 }
1770  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1771 }
1773  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1774 }
1775 // Rotate left 32 bits
1777  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1778 }
1780  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1781 }
1783  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1784 }
1785 
1786 // Reverse
1788  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1789 }
1791  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1792 }
1794  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1795 }
1796 
1797 // ------------------------------ TableLookupLanes
1798 
1799 // Returned by SetTableIndices for use by TableLookupLanes.
1800 template <typename T>
1801 struct Indices256 {
1802  __v128_u raw;
1803 };
1804 
1805 template <typename T, typename TI>
1807  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
1808  return Indices256<T>{};
1809 }
1810 
1811 template <typename T, typename TI>
1813  const Rebind<TI, decltype(d)> di;
1814  return IndicesFromVec(d, LoadU(di, idx));
1815 }
1816 
1817 template <typename T>
1819  using TI = MakeSigned<T>;
1820  const Full256<T> d;
1821  const Full256<TI> di;
1822  return BitCast(d, TableLookupBytes(BitCast(di, v), Vec256<TI>{idx.raw}));
1823 }
1824 
1825 // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
1826 
1827 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1829  return Shuffle01(v);
1830 }
1831 
1832 // Four lanes: shuffle
1833 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1834 HWY_API Vec256<T> Reverse(Full256<T> /* tag */, const Vec256<T> v) {
1835  return Shuffle0123(v);
1836 }
1837 
1838 // 16-bit
1839 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1840 HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
1841  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
1842  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
1843 }
1844 
1845 // ------------------------------ Reverse2
1846 
1847 template <typename T>
1849  HWY_ASSERT(0);
1850 }
1851 
1852 // ------------------------------ Reverse4
1853 
1854 template <typename T>
1856  HWY_ASSERT(0);
1857 }
1858 
1859 // ------------------------------ Reverse8
1860 
1861 template <typename T>
1863  HWY_ASSERT(0);
1864 }
1865 
1866 // ------------------------------ InterleaveLower
1867 
1869  return Vec256<uint8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18,
1870  3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1871 }
1873  Vec256<uint16_t> b) {
1874  return Vec256<uint16_t>{
1875  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1876 }
1878  Vec256<uint32_t> b) {
1879  return Vec256<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
1880 }
1882  Vec256<uint64_t> b) {
1883  return Vec256<uint64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
1884 }
1885 
1887  return Vec256<int8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3,
1888  19, 4, 20, 5, 21, 6, 22, 7, 23)};
1889 }
1891  return Vec256<int16_t>{
1892  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1893 }
1895  return Vec256<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
1896 }
1898  return Vec256<int64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
1899 }
1900 
1902  return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
1903 }
1904 
1905 // Additional overload for the optional tag.
1906 template <typename T, class V = Vec256<T>>
1907 HWY_API V InterleaveLower(Full256<T> /* tag */, V a, V b) {
1908  return InterleaveLower(a, b);
1909 }
1910 
1911 // ------------------------------ InterleaveUpper (UpperHalf)
1912 
1913 // All functions inside detail lack the required D parameter.
1914 namespace detail {
1915 
1917  return Vec256<uint8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26,
1918  11, 27, 12, 28, 13, 29, 14, 30, 15,
1919  31)};
1920 }
1922  Vec256<uint16_t> b) {
1923  return Vec256<uint16_t>{
1924  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1925 }
1927  Vec256<uint32_t> b) {
1928  return Vec256<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
1929 }
1931  Vec256<uint64_t> b) {
1932  return Vec256<uint64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
1933 }
1934 
1936  return Vec256<int8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26,
1937  11, 27, 12, 28, 13, 29, 14, 30, 15,
1938  31)};
1939 }
1941  return Vec256<int16_t>{
1942  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1943 }
1945  return Vec256<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
1946 }
1948  return Vec256<int64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
1949 }
1950 
1952  return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
1953 }
1954 
1955 } // namespace detail
1956 
1957 template <typename T, class V = Vec256<T>>
1958 HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
1959  return detail::InterleaveUpper(a, b);
1960 }
1961 
1962 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
1963 
1964 // Same as Interleave*, except that the return lanes are double-width integers;
1965 // this is necessary because the single-lane scalar cannot return two values.
1966 template <typename T, class DW = RepartitionToWide<Full256<T>>>
1968  return BitCast(DW(), InterleaveLower(a, b));
1969 }
1970 template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
1972  return BitCast(dw, InterleaveLower(D(), a, b));
1973 }
1974 
1975 template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
1977  return BitCast(dw, InterleaveUpper(D(), a, b));
1978 }
1979 
1980 // ================================================== COMBINE
1981 
1982 // ------------------------------ Combine (InterleaveLower)
1983 
1984 // N = N/2 + N/2 (upper half undefined)
1985 template <typename T>
1987  const Half<decltype(d)> d2;
1988  const RebindToUnsigned<decltype(d2)> du2;
1989  // Treat half-width input as one lane, and expand to two lanes.
1990  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
1991  const VU lo{BitCast(du2, lo_half).raw};
1992  const VU hi{BitCast(du2, hi_half).raw};
1993  return BitCast(d, InterleaveLower(lo, hi));
1994 }
1995 
1996 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
1997 
1998 template <typename T>
2000  return IfThenElseZero(FirstN(d, 16 / sizeof(T)), Vec256<T>{lo.raw});
2001 }
2002 
2003 // ------------------------------ ConcatLowerLower
2004 
2005 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2006 template <typename T>
2008  const Vec256<T> lo) {
2009  return Vec256<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
2010 }
2011 
2012 // ------------------------------ ConcatUpperUpper
2013 
2014 template <typename T>
2016  const Vec256<T> lo) {
2017  return Vec256<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
2018 }
2019 
2020 // ------------------------------ ConcatLowerUpper
2021 
2022 template <typename T>
2024  const Vec256<T> lo) {
2025  return CombineShiftRightBytes<8>(d, hi, lo);
2026 }
2027 
2028 // ------------------------------ ConcatUpperLower
2029 template <typename T>
2031  const Vec256<T> lo) {
2032  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
2033 }
2034 
2035 // ------------------------------ ConcatOdd
2036 
2037 // 32-bit
2038 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2040  return Vec256<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2041 }
2042 
2043 // 64-bit full - no partial because we need at least two inputs to have
2044 // even/odd.
2045 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2046 HWY_API Vec256<T> ConcatOdd(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
2047  return InterleaveUpper(Full256<T>(), lo, hi);
2048 }
2049 
2050 // ------------------------------ ConcatEven (InterleaveLower)
2051 
2052 // 32-bit full
2053 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2055  return Vec256<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2056 }
2057 
2058 // 64-bit full - no partial because we need at least two inputs to have
2059 // even/odd.
2060 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2061 HWY_API Vec256<T> ConcatEven(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
2062  return InterleaveLower(Full256<T>(), lo, hi);
2063 }
2064 
2065 // ------------------------------ DupEven
2066 template <typename T>
2068  HWY_ASSERT(0);
2069 }
2070 
2071 // ------------------------------ DupOdd
2072 template <typename T>
2074  HWY_ASSERT(0);
2075 }
2076 
2077 // ------------------------------ OddEven
2078 
2079 namespace detail {
2080 
2081 template <typename T>
2083  const Vec256<T> b) {
2084  const Full256<T> d;
2085  const Repartition<uint8_t, decltype(d)> d8;
2086  alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2087  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2088  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
2089 }
2090 template <typename T>
2092  const Vec256<T> b) {
2093  return Vec256<T>{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2094 }
2095 template <typename T>
2097  const Vec256<T> b) {
2098  return Vec256<T>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2099 }
2100 template <typename T>
2102  const Vec256<T> b) {
2103  return Vec256<T>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
2104 }
2105 
2106 } // namespace detail
2107 
2108 template <typename T>
2110  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
2111 }
2113  return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2114 }
2115 
2116 // ------------------------------ OddEvenBlocks
2117 template <typename T>
2119  return even;
2120 }
2121 
2122 // ------------------------------ SwapAdjacentBlocks
2123 
2124 template <typename T>
2126  return v;
2127 }
2128 
2129 // ------------------------------ ReverseBlocks
2130 
2131 template <typename T>
2133  return v;
2134 }
2135 
2136 // ================================================== CONVERT
2137 
2138 // ------------------------------ Promotions (part w/ narrow lanes -> full)
2139 
2140 // Unsigned: zero-extend.
2142  const Vec128<uint8_t> v) {
2143  return Vec256<uint16_t>{wasm_u16x8_extend_low_u8x16(v.raw)};
2144 }
2146  const Vec128<uint8_t> v) {
2147  return Vec256<uint32_t>{
2148  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2149 }
2151  const Vec128<uint8_t> v) {
2152  return Vec256<int16_t>{wasm_u16x8_extend_low_u8x16(v.raw)};
2153 }
2155  const Vec128<uint8_t> v) {
2156  return Vec256<int32_t>{
2157  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2158 }
2160  const Vec128<uint16_t> v) {
2161  return Vec256<uint32_t>{wasm_u32x4_extend_low_u16x8(v.raw)};
2162 }
2164  const Vec128<uint16_t> v) {
2165  return Vec256<int32_t>{wasm_u32x4_extend_low_u16x8(v.raw)};
2166 }
2167 
2168 // Signed: replicate sign bit.
2170  const Vec128<int8_t> v) {
2171  return Vec256<int16_t>{wasm_i16x8_extend_low_i8x16(v.raw)};
2172 }
2174  const Vec128<int8_t> v) {
2175  return Vec256<int32_t>{
2176  wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
2177 }
2179  const Vec128<int16_t> v) {
2180  return Vec256<int32_t>{wasm_i32x4_extend_low_i16x8(v.raw)};
2181 }
2182 
2184  const Vec128<int32_t> v) {
2185  return Vec256<double>{wasm_f64x2_convert_low_i32x4(v.raw)};
2186 }
2187 
2189  const Vec128<float16_t> v) {
2190  const Full256<int32_t> di32;
2191  const Full256<uint32_t> du32;
2192  const Full256<float> df32;
2193  // Expand to u32 so we can shift.
2194  const auto bits16 = PromoteTo(du32, Vec256<uint16_t>{v.raw});
2195  const auto sign = ShiftRight<15>(bits16);
2196  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
2197  const auto mantissa = bits16 & Set(du32, 0x3FF);
2198  const auto subnormal =
2199  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
2200  Set(df32, 1.0f / 16384 / 1024));
2201 
2202  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
2203  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
2204  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2205  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
2206  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2207 }
2208 
2210  const Vec128<bfloat16_t> v) {
2211  const Rebind<uint16_t, decltype(df32)> du16;
2212  const RebindToSigned<decltype(df32)> di32;
2213  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
2214 }
2215 
2216 // ------------------------------ Demotions (full -> part w/ narrow lanes)
2217 
2219  const Vec256<int32_t> v) {
2220  return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
2221 }
2222 
2224  const Vec256<int32_t> v) {
2225  return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
2226 }
2227 
2229  const Vec256<int32_t> v) {
2230  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2231  return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2232 }
2233 
2235  const Vec256<int16_t> v) {
2236  return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
2237 }
2238 
2240  const Vec256<int32_t> v) {
2241  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2242  return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
2243 }
2244 
2246  const Vec256<int16_t> v) {
2247  return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
2248 }
2249 
2251  const Vec256<double> v) {
2252  return Vec128<int32_t>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
2253 }
2254 
2256  const Vec256<float> v) {
2257  const Full256<int32_t> di;
2258  const Full256<uint32_t> du;
2259  const Full256<uint16_t> du16;
2260  const auto bits32 = BitCast(du, v);
2261  const auto sign = ShiftRight<31>(bits32);
2262  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
2263  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
2264 
2265  const auto k15 = Set(di, 15);
2266  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
2267  const auto is_tiny = exp < Set(di, -24);
2268 
2269  const auto is_subnormal = exp < Set(di, -14);
2270  const auto biased_exp16 =
2271  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
2272  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
2273  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
2274  (mantissa32 >> (Set(du, 13) + sub_exp));
2275  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
2276  ShiftRight<13>(mantissa32)); // <1024
2277 
2278  const auto sign16 = ShiftLeft<15>(sign);
2279  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2280  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
2281  return Vec128<float16_t>{DemoteTo(du16, bits16).raw};
2282 }
2283 
2285  const Vec256<float> v) {
2286  const Rebind<int32_t, decltype(dbf16)> di32;
2287  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
2288  const Rebind<uint16_t, decltype(dbf16)> du16;
2289  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
2290  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
2291 }
2292 
2295  const RebindToUnsigned<decltype(dbf16)> du16;
2296  const Repartition<uint32_t, decltype(dbf16)> du32;
2297  const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
2298  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2299 }
2300 
2301 // For already range-limited input [0, 255].
2303  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2304  return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2305 }
2306 
2307 // ------------------------------ Convert i32 <=> f32 (Round)
2308 
2310  const Vec256<int32_t> v) {
2311  return Vec256<float>{wasm_f32x4_convert_i32x4(v.raw)};
2312 }
2313 // Truncates (rounds toward zero).
2315  const Vec256<float> v) {
2316  return Vec256<int32_t>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
2317 }
2318 
2320  return ConvertTo(Full256<int32_t>(), Round(v));
2321 }
2322 
2323 // ================================================== MISC
2324 
2325 // ------------------------------ LoadMaskBits (TestBit)
2326 
2327 namespace detail {
2328 
2329 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2331  const RebindToUnsigned<decltype(d)> du;
2332  // Easier than Set(), which would require an >8-bit type, which would not
2333  // compile for T=uint8_t, N=1.
2334  const Vec256<T> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
2335 
2336  // Replicate bytes 8x such that each byte contains the bit that governs it.
2337  alignas(32) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2338  1, 1, 1, 1, 1, 1, 1, 1};
2339  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
2340 
2341  alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2342  1, 2, 4, 8, 16, 32, 64, 128};
2343  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
2344 }
2345 
2346 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2347 HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
2348  const RebindToUnsigned<decltype(d)> du;
2349  alignas(32) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2350  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2351 }
2352 
2353 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2354 HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
2355  const RebindToUnsigned<decltype(d)> du;
2356  alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2357  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2358 }
2359 
2360 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2361 HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
2362  const RebindToUnsigned<decltype(d)> du;
2363  alignas(32) constexpr uint64_t kBit[8] = {1, 2};
2364  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2365 }
2366 
2367 } // namespace detail
2368 
2369 // `p` points to at least 8 readable bytes, not all of which need be valid.
2370 template <typename T>
2372  const uint8_t* HWY_RESTRICT bits) {
2373  uint64_t mask_bits = 0;
2374  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
2375  return detail::LoadMaskBits(d, mask_bits);
2376 }
2377 
2378 // ------------------------------ Mask
2379 
2380 namespace detail {
2381 
2382 // Full
2383 template <typename T>
2384 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
2385  const Mask128<T> mask) {
2386  alignas(32) uint64_t lanes[2];
2387  wasm_v128_store(lanes, mask.raw);
2388 
2389  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2390  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2391  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2392  return (hi + lo);
2393 }
2394 
2395 template <typename T>
2397  const Mask256<T> mask) {
2398  // Remove useless lower half of each u16 while preserving the sign bit.
2399  const __i16x8 zero = wasm_i16x8_splat(0);
2400  const Mask256<uint8_t> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
2401  return BitsFromMask(hwy::SizeTag<1>(), mask8);
2402 }
2403 
2404 template <typename T>
2406  const Mask256<T> mask) {
2407  const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
2408  const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2409  const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2410  alignas(32) uint32_t lanes[4];
2411  wasm_v128_store(lanes, sliced_mask);
2412  return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2413 }
2414 
2415 // Returns 0xFF for bytes with index >= N, otherwise 0.
2416 constexpr __i8x16 BytesAbove() {
2417  return
2418  (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
2419  : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
2420  : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
2421  : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
2422  : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
2423  : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
2424  : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
2425  : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
2426  : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
2427  : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2428  -1, -1, -1, -1, -1)
2429  : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2430  -1, -1, -1, -1)
2431  : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
2432  -1, -1, -1, -1)
2433  : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
2434  -1, -1, -1)
2435  : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
2436  -1, -1, -1)
2437  : (N == 11)
2438  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
2439  : (N == 13)
2440  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
2441  : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
2442 }
2443 
2444 template <typename T>
2445 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
2446  return BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask);
2447 }
2448 
2449 template <typename T>
2450 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
2451  return PopCount(BitsFromMask(tag, m));
2452 }
2453 
2454 template <typename T>
2455 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
2456  return PopCount(BitsFromMask(tag, m));
2457 }
2458 
2459 template <typename T>
2460 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
2461  const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
2462  const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
2463  alignas(32) uint64_t lanes[2];
2464  wasm_v128_store(lanes, shifted_bits);
2465  return PopCount(lanes[0] | lanes[1]);
2466 }
2467 
2468 } // namespace detail
2469 
2470 // `p` points to at least 8 writable bytes.
2471 template <typename T>
2472 HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
2473  uint8_t* bits) {
2474  const uint64_t mask_bits = detail::BitsFromMask(mask);
2475  const size_t kNumBytes = (N + 7) / 8;
2476  CopyBytes<kNumBytes>(&mask_bits, bits);
2477  return kNumBytes;
2478 }
2479 
2480 template <typename T>
2481 HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask128<T> m) {
2482  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
2483 }
2484 
2485 template <typename T>
2486 HWY_API bool AllFalse(const Full256<T> d, const Mask128<T> m) {
2487 #if 0
2488  // Casting followed by wasm_i8x16_any_true results in wasm error:
2489  // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
2490  const auto v8 = BitCast(Full256<int8_t>(), VecFromMask(d, m));
2491  return !wasm_i8x16_any_true(v8.raw);
2492 #else
2493  (void)d;
2494  return (wasm_i64x2_extract_lane(m.raw, 0) |
2495  wasm_i64x2_extract_lane(m.raw, 1)) == 0;
2496 #endif
2497 }
2498 
2499 // Full vector
2500 namespace detail {
2501 template <typename T>
2502 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
2503  return wasm_i8x16_all_true(m.raw);
2504 }
2505 template <typename T>
2506 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
2507  return wasm_i16x8_all_true(m.raw);
2508 }
2509 template <typename T>
2510 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
2511  return wasm_i32x4_all_true(m.raw);
2512 }
2513 
2514 } // namespace detail
2515 
2516 template <typename T>
2517 HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask128<T> m) {
2518  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
2519 }
2520 
2521 template <typename T>
2522 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
2523  const Mask256<T> mask) {
2524  const uint64_t bits = detail::BitsFromMask(mask);
2525  return bits ? Num0BitsBelowLS1Bit_Nonzero64(bits) : -1;
2526 }
2527 
2528 // ------------------------------ Compress
2529 
2530 namespace detail {
2531 
2532 template <typename T>
2533 HWY_INLINE Vec256<T> Idx16x8FromBits(const uint64_t mask_bits) {
2534  HWY_DASSERT(mask_bits < 256);
2535  const Full256<T> d;
2536  const Rebind<uint8_t, decltype(d)> d8;
2537  const Full256<uint16_t> du;
2538 
2539  // We need byte indices for TableLookupBytes (one vector's worth for each of
2540  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
2541  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
2542  // with the doubling baked into the table. Unpacking nibbles is likely more
2543  // costly than the higher cache footprint from storing bytes.
2544  alignas(32) constexpr uint8_t table[256 * 8] = {
2545  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
2546  0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
2547  0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
2548  0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
2549  0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
2550  6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
2551  0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
2552  0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
2553  2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
2554  0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
2555  0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
2556  0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
2557  0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
2558  6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
2559  8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
2560  0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
2561  4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
2562  10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
2563  0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
2564  0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
2565  0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
2566  4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
2567  0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
2568  0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
2569  2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
2570  10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
2571  0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
2572  0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
2573  0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
2574  0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
2575  0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
2576  0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
2577  6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
2578  12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
2579  0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
2580  0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
2581  0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
2582  8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
2583  0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
2584  0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
2585  2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
2586  8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
2587  12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
2588  0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
2589  0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
2590  10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
2591  12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
2592  0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
2593  4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
2594  6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
2595  0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
2596  0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
2597  0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
2598  4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
2599  12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
2600  0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
2601  2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
2602  0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
2603  0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
2604  0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
2605  0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
2606  14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
2607  0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
2608  0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
2609  8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
2610  14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
2611  0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
2612  0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
2613  0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
2614  6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
2615  14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
2616  0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
2617  2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
2618  14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
2619  0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
2620  0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
2621  0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
2622  6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
2623  10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
2624  0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
2625  4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
2626  8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
2627  0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
2628  0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
2629  0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
2630  4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
2631  0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
2632  0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
2633  2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
2634  14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
2635  0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
2636  0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
2637  0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
2638  12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
2639  14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
2640  0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
2641  6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
2642  8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
2643  14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
2644  0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
2645  0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
2646  10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
2647  14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
2648  0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
2649  2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
2650  10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
2651  12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
2652  0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
2653  0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
2654  8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
2655  10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
2656  0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
2657  4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
2658  6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
2659 
2660  const Vec256<uint8_t> byte_idx{Load(d8, table + mask_bits * 8).raw};
2661  const Vec256<uint16_t> pairs = ZipLower(byte_idx, byte_idx);
2662  return BitCast(d, pairs + Set(du, 0x0100));
2663 }
2664 
2665 template <typename T>
2666 HWY_INLINE Vec256<T> Idx32x4FromBits(const uint64_t mask_bits) {
2667  HWY_DASSERT(mask_bits < 16);
2668 
2669  // There are only 4 lanes, so we can afford to load the index vector directly.
2670  alignas(32) constexpr uint8_t packed_array[16 * 16] = {
2671  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2672  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2673  4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2674  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, //
2675  8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2676  0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
2677  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
2678  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, //
2679  12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2680  0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2681  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2682  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, //
2683  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2684  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
2685  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
2686  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2687 
2688  const Full256<T> d;
2689  const Repartition<uint8_t, decltype(d)> d8;
2690  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
2691 }
2692 
2693 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2694 
2695 template <typename T>
2696 HWY_INLINE Vec256<T> Idx64x2FromBits(const uint64_t mask_bits) {
2697  HWY_DASSERT(mask_bits < 4);
2698 
2699  // There are only 2 lanes, so we can afford to load the index vector directly.
2700  alignas(32) constexpr uint8_t packed_array[4 * 16] = {
2701  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
2702  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
2703  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
2704  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2705 
2706  const Full256<T> d;
2707  const Repartition<uint8_t, decltype(d)> d8;
2708  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
2709 }
2710 
2711 #endif
2712 
2713 // Helper functions called by both Compress and CompressStore - avoids a
2714 // redundant BitsFromMask in the latter.
2715 
2716 template <typename T>
2718  const uint64_t mask_bits) {
2719  const auto idx = detail::Idx16x8FromBits<T>(mask_bits);
2720  using D = Full256<T>;
2721  const RebindToSigned<D> di;
2722  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
2723 }
2724 
2725 template <typename T>
2727  const uint64_t mask_bits) {
2728  const auto idx = detail::Idx32x4FromBits<T>(mask_bits);
2729  using D = Full256<T>;
2730  const RebindToSigned<D> di;
2731  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
2732 }
2733 
2734 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2735 
2736 template <typename T>
2739  const uint64_t mask_bits) {
2740  const auto idx = detail::Idx64x2FromBits<uint64_t>(mask_bits);
2741  using D = Full256<T>;
2742  const RebindToSigned<D> di;
2743  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
2744 }
2745 
2746 #endif
2747 
2748 } // namespace detail
2749 
2750 template <typename T>
2751 struct CompressIsPartition {
2752  enum { value = 1 };
2753 };
2754 
2755 template <typename T>
2757  const uint64_t mask_bits = detail::BitsFromMask(mask);
2758  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2759 }
2760 
2761 // ------------------------------ CompressNot
2762 template <typename T>
2763 HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
2764  return Compress(v, Not(mask));
2765 }
2766 
2767 // ------------------------------ CompressBlocksNot
2769  Mask256<uint64_t> mask) {
2770  HWY_ASSERT(0);
2771 }
2772 
2773 // ------------------------------ CompressBits
2774 
2775 template <typename T>
2777  uint64_t mask_bits = 0;
2778  constexpr size_t kNumBytes = (N + 7) / 8;
2779  CopyBytes<kNumBytes>(bits, &mask_bits);
2780  if (N < 8) {
2781  mask_bits &= (1ull << N) - 1;
2782  }
2783 
2784  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2785 }
2786 
2787 // ------------------------------ CompressStore
2788 template <typename T>
2790  T* HWY_RESTRICT unaligned) {
2791  const uint64_t mask_bits = detail::BitsFromMask(mask);
2792  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2793  StoreU(c, d, unaligned);
2794  return PopCount(mask_bits);
2795 }
2796 
2797 // ------------------------------ CompressBlendedStore
2798 template <typename T>
2800  T* HWY_RESTRICT unaligned) {
2801  const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
2802  using TU = TFromD<decltype(du)>;
2803  const uint64_t mask_bits = detail::BitsFromMask(m);
2804  const size_t count = PopCount(mask_bits);
2805  const Mask256<TU> store_mask = FirstN(du, count);
2806  const Vec256<TU> compressed =
2807  detail::Compress(hwy::SizeTag<sizeof(T)>(), BitCast(du, v), mask_bits);
2808  const Vec256<TU> prev = BitCast(du, LoadU(d, unaligned));
2809  StoreU(BitCast(d, IfThenElse(store_mask, compressed, prev)), d, unaligned);
2810  return count;
2811 }
2812 
2813 // ------------------------------ CompressBitsStore
2814 
2815 template <typename T>
2816 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
2817  Full256<T> d, T* HWY_RESTRICT unaligned) {
2818  uint64_t mask_bits = 0;
2819  constexpr size_t kNumBytes = (N + 7) / 8;
2820  CopyBytes<kNumBytes>(bits, &mask_bits);
2821  if (N < 8) {
2822  mask_bits &= (1ull << N) - 1;
2823  }
2824 
2825  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2826  StoreU(c, d, unaligned);
2827  return PopCount(mask_bits);
2828 }
2829 
2830 // ------------------------------ StoreInterleaved2/3/4
2831 
2832 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
2833 // generic_ops-inl.h.
2834 
2835 // ------------------------------ MulEven/Odd (Load)
2836 
2838  const Vec256<uint64_t> b) {
2839  alignas(32) uint64_t mul[2];
2840  mul[0] =
2841  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
2842  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
2843  return Load(Full256<uint64_t>(), mul);
2844 }
2845 
2847  const Vec256<uint64_t> b) {
2848  alignas(32) uint64_t mul[2];
2849  mul[0] =
2850  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
2851  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
2852  return Load(Full256<uint64_t>(), mul);
2853 }
2854 
2855 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2856 
2860  const Vec256<float> sum0,
2861  Vec256<float>& sum1) {
2862  const Repartition<uint16_t, decltype(df32)> du16;
2863  const RebindToUnsigned<decltype(df32)> du32;
2864  const Vec256<uint16_t> zero = Zero(du16);
2865  const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
2866  const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
2867  const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
2868  const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
2869  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2870  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2871 }
2872 
2873 // ------------------------------ Reductions
2874 
2875 namespace detail {
2876 
2877 // u32/i32/f32:
2878 
2879 template <typename T>
2881  const Vec256<T> v3210) {
2882  const Vec256<T> v1032 = Shuffle1032(v3210);
2883  const Vec256<T> v31_20_31_20 = v3210 + v1032;
2884  const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
2885  return v20_31_20_31 + v31_20_31_20;
2886 }
2887 template <typename T>
2889  const Vec256<T> v3210) {
2890  const Vec256<T> v1032 = Shuffle1032(v3210);
2891  const Vec256<T> v31_20_31_20 = Min(v3210, v1032);
2892  const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
2893  return Min(v20_31_20_31, v31_20_31_20);
2894 }
2895 template <typename T>
2897  const Vec256<T> v3210) {
2898  const Vec256<T> v1032 = Shuffle1032(v3210);
2899  const Vec256<T> v31_20_31_20 = Max(v3210, v1032);
2900  const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
2901  return Max(v20_31_20_31, v31_20_31_20);
2902 }
2903 
2904 // u64/i64/f64:
2905 
2906 template <typename T>
2908  const Vec256<T> v10) {
2909  const Vec256<T> v01 = Shuffle01(v10);
2910  return v10 + v01;
2911 }
2912 template <typename T>
2914  const Vec256<T> v10) {
2915  const Vec256<T> v01 = Shuffle01(v10);
2916  return Min(v10, v01);
2917 }
2918 template <typename T>
2920  const Vec256<T> v10) {
2921  const Vec256<T> v01 = Shuffle01(v10);
2922  return Max(v10, v01);
2923 }
2924 
2925 // u16/i16
2926 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2929  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
2930  const auto odd = ShiftRight<16>(BitCast(d32, v));
2931  const auto min = MinOfLanes(d32, Min(even, odd));
2932  // Also broadcast into odd lanes.
2933  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
2934 }
2935 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2938  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
2939  const auto odd = ShiftRight<16>(BitCast(d32, v));
2940  const auto min = MaxOfLanes(d32, Max(even, odd));
2941  // Also broadcast into odd lanes.
2942  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
2943 }
2944 
2945 } // namespace detail
2946 
2947 // Supported for u/i/f 32/64. Returns the same value in each lane.
2948 template <typename T>
2950  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
2951 }
2952 template <typename T>
2954  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
2955 }
2956 template <typename T>
2958  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
2959 }
2960 
2961 // ------------------------------ Lt128
2962 
2963 template <typename T>
2965 
2966 template <typename T>
2968 
2969 template <typename T>
2971 
2972 template <typename T>
2974 
2975 template <typename T>
2977 
2978 template <typename T>
2980 
2981 // NOLINTNEXTLINE(google-readability-namespace-comments)
2982 } // namespace HWY_NAMESPACE
2983 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_ABORT(format,...)
Definition: base.h:141
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
Raw raw
Definition: arm_neon-inl.h:814
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: wasm_256-inl.h:46
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: wasm_256-inl.h:43
Raw raw
Definition: x86_256-inl.h:100
Vec128< T > v1
Definition: wasm_256-inl.h:66
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: wasm_256-inl.h:58
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: wasm_256-inl.h:61
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: wasm_256-inl.h:49
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: wasm_256-inl.h:55
Vec128< T > v0
Definition: wasm_256-inl.h:65
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: wasm_256-inl.h:52
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:2144
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3476
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec256< T > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_256-inl.h:2533
HWY_INLINE Vec256< T > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_256-inl.h:2666
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: wasm_256-inl.h:1801
__v128_u raw
Definition: wasm_256-inl.h:1802
Definition: wasm_256-inl.h:70
Mask128< T > m1
Definition: wasm_256-inl.h:72
Raw raw
Definition: x86_256-inl.h:137
Mask128< T > m0
Definition: wasm_256-inl.h:71
Definition: ops/shared-inl.h:40
Definition: base.h:358
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()