Grok  10.0.3
emu128-inl.h
Go to the documentation of this file.
1 // Copyright 2022 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Single-element vectors and operations.
17 // External include guard in highway.h - see comment there.
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 
22 #include "hwy/base.h"
23 #include "hwy/ops/shared-inl.h"
24 
26 namespace hwy {
27 namespace HWY_NAMESPACE {
28 
29 template <typename T>
30 using Full128 = Simd<T, 16 / sizeof(T), 0>;
31 
32 // (Wrapper class required for overloading comparison operators.)
33 template <typename T, size_t N = 16 / sizeof(T)>
34 struct Vec128 {
35  HWY_INLINE Vec128() = default;
36  Vec128(const Vec128&) = default;
37  Vec128& operator=(const Vec128&) = default;
38 
40  return *this = (*this * other);
41  }
43  return *this = (*this / other);
44  }
46  return *this = (*this + other);
47  }
49  return *this = (*this - other);
50  }
52  return *this = (*this & other);
53  }
55  return *this = (*this | other);
56  }
58  return *this = (*this ^ other);
59  }
60 
61  // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h
62  // relies on this for LoadInterleaved*. CAVEAT: this method of padding
63  // prevents using range for, especially in SumOfLanes, where it would be
64  // incorrect. Moving padding to another field would require handling the case
65  // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward.
66  T raw[16 / sizeof(T)] = {};
67 };
68 
69 // 0 or FF..FF, same size as Vec128.
70 template <typename T, size_t N = 16 / sizeof(T)>
71 struct Mask128 {
73  static HWY_INLINE Raw FromBool(bool b) {
74  return b ? static_cast<Raw>(~Raw{0}) : 0;
75  }
76 
77  // Must match the size of Vec128.
78  Raw bits[16 / sizeof(T)] = {};
79 };
80 
81 namespace detail {
82 
83 // Deduce Simd<T, N, 0> from Vec128<T, N>
84 struct Deduce128 {
85  template <typename T, size_t N>
87  return Simd<T, N, 0>();
88  }
89 };
90 
91 } // namespace detail
92 
93 template <class V>
94 using DFromV = decltype(detail::Deduce128()(V()));
95 
96 template <class V>
97 using TFromV = TFromD<DFromV<V>>;
98 
99 // ------------------------------ BitCast
100 
101 template <typename T, size_t N, typename FromT, size_t FromN>
103  Vec128<T, N> to;
104  static_assert(sizeof(T) * N == sizeof(FromT) * FromN,
105  "Casting does not change size");
106  CopyBytes<sizeof(T) * N>(v.raw, to.raw);
107  return to;
108 }
109 
110 // ------------------------------ Set
111 
112 template <typename T, size_t N>
113 HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
114  Vec128<T, N> v;
115  ZeroBytes<sizeof(T) * N>(v.raw);
116  return v;
117 }
118 
119 template <class D>
120 using VFromD = decltype(Zero(D()));
121 
122 template <typename T, size_t N, typename T2>
123 HWY_API Vec128<T, N> Set(Simd<T, N, 0> /* tag */, const T2 t) {
124  Vec128<T, N> v;
125  for (size_t i = 0; i < N; ++i) {
126  v.raw[i] = static_cast<T>(t);
127  }
128  return v;
129 }
130 
131 template <typename T, size_t N>
132 HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
133  return Zero(d);
134 }
135 
136 namespace detail {
137 
138 template <typename T, HWY_IF_FLOAT(T)>
140  return t + T{1};
141 }
142 
143 template <typename T, HWY_IF_NOT_FLOAT(T)>
144 HWY_INLINE constexpr T IncrementWithWraparound(T t) {
145  using TU = MakeUnsigned<T>;
146  return static_cast<T>(static_cast<TU>(static_cast<TU>(t) + TU{1}) &
147  hwy::LimitsMax<TU>());
148 }
149 
150 } // namespace detail
151 
152 template <typename T, size_t N, typename T2>
153 HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> /* tag */, T2 first) {
154  Vec128<T, N> v;
155  T counter = static_cast<T>(first);
156  for (size_t i = 0; i < N; ++i) {
157  v.raw[i] = counter;
158  counter = detail::IncrementWithWraparound(counter);
159  }
160  return v;
161 }
162 
163 // ================================================== LOGICAL
164 
165 // ------------------------------ Not
166 template <typename T, size_t N>
168  const Simd<T, N, 0> d;
169  const RebindToUnsigned<decltype(d)> du;
170  using TU = TFromD<decltype(du)>;
171  VFromD<decltype(du)> vu = BitCast(du, v);
172  for (size_t i = 0; i < N; ++i) {
173  vu.raw[i] = static_cast<TU>(~vu.raw[i]);
174  }
175  return BitCast(d, vu);
176 }
177 
178 // ------------------------------ And
179 template <typename T, size_t N>
181  const Simd<T, N, 0> d;
182  const RebindToUnsigned<decltype(d)> du;
183  auto au = BitCast(du, a);
184  auto bu = BitCast(du, b);
185  for (size_t i = 0; i < N; ++i) {
186  au.raw[i] &= bu.raw[i];
187  }
188  return BitCast(d, au);
189 }
190 template <typename T, size_t N>
191 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
192  return And(a, b);
193 }
194 
195 // ------------------------------ AndNot
196 template <typename T, size_t N>
198  return And(Not(a), b);
199 }
200 
201 // ------------------------------ Or
202 template <typename T, size_t N>
204  const Simd<T, N, 0> d;
205  const RebindToUnsigned<decltype(d)> du;
206  auto au = BitCast(du, a);
207  auto bu = BitCast(du, b);
208  for (size_t i = 0; i < N; ++i) {
209  au.raw[i] |= bu.raw[i];
210  }
211  return BitCast(d, au);
212 }
213 template <typename T, size_t N>
214 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
215  return Or(a, b);
216 }
217 
218 // ------------------------------ Xor
219 template <typename T, size_t N>
221  const Simd<T, N, 0> d;
222  const RebindToUnsigned<decltype(d)> du;
223  auto au = BitCast(du, a);
224  auto bu = BitCast(du, b);
225  for (size_t i = 0; i < N; ++i) {
226  au.raw[i] ^= bu.raw[i];
227  }
228  return BitCast(d, au);
229 }
230 template <typename T, size_t N>
231 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
232  return Xor(a, b);
233 }
234 
235 // ------------------------------ Or3
236 
237 template <typename T, size_t N>
238 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
239  return Or(o1, Or(o2, o3));
240 }
241 
242 // ------------------------------ OrAnd
243 template <typename T, size_t N>
244 HWY_API Vec128<T, N> OrAnd(const Vec128<T, N> o, const Vec128<T, N> a1,
245  const Vec128<T, N> a2) {
246  return Or(o, And(a1, a2));
247 }
248 
249 // ------------------------------ IfVecThenElse
250 template <typename T, size_t N>
251 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
252  Vec128<T, N> no) {
253  return Or(And(mask, yes), AndNot(mask, no));
254 }
255 
256 // ------------------------------ CopySign
257 template <typename T, size_t N>
258 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
259  const Vec128<T, N> sign) {
260  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
261  const auto msb = SignBit(Simd<T, N, 0>());
262  return Or(AndNot(msb, magn), And(msb, sign));
263 }
264 
265 template <typename T, size_t N>
266 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
267  const Vec128<T, N> sign) {
268  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
269  return Or(abs, And(SignBit(Simd<T, N, 0>()), sign));
270 }
271 
272 // ------------------------------ BroadcastSignBit
273 template <typename T, size_t N>
275  // This is used inside ShiftRight, so we cannot implement in terms of it.
276  for (size_t i = 0; i < N; ++i) {
277  v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0);
278  }
279  return v;
280 }
281 
282 // ------------------------------ Mask
283 
284 template <typename TFrom, typename TTo, size_t N>
285 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
286  Mask128<TFrom, N> mask) {
287  Mask128<TTo, N> to;
288  static_assert(sizeof(TTo) * N == sizeof(TFrom) * N, "Must have same size");
289  CopyBytes<sizeof(TTo) * N>(mask.bits, to.bits);
290  return to;
291 }
292 
293 // v must be 0 or FF..FF.
294 template <typename T, size_t N>
295 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
296  Mask128<T, N> mask;
297  static_assert(sizeof(v) == sizeof(mask), "Must have same size");
298  CopyBytes<sizeof(T) * N>(v.raw, mask.bits);
299  return mask;
300 }
301 
302 template <typename T, size_t N>
304  Vec128<T, N> v;
305  CopyBytes<sizeof(T) * N>(mask.bits, v.raw);
306  return v;
307 }
308 
309 template <typename T, size_t N>
311  return VecFromMask(mask);
312 }
313 
314 template <typename T, size_t N>
315 HWY_API Mask128<T, N> FirstN(Simd<T, N, 0> /*tag*/, size_t n) {
316  Mask128<T, N> m;
317  for (size_t i = 0; i < N; ++i) {
318  m.bits[i] = Mask128<T, N>::FromBool(i < n);
319  }
320  return m;
321 }
322 
323 // Returns mask ? yes : no.
324 template <typename T, size_t N>
326  const Vec128<T, N> yes, const Vec128<T, N> no) {
327  return IfVecThenElse(VecFromMask(mask), yes, no);
328 }
329 
330 template <typename T, size_t N>
331 HWY_API Vec128<T, N> IfThenElseZero(const Mask128<T, N> mask,
332  const Vec128<T, N> yes) {
333  return IfVecThenElse(VecFromMask(mask), yes, Zero(Simd<T, N, 0>()));
334 }
335 
336 template <typename T, size_t N>
337 HWY_API Vec128<T, N> IfThenZeroElse(const Mask128<T, N> mask,
338  const Vec128<T, N> no) {
339  return IfVecThenElse(VecFromMask(mask), Zero(Simd<T, N, 0>()), no);
340 }
341 
342 template <typename T, size_t N>
343 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
344  Vec128<T, N> no) {
345  for (size_t i = 0; i < N; ++i) {
346  v.raw[i] = v.raw[i] < 0 ? yes.raw[i] : no.raw[i];
347  }
348  return v;
349 }
350 
351 template <typename T, size_t N>
352 HWY_API Vec128<T, N> ZeroIfNegative(const Vec128<T, N> v) {
353  return IfNegativeThenElse(v, Zero(Simd<T, N, 0>()), v);
354 }
355 
356 // ------------------------------ Mask logical
357 
358 template <typename T, size_t N>
359 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
360  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
361 }
362 
363 template <typename T, size_t N>
364 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
365  const Simd<T, N, 0> d;
366  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
367 }
368 
369 template <typename T, size_t N>
370 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
371  const Simd<T, N, 0> d;
372  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
373 }
374 
375 template <typename T, size_t N>
376 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
377  const Simd<T, N, 0> d;
378  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
379 }
380 
381 template <typename T, size_t N>
382 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
383  const Simd<T, N, 0> d;
384  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
385 }
386 
387 // ================================================== SHIFTS
388 
389 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
390 
391 template <int kBits, typename T, size_t N>
393  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
394  for (size_t i = 0; i < N; ++i) {
395  const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << kBits;
396  v.raw[i] = static_cast<T>(shifted);
397  }
398  return v;
399 }
400 
401 template <int kBits, typename T, size_t N>
403  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
404 #if __cplusplus >= 202002L
405  // Signed right shift is now guaranteed to be arithmetic (rounding toward
406  // negative infinity, i.e. shifting in the sign bit).
407  for (size_t i = 0; i < N; ++i) {
408  v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
409  }
410 #else
411  if (IsSigned<T>()) {
412  // Emulate arithmetic shift using only logical (unsigned) shifts, because
413  // signed shifts are still implementation-defined.
414  using TU = hwy::MakeUnsigned<T>;
415  for (size_t i = 0; i < N; ++i) {
416  const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
417  const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
418  const size_t sign_shift =
419  static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
420  const TU upper = static_cast<TU>(sign << sign_shift);
421  v.raw[i] = static_cast<T>(shifted | upper);
422  }
423  } else { // T is unsigned
424  for (size_t i = 0; i < N; ++i) {
425  v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
426  }
427  }
428 #endif
429  return v;
430 }
431 
432 // ------------------------------ RotateRight (ShiftRight)
433 
434 namespace detail {
435 
436 // For partial specialization: kBits == 0 results in an invalid shift count
437 template <int kBits>
438 struct RotateRight {
439  template <typename T, size_t N>
441  return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
442  }
443 };
444 
445 template <>
446 struct RotateRight<0> {
447  template <typename T, size_t N>
449  return v;
450  }
451 };
452 
453 } // namespace detail
454 
455 template <int kBits, typename T, size_t N>
457  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
458  return detail::RotateRight<kBits>()(v);
459 }
460 
461 // ------------------------------ ShiftLeftSame
462 
463 template <typename T, size_t N>
464 HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
465  for (size_t i = 0; i < N; ++i) {
466  const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits;
467  v.raw[i] = static_cast<T>(shifted);
468  }
469  return v;
470 }
471 
472 template <typename T, size_t N>
473 HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
474 #if __cplusplus >= 202002L
475  // Signed right shift is now guaranteed to be arithmetic (rounding toward
476  // negative infinity, i.e. shifting in the sign bit).
477  for (size_t i = 0; i < N; ++i) {
478  v.raw[i] = static_cast<T>(v.raw[i] >> bits);
479  }
480 #else
481  if (IsSigned<T>()) {
482  // Emulate arithmetic shift using only logical (unsigned) shifts, because
483  // signed shifts are still implementation-defined.
484  using TU = hwy::MakeUnsigned<T>;
485  for (size_t i = 0; i < N; ++i) {
486  const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
487  const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
488  const size_t sign_shift =
489  static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
490  const TU upper = static_cast<TU>(sign << sign_shift);
491  v.raw[i] = static_cast<T>(shifted | upper);
492  }
493  } else {
494  for (size_t i = 0; i < N; ++i) {
495  v.raw[i] = static_cast<T>(v.raw[i] >> bits); // unsigned, logical shift
496  }
497  }
498 #endif
499  return v;
500 }
501 
502 // ------------------------------ Shl
503 
504 template <typename T, size_t N>
506  for (size_t i = 0; i < N; ++i) {
507  const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i])
508  << bits.raw[i];
509  v.raw[i] = static_cast<T>(shifted);
510  }
511  return v;
512 }
513 
514 template <typename T, size_t N>
516 #if __cplusplus >= 202002L
517  // Signed right shift is now guaranteed to be arithmetic (rounding toward
518  // negative infinity, i.e. shifting in the sign bit).
519  for (size_t i = 0; i < N; ++i) {
520  v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
521  }
522 #else
523  if (IsSigned<T>()) {
524  // Emulate arithmetic shift using only logical (unsigned) shifts, because
525  // signed shifts are still implementation-defined.
526  using TU = hwy::MakeUnsigned<T>;
527  for (size_t i = 0; i < N; ++i) {
528  const TU shifted =
529  static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
530  const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
531  const size_t sign_shift = static_cast<size_t>(
532  static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
533  const TU upper = static_cast<TU>(sign << sign_shift);
534  v.raw[i] = static_cast<T>(shifted | upper);
535  }
536  } else { // T is unsigned
537  for (size_t i = 0; i < N; ++i) {
538  v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
539  }
540  }
541 #endif
542  return v;
543 }
544 
545 // ================================================== ARITHMETIC
546 
547 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
549  for (size_t i = 0; i < N; ++i) {
550  const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
551  const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
552  a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)));
553  }
554  return a;
555 }
556 template <typename T, size_t N, HWY_IF_FLOAT(T)>
557 HWY_API Vec128<T, N> operator+(Vec128<T, N> a, const Vec128<T, N> b) {
558  for (size_t i = 0; i < N; ++i) {
559  a.raw[i] += b.raw[i];
560  }
561  return a;
562 }
563 
564 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
566  for (size_t i = 0; i < N; ++i) {
567  const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
568  const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
569  a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
570  }
571  return a;
572 }
573 template <typename T, size_t N, HWY_IF_FLOAT(T)>
574 HWY_API Vec128<T, N> operator-(Vec128<T, N> a, const Vec128<T, N> b) {
575  for (size_t i = 0; i < N; ++i) {
576  a.raw[i] -= b.raw[i];
577  }
578  return a;
579 }
580 
581 // ------------------------------ SumsOf8
582 
583 template <size_t N>
584 HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(const Vec128<uint8_t, N> v) {
585  Vec128<uint64_t, (N + 7) / 8> sums;
586  for (size_t i = 0; i < N; ++i) {
587  sums.raw[i / 8] += v.raw[i];
588  }
589  return sums;
590 }
591 
592 // ------------------------------ SaturatedAdd
593 template <typename T, size_t N>
595  for (size_t i = 0; i < N; ++i) {
596  a.raw[i] = static_cast<T>(
597  HWY_MIN(HWY_MAX(hwy::LowestValue<T>(), a.raw[i] + b.raw[i]),
598  hwy::HighestValue<T>()));
599  }
600  return a;
601 }
602 
603 // ------------------------------ SaturatedSub
604 template <typename T, size_t N>
606  for (size_t i = 0; i < N; ++i) {
607  a.raw[i] = static_cast<T>(
608  HWY_MIN(HWY_MAX(hwy::LowestValue<T>(), a.raw[i] - b.raw[i]),
609  hwy::HighestValue<T>()));
610  }
611  return a;
612 }
613 
614 // ------------------------------ AverageRound
615 template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
617  for (size_t i = 0; i < N; ++i) {
618  a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2);
619  }
620  return a;
621 }
622 
623 // ------------------------------ Abs
624 
625 template <typename T, size_t N, HWY_IF_SIGNED(T)>
627  for (size_t i = 0; i < N; ++i) {
628  const T s = a.raw[i];
629  const T min = hwy::LimitsMin<T>();
630  a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
631  }
632  return a;
633 }
634 template <typename T, size_t N, HWY_IF_FLOAT(T)>
635 HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
636  for (size_t i = 0; i < N; ++i) {
637  v.raw[i] = std::abs(v.raw[i]);
638  }
639  return v;
640 }
641 
642 // ------------------------------ Min/Max
643 
644 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
646  for (size_t i = 0; i < N; ++i) {
647  a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
648  }
649  return a;
650 }
651 
652 template <typename T, size_t N, HWY_IF_FLOAT(T)>
653 HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
654  for (size_t i = 0; i < N; ++i) {
655  if (std::isnan(a.raw[i])) {
656  a.raw[i] = b.raw[i];
657  } else if (std::isnan(b.raw[i])) {
658  // no change
659  } else {
660  a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
661  }
662  }
663  return a;
664 }
665 
666 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
668  for (size_t i = 0; i < N; ++i) {
669  a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
670  }
671  return a;
672 }
673 
674 template <typename T, size_t N, HWY_IF_FLOAT(T)>
675 HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
676  for (size_t i = 0; i < N; ++i) {
677  if (std::isnan(a.raw[i])) {
678  a.raw[i] = b.raw[i];
679  } else if (std::isnan(b.raw[i])) {
680  // no change
681  } else {
682  a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
683  }
684  }
685  return a;
686 }
687 
688 // ------------------------------ Neg
689 
690 template <typename T, size_t N, HWY_IF_FLOAT(T)>
692  return Xor(v, SignBit(Simd<T, N, 0>()));
693 }
694 
695 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
696 HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
697  return Zero(Simd<T, N, 0>()) - v;
698 }
699 
700 // ------------------------------ Mul/Div
701 
702 template <typename T, size_t N, HWY_IF_FLOAT(T)>
704  for (size_t i = 0; i < N; ++i) {
705  a.raw[i] *= b.raw[i];
706  }
707  return a;
708 }
709 
710 template <typename T, size_t N, HWY_IF_SIGNED(T)>
711 HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
712  for (size_t i = 0; i < N; ++i) {
713  a.raw[i] = static_cast<T>(static_cast<int64_t>(a.raw[i]) * b.raw[i]);
714  }
715  return a;
716 }
717 
718 template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
719 HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
720  for (size_t i = 0; i < N; ++i) {
721  a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) * b.raw[i]);
722  }
723  return a;
724 }
725 
726 template <typename T, size_t N>
728  for (size_t i = 0; i < N; ++i) {
729  a.raw[i] /= b.raw[i];
730  }
731  return a;
732 }
733 
734 // Returns the upper 16 bits of a * b in each lane.
735 template <size_t N>
737  const Vec128<int16_t, N> b) {
738  for (size_t i = 0; i < N; ++i) {
739  a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
740  }
741  return a;
742 }
743 template <size_t N>
745  const Vec128<uint16_t, N> b) {
746  for (size_t i = 0; i < N; ++i) {
747  // Cast to uint32_t first to prevent overflow. Otherwise the result of
748  // uint16_t * uint16_t is in "int" which may overflow. In practice the
749  // result is the same but this way it is also defined.
750  a.raw[i] = static_cast<uint16_t>(
751  (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
752  16);
753  }
754  return a;
755 }
756 
757 template <size_t N>
759  Vec128<int16_t, N> b) {
760  for (size_t i = 0; i < N; ++i) {
761  a.raw[i] = static_cast<int16_t>((2 * a.raw[i] * b.raw[i] + 32768) >> 16);
762  }
763  return a;
764 }
765 
766 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
767 template <size_t N>
768 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
769  const Vec128<int32_t, N> b) {
770  Vec128<int64_t, (N + 1) / 2> mul;
771  for (size_t i = 0; i < N; i += 2) {
772  const int64_t a64 = a.raw[i];
773  mul.raw[i / 2] = a64 * b.raw[i];
774  }
775  return mul;
776 }
777 template <size_t N>
778 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
779  const Vec128<uint32_t, N> b) {
780  Vec128<uint64_t, (N + 1) / 2> mul;
781  for (size_t i = 0; i < N; i += 2) {
782  const uint64_t a64 = a.raw[i];
783  mul.raw[i / 2] = a64 * b.raw[i];
784  }
785  return mul;
786 }
787 
788 template <size_t N>
789 HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(const Vec128<int32_t, N> a,
790  const Vec128<int32_t, N> b) {
791  Vec128<int64_t, (N + 1) / 2> mul;
792  for (size_t i = 0; i < N; i += 2) {
793  const int64_t a64 = a.raw[i + 1];
794  mul.raw[i / 2] = a64 * b.raw[i + 1];
795  }
796  return mul;
797 }
798 template <size_t N>
799 HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a,
800  const Vec128<uint32_t, N> b) {
801  Vec128<uint64_t, (N + 1) / 2> mul;
802  for (size_t i = 0; i < N; i += 2) {
803  const uint64_t a64 = a.raw[i + 1];
804  mul.raw[i / 2] = a64 * b.raw[i + 1];
805  }
806  return mul;
807 }
808 
809 template <size_t N>
810 HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
811  for (size_t i = 0; i < N; ++i) {
812  // Zero inputs are allowed, but callers are responsible for replacing the
813  // return value with something else (typically using IfThenElse). This check
814  // avoids a ubsan error. The result is arbitrary.
815  v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
816  }
817  return v;
818 }
819 
820 template <size_t N>
822  return Abs(a - b);
823 }
824 
825 // ------------------------------ Floating-point multiply-add variants
826 
827 template <typename T, size_t N>
829  const Vec128<T, N> add) {
830  return mul * x + add;
831 }
832 
833 template <typename T, size_t N>
835  const Vec128<T, N> add) {
836  return add - mul * x;
837 }
838 
839 template <typename T, size_t N>
841  const Vec128<T, N> sub) {
842  return mul * x - sub;
843 }
844 
845 template <typename T, size_t N>
847  const Vec128<T, N> sub) {
848  return Neg(mul) * x - sub;
849 }
850 
851 // ------------------------------ Floating-point square root
852 
853 template <size_t N>
854 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
855  for (size_t i = 0; i < N; ++i) {
856  const float half = v.raw[i] * 0.5f;
857  uint32_t bits;
858  CopyBytes<4>(&v.raw[i], &bits);
859  // Initial guess based on log2(f)
860  bits = 0x5F3759DF - (bits >> 1);
861  CopyBytes<4>(&bits, &v.raw[i]);
862  // One Newton-Raphson iteration
863  v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
864  }
865  return v;
866 }
867 
868 template <typename T, size_t N>
870  for (size_t i = 0; i < N; ++i) {
871  v.raw[i] = std::sqrt(v.raw[i]);
872  }
873  return v;
874 }
875 
876 // ------------------------------ Floating-point rounding
877 
878 template <typename T, size_t N>
880  using TI = MakeSigned<T>;
881  const Vec128<T, N> a = Abs(v);
882  for (size_t i = 0; i < N; ++i) {
883  if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN
884  continue;
885  }
886  const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
887  const TI rounded = static_cast<TI>(v.raw[i] + bias);
888  if (rounded == 0) {
889  v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0};
890  continue;
891  }
892  const T rounded_f = static_cast<T>(rounded);
893  // Round to even
894  if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
895  v.raw[i] = static_cast<T>(rounded - (v.raw[i] < T(0) ? -1 : 1));
896  continue;
897  }
898  v.raw[i] = rounded_f;
899  }
900  return v;
901 }
902 
903 // Round-to-nearest even.
904 template <size_t N>
905 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
906  using T = float;
907  using TI = int32_t;
908 
909  const Vec128<float, N> abs = Abs(v);
910  Vec128<int32_t, N> ret;
911  for (size_t i = 0; i < N; ++i) {
912  const bool signbit = std::signbit(v.raw[i]);
913 
914  if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN
915  // Check if too large to cast or NaN
916  if (!(abs.raw[i] <= static_cast<T>(LimitsMax<TI>()))) {
917  ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
918  continue;
919  }
920  ret.raw[i] = static_cast<TI>(v.raw[i]);
921  continue;
922  }
923  const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
924  const TI rounded = static_cast<TI>(v.raw[i] + bias);
925  if (rounded == 0) {
926  ret.raw[i] = 0;
927  continue;
928  }
929  const T rounded_f = static_cast<T>(rounded);
930  // Round to even
931  if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
932  ret.raw[i] = rounded - (signbit ? -1 : 1);
933  continue;
934  }
935  ret.raw[i] = rounded;
936  }
937  return ret;
938 }
939 
940 template <typename T, size_t N>
942  using TI = MakeSigned<T>;
943  const Vec128<T, N> abs = Abs(v);
944  for (size_t i = 0; i < N; ++i) {
945  if (!(abs.raw[i] <= MantissaEnd<T>())) { // Huge or NaN
946  continue;
947  }
948  const TI truncated = static_cast<TI>(v.raw[i]);
949  if (truncated == 0) {
950  v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0};
951  continue;
952  }
953  v.raw[i] = static_cast<T>(truncated);
954  }
955  return v;
956 }
957 
958 // Toward +infinity, aka ceiling
959 template <typename Float, size_t N>
961  constexpr int kMantissaBits = MantissaBits<Float>();
962  using Bits = MakeUnsigned<Float>;
963  const Bits kExponentMask = MaxExponentField<Float>();
964  const Bits kMantissaMask = MantissaMask<Float>();
965  const Bits kBias = kExponentMask / 2;
966 
967  for (size_t i = 0; i < N; ++i) {
968  const bool positive = v.raw[i] > Float(0.0);
969 
970  Bits bits;
971  CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
972 
973  const int exponent =
974  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
975  // Already an integer.
976  if (exponent >= kMantissaBits) continue;
977  // |v| <= 1 => 0 or 1.
978  if (exponent < 0) {
979  v.raw[i] = positive ? Float{1} : Float{-0.0};
980  continue;
981  }
982 
983  const Bits mantissa_mask = kMantissaMask >> exponent;
984  // Already an integer
985  if ((bits & mantissa_mask) == 0) continue;
986 
987  // Clear fractional bits and round up
988  if (positive) bits += (kMantissaMask + 1) >> exponent;
989  bits &= ~mantissa_mask;
990 
991  CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
992  }
993  return v;
994 }
995 
996 // Toward -infinity, aka floor
997 template <typename Float, size_t N>
999  constexpr int kMantissaBits = MantissaBits<Float>();
1000  using Bits = MakeUnsigned<Float>;
1001  const Bits kExponentMask = MaxExponentField<Float>();
1002  const Bits kMantissaMask = MantissaMask<Float>();
1003  const Bits kBias = kExponentMask / 2;
1004 
1005  for (size_t i = 0; i < N; ++i) {
1006  const bool negative = v.raw[i] < Float(0.0);
1007 
1008  Bits bits;
1009  CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
1010 
1011  const int exponent =
1012  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1013  // Already an integer.
1014  if (exponent >= kMantissaBits) continue;
1015  // |v| <= 1 => -1 or 0.
1016  if (exponent < 0) {
1017  v.raw[i] = negative ? Float(-1.0) : Float(0.0);
1018  continue;
1019  }
1020 
1021  const Bits mantissa_mask = kMantissaMask >> exponent;
1022  // Already an integer
1023  if ((bits & mantissa_mask) == 0) continue;
1024 
1025  // Clear fractional bits and round down
1026  if (negative) bits += (kMantissaMask + 1) >> exponent;
1027  bits &= ~mantissa_mask;
1028 
1029  CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
1030  }
1031  return v;
1032 }
1033 
1034 // ------------------------------ Floating-point classification
1035 
1036 template <typename T, size_t N>
1037 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
1038  Mask128<T, N> ret;
1039  for (size_t i = 0; i < N; ++i) {
1040  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1041  MakeUnsigned<T> bits;
1042  memcpy(&bits, &v.raw[i], sizeof(T));
1043  bits += bits;
1044  bits >>= 1; // clear sign bit
1045  // NaN if all exponent bits are set and the mantissa is not zero.
1046  ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
1047  }
1048  return ret;
1049 }
1050 
1051 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1052 HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
1053  const Simd<T, N, 0> d;
1054  const RebindToSigned<decltype(d)> di;
1055  const VFromD<decltype(di)> vi = BitCast(di, v);
1056  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1057  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
1058 }
1059 
1060 // Returns whether normal/subnormal/zero.
1061 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1062 HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
1063  const Simd<T, N, 0> d;
1064  const RebindToUnsigned<decltype(d)> du;
1065  const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1066  using VI = VFromD<decltype(di)>;
1067  using VU = VFromD<decltype(du)>;
1068  const VU vu = BitCast(du, v);
1069  // 'Shift left' to clear the sign bit, then right so we can compare with the
1070  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1071  // negative and non-negative floats would be greater).
1072  const VI exp =
1073  BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1074  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1075 }
1076 
1077 // ================================================== COMPARE
1078 
1079 template <typename T, size_t N>
1081  Mask128<T, N> m;
1082  for (size_t i = 0; i < N; ++i) {
1083  m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]);
1084  }
1085  return m;
1086 }
1087 
1088 template <typename T, size_t N>
1090  Mask128<T, N> m;
1091  for (size_t i = 0; i < N; ++i) {
1092  m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]);
1093  }
1094  return m;
1095 }
1096 
1097 template <typename T, size_t N>
1099  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1100  return (v & bit) == bit;
1101 }
1102 
1103 template <typename T, size_t N>
1105  Mask128<T, N> m;
1106  for (size_t i = 0; i < N; ++i) {
1107  m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]);
1108  }
1109  return m;
1110 }
1111 template <typename T, size_t N>
1112 HWY_API Mask128<T, N> operator>(const Vec128<T, N> a, const Vec128<T, N> b) {
1113  Mask128<T, N> m;
1114  for (size_t i = 0; i < N; ++i) {
1115  m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]);
1116  }
1117  return m;
1118 }
1119 
1120 template <typename T, size_t N>
1122  Mask128<T, N> m;
1123  for (size_t i = 0; i < N; ++i) {
1124  m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]);
1125  }
1126  return m;
1127 }
1128 template <typename T, size_t N>
1129 HWY_API Mask128<T, N> operator>=(const Vec128<T, N> a, const Vec128<T, N> b) {
1130  Mask128<T, N> m;
1131  for (size_t i = 0; i < N; ++i) {
1132  m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]);
1133  }
1134  return m;
1135 }
1136 
1137 // ------------------------------ Lt128
1138 
1139 // Only makes sense for full vectors of u64.
1141  Vec128<uint64_t> a, const Vec128<uint64_t> b) {
1142  const bool lt =
1143  (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]);
1144  Mask128<uint64_t> ret;
1145  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1146  return ret;
1147 }
1148 
1150  Vec128<uint64_t> a,
1151  const Vec128<uint64_t> b) {
1152  const bool lt = a.raw[1] < b.raw[1];
1153  Mask128<uint64_t> ret;
1154  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1155  return ret;
1156 }
1157 
1158 // ------------------------------ Min128, Max128 (Lt128)
1159 
1160 template <class D, class V = VFromD<D>>
1161 HWY_API V Min128(D d, const V a, const V b) {
1162  return IfThenElse(Lt128(d, a, b), a, b);
1163 }
1164 
1165 template <class D, class V = VFromD<D>>
1166 HWY_API V Max128(D d, const V a, const V b) {
1167  return IfThenElse(Lt128(d, b, a), a, b);
1168 }
1169 
1170 template <class D, class V = VFromD<D>>
1171 HWY_API V Min128Upper(D d, const V a, const V b) {
1172  return IfThenElse(Lt128Upper(d, a, b), a, b);
1173 }
1174 
1175 template <class D, class V = VFromD<D>>
1176 HWY_API V Max128Upper(D d, const V a, const V b) {
1177  return IfThenElse(Lt128Upper(d, b, a), a, b);
1178 }
1179 
1180 // ================================================== MEMORY
1181 
1182 // ------------------------------ Load
1183 
1184 template <typename T, size_t N>
1185 HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */,
1186  const T* HWY_RESTRICT aligned) {
1187  Vec128<T, N> v;
1188  CopyBytes<sizeof(T) * N>(aligned, v.raw);
1189  return v;
1190 }
1191 
1192 template <typename T, size_t N>
1193 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1194  const T* HWY_RESTRICT aligned) {
1195  return IfThenElseZero(m, Load(d, aligned));
1196 }
1197 
1198 template <typename T, size_t N>
1200  return Load(d, p);
1201 }
1202 
1203 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
1204 template <typename T, size_t N>
1206  const T* HWY_RESTRICT aligned) {
1207  return Load(d, aligned);
1208 }
1209 
1210 // ------------------------------ Store
1211 
1212 template <typename T, size_t N>
1213 HWY_API void Store(const Vec128<T, N> v, Simd<T, N, 0> /* tag */,
1214  T* HWY_RESTRICT aligned) {
1215  CopyBytes<sizeof(T) * N>(v.raw, aligned);
1216 }
1217 
1218 template <typename T, size_t N>
1220  Store(v, d, p);
1221 }
1222 
1223 template <typename T, size_t N>
1224 HWY_API void BlendedStore(const Vec128<T, N> v, Mask128<T, N> m,
1225  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
1226  for (size_t i = 0; i < N; ++i) {
1227  if (m.bits[i]) p[i] = v.raw[i];
1228  }
1229 }
1230 
1231 // ------------------------------ LoadInterleaved2/3/4
1232 
1233 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1234 // We implement those here because scalar code is likely faster than emulation
1235 // via shuffles.
1236 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1237 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1238 #else
1239 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1240 #endif
1241 
1242 template <typename T, size_t N>
1244  Vec128<T, N>& v0, Vec128<T, N>& v1) {
1245  alignas(16) T buf0[N];
1246  alignas(16) T buf1[N];
1247  for (size_t i = 0; i < N; ++i) {
1248  buf0[i] = *unaligned++;
1249  buf1[i] = *unaligned++;
1250  }
1251  v0 = Load(d, buf0);
1252  v1 = Load(d, buf1);
1253 }
1254 
1255 template <typename T, size_t N>
1257  Vec128<T, N>& v0, Vec128<T, N>& v1,
1258  Vec128<T, N>& v2) {
1259  alignas(16) T buf0[N];
1260  alignas(16) T buf1[N];
1261  alignas(16) T buf2[N];
1262  for (size_t i = 0; i < N; ++i) {
1263  buf0[i] = *unaligned++;
1264  buf1[i] = *unaligned++;
1265  buf2[i] = *unaligned++;
1266  }
1267  v0 = Load(d, buf0);
1268  v1 = Load(d, buf1);
1269  v2 = Load(d, buf2);
1270 }
1271 
1272 template <typename T, size_t N>
1274  Vec128<T, N>& v0, Vec128<T, N>& v1,
1275  Vec128<T, N>& v2, Vec128<T, N>& v3) {
1276  alignas(16) T buf0[N];
1277  alignas(16) T buf1[N];
1278  alignas(16) T buf2[N];
1279  alignas(16) T buf3[N];
1280  for (size_t i = 0; i < N; ++i) {
1281  buf0[i] = *unaligned++;
1282  buf1[i] = *unaligned++;
1283  buf2[i] = *unaligned++;
1284  buf3[i] = *unaligned++;
1285  }
1286  v0 = Load(d, buf0);
1287  v1 = Load(d, buf1);
1288  v2 = Load(d, buf2);
1289  v3 = Load(d, buf3);
1290 }
1291 
1292 // ------------------------------ StoreInterleaved2/3/4
1293 
1294 template <typename T, size_t N>
1296  Simd<T, N, 0> /* tag */,
1297  T* HWY_RESTRICT unaligned) {
1298  for (size_t i = 0; i < N; ++i) {
1299  *unaligned++ = v0.raw[i];
1300  *unaligned++ = v1.raw[i];
1301  }
1302 }
1303 
1304 template <typename T, size_t N>
1306  const Vec128<T, N> v2, Simd<T, N, 0> /* tag */,
1307  T* HWY_RESTRICT unaligned) {
1308  for (size_t i = 0; i < N; ++i) {
1309  *unaligned++ = v0.raw[i];
1310  *unaligned++ = v1.raw[i];
1311  *unaligned++ = v2.raw[i];
1312  }
1313 }
1314 
1315 template <typename T, size_t N>
1317  const Vec128<T, N> v2, const Vec128<T, N> v3,
1318  Simd<T, N, 0> /* tag */,
1319  T* HWY_RESTRICT unaligned) {
1320  for (size_t i = 0; i < N; ++i) {
1321  *unaligned++ = v0.raw[i];
1322  *unaligned++ = v1.raw[i];
1323  *unaligned++ = v2.raw[i];
1324  *unaligned++ = v3.raw[i];
1325  }
1326 }
1327 
1328 // ------------------------------ Stream
1329 
1330 template <typename T, size_t N>
1331 HWY_API void Stream(const Vec128<T, N> v, Simd<T, N, 0> d,
1332  T* HWY_RESTRICT aligned) {
1333  Store(v, d, aligned);
1334 }
1335 
1336 // ------------------------------ Scatter
1337 
1338 template <typename T, size_t N, typename Offset>
1340  const Vec128<Offset, N> offset) {
1341  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1342  for (size_t i = 0; i < N; ++i) {
1343  uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw[i];
1344  CopyBytes<sizeof(T)>(&v.raw[i], base8);
1345  }
1346 }
1347 
1348 template <typename T, size_t N, typename Index>
1350  T* HWY_RESTRICT base, const Vec128<Index, N> index) {
1351  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1352  for (size_t i = 0; i < N; ++i) {
1353  base[index.raw[i]] = v.raw[i];
1354  }
1355 }
1356 
1357 // ------------------------------ Gather
1358 
1359 template <typename T, size_t N, typename Offset>
1361  const Vec128<Offset, N> offset) {
1362  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1363  Vec128<T, N> v;
1364  for (size_t i = 0; i < N; ++i) {
1365  const uint8_t* base8 =
1366  reinterpret_cast<const uint8_t*>(base) + offset.raw[i];
1367  CopyBytes<sizeof(T)>(base8, &v.raw[i]);
1368  }
1369  return v;
1370 }
1371 
1372 template <typename T, size_t N, typename Index>
1373 HWY_API Vec128<T, N> GatherIndex(Simd<T, N, 0> /* tag */,
1374  const T* HWY_RESTRICT base,
1375  const Vec128<Index, N> index) {
1376  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1377  Vec128<T, N> v;
1378  for (size_t i = 0; i < N; ++i) {
1379  v.raw[i] = base[index.raw[i]];
1380  }
1381  return v;
1382 }
1383 
1384 // ================================================== CONVERT
1385 
1386 // ConvertTo and DemoteTo with floating-point input and integer output truncate
1387 // (rounding toward zero).
1388 
1389 template <typename FromT, typename ToT, size_t N>
1391  Vec128<FromT, N> from) {
1392  static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
1393  Vec128<ToT, N> ret;
1394  for (size_t i = 0; i < N; ++i) {
1395  // For bits Y > X, floatX->floatY and intX->intY are always representable.
1396  ret.raw[i] = static_cast<ToT>(from.raw[i]);
1397  }
1398  return ret;
1399 }
1400 
1401 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
1402 // so we overload for FromT=double and ToT={float,int32_t}.
1403 template <size_t N>
1405  Vec128<double, N> from) {
1406  Vec128<float, N> ret;
1407  for (size_t i = 0; i < N; ++i) {
1408  // Prevent ubsan errors when converting float to narrower integer/float
1409  if (std::isinf(from.raw[i]) ||
1410  std::fabs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
1411  ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<float>()
1412  : HighestValue<float>();
1413  continue;
1414  }
1415  ret.raw[i] = static_cast<float>(from.raw[i]);
1416  }
1417  return ret;
1418 }
1419 template <size_t N>
1421  Vec128<double, N> from) {
1422  Vec128<int32_t, N> ret;
1423  for (size_t i = 0; i < N; ++i) {
1424  // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
1425  if (std::isinf(from.raw[i]) ||
1426  std::fabs(from.raw[i]) > static_cast<double>(HighestValue<int32_t>())) {
1427  ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<int32_t>()
1428  : HighestValue<int32_t>();
1429  continue;
1430  }
1431  ret.raw[i] = static_cast<int32_t>(from.raw[i]);
1432  }
1433  return ret;
1434 }
1435 
1436 template <typename FromT, typename ToT, size_t N>
1438  Vec128<FromT, N> from) {
1439  static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
1440  static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
1441 
1442  Vec128<ToT, N> ret;
1443  for (size_t i = 0; i < N; ++i) {
1444  // Int to int: choose closest value in ToT to `from` (avoids UB)
1445  from.raw[i] =
1446  HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw[i]), LimitsMax<ToT>());
1447  ret.raw[i] = static_cast<ToT>(from.raw[i]);
1448  }
1449  return ret;
1450 }
1451 
1452 template <size_t N>
1453 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
1454  Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
1455  const Repartition<uint32_t, decltype(dbf16)> du32;
1456  const Vec128<uint32_t, N> b_in_lower = ShiftRight<16>(BitCast(du32, b));
1457  // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
1458  const Vec128<uint32_t, N> a_mask = Set(du32, 0xFFFF0000);
1459  return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
1460 }
1461 
1462 namespace detail {
1463 
1464 HWY_INLINE void StoreU16ToF16(const uint16_t val,
1466 #if HWY_NATIVE_FLOAT16
1467  CopyBytes<2>(&val, to);
1468 #else
1469  to->bits = val;
1470 #endif
1471 }
1472 
1474 #if HWY_NATIVE_FLOAT16
1475  uint16_t bits16;
1476  CopyBytes<2>(from, &bits16);
1477  return bits16;
1478 #else
1479  return from->bits;
1480 #endif
1481 }
1482 
1483 } // namespace detail
1484 
1485 template <size_t N>
1486 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
1487  const Vec128<float16_t, N> v) {
1488  Vec128<float, N> ret;
1489  for (size_t i = 0; i < N; ++i) {
1490  const uint16_t bits16 = detail::U16FromF16(&v.raw[i]);
1491  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
1492  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1493  const uint32_t mantissa = bits16 & 0x3FF;
1494 
1495  // Subnormal or zero
1496  if (biased_exp == 0) {
1497  const float subnormal =
1498  (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1499  ret.raw[i] = sign ? -subnormal : subnormal;
1500  continue;
1501  }
1502 
1503  // Normalized: convert the representation directly (faster than
1504  // ldexp/tables).
1505  const uint32_t biased_exp32 = biased_exp + (127 - 15);
1506  const uint32_t mantissa32 = mantissa << (23 - 10);
1507  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1508  CopyBytes<4>(&bits32, &ret.raw[i]);
1509  }
1510  return ret;
1511 }
1512 
1513 template <size_t N>
1514 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
1515  const Vec128<bfloat16_t, N> v) {
1516  Vec128<float, N> ret;
1517  for (size_t i = 0; i < N; ++i) {
1518  ret.raw[i] = F32FromBF16(v.raw[i]);
1519  }
1520  return ret;
1521 }
1522 
1523 template <size_t N>
1524 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
1525  const Vec128<float, N> v) {
1526  Vec128<float16_t, N> ret;
1527  for (size_t i = 0; i < N; ++i) {
1528  uint32_t bits32;
1529  CopyBytes<4>(&v.raw[i], &bits32);
1530  const uint32_t sign = bits32 >> 31;
1531  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1532  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1533 
1534  const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
1535 
1536  // Tiny or zero => zero.
1537  if (exp < -24) {
1538  ZeroBytes<sizeof(uint16_t)>(&ret.raw[i]);
1539  continue;
1540  }
1541 
1542  uint32_t biased_exp16, mantissa16;
1543 
1544  // exp = [-24, -15] => subnormal
1545  if (exp < -14) {
1546  biased_exp16 = 0;
1547  const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
1548  HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
1549  mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
1550  (mantissa32 >> (13 + sub_exp)));
1551  } else {
1552  // exp = [-14, 15]
1553  biased_exp16 = static_cast<uint32_t>(exp + 15);
1554  HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1555  mantissa16 = mantissa32 >> 13;
1556  }
1557 
1558  HWY_DASSERT(mantissa16 < 1024);
1559  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1560  HWY_DASSERT(bits16 < 0x10000);
1561  const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1562  detail::StoreU16ToF16(narrowed, &ret.raw[i]);
1563  }
1564  return ret;
1565 }
1566 
1567 template <size_t N>
1568 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> /* tag */,
1569  const Vec128<float, N> v) {
1570  Vec128<bfloat16_t, N> ret;
1571  for (size_t i = 0; i < N; ++i) {
1572  ret.raw[i] = BF16FromF32(v.raw[i]);
1573  }
1574  return ret;
1575 }
1576 
1577 template <typename FromT, typename ToT, size_t N, HWY_IF_FLOAT(FromT)>
1579  Vec128<FromT, N> from) {
1580  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1581  Vec128<ToT, N> ret;
1582  for (size_t i = 0; i < N; ++i) {
1583  // float## -> int##: return closest representable value. We cannot exactly
1584  // represent LimitsMax<ToT> in FromT, so use double.
1585  const double f = static_cast<double>(from.raw[i]);
1586  if (std::isinf(from.raw[i]) ||
1587  std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
1588  ret.raw[i] =
1589  std::signbit(from.raw[i]) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1590  continue;
1591  }
1592  ret.raw[i] = static_cast<ToT>(from.raw[i]);
1593  }
1594  return ret;
1595 }
1596 
1597 template <typename FromT, typename ToT, size_t N, HWY_IF_NOT_FLOAT(FromT)>
1598 HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> /* tag */,
1599  Vec128<FromT, N> from) {
1600  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1601  Vec128<ToT, N> ret;
1602  for (size_t i = 0; i < N; ++i) {
1603  // int## -> float##: no check needed
1604  ret.raw[i] = static_cast<ToT>(from.raw[i]);
1605  }
1606  return ret;
1607 }
1608 
1609 template <size_t N>
1611  return DemoteTo(Simd<uint8_t, N, 0>(), v);
1612 }
1613 
1614 // ================================================== COMBINE
1615 
1616 template <typename T, size_t N>
1618  Vec128<T, N / 2> ret;
1619  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1620  return ret;
1621 }
1622 
1623 template <typename T, size_t N>
1624 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
1625  Vec128<T, N> v) {
1626  return LowerHalf(v);
1627 }
1628 
1629 template <typename T, size_t N>
1631  Vec128<T, N> v) {
1632  Vec128<T, N / 2> ret;
1633  CopyBytes<N / 2 * sizeof(T)>(&v.raw[N / 2], ret.raw);
1634  return ret;
1635 }
1636 
1637 template <typename T, size_t N>
1638 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> /* tag */,
1639  Vec128<T, N / 2> v) {
1640  Vec128<T, N> ret;
1641  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1642  return ret;
1643 }
1644 
1645 template <typename T, size_t N>
1647  Vec128<T, N / 2> lo_half) {
1648  Vec128<T, N> ret;
1649  CopyBytes<N / 2 * sizeof(T)>(lo_half.raw, &ret.raw[0]);
1650  CopyBytes<N / 2 * sizeof(T)>(hi_half.raw, &ret.raw[N / 2]);
1651  return ret;
1652 }
1653 
1654 template <typename T, size_t N>
1656  Vec128<T, N> lo) {
1657  Vec128<T, N> ret;
1658  CopyBytes<N / 2 * sizeof(T)>(lo.raw, &ret.raw[0]);
1659  CopyBytes<N / 2 * sizeof(T)>(hi.raw, &ret.raw[N / 2]);
1660  return ret;
1661 }
1662 
1663 template <typename T, size_t N>
1665  Vec128<T, N> lo) {
1666  Vec128<T, N> ret;
1667  CopyBytes<N / 2 * sizeof(T)>(&lo.raw[N / 2], &ret.raw[0]);
1668  CopyBytes<N / 2 * sizeof(T)>(&hi.raw[N / 2], &ret.raw[N / 2]);
1669  return ret;
1670 }
1671 
1672 template <typename T, size_t N>
1674  const Vec128<T, N> hi,
1675  const Vec128<T, N> lo) {
1676  Vec128<T, N> ret;
1677  CopyBytes<N / 2 * sizeof(T)>(&lo.raw[N / 2], &ret.raw[0]);
1678  CopyBytes<N / 2 * sizeof(T)>(hi.raw, &ret.raw[N / 2]);
1679  return ret;
1680 }
1681 
1682 template <typename T, size_t N>
1683 HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
1684  Vec128<T, N> lo) {
1685  Vec128<T, N> ret;
1686  CopyBytes<N / 2 * sizeof(T)>(lo.raw, &ret.raw[0]);
1687  CopyBytes<N / 2 * sizeof(T)>(&hi.raw[N / 2], &ret.raw[N / 2]);
1688  return ret;
1689 }
1690 
1691 template <typename T, size_t N>
1693  Vec128<T, N> lo) {
1694  Vec128<T, N> ret;
1695  for (size_t i = 0; i < N / 2; ++i) {
1696  ret.raw[i] = lo.raw[2 * i];
1697  }
1698  for (size_t i = 0; i < N / 2; ++i) {
1699  ret.raw[N / 2 + i] = hi.raw[2 * i];
1700  }
1701  return ret;
1702 }
1703 
1704 template <typename T, size_t N>
1706  Vec128<T, N> lo) {
1707  Vec128<T, N> ret;
1708  for (size_t i = 0; i < N / 2; ++i) {
1709  ret.raw[i] = lo.raw[2 * i + 1];
1710  }
1711  for (size_t i = 0; i < N / 2; ++i) {
1712  ret.raw[N / 2 + i] = hi.raw[2 * i + 1];
1713  }
1714  return ret;
1715 }
1716 
1717 // ------------------------------ CombineShiftRightBytes
1718 
1719 template <int kBytes, typename T, size_t N, class V = Vec128<T, N>>
1721  V ret;
1722  const uint8_t* HWY_RESTRICT lo8 =
1723  reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
1724  uint8_t* HWY_RESTRICT ret8 =
1725  reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1726  CopyBytes<sizeof(T) * N - kBytes>(lo8 + kBytes, ret8);
1727  CopyBytes<kBytes>(hi.raw, ret8 + sizeof(T) * N - kBytes);
1728  return ret;
1729 }
1730 
1731 // ------------------------------ ShiftLeftBytes
1732 
1733 template <int kBytes, typename T, size_t N>
1734 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
1735  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1736  Vec128<T, N> ret;
1737  uint8_t* HWY_RESTRICT ret8 =
1738  reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1739  ZeroBytes<kBytes>(ret8);
1740  CopyBytes<sizeof(T) * N - kBytes>(v.raw, ret8 + kBytes);
1741  return ret;
1742 }
1743 
1744 template <int kBytes, typename T, size_t N>
1745 HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
1746  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
1747 }
1748 
1749 // ------------------------------ ShiftLeftLanes
1750 
1751 template <int kLanes, typename T, size_t N>
1752 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
1753  const Repartition<uint8_t, decltype(d)> d8;
1754  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1755 }
1756 
1757 template <int kLanes, typename T, size_t N>
1758 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
1759  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
1760 }
1761 
1762 // ------------------------------ ShiftRightBytes
1763 template <int kBytes, typename T, size_t N>
1764 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
1765  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1766  Vec128<T, N> ret;
1767  const uint8_t* HWY_RESTRICT v8 =
1768  reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
1769  uint8_t* HWY_RESTRICT ret8 =
1770  reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1771  CopyBytes<sizeof(T) * N - kBytes>(v8 + kBytes, ret8);
1772  ZeroBytes<kBytes>(ret8 + sizeof(T) * N - kBytes);
1773  return ret;
1774 }
1775 
1776 // ------------------------------ ShiftRightLanes
1777 template <int kLanes, typename T, size_t N>
1778 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
1779  const Repartition<uint8_t, decltype(d)> d8;
1780  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
1781 }
1782 
1783 // ================================================== SWIZZLE
1784 
1785 template <typename T, size_t N>
1787  return v.raw[0];
1788 }
1789 
1790 template <typename T, size_t N>
1792  v.raw[i] = t;
1793  return v;
1794 }
1795 
1796 template <typename T, size_t N>
1797 HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
1798  return v.raw[i];
1799 }
1800 
1801 template <typename T, size_t N>
1803  for (size_t i = 0; i < N; i += 2) {
1804  v.raw[i + 1] = v.raw[i];
1805  }
1806  return v;
1807 }
1808 
1809 template <typename T, size_t N>
1811  for (size_t i = 0; i < N; i += 2) {
1812  v.raw[i] = v.raw[i + 1];
1813  }
1814  return v;
1815 }
1816 
1817 template <typename T, size_t N>
1818 HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
1819  for (size_t i = 0; i < N; i += 2) {
1820  odd.raw[i] = even.raw[i];
1821  }
1822  return odd;
1823 }
1824 
1825 template <typename T, size_t N>
1826 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
1827  return even;
1828 }
1829 
1830 // ------------------------------ SwapAdjacentBlocks
1831 
1832 template <typename T, size_t N>
1833 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
1834  return v;
1835 }
1836 
1837 // ------------------------------ TableLookupLanes
1838 
1839 // Returned by SetTableIndices for use by TableLookupLanes.
1840 template <typename T, size_t N>
1841 struct Indices128 {
1843 };
1844 
1845 template <typename T, size_t N, typename TI>
1847  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
1848  Indices128<T, N> ret;
1849  CopyBytes<N * sizeof(T)>(vec.raw, ret.raw);
1850  return ret;
1851 }
1852 
1853 template <typename T, size_t N, typename TI>
1855  return IndicesFromVec(d, LoadU(Simd<TI, N, 0>(), idx));
1856 }
1857 
1858 template <typename T, size_t N>
1859 HWY_API Vec128<T, N> TableLookupLanes(const Vec128<T, N> v,
1860  const Indices128<T, N> idx) {
1861  Vec128<T, N> ret;
1862  for (size_t i = 0; i < N; ++i) {
1863  ret.raw[i] = v.raw[idx.raw[i]];
1864  }
1865  return ret;
1866 }
1867 
1868 // ------------------------------ ReverseBlocks
1869 
1870 // Single block: no change
1871 template <typename T, size_t N>
1873  const Vec128<T, N> v) {
1874  return v;
1875 }
1876 
1877 // ------------------------------ Reverse
1878 
1879 template <typename T, size_t N>
1881  Vec128<T, N> ret;
1882  for (size_t i = 0; i < N; ++i) {
1883  ret.raw[i] = v.raw[N - 1 - i];
1884  }
1885  return ret;
1886 }
1887 
1888 template <typename T, size_t N>
1890  Vec128<T, N> ret;
1891  for (size_t i = 0; i < N; i += 2) {
1892  ret.raw[i + 0] = v.raw[i + 1];
1893  ret.raw[i + 1] = v.raw[i + 0];
1894  }
1895  return ret;
1896 }
1897 
1898 template <typename T, size_t N>
1900  Vec128<T, N> ret;
1901  for (size_t i = 0; i < N; i += 4) {
1902  ret.raw[i + 0] = v.raw[i + 3];
1903  ret.raw[i + 1] = v.raw[i + 2];
1904  ret.raw[i + 2] = v.raw[i + 1];
1905  ret.raw[i + 3] = v.raw[i + 0];
1906  }
1907  return ret;
1908 }
1909 
1910 template <typename T, size_t N>
1912  Vec128<T, N> ret;
1913  for (size_t i = 0; i < N; i += 8) {
1914  ret.raw[i + 0] = v.raw[i + 7];
1915  ret.raw[i + 1] = v.raw[i + 6];
1916  ret.raw[i + 2] = v.raw[i + 5];
1917  ret.raw[i + 3] = v.raw[i + 4];
1918  ret.raw[i + 4] = v.raw[i + 3];
1919  ret.raw[i + 5] = v.raw[i + 2];
1920  ret.raw[i + 6] = v.raw[i + 1];
1921  ret.raw[i + 7] = v.raw[i + 0];
1922  }
1923  return ret;
1924 }
1925 
1926 // ================================================== BLOCKWISE
1927 
1928 // ------------------------------ Shuffle*
1929 
1930 // Swap 32-bit halves in 64-bit halves.
1931 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1933  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1934  return Reverse2(DFromV<decltype(v)>(), v);
1935 }
1936 
1937 // Swap 64-bit halves
1938 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1940  Vec128<T> ret;
1941  ret.raw[3] = v.raw[1];
1942  ret.raw[2] = v.raw[0];
1943  ret.raw[1] = v.raw[3];
1944  ret.raw[0] = v.raw[2];
1945  return ret;
1946 }
1947 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1949  return Reverse2(DFromV<decltype(v)>(), v);
1950 }
1951 
1952 // Rotate right 32 bits
1953 template <typename T>
1954 HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
1955  Vec128<T> ret;
1956  ret.raw[3] = v.raw[0];
1957  ret.raw[2] = v.raw[3];
1958  ret.raw[1] = v.raw[2];
1959  ret.raw[0] = v.raw[1];
1960  return ret;
1961 }
1962 
1963 // Rotate left 32 bits
1964 template <typename T>
1965 HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
1966  Vec128<T> ret;
1967  ret.raw[3] = v.raw[2];
1968  ret.raw[2] = v.raw[1];
1969  ret.raw[1] = v.raw[0];
1970  ret.raw[0] = v.raw[3];
1971  return ret;
1972 }
1973 
1974 template <typename T>
1975 HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
1976  return Reverse4(DFromV<decltype(v)>(), v);
1977 }
1978 
1979 // ------------------------------ Broadcast/splat any lane
1980 
1981 template <int kLane, typename T, size_t N>
1983  for (size_t i = 0; i < N; ++i) {
1984  v.raw[i] = v.raw[kLane];
1985  }
1986  return v;
1987 }
1988 
1989 // ------------------------------ TableLookupBytes, TableLookupBytesOr0
1990 
1991 template <typename T, size_t N, typename TI, size_t NI>
1993  const Vec128<TI, NI> indices) {
1994  const uint8_t* HWY_RESTRICT v_bytes =
1995  reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
1996  const uint8_t* HWY_RESTRICT idx_bytes =
1997  reinterpret_cast<const uint8_t*>(indices.raw);
1998  Vec128<TI, NI> ret;
1999  uint8_t* HWY_RESTRICT ret_bytes =
2000  reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2001  for (size_t i = 0; i < NI * sizeof(TI); ++i) {
2002  const size_t idx = idx_bytes[i];
2003  // Avoid out of bounds reads.
2004  ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0;
2005  }
2006  return ret;
2007 }
2008 
2009 template <typename T, size_t N, typename TI, size_t NI>
2011  const Vec128<TI, NI> indices) {
2012  // Same as TableLookupBytes, which already returns 0 if out of bounds.
2013  return TableLookupBytes(v, indices);
2014 }
2015 
2016 // ------------------------------ InterleaveLower/InterleaveUpper
2017 
2018 template <typename T, size_t N>
2020  const Vec128<T, N> b) {
2021  Vec128<T, N> ret;
2022  for (size_t i = 0; i < N / 2; ++i) {
2023  ret.raw[2 * i + 0] = a.raw[i];
2024  ret.raw[2 * i + 1] = b.raw[i];
2025  }
2026  return ret;
2027 }
2028 
2029 // Additional overload for the optional tag (also for 256/512).
2030 template <class V>
2031 HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2032  return InterleaveLower(a, b);
2033 }
2034 
2035 template <typename T, size_t N>
2037  const Vec128<T, N> a,
2038  const Vec128<T, N> b) {
2039  Vec128<T, N> ret;
2040  for (size_t i = 0; i < N / 2; ++i) {
2041  ret.raw[2 * i + 0] = a.raw[N / 2 + i];
2042  ret.raw[2 * i + 1] = b.raw[N / 2 + i];
2043  }
2044  return ret;
2045 }
2046 
2047 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2048 
2049 // Same as Interleave*, except that the return lanes are double-width integers;
2050 // this is necessary because the single-lane scalar cannot return two values.
2051 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2052 HWY_API VFromD<DW> ZipLower(V a, V b) {
2053  return BitCast(DW(), InterleaveLower(a, b));
2054 }
2055 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2056 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2057  return BitCast(dw, InterleaveLower(D(), a, b));
2058 }
2059 
2060 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2061 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2062  return BitCast(dw, InterleaveUpper(D(), a, b));
2063 }
2064 
2065 // ================================================== MASK
2066 
2067 template <typename T, size_t N>
2068 HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
2069  typename Mask128<T, N>::Raw or_sum = 0;
2070  for (size_t i = 0; i < N; ++i) {
2071  or_sum |= mask.bits[i];
2072  }
2073  return or_sum == 0;
2074 }
2075 
2076 template <typename T, size_t N>
2077 HWY_API bool AllTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
2078  using Bits = typename Mask128<T, N>::Raw;
2079  constexpr Bits kAll = static_cast<Bits>(~Bits{0});
2080  Bits and_sum = kAll;
2081  for (size_t i = 0; i < N; ++i) {
2082  and_sum &= mask.bits[i];
2083  }
2084  return and_sum == kAll;
2085 }
2086 
2087 // `p` points to at least 8 readable bytes, not all of which need be valid.
2088 template <typename T, size_t N>
2090  const uint8_t* HWY_RESTRICT bits) {
2091  Mask128<T, N> m;
2092  for (size_t i = 0; i < N; ++i) {
2093  const size_t bit = size_t{1} << (i & 7);
2094  const size_t idx_byte = i >> 3;
2095  m.bits[i] = Mask128<T, N>::FromBool((bits[idx_byte] & bit) != 0);
2096  }
2097  return m;
2098 }
2099 
2100 // `p` points to at least 8 writable bytes.
2101 template <typename T, size_t N>
2102 HWY_API size_t StoreMaskBits(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask,
2103  uint8_t* bits) {
2104  bits[0] = 0;
2105  if (N > 8) bits[1] = 0; // N <= 16, so max two bytes
2106  for (size_t i = 0; i < N; ++i) {
2107  const size_t bit = size_t{1} << (i & 7);
2108  const size_t idx_byte = i >> 3;
2109  if (mask.bits[i]) {
2110  bits[idx_byte] = static_cast<uint8_t>(bits[idx_byte] | bit);
2111  }
2112  }
2113  return N > 8 ? 2 : 1;
2114 }
2115 
2116 template <typename T, size_t N>
2117 HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
2118  size_t count = 0;
2119  for (size_t i = 0; i < N; ++i) {
2120  count += mask.bits[i] != 0;
2121  }
2122  return count;
2123 }
2124 
2125 template <typename T, size_t N>
2126 HWY_API intptr_t FindFirstTrue(Simd<T, N, 0> /* tag */,
2127  const Mask128<T, N> mask) {
2128  for (size_t i = 0; i < N; ++i) {
2129  if (mask.bits[i] != 0) return static_cast<intptr_t>(i);
2130  }
2131  return intptr_t{-1};
2132 }
2133 
2134 // ------------------------------ Compress
2135 
2136 template <typename T>
2137 struct CompressIsPartition {
2138  enum { value = 1 };
2139 };
2140 
2141 template <typename T, size_t N>
2143  size_t count = 0;
2144  Vec128<T, N> ret;
2145  for (size_t i = 0; i < N; ++i) {
2146  if (mask.bits[i]) {
2147  ret.raw[count++] = v.raw[i];
2148  }
2149  }
2150  for (size_t i = 0; i < N; ++i) {
2151  if (!mask.bits[i]) {
2152  ret.raw[count++] = v.raw[i];
2153  }
2154  }
2155  HWY_DASSERT(count == N);
2156  return ret;
2157 }
2158 
2159 // ------------------------------ CompressNot
2160 template <typename T, size_t N>
2162  size_t count = 0;
2163  Vec128<T, N> ret;
2164  for (size_t i = 0; i < N; ++i) {
2165  if (!mask.bits[i]) {
2166  ret.raw[count++] = v.raw[i];
2167  }
2168  }
2169  for (size_t i = 0; i < N; ++i) {
2170  if (mask.bits[i]) {
2171  ret.raw[count++] = v.raw[i];
2172  }
2173  }
2174  HWY_DASSERT(count == N);
2175  return ret;
2176 }
2177 
2178 // ------------------------------ CompressBlocksNot
2179 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
2180  Mask128<uint64_t> /* m */) {
2181  return v;
2182 }
2183 
2184 // ------------------------------ CompressBits
2185 template <typename T, size_t N>
2187  const uint8_t* HWY_RESTRICT bits) {
2188  return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
2189 }
2190 
2191 // ------------------------------ CompressStore
2192 template <typename T, size_t N>
2193 HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
2194  Simd<T, N, 0> /* tag */,
2195  T* HWY_RESTRICT unaligned) {
2196  size_t count = 0;
2197  for (size_t i = 0; i < N; ++i) {
2198  if (mask.bits[i]) {
2199  unaligned[count++] = v.raw[i];
2200  }
2201  }
2202  return count;
2203 }
2204 
2205 // ------------------------------ CompressBlendedStore
2206 template <typename T, size_t N>
2207 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, const Mask128<T, N> mask,
2208  Simd<T, N, 0> d,
2209  T* HWY_RESTRICT unaligned) {
2210  return CompressStore(v, mask, d, unaligned);
2211 }
2212 
2213 // ------------------------------ CompressBitsStore
2214 template <typename T, size_t N>
2215 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
2216  const uint8_t* HWY_RESTRICT bits,
2217  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
2218  const Mask128<T, N> mask = LoadMaskBits(d, bits);
2219  StoreU(Compress(v, mask), d, unaligned);
2220  return CountTrue(d, mask);
2221 }
2222 
2223 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2224 template <size_t N>
2225 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
2226  Vec128<bfloat16_t, 2 * N> a,
2227  Vec128<bfloat16_t, 2 * N> b,
2228  const Vec128<float, N> sum0,
2229  Vec128<float, N>& sum1) {
2230  const Rebind<bfloat16_t, decltype(df32)> dbf16;
2231  // Avoid ZipLower/Upper so this also works on big-endian systems.
2232  const Vec128<float, N> a0 = PromoteTo(df32, LowerHalf(dbf16, a));
2233  const Vec128<float, N> a1 = PromoteTo(df32, UpperHalf(dbf16, a));
2234  const Vec128<float, N> b0 = PromoteTo(df32, LowerHalf(dbf16, b));
2235  const Vec128<float, N> b1 = PromoteTo(df32, UpperHalf(dbf16, b));
2236  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2237  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2238 }
2239 
2240 // ================================================== REDUCTIONS
2241 
2242 template <typename T, size_t N>
2243 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2244  T sum = T{0};
2245  for (size_t i = 0; i < N; ++i) {
2246  sum += v.raw[i];
2247  }
2248  return Set(d, sum);
2249 }
2250 template <typename T, size_t N>
2251 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2252  T min = HighestValue<T>();
2253  for (size_t i = 0; i < N; ++i) {
2254  min = HWY_MIN(min, v.raw[i]);
2255  }
2256  return Set(d, min);
2257 }
2258 template <typename T, size_t N>
2259 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2260  T max = LowestValue<T>();
2261  for (size_t i = 0; i < N; ++i) {
2262  max = HWY_MAX(max, v.raw[i]);
2263  }
2264  return Set(d, max);
2265 }
2266 
2267 // ================================================== OPS WITH DEPENDENCIES
2268 
2269 // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
2270 
2271 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
2272  const Vec128<uint64_t> b) {
2273  alignas(16) uint64_t mul[2];
2274  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
2275  return Load(Full128<uint64_t>(), mul);
2276 }
2277 
2278 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
2279  const Vec128<uint64_t> b) {
2280  alignas(16) uint64_t mul[2];
2281  const Half<Full128<uint64_t>> d2;
2282  mul[0] =
2283  Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
2284  return Load(Full128<uint64_t>(), mul);
2285 }
2286 
2287 // ================================================== Operator wrapper
2288 
2289 template <class V>
2290 HWY_API V Add(V a, V b) {
2291  return a + b;
2292 }
2293 template <class V>
2294 HWY_API V Sub(V a, V b) {
2295  return a - b;
2296 }
2297 
2298 template <class V>
2299 HWY_API V Mul(V a, V b) {
2300  return a * b;
2301 }
2302 template <class V>
2303 HWY_API V Div(V a, V b) {
2304  return a / b;
2305 }
2306 
2307 template <class V>
2308 V Shl(V a, V b) {
2309  return a << b;
2310 }
2311 template <class V>
2312 V Shr(V a, V b) {
2313  return a >> b;
2314 }
2315 
2316 template <class V>
2317 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
2318  return a == b;
2319 }
2320 template <class V>
2321 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
2322  return a != b;
2323 }
2324 template <class V>
2325 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
2326  return a < b;
2327 }
2328 
2329 template <class V>
2330 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
2331  return a > b;
2332 }
2333 template <class V>
2334 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
2335  return a >= b;
2336 }
2337 
2338 template <class V>
2339 HWY_API auto Le(V a, V b) -> decltype(a == b) {
2340  return a <= b;
2341 }
2342 
2343 // NOLINTNEXTLINE(google-readability-namespace-comments)
2344 } // namespace HWY_NAMESPACE
2345 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
Definition: arm_neon-inl.h:804
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
static HWY_INLINE Raw FromBool(bool b)
Definition: emu128-inl.h:73
Raw bits[16/sizeof(T)]
Definition: emu128-inl.h:78
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: emu128-inl.h:51
HWY_INLINE Vec128()=default
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: emu128-inl.h:54
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: emu128-inl.h:42
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: emu128-inl.h:57
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: emu128-inl.h:39
Vec128(const Vec128 &)=default
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: emu128-inl.h:45
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: emu128-inl.h:48
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
constexpr HWY_INLINE T IncrementWithWraparound(T t)
Definition: emu128-inl.h:139
HWY_INLINE void StoreU16ToF16(const uint16_t val, hwy::float16_t *HWY_RESTRICT to)
Definition: emu128-inl.h:1464
HWY_INLINE uint16_t U16FromF16(const hwy::float16_t *HWY_RESTRICT from)
Definition: emu128-inl.h:1473
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:831
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:839
constexpr float HighestValue< float >()
Definition: base.h:580
constexpr HWY_API T LimitsMax()
Definition: base.h:548
constexpr float LowestValue< float >()
Definition: base.h:567
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
Definition: ops/shared-inl.h:40
Definition: emu128-inl.h:84
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: emu128-inl.h:86
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition: emu128-inl.h:448
Definition: emu128-inl.h:438
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition: emu128-inl.h:440
Definition: base.h:246