Grok  10.0.3
scalar-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Single-element vectors and operations.
17 // External include guard in highway.h - see comment there.
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 
22 #include "hwy/base.h"
23 #include "hwy/ops/shared-inl.h"
24 
26 namespace hwy {
27 namespace HWY_NAMESPACE {
28 
29 // Single instruction, single data.
30 template <typename T>
32 
33 // (Wrapper class required for overloading comparison operators.)
34 template <typename T>
35 struct Vec1 {
36  HWY_INLINE Vec1() = default;
37  Vec1(const Vec1&) = default;
38  Vec1& operator=(const Vec1&) = default;
39  HWY_INLINE explicit Vec1(const T t) : raw(t) {}
40 
41  HWY_INLINE Vec1& operator*=(const Vec1 other) {
42  return *this = (*this * other);
43  }
44  HWY_INLINE Vec1& operator/=(const Vec1 other) {
45  return *this = (*this / other);
46  }
47  HWY_INLINE Vec1& operator+=(const Vec1 other) {
48  return *this = (*this + other);
49  }
50  HWY_INLINE Vec1& operator-=(const Vec1 other) {
51  return *this = (*this - other);
52  }
53  HWY_INLINE Vec1& operator&=(const Vec1 other) {
54  return *this = (*this & other);
55  }
56  HWY_INLINE Vec1& operator|=(const Vec1 other) {
57  return *this = (*this | other);
58  }
59  HWY_INLINE Vec1& operator^=(const Vec1 other) {
60  return *this = (*this ^ other);
61  }
62 
63  T raw;
64 };
65 
66 // 0 or FF..FF, same size as Vec1.
67 template <typename T>
68 class Mask1 {
70 
71  public:
72  static HWY_INLINE Mask1<T> FromBool(bool b) {
73  Mask1<T> mask;
74  mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
75  return mask;
76  }
77 
79 };
80 
81 namespace detail {
82 
83 // Deduce Sisd<T> from Vec1<T>
84 struct Deduce1 {
85  template <typename T>
87  return Sisd<T>();
88  }
89 };
90 
91 } // namespace detail
92 
93 template <class V>
94 using DFromV = decltype(detail::Deduce1()(V()));
95 
96 template <class V>
97 using TFromV = TFromD<DFromV<V>>;
98 
99 // ------------------------------ BitCast
100 
101 template <typename T, typename FromT>
103  static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
104  T to;
105  CopyBytes<sizeof(FromT)>(&v.raw, &to);
106  return Vec1<T>(to);
107 }
108 
109 // ------------------------------ Set
110 
111 template <typename T>
113  return Vec1<T>(T(0));
114 }
115 
116 template <typename T, typename T2>
117 HWY_API Vec1<T> Set(Sisd<T> /* tag */, const T2 t) {
118  return Vec1<T>(static_cast<T>(t));
119 }
120 
121 template <typename T>
123  return Zero(d);
124 }
125 
126 template <typename T, typename T2>
127 HWY_API Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
128  return Vec1<T>(static_cast<T>(first));
129 }
130 
131 // ================================================== LOGICAL
132 
133 // ------------------------------ Not
134 
135 template <typename T>
137  using TU = MakeUnsigned<T>;
138  const Sisd<TU> du;
139  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw)));
140 }
141 
142 // ------------------------------ And
143 
144 template <typename T>
145 HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
146  using TU = MakeUnsigned<T>;
147  const Sisd<TU> du;
148  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
149 }
150 template <typename T>
152  return And(a, b);
153 }
154 
155 // ------------------------------ AndNot
156 
157 template <typename T>
158 HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
159  using TU = MakeUnsigned<T>;
160  const Sisd<TU> du;
161  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw &
162  BitCast(du, b).raw)));
163 }
164 
165 // ------------------------------ Or
166 
167 template <typename T>
168 HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
169  using TU = MakeUnsigned<T>;
170  const Sisd<TU> du;
171  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
172 }
173 template <typename T>
175  return Or(a, b);
176 }
177 
178 // ------------------------------ Xor
179 
180 template <typename T>
181 HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
182  using TU = MakeUnsigned<T>;
183  const Sisd<TU> du;
184  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
185 }
186 template <typename T>
188  return Xor(a, b);
189 }
190 
191 // ------------------------------ Or3
192 
193 template <typename T>
195  return Or(o1, Or(o2, o3));
196 }
197 
198 // ------------------------------ OrAnd
199 
200 template <typename T>
201 HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
202  return Or(o, And(a1, a2));
203 }
204 
205 // ------------------------------ IfVecThenElse
206 
207 template <typename T>
209  return IfThenElse(MaskFromVec(mask), yes, no);
210 }
211 
212 // ------------------------------ CopySign
213 
214 template <typename T>
215 HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
216  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
217  const auto msb = SignBit(Sisd<T>());
218  return Or(AndNot(msb, magn), And(msb, sign));
219 }
220 
221 template <typename T>
222 HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
223  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
224  return Or(abs, And(SignBit(Sisd<T>()), sign));
225 }
226 
227 // ------------------------------ BroadcastSignBit
228 
229 template <typename T>
231  // This is used inside ShiftRight, so we cannot implement in terms of it.
232  return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
233 }
234 
235 // ------------------------------ PopulationCount
236 
237 #ifdef HWY_NATIVE_POPCNT
238 #undef HWY_NATIVE_POPCNT
239 #else
240 #define HWY_NATIVE_POPCNT
241 #endif
242 
243 template <typename T>
245  return Vec1<T>(static_cast<T>(PopCount(v.raw)));
246 }
247 
248 // ------------------------------ Mask
249 
250 template <typename TFrom, typename TTo>
252  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
253  return Mask1<TTo>{m.bits};
254 }
255 
256 // v must be 0 or FF..FF.
257 template <typename T>
259  Mask1<T> mask;
260  CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
261  return mask;
262 }
263 
264 template <typename T>
266  Vec1<T> v;
267  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
268  return v;
269 }
270 
271 template <typename T>
272 Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
273  Vec1<T> v;
274  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
275  return v;
276 }
277 
278 template <typename T>
279 HWY_API Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
280  return Mask1<T>::FromBool(n != 0);
281 }
282 
283 // Returns mask ? yes : no.
284 template <typename T>
285 HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
286  const Vec1<T> no) {
287  return mask.bits ? yes : no;
288 }
289 
290 template <typename T>
292  return mask.bits ? yes : Vec1<T>(0);
293 }
294 
295 template <typename T>
297  return mask.bits ? Vec1<T>(0) : no;
298 }
299 
300 template <typename T>
302  return v.raw < 0 ? yes : no;
303 }
304 
305 template <typename T>
307  return v.raw < 0 ? Vec1<T>(0) : v;
308 }
309 
310 // ------------------------------ Mask logical
311 
312 template <typename T>
314  return MaskFromVec(Not(VecFromMask(Sisd<T>(), m)));
315 }
316 
317 template <typename T>
319  const Sisd<T> d;
320  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
321 }
322 
323 template <typename T>
325  const Sisd<T> d;
326  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
327 }
328 
329 template <typename T>
331  const Sisd<T> d;
332  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
333 }
334 
335 template <typename T>
337  const Sisd<T> d;
338  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
339 }
340 
341 // ================================================== SHIFTS
342 
343 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
344 
345 template <int kBits, typename T>
347  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
348  return Vec1<T>(
349  static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits));
350 }
351 
352 template <int kBits, typename T>
354  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
355 #if __cplusplus >= 202002L
356  // Signed right shift is now guaranteed to be arithmetic (rounding toward
357  // negative infinity, i.e. shifting in the sign bit).
358  return Vec1<T>(static_cast<T>(v.raw >> kBits));
359 #else
360  if (IsSigned<T>()) {
361  // Emulate arithmetic shift using only logical (unsigned) shifts, because
362  // signed shifts are still implementation-defined.
363  using TU = hwy::MakeUnsigned<T>;
364  const Sisd<TU> du;
365  const TU shifted = BitCast(du, v).raw >> kBits;
366  const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
367  const size_t sign_shift =
368  static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
369  const TU upper = static_cast<TU>(sign << sign_shift);
370  return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
371  } else { // T is unsigned
372  return Vec1<T>(static_cast<T>(v.raw >> kBits));
373  }
374 #endif
375 }
376 
377 // ------------------------------ RotateRight (ShiftRight)
378 
379 namespace detail {
380 
381 // For partial specialization: kBits == 0 results in an invalid shift count
382 template <int kBits>
383 struct RotateRight {
384  template <typename T>
386  return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
387  }
388 };
389 
390 template <>
391 struct RotateRight<0> {
392  template <typename T>
394  return v;
395  }
396 };
397 
398 } // namespace detail
399 
400 template <int kBits, typename T>
402  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
403  return detail::RotateRight<kBits>()(v);
404 }
405 
406 // ------------------------------ ShiftLeftSame (BroadcastSignBit)
407 
408 template <typename T>
410  return Vec1<T>(
411  static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits));
412 }
413 
414 template <typename T>
416 #if __cplusplus >= 202002L
417  // Signed right shift is now guaranteed to be arithmetic (rounding toward
418  // negative infinity, i.e. shifting in the sign bit).
419  return Vec1<T>(static_cast<T>(v.raw >> bits));
420 #else
421  if (IsSigned<T>()) {
422  // Emulate arithmetic shift using only logical (unsigned) shifts, because
423  // signed shifts are still implementation-defined.
424  using TU = hwy::MakeUnsigned<T>;
425  const Sisd<TU> du;
426  const TU shifted = BitCast(du, v).raw >> bits;
427  const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
428  const size_t sign_shift =
429  static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
430  const TU upper = static_cast<TU>(sign << sign_shift);
431  return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
432  } else { // T is unsigned
433  return Vec1<T>(static_cast<T>(v.raw >> bits));
434  }
435 #endif
436 }
437 
438 // ------------------------------ Shl
439 
440 // Single-lane => same as ShiftLeftSame except for the argument type.
441 template <typename T>
443  return ShiftLeftSame(v, static_cast<int>(bits.raw));
444 }
445 
446 template <typename T>
448  return ShiftRightSame(v, static_cast<int>(bits.raw));
449 }
450 
451 // ================================================== ARITHMETIC
452 
453 template <typename T>
455  const uint64_t a64 = static_cast<uint64_t>(a.raw);
456  const uint64_t b64 = static_cast<uint64_t>(b.raw);
457  return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
458 }
460  return Vec1<float>(a.raw + b.raw);
461 }
463  return Vec1<double>(a.raw + b.raw);
464 }
465 
466 template <typename T>
468  const uint64_t a64 = static_cast<uint64_t>(a.raw);
469  const uint64_t b64 = static_cast<uint64_t>(b.raw);
470  return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
471 }
473  return Vec1<float>(a.raw - b.raw);
474 }
476  return Vec1<double>(a.raw - b.raw);
477 }
478 
479 // ------------------------------ SumsOf8
480 
482  return Vec1<uint64_t>(v.raw);
483 }
484 
485 // ------------------------------ SaturatedAdd
486 
487 // Returns a + b clamped to the destination range.
488 
489 // Unsigned
491  const Vec1<uint8_t> b) {
492  return Vec1<uint8_t>(
493  static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
494 }
496  const Vec1<uint16_t> b) {
497  return Vec1<uint16_t>(
498  static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
499 }
500 
501 // Signed
503  return Vec1<int8_t>(
504  static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
505 }
507  const Vec1<int16_t> b) {
508  return Vec1<int16_t>(
509  static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
510 }
511 
512 // ------------------------------ Saturating subtraction
513 
514 // Returns a - b clamped to the destination range.
515 
516 // Unsigned
518  const Vec1<uint8_t> b) {
519  return Vec1<uint8_t>(
520  static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
521 }
523  const Vec1<uint16_t> b) {
524  return Vec1<uint16_t>(
525  static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
526 }
527 
528 // Signed
530  return Vec1<int8_t>(
531  static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
532 }
534  const Vec1<int16_t> b) {
535  return Vec1<int16_t>(
536  static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
537 }
538 
539 // ------------------------------ Average
540 
541 // Returns (a + b + 1) / 2
542 
544  const Vec1<uint8_t> b) {
545  return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
546 }
548  const Vec1<uint16_t> b) {
549  return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
550 }
551 
552 // ------------------------------ Absolute value
553 
554 template <typename T>
556  const T i = a.raw;
557  return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
558 }
560  return Vec1<float>(std::abs(a.raw));
561 }
563  return Vec1<double>(std::abs(a.raw));
564 }
565 
566 // ------------------------------ min/max
567 
568 template <typename T, HWY_IF_NOT_FLOAT(T)>
569 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
570  return Vec1<T>(HWY_MIN(a.raw, b.raw));
571 }
572 
573 template <typename T, HWY_IF_FLOAT(T)>
574 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
575  if (std::isnan(a.raw)) return b;
576  if (std::isnan(b.raw)) return a;
577  return Vec1<T>(HWY_MIN(a.raw, b.raw));
578 }
579 
580 template <typename T, HWY_IF_NOT_FLOAT(T)>
581 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
582  return Vec1<T>(HWY_MAX(a.raw, b.raw));
583 }
584 
585 template <typename T, HWY_IF_FLOAT(T)>
586 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
587  if (std::isnan(a.raw)) return b;
588  if (std::isnan(b.raw)) return a;
589  return Vec1<T>(HWY_MAX(a.raw, b.raw));
590 }
591 
592 // ------------------------------ Floating-point negate
593 
594 template <typename T, HWY_IF_FLOAT(T)>
596  return Xor(v, SignBit(Sisd<T>()));
597 }
598 
599 template <typename T, HWY_IF_NOT_FLOAT(T)>
600 HWY_API Vec1<T> Neg(const Vec1<T> v) {
601  return Zero(Sisd<T>()) - v;
602 }
603 
604 // ------------------------------ mul/div
605 
606 template <typename T, HWY_IF_FLOAT(T)>
608  return Vec1<T>(static_cast<T>(double(a.raw) * b.raw));
609 }
610 
611 template <typename T, HWY_IF_SIGNED(T)>
612 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
613  return Vec1<T>(static_cast<T>(int64_t(a.raw) * b.raw));
614 }
615 
616 template <typename T, HWY_IF_UNSIGNED(T)>
617 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
618  return Vec1<T>(static_cast<T>(uint64_t(a.raw) * b.raw));
619 }
620 
621 template <typename T>
623  return Vec1<T>(a.raw / b.raw);
624 }
625 
626 // Returns the upper 16 bits of a * b in each lane.
628  return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
629 }
631  // Cast to uint32_t first to prevent overflow. Otherwise the result of
632  // uint16_t * uint16_t is in "int" which may overflow. In practice the result
633  // is the same but this way it is also defined.
634  return Vec1<uint16_t>(static_cast<uint16_t>(
635  (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
636 }
637 
639  return Vec1<int16_t>(static_cast<int16_t>((2 * a.raw * b.raw + 32768) >> 16));
640 }
641 
642 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
644  const int64_t a64 = a.raw;
645  return Vec1<int64_t>(a64 * b.raw);
646 }
648  const uint64_t a64 = a.raw;
649  return Vec1<uint64_t>(a64 * b.raw);
650 }
651 
652 // Approximate reciprocal
654  // Zero inputs are allowed, but callers are responsible for replacing the
655  // return value with something else (typically using IfThenElse). This check
656  // avoids a ubsan error. The return value is arbitrary.
657  if (v.raw == 0.0f) return Vec1<float>(0.0f);
658  return Vec1<float>(1.0f / v.raw);
659 }
660 
661 // Absolute value of difference.
663  return Abs(a - b);
664 }
665 
666 // ------------------------------ Floating-point multiply-add variants
667 
668 template <typename T>
669 HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
670  return mul * x + add;
671 }
672 
673 template <typename T>
675  const Vec1<T> add) {
676  return add - mul * x;
677 }
678 
679 template <typename T>
680 HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
681  return mul * x - sub;
682 }
683 
684 template <typename T>
686  const Vec1<T> sub) {
687  return Neg(mul) * x - sub;
688 }
689 
690 // ------------------------------ Floating-point square root
691 
692 // Approximate reciprocal square root
694  float f = v.raw;
695  const float half = f * 0.5f;
696  uint32_t bits;
697  CopyBytes<4>(&f, &bits);
698  // Initial guess based on log2(f)
699  bits = 0x5F3759DF - (bits >> 1);
700  CopyBytes<4>(&bits, &f);
701  // One Newton-Raphson iteration
702  return Vec1<float>(f * (1.5f - (half * f * f)));
703 }
704 
705 // Square root
707  return Vec1<float>(std::sqrt(v.raw));
708 }
710  return Vec1<double>(std::sqrt(v.raw));
711 }
712 
713 // ------------------------------ Floating-point rounding
714 
715 template <typename T>
717  using TI = MakeSigned<T>;
718  if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN
719  return v;
720  }
721  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
722  const TI rounded = static_cast<TI>(v.raw + bias);
723  if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
724  // Round to even
725  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
726  return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
727  }
728  return Vec1<T>(static_cast<T>(rounded));
729 }
730 
731 // Round-to-nearest even.
733  using T = float;
734  using TI = int32_t;
735 
736  const T abs = Abs(v).raw;
737  const bool signbit = std::signbit(v.raw);
738 
739  if (!(abs < MantissaEnd<T>())) { // Huge or NaN
740  // Check if too large to cast or NaN
741  if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
742  return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
743  }
744  return Vec1<int32_t>(static_cast<TI>(v.raw));
745  }
746  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
747  const TI rounded = static_cast<TI>(v.raw + bias);
748  if (rounded == 0) return Vec1<int32_t>(0);
749  // Round to even
750  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
751  return Vec1<TI>(rounded - (signbit ? -1 : 1));
752  }
753  return Vec1<TI>(rounded);
754 }
755 
756 template <typename T>
758  using TI = MakeSigned<T>;
759  if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN
760  return v;
761  }
762  const TI truncated = static_cast<TI>(v.raw);
763  if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
764  return Vec1<T>(static_cast<T>(truncated));
765 }
766 
767 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
768  class V>
769 V Ceiling(const V v) {
770  const Bits kExponentMask = (1ull << kExponentBits) - 1;
771  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
772  const Bits kBias = kExponentMask / 2;
773 
774  Float f = v.raw;
775  const bool positive = f > Float(0.0);
776 
777  Bits bits;
778  CopyBytes<sizeof(Bits)>(&v, &bits);
779 
780  const int exponent =
781  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
782  // Already an integer.
783  if (exponent >= kMantissaBits) return v;
784  // |v| <= 1 => 0 or 1.
785  if (exponent < 0) return positive ? V(1) : V(-0.0);
786 
787  const Bits mantissa_mask = kMantissaMask >> exponent;
788  // Already an integer
789  if ((bits & mantissa_mask) == 0) return v;
790 
791  // Clear fractional bits and round up
792  if (positive) bits += (kMantissaMask + 1) >> exponent;
793  bits &= ~mantissa_mask;
794 
795  CopyBytes<sizeof(Bits)>(&bits, &f);
796  return V(f);
797 }
798 
799 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
800  class V>
801 V Floor(const V v) {
802  const Bits kExponentMask = (1ull << kExponentBits) - 1;
803  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
804  const Bits kBias = kExponentMask / 2;
805 
806  Float f = v.raw;
807  const bool negative = f < Float(0.0);
808 
809  Bits bits;
810  CopyBytes<sizeof(Bits)>(&v, &bits);
811 
812  const int exponent =
813  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
814  // Already an integer.
815  if (exponent >= kMantissaBits) return v;
816  // |v| <= 1 => -1 or 0.
817  if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
818 
819  const Bits mantissa_mask = kMantissaMask >> exponent;
820  // Already an integer
821  if ((bits & mantissa_mask) == 0) return v;
822 
823  // Clear fractional bits and round down
824  if (negative) bits += (kMantissaMask + 1) >> exponent;
825  bits &= ~mantissa_mask;
826 
827  CopyBytes<sizeof(Bits)>(&bits, &f);
828  return V(f);
829 }
830 
831 // Toward +infinity, aka ceiling
833  return Ceiling<float, uint32_t, 23, 8>(v);
834 }
836  return Ceiling<double, uint64_t, 52, 11>(v);
837 }
838 
839 // Toward -infinity, aka floor
841  return Floor<float, uint32_t, 23, 8>(v);
842 }
844  return Floor<double, uint64_t, 52, 11>(v);
845 }
846 
847 // ================================================== COMPARE
848 
849 template <typename T>
851  return Mask1<T>::FromBool(a.raw == b.raw);
852 }
853 
854 template <typename T>
856  return Mask1<T>::FromBool(a.raw != b.raw);
857 }
858 
859 template <typename T>
861  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
862  return (v & bit) == bit;
863 }
864 
865 template <typename T>
867  return Mask1<T>::FromBool(a.raw < b.raw);
868 }
869 template <typename T>
871  return Mask1<T>::FromBool(a.raw > b.raw);
872 }
873 
874 template <typename T>
876  return Mask1<T>::FromBool(a.raw <= b.raw);
877 }
878 template <typename T>
880  return Mask1<T>::FromBool(a.raw >= b.raw);
881 }
882 
883 // ------------------------------ Floating-point classification (==)
884 
885 template <typename T>
887  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
888  MakeUnsigned<T> bits;
889  memcpy(&bits, &v, sizeof(v));
890  bits += bits;
891  bits >>= 1; // clear sign bit
892  // NaN if all exponent bits are set and the mantissa is not zero.
893  return Mask1<T>::FromBool(bits > ExponentMask<T>());
894 }
895 
897  const Sisd<float> d;
898  const RebindToUnsigned<decltype(d)> du;
899  const Vec1<uint32_t> vu = BitCast(du, v);
900  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
901  return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u));
902 }
904  const Sisd<double> d;
905  const RebindToUnsigned<decltype(d)> du;
906  const Vec1<uint64_t> vu = BitCast(du, v);
907  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
908  return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull));
909 }
910 
912  const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v);
913  // Shift left to clear the sign bit, check whether exponent != max value.
914  return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u);
915 }
917  const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v);
918  // Shift left to clear the sign bit, check whether exponent != max value.
919  return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull);
920 }
921 
922 // ================================================== MEMORY
923 
924 // ------------------------------ Load
925 
926 template <typename T>
927 HWY_API Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
928  T t;
929  CopyBytes<sizeof(T)>(aligned, &t);
930  return Vec1<T>(t);
931 }
932 
933 template <typename T>
935  const T* HWY_RESTRICT aligned) {
936  return IfThenElseZero(m, Load(d, aligned));
937 }
938 
939 template <typename T>
941  return Load(d, p);
942 }
943 
944 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
945 template <typename T>
947  return Load(d, aligned);
948 }
949 
950 // ------------------------------ Store
951 
952 template <typename T>
953 HWY_API void Store(const Vec1<T> v, Sisd<T> /* tag */,
954  T* HWY_RESTRICT aligned) {
955  CopyBytes<sizeof(T)>(&v.raw, aligned);
956 }
957 
958 template <typename T>
960  return Store(v, d, p);
961 }
962 
963 template <typename T>
965  T* HWY_RESTRICT p) {
966  if (!m.bits) return;
967  StoreU(v, d, p);
968 }
969 
970 // ------------------------------ LoadInterleaved2/3/4
971 
972 // Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
973 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
974 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
975 #else
976 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
977 #endif
978 
979 template <typename T>
980 HWY_API void LoadInterleaved2(Sisd<T> d, const T* HWY_RESTRICT unaligned,
981  Vec1<T>& v0, Vec1<T>& v1) {
982  v0 = LoadU(d, unaligned + 0);
983  v1 = LoadU(d, unaligned + 1);
984 }
985 
986 template <typename T>
987 HWY_API void LoadInterleaved3(Sisd<T> d, const T* HWY_RESTRICT unaligned,
988  Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2) {
989  v0 = LoadU(d, unaligned + 0);
990  v1 = LoadU(d, unaligned + 1);
991  v2 = LoadU(d, unaligned + 2);
992 }
993 
994 template <typename T>
995 HWY_API void LoadInterleaved4(Sisd<T> d, const T* HWY_RESTRICT unaligned,
996  Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2,
997  Vec1<T>& v3) {
998  v0 = LoadU(d, unaligned + 0);
999  v1 = LoadU(d, unaligned + 1);
1000  v2 = LoadU(d, unaligned + 2);
1001  v3 = LoadU(d, unaligned + 3);
1002 }
1003 
1004 // ------------------------------ StoreInterleaved2/3/4
1005 
1006 template <typename T>
1008  T* HWY_RESTRICT unaligned) {
1009  StoreU(v0, d, unaligned + 0);
1010  StoreU(v1, d, unaligned + 1);
1011 }
1012 
1013 template <typename T>
1014 HWY_API void StoreInterleaved3(const Vec1<T> v0, const Vec1<T> v1,
1015  const Vec1<T> v2, Sisd<T> d,
1016  T* HWY_RESTRICT unaligned) {
1017  StoreU(v0, d, unaligned + 0);
1018  StoreU(v1, d, unaligned + 1);
1019  StoreU(v2, d, unaligned + 2);
1020 }
1021 
1022 template <typename T>
1023 HWY_API void StoreInterleaved4(const Vec1<T> v0, const Vec1<T> v1,
1024  const Vec1<T> v2, const Vec1<T> v3, Sisd<T> d,
1025  T* HWY_RESTRICT unaligned) {
1026  StoreU(v0, d, unaligned + 0);
1027  StoreU(v1, d, unaligned + 1);
1028  StoreU(v2, d, unaligned + 2);
1029  StoreU(v3, d, unaligned + 3);
1030 }
1031 
1032 // ------------------------------ Stream
1033 
1034 template <typename T>
1035 HWY_API void Stream(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT aligned) {
1036  return Store(v, d, aligned);
1037 }
1038 
1039 // ------------------------------ Scatter
1040 
1041 template <typename T, typename Offset>
1043  const Vec1<Offset> offset) {
1044  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1045  uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
1046  return Store(v, d, reinterpret_cast<T*>(base8));
1047 }
1048 
1049 template <typename T, typename Index>
1051  const Vec1<Index> index) {
1052  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1053  return Store(v, d, base + index.raw);
1054 }
1055 
1056 // ------------------------------ Gather
1057 
1058 template <typename T, typename Offset>
1060  const Vec1<Offset> offset) {
1061  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1062  const intptr_t addr =
1063  reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
1064  return Load(d, reinterpret_cast<const T*>(addr));
1065 }
1066 
1067 template <typename T, typename Index>
1069  const Vec1<Index> index) {
1070  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1071  return Load(d, base + index.raw);
1072 }
1073 
1074 // ================================================== CONVERT
1075 
1076 // ConvertTo and DemoteTo with floating-point input and integer output truncate
1077 // (rounding toward zero).
1078 
1079 template <typename FromT, typename ToT>
1081  static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
1082  // For bits Y > X, floatX->floatY and intX->intY are always representable.
1083  return Vec1<ToT>(static_cast<ToT>(from.raw));
1084 }
1085 
1086 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
1087 // so we overload for FromT=double and ToT={float,int32_t}.
1089  // Prevent ubsan errors when converting float to narrower integer/float
1090  if (std::isinf(from.raw) ||
1091  std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
1092  return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
1093  : HighestValue<float>());
1094  }
1095  return Vec1<float>(static_cast<float>(from.raw));
1096 }
1098  // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
1099  if (std::isinf(from.raw) ||
1100  std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
1101  return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
1102  : HighestValue<int32_t>());
1103  }
1104  return Vec1<int32_t>(static_cast<int32_t>(from.raw));
1105 }
1106 
1107 template <typename FromT, typename ToT>
1109  static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
1110  static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
1111 
1112  // Int to int: choose closest value in ToT to `from` (avoids UB)
1113  from.raw = HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw), LimitsMax<ToT>());
1114  return Vec1<ToT>(static_cast<ToT>(from.raw));
1115 }
1116 
1118 #if HWY_NATIVE_FLOAT16
1119  uint16_t bits16;
1120  CopyBytes<2>(&v.raw, &bits16);
1121 #else
1122  const uint16_t bits16 = v.raw.bits;
1123 #endif
1124  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
1125  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1126  const uint32_t mantissa = bits16 & 0x3FF;
1127 
1128  // Subnormal or zero
1129  if (biased_exp == 0) {
1130  const float subnormal =
1131  (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1132  return Vec1<float>(sign ? -subnormal : subnormal);
1133  }
1134 
1135  // Normalized: convert the representation directly (faster than ldexp/tables).
1136  const uint32_t biased_exp32 = biased_exp + (127 - 15);
1137  const uint32_t mantissa32 = mantissa << (23 - 10);
1138  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1139  float out;
1140  CopyBytes<4>(&bits32, &out);
1141  return Vec1<float>(out);
1142 }
1143 
1145  return Set(d, F32FromBF16(v.raw));
1146 }
1147 
1149  const Vec1<float> v) {
1150  uint32_t bits32;
1151  CopyBytes<4>(&v.raw, &bits32);
1152  const uint32_t sign = bits32 >> 31;
1153  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1154  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1155 
1156  const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
1157 
1158  // Tiny or zero => zero.
1159  Vec1<float16_t> out;
1160  if (exp < -24) {
1161 #if HWY_NATIVE_FLOAT16
1162  const uint16_t zero = 0;
1163  CopyBytes<2>(&zero, &out.raw);
1164 #else
1165  out.raw.bits = 0;
1166 #endif
1167  return out;
1168  }
1169 
1170  uint32_t biased_exp16, mantissa16;
1171 
1172  // exp = [-24, -15] => subnormal
1173  if (exp < -14) {
1174  biased_exp16 = 0;
1175  const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
1176  HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
1177  mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
1178  (mantissa32 >> (13 + sub_exp)));
1179  } else {
1180  // exp = [-14, 15]
1181  biased_exp16 = static_cast<uint32_t>(exp + 15);
1182  HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1183  mantissa16 = mantissa32 >> 13;
1184  }
1185 
1186  HWY_DASSERT(mantissa16 < 1024);
1187  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1188  HWY_DASSERT(bits16 < 0x10000);
1189 #if HWY_NATIVE_FLOAT16
1190  const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1191  CopyBytes<2>(&narrowed, &out.raw);
1192 #else
1193  out.raw.bits = static_cast<uint16_t>(bits16);
1194 #endif
1195  return out;
1196 }
1197 
1199  return Set(d, BF16FromF32(v.raw));
1200 }
1201 
1202 template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
1204  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1205  // float## -> int##: return closest representable value. We cannot exactly
1206  // represent LimitsMax<ToT> in FromT, so use double.
1207  const double f = static_cast<double>(from.raw);
1208  if (std::isinf(from.raw) ||
1209  std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
1210  return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
1211  : LimitsMax<ToT>());
1212  }
1213  return Vec1<ToT>(static_cast<ToT>(from.raw));
1214 }
1215 
1216 template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
1217 HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
1218  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1219  // int## -> float##: no check needed
1220  return Vec1<ToT>(static_cast<ToT>(from.raw));
1221 }
1222 
1224  return DemoteTo(Sisd<uint8_t>(), v);
1225 }
1226 
1227 // ================================================== COMBINE
1228 // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
1229 
1230 template <typename T>
1232  return v;
1233 }
1234 
1235 template <typename T>
1237  return v;
1238 }
1239 
1240 // ================================================== SWIZZLE
1241 
1242 template <typename T>
1244  return v.raw;
1245 }
1246 
1247 template <typename T>
1248 HWY_API T ExtractLane(const Vec1<T> v, size_t i) {
1249  HWY_DASSERT(i == 0);
1250  (void)i;
1251  return v.raw;
1252 }
1253 
1254 template <typename T>
1256  HWY_DASSERT(i == 0);
1257  (void)i;
1258  v.raw = t;
1259  return v;
1260 }
1261 
1262 template <typename T>
1264  return v;
1265 }
1266 // DupOdd is unsupported.
1267 
1268 template <typename T>
1270  return even;
1271 }
1272 
1273 template <typename T>
1275  return even;
1276 }
1277 
1278 // ------------------------------ SwapAdjacentBlocks
1279 
1280 template <typename T>
1282  return v;
1283 }
1284 
1285 // ------------------------------ TableLookupLanes
1286 
1287 // Returned by SetTableIndices for use by TableLookupLanes.
1288 template <typename T>
1289 struct Indices1 {
1291 };
1292 
1293 template <typename T, typename TI>
1295  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
1296  HWY_DASSERT(vec.raw == 0);
1297  return Indices1<T>{vec.raw};
1298 }
1299 
1300 template <typename T, typename TI>
1302  return IndicesFromVec(d, LoadU(Sisd<TI>(), idx));
1303 }
1304 
1305 template <typename T>
1307  return v;
1308 }
1309 
1310 // ------------------------------ ReverseBlocks
1311 
1312 // Single block: no change
1313 template <typename T>
1315  return v;
1316 }
1317 
1318 // ------------------------------ Reverse
1319 
1320 template <typename T>
1322  return v;
1323 }
1324 
1325 // Must not be called:
1326 template <typename T>
1328  return v;
1329 }
1330 
1331 template <typename T>
1333  return v;
1334 }
1335 
1336 template <typename T>
1338  return v;
1339 }
1340 
1341 // ================================================== BLOCKWISE
1342 // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
1343 
1344 // ------------------------------ Broadcast/splat any lane
1345 
1346 template <int kLane, typename T>
1348  static_assert(kLane == 0, "Scalar only has one lane");
1349  return v;
1350 }
1351 
1352 // ------------------------------ TableLookupBytes, TableLookupBytesOr0
1353 
1354 template <typename T, typename TI>
1356  uint8_t in_bytes[sizeof(T)];
1357  uint8_t idx_bytes[sizeof(T)];
1358  uint8_t out_bytes[sizeof(T)];
1359  CopyBytes<sizeof(T)>(&in, &in_bytes);
1360  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1361  for (size_t i = 0; i < sizeof(T); ++i) {
1362  out_bytes[i] = in_bytes[idx_bytes[i]];
1363  }
1364  TI out;
1365  CopyBytes<sizeof(TI)>(&out_bytes, &out);
1366  return Vec1<TI>{out};
1367 }
1368 
1369 template <typename T, typename TI>
1371  uint8_t in_bytes[sizeof(T)];
1372  uint8_t idx_bytes[sizeof(T)];
1373  uint8_t out_bytes[sizeof(T)];
1374  CopyBytes<sizeof(T)>(&in, &in_bytes);
1375  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1376  for (size_t i = 0; i < sizeof(T); ++i) {
1377  out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1378  }
1379  TI out;
1380  CopyBytes<sizeof(TI)>(&out_bytes, &out);
1381  return Vec1<TI>{out};
1382 }
1383 
1384 // ------------------------------ ZipLower
1385 
1387  return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t(b.raw) << 8) + a.raw));
1388 }
1390  const Vec1<uint16_t> b) {
1391  return Vec1<uint32_t>((uint32_t(b.raw) << 16) + a.raw);
1392 }
1394  const Vec1<uint32_t> b) {
1395  return Vec1<uint64_t>((uint64_t(b.raw) << 32) + a.raw);
1396 }
1398  return Vec1<int16_t>(static_cast<int16_t>((int32_t(b.raw) << 8) + a.raw));
1399 }
1401  return Vec1<int32_t>((int32_t(b.raw) << 16) + a.raw);
1402 }
1404  return Vec1<int64_t>((int64_t(b.raw) << 32) + a.raw);
1405 }
1406 
1407 template <typename T, typename TW = MakeWide<T>, class VW = Vec1<TW>>
1408 HWY_API VW ZipLower(Sisd<TW> /* tag */, Vec1<T> a, Vec1<T> b) {
1409  return VW(static_cast<TW>((TW{b.raw} << (sizeof(T) * 8)) + a.raw));
1410 }
1411 
1412 // ================================================== MASK
1413 
1414 template <typename T>
1415 HWY_API bool AllFalse(Sisd<T> /* tag */, const Mask1<T> mask) {
1416  return mask.bits == 0;
1417 }
1418 
1419 template <typename T>
1420 HWY_API bool AllTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1421  return mask.bits != 0;
1422 }
1423 
1424 // `p` points to at least 8 readable bytes, not all of which need be valid.
1425 template <typename T>
1427  const uint8_t* HWY_RESTRICT bits) {
1428  return Mask1<T>::FromBool((bits[0] & 1) != 0);
1429 }
1430 
1431 // `p` points to at least 8 writable bytes.
1432 template <typename T>
1433 HWY_API size_t StoreMaskBits(Sisd<T> d, const Mask1<T> mask, uint8_t* bits) {
1434  *bits = AllTrue(d, mask);
1435  return 1;
1436 }
1437 
1438 template <typename T>
1439 HWY_API size_t CountTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1440  return mask.bits == 0 ? 0 : 1;
1441 }
1442 
1443 template <typename T>
1444 HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1445  return mask.bits == 0 ? -1 : 0;
1446 }
1447 
1448 // ------------------------------ Compress, CompressBits
1449 
1450 template <typename T>
1451 struct CompressIsPartition {
1452  enum { value = 1 };
1453 };
1454 
1455 template <typename T>
1457  // A single lane is already partitioned by definition.
1458  return v;
1459 }
1460 
1461 template <typename T>
1463  // A single lane is already partitioned by definition.
1464  return v;
1465 }
1466 
1467 // ------------------------------ CompressStore
1468 template <typename T>
1470  T* HWY_RESTRICT unaligned) {
1471  StoreU(Compress(v, mask), d, unaligned);
1472  return CountTrue(d, mask);
1473 }
1474 
1475 // ------------------------------ CompressBlendedStore
1476 template <typename T>
1478  T* HWY_RESTRICT unaligned) {
1479  if (!mask.bits) return 0;
1480  StoreU(v, d, unaligned);
1481  return 1;
1482 }
1483 
1484 // ------------------------------ CompressBits
1485 template <typename T>
1486 HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) {
1487  return v;
1488 }
1489 
1490 // ------------------------------ CompressBitsStore
1491 template <typename T>
1492 HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
1493  Sisd<T> d, T* HWY_RESTRICT unaligned) {
1494  const Mask1<T> mask = LoadMaskBits(d, bits);
1495  StoreU(Compress(v, mask), d, unaligned);
1496  return CountTrue(d, mask);
1497 }
1498 
1499 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
1500 
1502  Vec1<bfloat16_t> a,
1503  Vec1<bfloat16_t> b,
1504  const Vec1<float> sum0,
1505  Vec1<float>& /* sum1 */) {
1506  return MulAdd(Vec1<float>(F32FromBF16(a.raw)),
1507  Vec1<float>(F32FromBF16(b.raw)), sum0);
1508 }
1509 
1510 // ================================================== REDUCTIONS
1511 
1512 // Sum of all lanes, i.e. the only one.
1513 template <typename T>
1515  return v;
1516 }
1517 template <typename T>
1519  return v;
1520 }
1521 template <typename T>
1523  return v;
1524 }
1525 
1526 // ================================================== Operator wrapper
1527 
1528 template <class V>
1529 HWY_API V Add(V a, V b) {
1530  return a + b;
1531 }
1532 template <class V>
1533 HWY_API V Sub(V a, V b) {
1534  return a - b;
1535 }
1536 
1537 template <class V>
1538 HWY_API V Mul(V a, V b) {
1539  return a * b;
1540 }
1541 template <class V>
1542 HWY_API V Div(V a, V b) {
1543  return a / b;
1544 }
1545 
1546 template <class V>
1547 V Shl(V a, V b) {
1548  return a << b;
1549 }
1550 template <class V>
1551 V Shr(V a, V b) {
1552  return a >> b;
1553 }
1554 
1555 template <class V>
1556 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1557  return a == b;
1558 }
1559 template <class V>
1560 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
1561  return a != b;
1562 }
1563 template <class V>
1564 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1565  return a < b;
1566 }
1567 
1568 template <class V>
1569 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1570  return a > b;
1571 }
1572 template <class V>
1573 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1574  return a >= b;
1575 }
1576 
1577 template <class V>
1578 HWY_API auto Le(V a, V b) -> decltype(a == b) {
1579  return a <= b;
1580 }
1581 
1582 // NOLINTNEXTLINE(google-readability-namespace-comments)
1583 } // namespace HWY_NAMESPACE
1584 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
Definition: scalar-inl.h:68
Raw bits
Definition: scalar-inl.h:78
hwy::MakeUnsigned< T > Raw
Definition: scalar-inl.h:69
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition: scalar-inl.h:72
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
V Ceiling(const V v)
Definition: scalar-inl.h:769
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:831
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:839
constexpr float HighestValue< float >()
Definition: base.h:580
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
constexpr float LowestValue< float >()
Definition: base.h:567
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: scalar-inl.h:1289
MakeSigned< T > raw
Definition: scalar-inl.h:1290
Definition: ops/shared-inl.h:40
Definition: scalar-inl.h:35
T raw
Definition: scalar-inl.h:63
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition: scalar-inl.h:41
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition: scalar-inl.h:50
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition: scalar-inl.h:47
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition: scalar-inl.h:44
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition: scalar-inl.h:53
HWY_INLINE Vec1(const T t)
Definition: scalar-inl.h:39
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition: scalar-inl.h:59
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition: scalar-inl.h:56
Definition: scalar-inl.h:84
Sisd< T > operator()(Vec1< T >) const
Definition: scalar-inl.h:86
HWY_INLINE Vec1< T > operator()(const Vec1< T > v) const
Definition: scalar-inl.h:393
Definition: emu128-inl.h:438
HWY_INLINE Vec1< T > operator()(const Vec1< T > v) const
Definition: scalar-inl.h:385