Grok  10.0.3
base.h
Go to the documentation of this file.
1 // Copyright 2020 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #ifndef HIGHWAY_HWY_BASE_H_
17 #define HIGHWAY_HWY_BASE_H_
18 
19 // For SIMD module implementations and their callers, target-independent.
20 
21 #include <stddef.h>
22 #include <stdint.h>
23 
25 #include "hwy/highway_export.h"
26 
27 #if HWY_ARCH_X86
28 #include <atomic>
29 #endif
30 
31 //------------------------------------------------------------------------------
32 // Compiler-specific definitions
33 
34 #define HWY_STR_IMPL(macro) #macro
35 #define HWY_STR(macro) HWY_STR_IMPL(macro)
36 
37 #if HWY_COMPILER_MSVC
38 
39 #include <intrin.h>
40 
41 #define HWY_RESTRICT __restrict
42 #define HWY_INLINE __forceinline
43 #define HWY_NOINLINE __declspec(noinline)
44 #define HWY_FLATTEN
45 #define HWY_NORETURN __declspec(noreturn)
46 #define HWY_LIKELY(expr) (expr)
47 #define HWY_UNLIKELY(expr) (expr)
48 #define HWY_PRAGMA(tokens) __pragma(tokens)
49 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
50 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
51 #define HWY_MAYBE_UNUSED
52 #define HWY_HAS_ASSUME_ALIGNED 0
53 #if (_MSC_VER >= 1700)
54 #define HWY_MUST_USE_RESULT _Check_return_
55 #else
56 #define HWY_MUST_USE_RESULT
57 #endif
58 
59 #else
60 
61 #define HWY_RESTRICT __restrict__
62 #define HWY_INLINE inline __attribute__((always_inline))
63 #define HWY_NOINLINE __attribute__((noinline))
64 #define HWY_FLATTEN __attribute__((flatten))
65 #define HWY_NORETURN __attribute__((noreturn))
66 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
67 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
68 #define HWY_PRAGMA(tokens) _Pragma(#tokens)
69 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
70 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
71 // Encountered "attribute list cannot appear here" when using the C++17
72 // [[maybe_unused]], so only use the old style attribute for now.
73 #define HWY_MAYBE_UNUSED __attribute__((unused))
74 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
75 
76 #endif // !HWY_COMPILER_MSVC
77 
78 //------------------------------------------------------------------------------
79 // Builtin/attributes
80 
81 // Enables error-checking of format strings.
82 #if HWY_HAS_ATTRIBUTE(__format__)
83 #define HWY_FORMAT(idx_fmt, idx_arg) \
84  __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
85 #else
86 #define HWY_FORMAT(idx_fmt, idx_arg)
87 #endif
88 
89 // Returns a void* pointer which the compiler then assumes is N-byte aligned.
90 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
91 //
92 // The assignment semantics are required by GCC/Clang. ICC provides an in-place
93 // __assume_aligned, whereas MSVC's __assume appears unsuitable.
94 #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
95 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
96 #else
97 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
98 #endif
99 
100 // Clang and GCC require attributes on each function into which SIMD intrinsics
101 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
102 // automatic annotation via pragmas.
103 #if HWY_COMPILER_CLANG
104 #define HWY_PUSH_ATTRIBUTES(targets_str) \
105  HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
106  apply_to = function))
107 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
108 #elif HWY_COMPILER_GCC
109 #define HWY_PUSH_ATTRIBUTES(targets_str) \
110  HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
111 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
112 #else
113 #define HWY_PUSH_ATTRIBUTES(targets_str)
114 #define HWY_POP_ATTRIBUTES
115 #endif
116 
117 //------------------------------------------------------------------------------
118 // Macros
119 
120 #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
121 
122 #define HWY_CONCAT_IMPL(a, b) a##b
123 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
124 
125 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
126 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
127 
128 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
129 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
130 // does, without generating code.
131 #if HWY_ARCH_X86
132 #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
133 #else
134 // TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
135 #define HWY_FENCE
136 #endif
137 
138 // 4 instances of a given literal value, useful as input to LoadDup128.
139 #define HWY_REP4(literal) literal, literal, literal, literal
140 
141 #define HWY_ABORT(format, ...) \
142  ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
143 
144 // Always enabled.
145 #define HWY_ASSERT(condition) \
146  do { \
147  if (!(condition)) { \
148  HWY_ABORT("Assert %s", #condition); \
149  } \
150  } while (0)
151 
152 #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
153 #define HWY_IS_MSAN 1
154 #else
155 #define HWY_IS_MSAN 0
156 #endif
157 
158 #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
159 #define HWY_IS_ASAN 1
160 #else
161 #define HWY_IS_ASAN 0
162 #endif
163 
164 #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
165 #define HWY_IS_TSAN 1
166 #else
167 #define HWY_IS_TSAN 0
168 #endif
169 
170 // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
171 // You can disable MSAN by adding this attribute to the function that fails.
172 #if HWY_IS_MSAN
173 #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
174 #else
175 #define HWY_ATTR_NO_MSAN
176 #endif
177 
178 // For enabling HWY_DASSERT and shortening tests in slower debug builds
179 #if !defined(HWY_IS_DEBUG_BUILD)
180 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
181 // MSVC defines NDEBUG (if not, could instead check _DEBUG).
182 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
183  HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
184 #define HWY_IS_DEBUG_BUILD 1
185 #else
186 #define HWY_IS_DEBUG_BUILD 0
187 #endif
188 #endif // HWY_IS_DEBUG_BUILD
189 
190 #if HWY_IS_DEBUG_BUILD
191 #define HWY_DASSERT(condition) HWY_ASSERT(condition)
192 #else
193 #define HWY_DASSERT(condition) \
194  do { \
195  } while (0)
196 #endif
197 
198 namespace hwy {
199 
200 //------------------------------------------------------------------------------
201 // kMaxVectorSize (undocumented, pending removal)
202 
203 #if HWY_ARCH_X86
204 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
205 #elif HWY_ARCH_RVV && defined(__riscv_vector)
206 // Not actually an upper bound on the size.
207 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
208 #else
209 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
210 #endif
211 
212 //------------------------------------------------------------------------------
213 // Alignment
214 
215 // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
216 // should be allocated dynamically via aligned_allocator.h because Lanes() may
217 // exceed the stack size.
218 #if HWY_ARCH_X86
219 #define HWY_ALIGN_MAX alignas(64)
220 #elif HWY_ARCH_RVV && defined(__riscv_vector)
221 #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
222 #else
223 #define HWY_ALIGN_MAX alignas(16)
224 #endif
225 
226 //------------------------------------------------------------------------------
227 // Lane types
228 
229 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
230 // by concatenating base type and bits.
231 
232 #if HWY_ARCH_ARM && (__ARM_FP & 2)
233 #define HWY_NATIVE_FLOAT16 1
234 #else
235 #define HWY_NATIVE_FLOAT16 0
236 #endif
237 
238 #pragma pack(push, 1)
239 
240 #if HWY_NATIVE_FLOAT16
241 using float16_t = __fp16;
242 // Clang does not allow __fp16 arguments, but scalar.h requires LaneType
243 // arguments, so use a wrapper.
244 // TODO(janwas): replace with _Float16 when that is supported?
245 #else
246 struct float16_t {
247  uint16_t bits;
248 };
249 #endif
250 
251 struct bfloat16_t {
252  uint16_t bits;
253 };
254 
255 #pragma pack(pop)
256 
257 using float32_t = float;
258 using float64_t = double;
259 
260 #pragma pack(push, 1)
261 
262 // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
263 // https://reviews.llvm.org/D86310
264 struct alignas(16) uint128_t {
265  uint64_t lo; // little-endian layout
266  uint64_t hi;
267 };
268 
269 // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
270 // field is to be compared (Lt128Upper instead of Lt128).
271 struct alignas(16) K64V64 {
272  uint64_t value; // little-endian layout
273  uint64_t key;
274 };
275 
276 #pragma pack(pop)
277 
278 static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
279  const uint128_t& b) {
280  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
281 }
282 // Required for std::greater.
283 static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
284  const uint128_t& b) {
285  return b < a;
286 }
287 
288 static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
289  const K64V64& b) {
290  return a.key < b.key;
291 }
292 // Required for std::greater.
293 static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
294  const K64V64& b) {
295  return b < a;
296 }
297 
298 //------------------------------------------------------------------------------
299 // Controlling overload resolution (SFINAE)
300 
301 template <bool Condition>
302 struct EnableIfT {};
303 template <>
304 struct EnableIfT<true> {
305  using type = void;
306 };
307 
308 template <bool Condition>
310 
311 template <typename T, typename U>
312 struct IsSameT {
313  enum { value = 0 };
314 };
315 
316 template <typename T>
317 struct IsSameT<T, T> {
318  enum { value = 1 };
319 };
320 
321 template <typename T, typename U>
322 HWY_API constexpr bool IsSame() {
323  return IsSameT<T, U>::value;
324 }
325 
326 // Insert into template/function arguments to enable this overload only for
327 // vectors of AT MOST this many bits.
328 //
329 // Note that enabling for exactly 128 bits is unnecessary because a function can
330 // simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
331 // sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
332 #define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
333 #define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
334 #define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
335 #define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
336 #define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
337 #define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
338 #define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
339 
340 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
341 #define HWY_IF_SIGNED(T) \
342  hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
343 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
344 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
345 
346 #define HWY_IF_LANE_SIZE(T, bytes) \
347  hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
348 #define HWY_IF_NOT_LANE_SIZE(T, bytes) \
349  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
350 #define HWY_IF_LANE_SIZE_LT(T, bytes) \
351  hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
352 
353 #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
354  hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
355 
356 // Empty struct used as a size tag type.
357 template <size_t N>
358 struct SizeTag {};
359 
360 template <class T>
361 struct RemoveConstT {
362  using type = T;
363 };
364 template <class T>
365 struct RemoveConstT<const T> {
366  using type = T;
367 };
368 
369 template <class T>
371 
372 //------------------------------------------------------------------------------
373 // Type relations
374 
375 namespace detail {
376 
377 template <typename T>
378 struct Relations;
379 template <>
380 struct Relations<uint8_t> {
381  using Unsigned = uint8_t;
382  using Signed = int8_t;
383  using Wide = uint16_t;
384 };
385 template <>
386 struct Relations<int8_t> {
387  using Unsigned = uint8_t;
388  using Signed = int8_t;
389  using Wide = int16_t;
390 };
391 template <>
392 struct Relations<uint16_t> {
393  using Unsigned = uint16_t;
394  using Signed = int16_t;
395  using Wide = uint32_t;
396  using Narrow = uint8_t;
397 };
398 template <>
399 struct Relations<int16_t> {
400  using Unsigned = uint16_t;
401  using Signed = int16_t;
402  using Wide = int32_t;
403  using Narrow = int8_t;
404 };
405 template <>
406 struct Relations<uint32_t> {
407  using Unsigned = uint32_t;
408  using Signed = int32_t;
409  using Float = float;
410  using Wide = uint64_t;
411  using Narrow = uint16_t;
412 };
413 template <>
414 struct Relations<int32_t> {
415  using Unsigned = uint32_t;
416  using Signed = int32_t;
417  using Float = float;
418  using Wide = int64_t;
419  using Narrow = int16_t;
420 };
421 template <>
422 struct Relations<uint64_t> {
423  using Unsigned = uint64_t;
424  using Signed = int64_t;
425  using Float = double;
426  using Wide = uint128_t;
427  using Narrow = uint32_t;
428 };
429 template <>
430 struct Relations<int64_t> {
431  using Unsigned = uint64_t;
432  using Signed = int64_t;
433  using Float = double;
434  using Narrow = int32_t;
435 };
436 template <>
439  using Narrow = uint64_t;
440 };
441 template <>
443  using Unsigned = uint16_t;
444  using Signed = int16_t;
445  using Float = float16_t;
446  using Wide = float;
447 };
448 template <>
450  using Unsigned = uint16_t;
451  using Signed = int16_t;
452  using Wide = float;
453 };
454 template <>
455 struct Relations<float> {
456  using Unsigned = uint32_t;
457  using Signed = int32_t;
458  using Float = float;
459  using Wide = double;
460  using Narrow = float16_t;
461 };
462 template <>
463 struct Relations<double> {
464  using Unsigned = uint64_t;
465  using Signed = int64_t;
466  using Float = double;
467  using Narrow = float;
468 };
469 
470 template <size_t N>
472 template <>
473 struct TypeFromSize<1> {
474  using Unsigned = uint8_t;
475  using Signed = int8_t;
476 };
477 template <>
478 struct TypeFromSize<2> {
479  using Unsigned = uint16_t;
480  using Signed = int16_t;
481 };
482 template <>
483 struct TypeFromSize<4> {
484  using Unsigned = uint32_t;
485  using Signed = int32_t;
486  using Float = float;
487 };
488 template <>
489 struct TypeFromSize<8> {
490  using Unsigned = uint64_t;
491  using Signed = int64_t;
492  using Float = double;
493 };
494 template <>
495 struct TypeFromSize<16> {
497 };
498 
499 } // namespace detail
500 
501 // Aliases for types of a different category, but the same size.
502 template <typename T>
504 template <typename T>
506 template <typename T>
508 
509 // Aliases for types of the same category, but different size.
510 template <typename T>
512 template <typename T>
514 
515 // Obtain type from its size [bytes].
516 template <size_t N>
518 template <size_t N>
520 template <size_t N>
522 
523 //------------------------------------------------------------------------------
524 // Type traits
525 
526 template <typename T>
527 HWY_API constexpr bool IsFloat() {
528  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
529  // from a float, not compared.
530  return IsSame<T, float>() || IsSame<T, double>();
531 }
532 
533 template <typename T>
534 HWY_API constexpr bool IsSigned() {
535  return T(0) > T(-1);
536 }
537 template <>
538 constexpr bool IsSigned<float16_t>() {
539  return true;
540 }
541 template <>
542 constexpr bool IsSigned<bfloat16_t>() {
543  return true;
544 }
545 
546 // Largest/smallest representable integer values.
547 template <typename T>
548 HWY_API constexpr T LimitsMax() {
549  static_assert(!IsFloat<T>(), "Only for integer types");
550  using TU = MakeUnsigned<T>;
551  return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
552  : static_cast<TU>(~0ull));
553 }
554 template <typename T>
555 HWY_API constexpr T LimitsMin() {
556  static_assert(!IsFloat<T>(), "Only for integer types");
557  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
558 }
559 
560 // Largest/smallest representable value (integer or float). This naming avoids
561 // confusion with numeric_limits<float>::min() (the smallest positive value).
562 template <typename T>
563 HWY_API constexpr T LowestValue() {
564  return LimitsMin<T>();
565 }
566 template <>
567 constexpr float LowestValue<float>() {
568  return -3.402823466e+38F;
569 }
570 template <>
571 constexpr double LowestValue<double>() {
572  return -1.7976931348623158e+308;
573 }
574 
575 template <typename T>
576 HWY_API constexpr T HighestValue() {
577  return LimitsMax<T>();
578 }
579 template <>
580 constexpr float HighestValue<float>() {
581  return 3.402823466e+38F;
582 }
583 template <>
584 constexpr double HighestValue<double>() {
585  return 1.7976931348623158e+308;
586 }
587 
588 // Returns width in bits of the mantissa field in IEEE binary32/64.
589 template <typename T>
590 constexpr int MantissaBits() {
591  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
592  return 0;
593 }
594 template <>
595 constexpr int MantissaBits<float>() {
596  return 23;
597 }
598 template <>
599 constexpr int MantissaBits<double>() {
600  return 52;
601 }
602 
603 // Returns the (left-shifted by one bit) IEEE binary32/64 representation with
604 // the largest possible (biased) exponent field. Used by IsInf.
605 template <typename T>
607  return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
608 }
609 
610 // Returns bitmask of the sign bit in IEEE binary32/64.
611 template <typename T>
613  return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
614 }
615 
616 // Returns bitmask of the exponent field in IEEE binary32/64.
617 template <typename T>
619  return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
620 }
621 
622 // Returns bitmask of the mantissa field in IEEE binary32/64.
623 template <typename T>
625  return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
626 }
627 
628 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
629 // absolute value are less than this can be represented exactly.
630 template <typename T>
631 constexpr T MantissaEnd() {
632  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
633  return 0;
634 }
635 template <>
636 constexpr float MantissaEnd<float>() {
637  return 8388608.0f; // 1 << 23
638 }
639 template <>
640 constexpr double MantissaEnd<double>() {
641  // floating point literal with p52 requires C++17.
642  return 4503599627370496.0; // 1 << 52
643 }
644 
645 // Returns width in bits of the exponent field in IEEE binary32/64.
646 template <typename T>
647 constexpr int ExponentBits() {
648  // Exponent := remaining bits after deducting sign and mantissa.
649  return 8 * sizeof(T) - 1 - MantissaBits<T>();
650 }
651 
652 // Returns largest value of the biased exponent field in IEEE binary32/64,
653 // right-shifted so that the LSB is bit zero. Example: 0xFF for float.
654 // This is expressed as a signed integer for more efficient comparison.
655 template <typename T>
657  return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
658 }
659 
660 //------------------------------------------------------------------------------
661 // Helper functions
662 
663 template <typename T1, typename T2>
664 constexpr inline T1 DivCeil(T1 a, T2 b) {
665  return (a + b - 1) / b;
666 }
667 
668 // Works for any `align`; if a power of two, compiler emits ADD+AND.
669 constexpr inline size_t RoundUpTo(size_t what, size_t align) {
670  return DivCeil(what, align) * align;
671 }
672 
673 // Undefined results for x == 0.
674 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
675 #if HWY_COMPILER_MSVC
676  unsigned long index; // NOLINT
677  _BitScanForward(&index, x);
678  return index;
679 #else // HWY_COMPILER_MSVC
680  return static_cast<size_t>(__builtin_ctz(x));
681 #endif // HWY_COMPILER_MSVC
682 }
683 
684 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
685 #if HWY_COMPILER_MSVC
686 #if HWY_ARCH_X86_64
687  unsigned long index; // NOLINT
688  _BitScanForward64(&index, x);
689  return index;
690 #else // HWY_ARCH_X86_64
691  // _BitScanForward64 not available
692  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
693  unsigned long index; // NOLINT
694  if (lsb == 0) {
695  uint32_t msb = static_cast<uint32_t>(x >> 32u);
696  _BitScanForward(&index, msb);
697  return 32 + index;
698  } else {
699  _BitScanForward(&index, lsb);
700  return index;
701  }
702 #endif // HWY_ARCH_X86_64
703 #else // HWY_COMPILER_MSVC
704  return static_cast<size_t>(__builtin_ctzll(x));
705 #endif // HWY_COMPILER_MSVC
706 }
707 
708 // Undefined results for x == 0.
709 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
710 #if HWY_COMPILER_MSVC
711  unsigned long index; // NOLINT
712  _BitScanReverse(&index, x);
713  return 31 - index;
714 #else // HWY_COMPILER_MSVC
715  return static_cast<size_t>(__builtin_clz(x));
716 #endif // HWY_COMPILER_MSVC
717 }
718 
719 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
720 #if HWY_COMPILER_MSVC
721 #if HWY_ARCH_X86_64
722  unsigned long index; // NOLINT
723  _BitScanReverse64(&index, x);
724  return 63 - index;
725 #else // HWY_ARCH_X86_64
726  // _BitScanReverse64 not available
727  const uint32_t msb = static_cast<uint32_t>(x >> 32u);
728  unsigned long index; // NOLINT
729  if (msb == 0) {
730  const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
731  _BitScanReverse(&index, lsb);
732  return 63 - index;
733  } else {
734  _BitScanReverse(&index, msb);
735  return 31 - index;
736  }
737 #endif // HWY_ARCH_X86_64
738 #else // HWY_COMPILER_MSVC
739  return static_cast<size_t>(__builtin_clzll(x));
740 #endif // HWY_COMPILER_MSVC
741 }
742 
743 HWY_API size_t PopCount(uint64_t x) {
744 #if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
745  return static_cast<size_t>(__builtin_popcountll(x));
746  // This instruction has a separate feature flag, but is often called from
747  // non-SIMD code, so we don't want to require dynamic dispatch. It was first
748  // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
749  // for AVX, so check for that.
750 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
751  return _mm_popcnt_u64(x);
752 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
753  return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
754  _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
755 #else
756  x -= ((x >> 1) & 0x5555555555555555ULL);
757  x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
758  x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
759  x += (x >> 8);
760  x += (x >> 16);
761  x += (x >> 32);
762  return static_cast<size_t>(x & 0x7Fu);
763 #endif
764 }
765 
766 // Skip HWY_API due to GCC "function not considered for inlining". Previously
767 // such errors were caused by underlying type mismatches, but it's not clear
768 // what is still mismatched despite all the casts.
769 template <typename TI>
770 /*HWY_API*/ constexpr size_t FloorLog2(TI x) {
771  return x == TI{1}
772  ? 0
773  : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
774 }
775 
776 template <typename TI>
777 /*HWY_API*/ constexpr size_t CeilLog2(TI x) {
778  return x == TI{1}
779  ? 0
780  : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
781 }
782 
783 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
784 #pragma intrinsic(_umul128)
785 #endif
786 
787 // 64 x 64 = 128 bit multiplication
788 HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
789 #if defined(__SIZEOF_INT128__)
790  __uint128_t product = (__uint128_t)a * (__uint128_t)b;
791  *upper = (uint64_t)(product >> 64);
792  return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
793 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
794  return _umul128(a, b, upper);
795 #else
796  constexpr uint64_t kLo32 = 0xFFFFFFFFU;
797  const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
798  const uint64_t hi_lo = (a >> 32) * (b & kLo32);
799  const uint64_t lo_hi = (a & kLo32) * (b >> 32);
800  const uint64_t hi_hi = (a >> 32) * (b >> 32);
801  const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
802  *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
803  return (t << 32) | (lo_lo & kLo32);
804 #endif
805 }
806 
807 #if HWY_COMPILER_MSVC
808 #pragma intrinsic(memcpy)
809 #pragma intrinsic(memset)
810 #endif
811 
812 // The source/destination must not overlap/alias.
813 template <size_t kBytes, typename From, typename To>
814 HWY_API void CopyBytes(const From* from, To* to) {
815 #if HWY_COMPILER_MSVC
816  memcpy(to, from, kBytes);
817 #else
818  __builtin_memcpy(to, from, kBytes);
819 #endif
820 }
821 
822 template <size_t kBytes, typename To>
823 HWY_API void ZeroBytes(To* to) {
824 #if HWY_COMPILER_MSVC
825  memset(to, 0, kBytes);
826 #else
827  __builtin_memset(to, 0, kBytes);
828 #endif
829 }
830 
832  uint32_t bits = bf.bits;
833  bits <<= 16;
834  float f;
835  CopyBytes<4>(&bits, &f);
836  return f;
837 }
838 
840  uint32_t bits;
841  CopyBytes<4>(&f, &bits);
842  bfloat16_t bf;
843  bf.bits = static_cast<uint16_t>(bits >> 16);
844  return bf;
845 }
846 
848  Abort(const char* file, int line, const char* format, ...);
849 
850 } // namespace hwy
851 
852 #endif // HIGHWAY_HWY_BASE_H_
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_NORETURN
Definition: base.h:65
#define HWY_API
Definition: base.h:120
#define HWY_MAYBE_UNUSED
Definition: base.h:73
#define HWY_DLLEXPORT
Definition: highway_export.h:13
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
constexpr T MantissaEnd()
Definition: base.h:631
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:831
HWY_API void ZeroBytes(To *to)
Definition: base.h:823
constexpr HWY_API T LimitsMin()
Definition: base.h:555
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:839
typename detail::TypeFromSize< N >::Float FloatFromSize
Definition: base.h:521
typename RemoveConstT< T >::type RemoveConst
Definition: base.h:370
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
constexpr float HighestValue< float >()
Definition: base.h:580
constexpr HWY_API T LimitsMax()
Definition: base.h:548
typename detail::TypeFromSize< N >::Signed SignedFromSize
Definition: base.h:519
constexpr T1 DivCeil(T1 a, T2 b)
Definition: base.h:664
constexpr float MantissaEnd< float >()
Definition: base.h:636
double float64_t
Definition: base.h:258
constexpr bool IsSigned< bfloat16_t >()
Definition: base.h:542
constexpr MakeUnsigned< T > MantissaMask()
Definition: base.h:624
constexpr size_t FloorLog2(TI x)
Definition: base.h:770
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:709
constexpr bool IsSigned< float16_t >()
Definition: base.h:538
constexpr double HighestValue< double >()
Definition: base.h:584
constexpr int MantissaBits< double >()
Definition: base.h:599
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:309
constexpr HWY_API bool IsFloat()
Definition: base.h:527
static HWY_MAYBE_UNUSED bool operator>(const uint128_t &a, const uint128_t &b)
Definition: base.h:283
float float32_t
Definition: base.h:257
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
constexpr double MantissaEnd< double >()
Definition: base.h:640
constexpr int MantissaBits()
Definition: base.h:590
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
constexpr float LowestValue< float >()
Definition: base.h:567
constexpr HWY_API bool IsSame()
Definition: base.h:322
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:719
constexpr double LowestValue< double >()
Definition: base.h:571
static HWY_MAYBE_UNUSED bool operator<(const uint128_t &a, const uint128_t &b)
Definition: base.h:278
constexpr MakeSigned< T > MaxExponentField()
Definition: base.h:656
constexpr HWY_API T LowestValue()
Definition: base.h:563
constexpr HWY_API T HighestValue()
Definition: base.h:576
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize
Definition: base.h:209
constexpr HWY_API bool IsSigned()
Definition: base.h:534
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Float MakeFloat
Definition: base.h:507
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
constexpr int MantissaBits< float >()
Definition: base.h:595
constexpr MakeUnsigned< T > SignMask()
Definition: base.h:612
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char *file
HWY_DLLEXPORT HWY_NORETURN void int const char * format
Definition: base.h:848
HWY_DLLEXPORT HWY_NORETURN void int line
Definition: base.h:848
constexpr size_t RoundUpTo(size_t what, size_t align)
Definition: base.h:669
typename detail::Relations< T >::Narrow MakeNarrow
Definition: base.h:513
constexpr MakeUnsigned< T > ExponentMask()
Definition: base.h:618
constexpr int ExponentBits()
Definition: base.h:647
constexpr MakeSigned< T > MaxExponentTimes2()
Definition: base.h:606
void type
Definition: base.h:305
Definition: base.h:302
Definition: base.h:312
@ value
Definition: base.h:313
Definition: base.h:271
uint64_t value
Definition: base.h:272
uint64_t key
Definition: base.h:273
T type
Definition: base.h:366
Definition: base.h:361
T type
Definition: base.h:362
Definition: base.h:358
Definition: base.h:251
uint16_t bits
Definition: base.h:252
int16_t Signed
Definition: base.h:451
float Wide
Definition: base.h:452
uint16_t Unsigned
Definition: base.h:450
double Float
Definition: base.h:466
uint64_t Unsigned
Definition: base.h:464
int64_t Signed
Definition: base.h:465
float Narrow
Definition: base.h:467
int16_t Signed
Definition: base.h:444
float Wide
Definition: base.h:446
uint16_t Unsigned
Definition: base.h:443
uint32_t Unsigned
Definition: base.h:456
double Wide
Definition: base.h:459
float Float
Definition: base.h:458
int32_t Signed
Definition: base.h:457
uint16_t Unsigned
Definition: base.h:400
int16_t Signed
Definition: base.h:401
int32_t Wide
Definition: base.h:402
int8_t Narrow
Definition: base.h:403
uint32_t Unsigned
Definition: base.h:415
int64_t Wide
Definition: base.h:418
float Float
Definition: base.h:417
int16_t Narrow
Definition: base.h:419
int32_t Signed
Definition: base.h:416
int32_t Narrow
Definition: base.h:434
double Float
Definition: base.h:433
uint64_t Unsigned
Definition: base.h:431
int64_t Signed
Definition: base.h:432
int16_t Wide
Definition: base.h:389
int8_t Signed
Definition: base.h:388
uint8_t Unsigned
Definition: base.h:387
uint64_t Narrow
Definition: base.h:439
uint8_t Narrow
Definition: base.h:396
int16_t Signed
Definition: base.h:394
uint32_t Wide
Definition: base.h:395
uint16_t Unsigned
Definition: base.h:393
uint32_t Unsigned
Definition: base.h:407
uint64_t Wide
Definition: base.h:410
uint16_t Narrow
Definition: base.h:411
float Float
Definition: base.h:409
int32_t Signed
Definition: base.h:408
uint32_t Narrow
Definition: base.h:427
int64_t Signed
Definition: base.h:424
uint64_t Unsigned
Definition: base.h:423
double Float
Definition: base.h:425
int8_t Signed
Definition: base.h:382
uint8_t Unsigned
Definition: base.h:381
uint16_t Wide
Definition: base.h:383
Definition: base.h:378
int8_t Signed
Definition: base.h:475
uint8_t Unsigned
Definition: base.h:474
int16_t Signed
Definition: base.h:480
uint16_t Unsigned
Definition: base.h:479
int32_t Signed
Definition: base.h:485
uint32_t Unsigned
Definition: base.h:484
float Float
Definition: base.h:486
double Float
Definition: base.h:492
int64_t Signed
Definition: base.h:491
uint64_t Unsigned
Definition: base.h:490
Definition: base.h:471
Definition: base.h:246
uint16_t bits
Definition: base.h:247
Definition: base.h:264
uint64_t lo
Definition: base.h:265
uint64_t hi
Definition: base.h:266