Grok  10.0.3
ops/shared-inl.h
Go to the documentation of this file.
1 // Copyright 2020 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Per-target definitions shared by ops/*.h and user code.
17 
18 #include <cmath>
19 
20 #include "hwy/base.h"
21 
22 // Separate header because foreach_target.h re-enables its include guard.
23 #include "hwy/ops/set_macros-inl.h"
24 
25 // Relies on the external include guard in highway.h.
27 namespace hwy {
28 namespace HWY_NAMESPACE {
29 
30 // Highway operations are implemented as overloaded functions selected using an
31 // internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
32 // shift count applied to scalable vectors. Instead of referring to Simd<>
33 // directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
34 // full vector, or fractions/groups if the argument is negative/positive),
35 // CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
36 // Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
37 // cap. For constexpr-size vectors, N is the actual number of lanes. This
38 // ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
39 template <typename Lane, size_t N, int kPow2>
40 struct Simd {
41  constexpr Simd() = default;
42  using T = Lane;
43  static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
44 
45  // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
46  // warns when using enums and non-enums in the same expression. Cannot be
47  // static constexpr function (another MSVC limitation).
48  static constexpr size_t kPrivateN = N;
49  static constexpr int kPrivatePow2 = kPow2;
50 
51  template <typename NewT>
52  static constexpr size_t NewN() {
53  // Round up to correctly handle scalars with N=1.
54  return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
55  }
56 
57 #if HWY_HAVE_SCALABLE
58  template <typename NewT>
59  static constexpr int Pow2Ratio() {
60  return (sizeof(NewT) > sizeof(T))
61  ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
62  : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
63  }
64 #endif
65 
66  // Widening/narrowing ops change the number of lanes and/or their type.
67  // To initialize such vectors, we need the corresponding tag types:
68 
69 // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
70 #if HWY_HAVE_SCALABLE
71  template <typename NewT>
72  using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
73 #else
74  template <typename NewT>
76 #endif
77 
78  // Change lane type while keeping the same vector size, e.g. for MulEven.
79  template <typename NewT>
81 
82 // Half the lanes while keeping the same lane type, e.g. for LowerHalf.
83 // Round up to correctly handle scalars with N=1.
84 #if HWY_HAVE_SCALABLE
85  // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
86  // then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
87  using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
88 #else
89  using Half = Simd<T, (N + 1) / 2, kPow2>;
90 #endif
91 
92 // Twice the lanes while keeping the same lane type, e.g. for Combine.
93 #if HWY_HAVE_SCALABLE
95 #else
97 #endif
98 };
99 
100 namespace detail {
101 
102 template <typename T, size_t N, int kPow2>
103 constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
104  return N == HWY_LANES(T) && kPow2 == 0;
105 }
106 
107 // Returns the number of lanes (possibly zero) after applying a shift:
108 // - 0: no change;
109 // - [1,3]: a group of 2,4,8 [fractional] vectors;
110 // - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
111 constexpr size_t ScaleByPower(size_t N, int pow2) {
112 #if HWY_TARGET == HWY_RVV
113  return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
114 #else
115  return pow2 >= 0 ? N : (N >> (-pow2));
116 #endif
117 }
118 
119 // Struct wrappers enable validation of arguments via static_assert.
120 template <typename T, int kPow2>
122  static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
123 #if HWY_TARGET == HWY_RVV
124  // Only RVV supports register groups.
125  using type = Simd<T, HWY_LANES(T), kPow2>;
126 #elif HWY_HAVE_SCALABLE
127  // For SVE[2], only allow full or fractions.
128  using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
129 #elif HWY_TARGET == HWY_SCALAR
130  using type = Simd<T, /*N=*/1, 0>;
131 #else
132  // Only allow full or fractions.
133  using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
134 #endif
135 };
136 
137 template <typename T, size_t kLimit>
139  static_assert(kLimit != 0, "Does not make sense to have zero lanes");
140  // Safely handle non-power-of-two inputs by rounding down, which is allowed by
141  // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
142  static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
143  using type = Simd<T, HWY_MIN(kLimitPow2, HWY_LANES(T)), 0>;
144 };
145 
146 template <typename T, size_t kNumLanes>
148  static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
149  static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
151 };
152 
153 } // namespace detail
154 
155 // Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
156 // e.g. 1D loops where the application does not care about the vector size) or a
157 // fraction/multiple of one. Multiples are the same as full vectors for all
158 // targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
159 // value of type promotion and demotion.
160 template <typename T, int kPow2 = 0>
162 
163 // Alias for a tag describing a vector with *up to* kLimit active lanes, even on
164 // targets with scalable vectors and HWY_SCALAR. The runtime lane count
165 // `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
166 // typically used for 1D loops with a relatively low application-defined upper
167 // bound, e.g. for 8x8 DCTs. However, it is better if data structures are
168 // designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
169 // chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
170 // this would enable vector-length-agnostic loops using ScalableTag).
171 template <typename T, size_t kLimit>
173 
174 // Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
175 // even on targets with scalable vectors. Requires `kNumLanes` to be a power of
176 // two not exceeding `HWY_LANES(T)`.
177 //
178 // NOTE: if the application does not need to support HWY_SCALAR (+), use this
179 // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
180 // This is useful for data structures that rely on exactly 128-bit SIMD, but
181 // these are discouraged because they cannot benefit from wider vectors.
182 // Instead, applications would ideally define a larger problem size and loop
183 // over it with the (unknown size) vectors from ScalableTag.
184 //
185 // + e.g. if the baseline is known to support SIMD, or the application requires
186 // ops such as TableLookupBytes not supported by HWY_SCALAR.
187 template <typename T, size_t kNumLanes>
189 
190 template <class D>
191 using TFromD = typename D::T;
192 
193 // Tag for the same number of lanes as D, but with the LaneType T.
194 template <class T, class D>
195 using Rebind = typename D::template Rebind<T>;
196 
197 template <class D>
199 template <class D>
201 template <class D>
203 
204 // Tag for the same total size as D, but with the LaneType T.
205 template <class T, class D>
206 using Repartition = typename D::template Repartition<T>;
207 
208 template <class D>
210 template <class D>
212 
213 // Tag for the same lane type as D, but half the lanes.
214 template <class D>
215 using Half = typename D::Half;
216 
217 // Tag for the same lane type as D, but twice the lanes.
218 template <class D>
219 using Twice = typename D::Twice;
220 
221 template <typename T>
222 using Full32 = Simd<T, 4 / sizeof(T), 0>;
223 
224 template <typename T>
225 using Full64 = Simd<T, 8 / sizeof(T), 0>;
226 
227 template <typename T>
228 using Full128 = Simd<T, 16 / sizeof(T), 0>;
229 
230 // Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
231 #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
232 #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
233 #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
234 #define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
235 #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
236 #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
237 
238 // MSVC workaround: use PrivateN directly instead of MaxLanes.
239 #define HWY_IF_LT128_D(D) \
240  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
241 #define HWY_IF_GE128_D(D) \
242  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
243 
244 // Same, but with a vector argument. ops/*-inl.h define their own TFromV.
245 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
246 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
247 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
248 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
249 #define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)
250 
251 template <class D>
252 HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
253  return D::kPrivatePow2;
254 }
255 
256 // MSVC requires the explicit <D>.
257 #define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
258 
259 #if HWY_HAVE_SCALABLE
260 
261 // Upper bound on the number of lanes. Intended for template arguments and
262 // reducing code size (e.g. for SSE4, we know at compile-time that vectors will
263 // not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
264 // actual size for allocating storage. WARNING: MSVC might not be able to deduce
265 // arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
266 template <class D>
267 HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
268  return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
269  D::kPrivatePow2);
270 }
271 
272 #else
273 // Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
274 // is not an option, nor does a member function work.
275 template <class D>
276 HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
277  return D::kPrivateN;
278 }
279 
280 // (Potentially) non-constant actual size of the vector at runtime, subject to
281 // the limit imposed by the Simd. Useful for advancing loop counters.
282 // Targets with scalable vectors define this themselves.
283 template <typename T, size_t N, int kPow2>
285  return N;
286 }
287 
288 #endif // !HWY_HAVE_SCALABLE
289 
290 // NOTE: GCC generates incorrect code for vector arguments to non-inlined
291 // functions in two situations:
292 // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
293 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
294 // - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
295 // all) tests to fail.
296 //
297 // We therefore pass by const& only on GCC and (Windows or ARM64). This alias
298 // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
299 // and possibly also other functions that are not inlined.
300 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
301  ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
302 template <class V>
303 using VecArg = const V&;
304 #else
305 template <class V>
306 using VecArg = V;
307 #endif
308 
309 // NOLINTNEXTLINE(google-readability-namespace-comments)
310 } // namespace HWY_NAMESPACE
311 } // namespace hwy
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_MAYBE_UNUSED
Definition: base.h:73
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:111
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition: ops/shared-inl.h:103
V VecArg
Definition: ops/shared-inl.h:306
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:252
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition: ops/shared-inl.h:172
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:161
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_INLINE constexpr HWY_MAYBE_UNUSED size_t MaxLanes(D)
Definition: ops/shared-inl.h:276
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition: ops/shared-inl.h:188
typename D::Half Half
Definition: ops/shared-inl.h:215
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
N
Definition: rvv-inl.h:1742
typename D::T TFromD
Definition: ops/shared-inl.h:191
Definition: aligned_allocator.h:27
constexpr size_t FloorLog2(TI x)
Definition: base.h:770
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_LANES(T)
Definition: set_macros-inl.h:85
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: ops/shared-inl.h:40
constexpr Simd()=default
Simd< NewT, N, kPow2 > Rebind
Definition: ops/shared-inl.h:75
static constexpr size_t NewN()
Definition: ops/shared-inl.h:52
static constexpr int kPrivatePow2
Definition: ops/shared-inl.h:49
static constexpr size_t kPrivateN
Definition: ops/shared-inl.h:48
Lane T
Definition: ops/shared-inl.h:42
Definition: ops/shared-inl.h:138
static constexpr size_t kLimitPow2
Definition: ops/shared-inl.h:142
Definition: ops/shared-inl.h:147
Definition: ops/shared-inl.h:121