Grok  10.0.3
transform-inl.h
Go to the documentation of this file.
1 // Copyright 2022 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Per-target include guard
17 #if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
18  defined(HWY_TARGET_TOGGLE)
19 #ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
20 #undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
21 #else
22 #define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
23 #endif
24 
25 #include "hwy/highway.h"
26 
28 namespace hwy {
29 namespace HWY_NAMESPACE {
30 
31 // These functions avoid having to write a loop plus remainder handling in the
32 // (unfortunately still common) case where arrays are not aligned/padded. If the
33 // inputs are known to be aligned/padded, it is more efficient to write a single
34 // loop using Load(). We do not provide a TransformAlignedPadded because it
35 // would be more verbose than such a loop.
36 //
37 // Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
38 // generic lambda if using C++14. Due to apparent limitations of Clang on
39 // Windows, it is currently necessary to add HWY_ATTR before the opening { of
40 // the lambda to avoid errors about "always_inline function .. requires target".
41 //
42 // If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
43 // we used `MaskedLoad` and `BlendedStore` to read/write the final partial
44 // vector.
45 
46 // Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`,
47 // where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`,
48 // the value of its lane i is i, and increases by `Lanes(d)` after every call.
49 // Note that some of these indices may be `>= count`, but the elements that
50 // `func` returns in those lanes will not be written to `out`.
51 template <class D, class Func, typename T = TFromD<D>>
52 void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) {
53  const RebindToUnsigned<D> du;
54  using TU = TFromD<decltype(du)>;
55  const size_t N = Lanes(d);
56 
57  size_t idx = 0;
58  Vec<decltype(du)> vidx = Iota(du, 0);
59  for (; idx + N <= count; idx += N) {
60  StoreU(func(d, vidx), d, out + idx);
61  vidx = Add(vidx, Set(du, static_cast<TU>(N)));
62  }
63 
64  // `count` was a multiple of the vector length `N`: already done.
65  if (HWY_UNLIKELY(idx == count)) return;
66 
67 #if HWY_MEM_OPS_MIGHT_FAULT
68  // Proceed one by one.
69  const CappedTag<T, 1> d1;
70  const RebindToUnsigned<decltype(d1)> du1;
71  for (; idx < count; ++idx) {
72  StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx);
73  }
74 #else
75  const size_t remaining = count - idx;
76  HWY_DASSERT(0 != remaining && remaining < N);
77  const Mask<D> mask = FirstN(d, remaining);
78  BlendedStore(func(d, vidx), mask, d, out + idx);
79 #endif
80 }
81 
82 // Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
83 // array elements by a constant.
84 template <class D, class Func, typename T = TFromD<D>>
85 void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
86  const size_t N = Lanes(d);
87 
88  size_t idx = 0;
89  for (; idx + N <= count; idx += N) {
90  const Vec<D> v = LoadU(d, inout + idx);
91  StoreU(func(d, v), d, inout + idx);
92  }
93 
94  // `count` was a multiple of the vector length `N`: already done.
95  if (HWY_UNLIKELY(idx == count)) return;
96 
97 #if HWY_MEM_OPS_MIGHT_FAULT
98  // Proceed one by one.
99  const CappedTag<T, 1> d1;
100  for (; idx < count; ++idx) {
101  using V1 = Vec<decltype(d1)>;
102  const V1 v = LoadU(d1, inout + idx);
103  StoreU(func(d1, v), d1, inout + idx);
104  }
105 #else
106  const size_t remaining = count - idx;
107  HWY_DASSERT(0 != remaining && remaining < N);
108  const Mask<D> mask = FirstN(d, remaining);
109  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
110  BlendedStore(func(d, v), mask, d, inout + idx);
111 #endif
112 }
113 
114 // Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
115 // multiplying array elements by those of another array.
116 template <class D, class Func, typename T = TFromD<D>>
117 void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
118  const T* HWY_RESTRICT in1, const Func& func) {
119  const size_t N = Lanes(d);
120 
121  size_t idx = 0;
122  for (; idx + N <= count; idx += N) {
123  const Vec<D> v = LoadU(d, inout + idx);
124  const Vec<D> v1 = LoadU(d, in1 + idx);
125  StoreU(func(d, v, v1), d, inout + idx);
126  }
127 
128  // `count` was a multiple of the vector length `N`: already done.
129  if (HWY_UNLIKELY(idx == count)) return;
130 
131 #if HWY_MEM_OPS_MIGHT_FAULT
132  // Proceed one by one.
133  const CappedTag<T, 1> d1;
134  for (; idx < count; ++idx) {
135  using V1 = Vec<decltype(d1)>;
136  const V1 v = LoadU(d1, inout + idx);
137  const V1 v1 = LoadU(d1, in1 + idx);
138  StoreU(func(d1, v, v1), d1, inout + idx);
139  }
140 #else
141  const size_t remaining = count - idx;
142  HWY_DASSERT(0 != remaining && remaining < N);
143  const Mask<D> mask = FirstN(d, remaining);
144  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
145  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
146  BlendedStore(func(d, v, v1), mask, d, inout + idx);
147 #endif
148 }
149 
150 // Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
151 // usage: FMA of elements from three arrays, stored into the first array.
152 template <class D, class Func, typename T = TFromD<D>>
153 void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
154  const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
155  const Func& func) {
156  const size_t N = Lanes(d);
157 
158  size_t idx = 0;
159  for (; idx + N <= count; idx += N) {
160  const Vec<D> v = LoadU(d, inout + idx);
161  const Vec<D> v1 = LoadU(d, in1 + idx);
162  const Vec<D> v2 = LoadU(d, in2 + idx);
163  StoreU(func(d, v, v1, v2), d, inout + idx);
164  }
165 
166  // `count` was a multiple of the vector length `N`: already done.
167  if (HWY_UNLIKELY(idx == count)) return;
168 
169 #if HWY_MEM_OPS_MIGHT_FAULT
170  // Proceed one by one.
171  const CappedTag<T, 1> d1;
172  for (; idx < count; ++idx) {
173  using V1 = Vec<decltype(d1)>;
174  const V1 v = LoadU(d1, inout + idx);
175  const V1 v1 = LoadU(d1, in1 + idx);
176  const V1 v2 = LoadU(d1, in2 + idx);
177  StoreU(func(d1, v, v1, v2), d1, inout + idx);
178  }
179 #else
180  const size_t remaining = count - idx;
181  HWY_DASSERT(0 != remaining && remaining < N);
182  const Mask<D> mask = FirstN(d, remaining);
183  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
184  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
185  const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
186  BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
187 #endif
188 }
189 
190 template <class D, typename T = TFromD<D>>
191 void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) {
192  const size_t N = Lanes(d);
193  const Vec<D> old_v = Set(d, old_t);
194  const Vec<D> new_v = Set(d, new_t);
195 
196  size_t idx = 0;
197  for (; idx + N <= count; idx += N) {
198  Vec<D> v = LoadU(d, inout + idx);
199  StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx);
200  }
201 
202  // `count` was a multiple of the vector length `N`: already done.
203  if (HWY_UNLIKELY(idx == count)) return;
204 
205 #if HWY_MEM_OPS_MIGHT_FAULT
206  // Proceed one by one.
207  const CappedTag<T, 1> d1;
208  const Vec<decltype(d1)> old_v1 = Set(d1, old_t);
209  const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
210  for (; idx < count; ++idx) {
211  using V1 = Vec<decltype(d1)>;
212  const V1 v1 = LoadU(d1, inout + idx);
213  StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx);
214  }
215 #else
216  const size_t remaining = count - idx;
217  HWY_DASSERT(0 != remaining && remaining < N);
218  const Mask<D> mask = FirstN(d, remaining);
219  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
220  BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx);
221 #endif
222 }
223 
224 template <class D, class Func, typename T = TFromD<D>>
225 void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t,
226  const Func& func) {
227  const size_t N = Lanes(d);
228  const Vec<D> new_v = Set(d, new_t);
229 
230  size_t idx = 0;
231  for (; idx + N <= count; idx += N) {
232  Vec<D> v = LoadU(d, inout + idx);
233  StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx);
234  }
235 
236  // `count` was a multiple of the vector length `N`: already done.
237  if (HWY_UNLIKELY(idx == count)) return;
238 
239 #if HWY_MEM_OPS_MIGHT_FAULT
240  // Proceed one by one.
241  const CappedTag<T, 1> d1;
242  const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
243  for (; idx < count; ++idx) {
244  using V1 = Vec<decltype(d1)>;
245  const V1 v = LoadU(d1, inout + idx);
246  StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx);
247  }
248 #else
249  const size_t remaining = count - idx;
250  HWY_DASSERT(0 != remaining && remaining < N);
251  const Mask<D> mask = FirstN(d, remaining);
252  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
253  BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx);
254 #endif
255 }
256 
257 // NOLINTNEXTLINE(google-readability-namespace-comments)
258 } // namespace HWY_NAMESPACE
259 } // namespace hwy
261 
262 #endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_UNLIKELY(expr)
Definition: base.h:67
d
Definition: rvv-inl.h:1742
void Generate(D d, T *HWY_RESTRICT out, size_t count, const Func &func)
Definition: transform-inl.h:52
void ReplaceIf(D d, T *HWY_RESTRICT inout, size_t count, T new_t, const Func &func)
Definition: transform-inl.h:225
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition: ops/shared-inl.h:172
void Transform2(D d, T *HWY_RESTRICT inout, size_t count, const T *HWY_RESTRICT in1, const T *HWY_RESTRICT in2, const Func &func)
Definition: transform-inl.h:153
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
void Replace(D d, T *HWY_RESTRICT inout, size_t count, T new_t, T old_t)
Definition: transform-inl.h:191
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
void Transform(D d, T *HWY_RESTRICT inout, size_t count, const Func &func)
Definition: transform-inl.h:85
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:38
N
Definition: rvv-inl.h:1742
void Transform1(D d, T *HWY_RESTRICT inout, size_t count, const T *HWY_RESTRICT in1, const Func &func)
Definition: transform-inl.h:117
const vfloat64m1_t v
Definition: rvv-inl.h:1742
typename D::T TFromD
Definition: ops/shared-inl.h:191
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
FuncOutput(*)(const void *, FuncInput) Func
Definition: nanobenchmark.h:105
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()