Grok  10.0.3
generic_ops-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Target-independent types/functions defined after target-specific ops.
17 
18 // Relies on the external include guard in highway.h.
20 namespace hwy {
21 namespace HWY_NAMESPACE {
22 
23 // The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
24 template <class V>
25 using LaneType = decltype(GetLane(V()));
26 
27 // Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
28 // type of functions that do not take a vector argument, or as an argument type
29 // if the function only has a template argument for D, or for explicit type
30 // names instead of auto. This may be a built-in type.
31 template <class D>
32 using Vec = decltype(Zero(D()));
33 
34 // Mask type. Useful as the return type of functions that do not take a mask
35 // argument, or as an argument type if the function only has a template argument
36 // for D, or for explicit type names instead of auto.
37 template <class D>
38 using Mask = decltype(MaskFromVec(Zero(D())));
39 
40 // Returns the closest value to v within [lo, hi].
41 template <class V>
42 HWY_API V Clamp(const V v, const V lo, const V hi) {
43  return Min(Max(lo, v), hi);
44 }
45 
46 // CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
47 // and RVV has its own implementation of -Lanes.
48 #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
49 
50 template <size_t kLanes, class D, class V = VFromD<D>>
51 HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
52  constexpr size_t kBytes = kLanes * sizeof(LaneType<V>);
53  static_assert(kBytes < 16, "Shift count is per-block");
54  return CombineShiftRightBytes<kBytes>(d, hi, lo);
55 }
56 
57 #endif
58 
59 // Returns lanes with the most significant bit set and all other bits zero.
60 template <class D>
62  const RebindToUnsigned<decltype(d)> du;
63  return BitCast(d, Set(du, SignMask<TFromD<D>>()));
64 }
65 
66 // Returns quiet NaN.
67 template <class D>
69  const RebindToSigned<D> di;
70  // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
71  // mantissa MSB (to indicate quiet) would be sufficient.
72  return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
73 }
74 
75 // Returns positive infinity.
76 template <class D>
78  const RebindToUnsigned<D> du;
79  using T = TFromD<D>;
80  using TU = TFromD<decltype(du)>;
81  const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
82  return BitCast(d, Set(du, max_x2 >> 1));
83 }
84 
85 // ------------------------------ SafeFillN
86 
87 template <class D, typename T = TFromD<D>>
88 HWY_API void SafeFillN(const size_t num, const T value, D d,
89  T* HWY_RESTRICT to) {
90 #if HWY_MEM_OPS_MIGHT_FAULT
91  (void)d;
92  for (size_t i = 0; i < num; ++i) {
93  to[i] = value;
94  }
95 #else
96  BlendedStore(Set(d, value), FirstN(d, num), d, to);
97 #endif
98 }
99 
100 // ------------------------------ SafeCopyN
101 
102 template <class D, typename T = TFromD<D>>
103 HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
104  T* HWY_RESTRICT to) {
105 #if HWY_MEM_OPS_MIGHT_FAULT
106  (void)d;
107  for (size_t i = 0; i < num; ++i) {
108  to[i] = from[i];
109  }
110 #else
111  const Mask<D> mask = FirstN(d, num);
112  BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
113 #endif
114 }
115 
116 // "Include guard": skip if native instructions are available. The generic
117 // implementation is currently shared between x86_* and wasm_*, and is too large
118 // to duplicate.
119 
120 #if (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
121 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
122 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
123 #else
124 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
125 #endif
126 
127 // ------------------------------ LoadInterleaved2
128 
129 template <typename T, size_t N, class V>
131  V& v0, V& v1) {
132  const V A = LoadU(d, unaligned + 0 * N); // v1[1] v0[1] v1[0] v0[0]
133  const V B = LoadU(d, unaligned + 1 * N);
134  v0 = ConcatEven(d, B, A);
135  v1 = ConcatOdd(d, B, A);
136 }
137 
138 template <typename T, class V>
140  V& v0, V& v1) {
141  v0 = LoadU(d, unaligned + 0);
142  v1 = LoadU(d, unaligned + 1);
143 }
144 
145 // ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
146 
147 namespace detail {
148 
149 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
150 template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
152  const T* HWY_RESTRICT unaligned, V& A, V& B,
153  V& C) {
154  A = LoadU(d, unaligned + 0 * N);
155  B = LoadU(d, unaligned + 1 * N);
156  C = LoadU(d, unaligned + 2 * N);
157 }
158 
159 } // namespace detail
160 
161 template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
163  V& v0, V& v1, V& v2) {
164  const RebindToUnsigned<decltype(d)> du;
165  // Compact notation so these fit on one line: 12 := v1[2].
166  V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
167  V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
168  V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
169  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
170  // Compress all lanes belonging to v0 into consecutive lanes.
171  constexpr uint8_t Z = 0x80;
172  alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, 9, 12, 15, Z, Z,
173  Z, Z, Z, Z, Z, Z, Z, Z};
174  alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, Z, Z, Z, 2, 5,
175  8, 11, 14, Z, Z, Z, Z, Z};
176  alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
177  Z, Z, Z, 1, 4, 7, 10, 13};
178  alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, 10, 13, Z, Z, Z,
179  Z, Z, Z, Z, Z, Z, Z, Z};
180  alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, Z, Z, 0, 3, 6,
181  9, 12, 15, Z, Z, Z, Z, Z};
182  alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
183  Z, Z, Z, 2, 5, 8, 11, 14};
184  alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, 8, 11, 14, Z, Z, Z,
185  Z, Z, Z, Z, Z, Z, Z, Z};
186  alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, Z, Z, Z, 1, 4, 7,
187  10, 13, Z, Z, Z, Z, Z, Z};
188  alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
189  Z, Z, 0, 3, 6, 9, 12, 15};
190  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
191  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
192  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
193  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
194  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
195  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
196  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
197  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
198  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
199  v0 = Or3(v0L, v0M, v0U);
200  v1 = Or3(v1L, v1M, v1U);
201  v2 = Or3(v2L, v2M, v2U);
202 }
203 
204 // 8-bit lanes x8
205 template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
206  HWY_IF_LANES_PER_BLOCK(T, N, 8)>
208  V& v0, V& v1, V& v2) {
209  const RebindToUnsigned<decltype(d)> du;
210  V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
211  V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
212  V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
213  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
214  // Compress all lanes belonging to v0 into consecutive lanes.
215  constexpr uint8_t Z = 0x80;
216  alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
217  alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
218  alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
219  alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
220  alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
221  alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
222  alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
223  alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
224  alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
225  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
226  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
227  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
228  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
229  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
230  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
231  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
232  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
233  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
234  v0 = Or3(v0L, v0M, v0U);
235  v1 = Or3(v1L, v1M, v1U);
236  v2 = Or3(v2L, v2M, v2U);
237 }
238 
239 // 16-bit lanes x8
240 template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
241  HWY_IF_LANES_PER_BLOCK(T, N, 8)>
242 HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
243  V& v0, V& v1, V& v2) {
244  const RebindToUnsigned<decltype(d)> du;
245  V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
246  V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
247  V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
248  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
249  // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
250  // but each element of the array contains two byte indices for a lane.
251  constexpr uint16_t Z = 0x8080;
252  alignas(16) constexpr uint16_t kIdx_v0A[8] = {0x0100, 0x0706, 0x0D0C, Z,
253  Z, Z, Z, Z};
254  alignas(16) constexpr uint16_t kIdx_v0B[8] = {Z, Z, Z, 0x0302,
255  0x0908, 0x0F0E, Z, Z};
256  alignas(16) constexpr uint16_t kIdx_v0C[8] = {Z, Z, Z, Z,
257  Z, Z, 0x0504, 0x0B0A};
258  alignas(16) constexpr uint16_t kIdx_v1A[8] = {0x0302, 0x0908, 0x0F0E, Z,
259  Z, Z, Z, Z};
260  alignas(16) constexpr uint16_t kIdx_v1B[8] = {Z, Z, Z, 0x0504,
261  0x0B0A, Z, Z, Z};
262  alignas(16) constexpr uint16_t kIdx_v1C[8] = {Z, Z, Z, Z,
263  Z, 0x0100, 0x0706, 0x0D0C};
264  alignas(16) constexpr uint16_t kIdx_v2A[8] = {0x0504, 0x0B0A, Z, Z,
265  Z, Z, Z, Z};
266  alignas(16) constexpr uint16_t kIdx_v2B[8] = {Z, Z, 0x0100, 0x0706,
267  0x0D0C, Z, Z, Z};
268  alignas(16) constexpr uint16_t kIdx_v2C[8] = {Z, Z, Z, Z,
269  Z, 0x0302, 0x0908, 0x0F0E};
270  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
271  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
272  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
273  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
274  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
275  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
276  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
277  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
278  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
279  v0 = Or3(v0L, v0M, v0U);
280  v1 = Or3(v1L, v1M, v1U);
281  v2 = Or3(v2L, v2M, v2U);
282 }
283 
284 template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
285 HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
286  V& v0, V& v1, V& v2) {
287  V A; // v0[1] v2[0] v1[0] v0[0]
288  V B; // v1[2] v0[2] v2[1] v1[1]
289  V C; // v2[3] v1[3] v0[3] v2[2]
290  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
291 
292  const V vxx_02_03_xx = OddEven(C, B);
293  v0 = detail::Shuffle1230(A, vxx_02_03_xx);
294 
295  // Shuffle2301 takes the upper/lower halves of the output from one input, so
296  // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
297  // OddEven because it may have higher throughput than Shuffle.
298  const V vxx_xx_10_11 = OddEven(A, B);
299  const V v12_13_xx_xx = OddEven(B, C);
300  v1 = detail::Shuffle2301(vxx_xx_10_11, v12_13_xx_xx);
301 
302  const V vxx_20_21_xx = OddEven(B, A);
303  v2 = detail::Shuffle3012(vxx_20_21_xx, C);
304 }
305 
306 template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
307 HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
308  V& v0, V& v1, V& v2) {
309  V A; // v1[0] v0[0]
310  V B; // v0[1] v2[0]
311  V C; // v2[1] v1[1]
312  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
313  v0 = OddEven(B, A);
314  v1 = CombineShiftRightBytes<sizeof(T)>(d, C, A);
315  v2 = OddEven(C, B);
316 }
317 
318 template <typename T, class V>
320  V& v0, V& v1, V& v2) {
321  v0 = LoadU(d, unaligned + 0);
322  v1 = LoadU(d, unaligned + 1);
323  v2 = LoadU(d, unaligned + 2);
324 }
325 
326 // ------------------------------ LoadInterleaved4
327 
328 namespace detail {
329 
330 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
331 template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
333  const T* HWY_RESTRICT unaligned, V& A, V& B,
334  V& C, V& D) {
335  A = LoadU(d, unaligned + 0 * N);
336  B = LoadU(d, unaligned + 1 * N);
337  C = LoadU(d, unaligned + 2 * N);
338  D = LoadU(d, unaligned + 3 * N);
339 }
340 
341 } // namespace detail
342 
343 template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
345  V& v0, V& v1, V& v2, V& v3) {
346  const Repartition<uint64_t, decltype(d)> d64;
347  using V64 = VFromD<decltype(d64)>;
348  // 16 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
349  // Here int[i] means the four interleaved values of the i-th 4-tuple and
350  // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
351  V A; // int[13..10] int[3..0]
352  V B; // int[17..14] int[7..4]
353  V C; // int[1b..18] int[b..8]
354  V D; // int[1f..1c] int[f..c]
355  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
356 
357  // For brevity, the comments only list the lower block (upper = lower + 0x10)
358  const V v5140 = InterleaveLower(d, A, B); // int[5,1,4,0]
359  const V vd9c8 = InterleaveLower(d, C, D); // int[d,9,c,8]
360  const V v7362 = InterleaveUpper(d, A, B); // int[7,3,6,2]
361  const V vfbea = InterleaveUpper(d, C, D); // int[f,b,e,a]
362 
363  const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0]
364  const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8]
365  const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1]
366  const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9]
367 
368  const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0]
369  const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8]
370  const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0]
371  const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8]
372 
373  v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
374  v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
375  v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
376  v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
377 }
378 
379 template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 8)>
380 HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
381  V& v0, V& v1, V& v2, V& v3) {
382  // In the last step, we interleave by half of the block size, which is usually
383  // 8 bytes but half that for 8-bit x8 vectors.
384  using TW = hwy::UnsignedFromSize<sizeof(T) * N == 8 ? 4 : 8>;
385  const Repartition<TW, decltype(d)> dw;
386  using VW = VFromD<decltype(dw)>;
387 
388  // (Comments are for 256-bit vectors.)
389  // 8 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
390  V A; // v3210[9]v3210[8] v3210[1]v3210[0]
391  V B; // v3210[b]v3210[a] v3210[3]v3210[2]
392  V C; // v3210[d]v3210[c] v3210[5]v3210[4]
393  V D; // v3210[f]v3210[e] v3210[7]v3210[6]
394  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
395 
396  const V va820 = InterleaveLower(d, A, B); // v3210[a,8] v3210[2,0]
397  const V vec64 = InterleaveLower(d, C, D); // v3210[e,c] v3210[6,4]
398  const V vb931 = InterleaveUpper(d, A, B); // v3210[b,9] v3210[3,1]
399  const V vfd75 = InterleaveUpper(d, C, D); // v3210[f,d] v3210[7,5]
400 
401  const VW v10_b830 = // v10[b..8] v10[3..0]
402  BitCast(dw, InterleaveLower(d, va820, vb931));
403  const VW v10_fc74 = // v10[f..c] v10[7..4]
404  BitCast(dw, InterleaveLower(d, vec64, vfd75));
405  const VW v32_b830 = // v32[b..8] v32[3..0]
406  BitCast(dw, InterleaveUpper(d, va820, vb931));
407  const VW v32_fc74 = // v32[f..c] v32[7..4]
408  BitCast(dw, InterleaveUpper(d, vec64, vfd75));
409 
410  v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
411  v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
412  v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
413  v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
414 }
415 
416 template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
417 HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
418  V& v0, V& v1, V& v2, V& v3) {
419  V A; // v3210[4] v3210[0]
420  V B; // v3210[5] v3210[1]
421  V C; // v3210[6] v3210[2]
422  V D; // v3210[7] v3210[3]
423  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
424  const V v10_ev = InterleaveLower(d, A, C); // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
425  const V v10_od = InterleaveLower(d, B, D); // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
426  const V v32_ev = InterleaveUpper(d, A, C); // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
427  const V v32_od = InterleaveUpper(d, B, D); // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
428 
429  v0 = InterleaveLower(d, v10_ev, v10_od);
430  v1 = InterleaveUpper(d, v10_ev, v10_od);
431  v2 = InterleaveLower(d, v32_ev, v32_od);
432  v3 = InterleaveUpper(d, v32_ev, v32_od);
433 }
434 
435 template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
436 HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
437  V& v0, V& v1, V& v2, V& v3) {
438  V A, B, C, D;
439  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
440  v0 = InterleaveLower(d, A, C);
441  v1 = InterleaveUpper(d, A, C);
442  v2 = InterleaveLower(d, B, D);
443  v3 = InterleaveUpper(d, B, D);
444 }
445 
446 // Any T x1
447 template <typename T, class V>
449  V& v0, V& v1, V& v2, V& v3) {
450  v0 = LoadU(d, unaligned + 0);
451  v1 = LoadU(d, unaligned + 1);
452  v2 = LoadU(d, unaligned + 2);
453  v3 = LoadU(d, unaligned + 3);
454 }
455 
456 // ------------------------------ StoreInterleaved2
457 
458 namespace detail {
459 
460 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
461 template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
462 HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd<T, N, 0> d,
463  T* HWY_RESTRICT unaligned) {
464  StoreU(A, d, unaligned + 0 * N);
465  StoreU(B, d, unaligned + 1 * N);
466 }
467 
468 } // namespace detail
469 
470 // >= 128 bit vector
471 template <typename T, size_t N, class V, HWY_IF_GE128(T, N)>
472 HWY_API void StoreInterleaved2(const V v0, const V v1, Simd<T, N, 0> d,
473  T* HWY_RESTRICT unaligned) {
474  const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0]
475  const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[N/2] v0[N/2]
476  detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
477 }
478 
479 // 64 bits
480 template <typename T>
481 HWY_API void StoreInterleaved2(const Vec64<T> part0, const Vec64<T> part1,
482  Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
483  // Use full vectors to reduce the number of stores.
484  const Full128<T> d_full;
485  const Vec128<T> v0{part0.raw};
486  const Vec128<T> v1{part1.raw};
487  const auto v10 = InterleaveLower(d_full, v0, v1);
488  StoreU(v10, d_full, unaligned);
489 }
490 
491 // <= 32 bits
492 template <typename T, size_t N, HWY_IF_LE32(T, N)>
493 HWY_API void StoreInterleaved2(const Vec128<T, N> part0,
494  const Vec128<T, N> part1, Simd<T, N, 0> /*tag*/,
495  T* HWY_RESTRICT unaligned) {
496  // Use full vectors to reduce the number of stores.
497  const Full128<T> d_full;
498  const Vec128<T> v0{part0.raw};
499  const Vec128<T> v1{part1.raw};
500  const auto v10 = InterleaveLower(d_full, v0, v1);
501  alignas(16) T buf[16 / sizeof(T)];
502  StoreU(v10, d_full, buf);
503  CopyBytes<2 * N * sizeof(T)>(buf, unaligned);
504 }
505 
506 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
507 // TableLookupBytes)
508 
509 namespace detail {
510 
511 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
512 template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
513 HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C,
515  T* HWY_RESTRICT unaligned) {
516  StoreU(A, d, unaligned + 0 * N);
517  StoreU(B, d, unaligned + 1 * N);
518  StoreU(C, d, unaligned + 2 * N);
519 }
520 
521 } // namespace detail
522 
523 // >= 128-bit vector, 8-bit lanes
524 template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
525  HWY_IF_GE128(T, N)>
526 HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
527  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
528  const RebindToUnsigned<decltype(d)> du;
529  const auto k5 = Set(du, 5);
530  const auto k6 = Set(du, 6);
531 
532  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
533  // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
534  // to their place, with 0x80 so lanes to be filled from other vectors are 0
535  // to enable blending by ORing together.
536  alignas(16) static constexpr uint8_t tbl_v0[16] = {
537  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
538  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
539  alignas(16) static constexpr uint8_t tbl_v1[16] = {
540  0x80, 0, 0x80, 0x80, 1, 0x80, //
541  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
542  // The interleaved vectors will be named A, B, C; temporaries with suffix
543  // 0..2 indicate which input vector's lanes they hold.
544  const auto shuf_A0 = LoadDup128(du, tbl_v0);
545  const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5)
546  const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
547  const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
548  const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
549  const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
550  const V A = BitCast(d, A0 | A1 | A2);
551 
552  // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
553  const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
554  const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
555  const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
556  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
557  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
558  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
559  const V B = BitCast(d, B0 | B1 | B2);
560 
561  // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
562  const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
563  const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
564  const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
565  const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
566  const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
567  const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
568  const V C = BitCast(d, C0 | C1 | C2);
569 
570  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
571 }
572 
573 // >= 128-bit vector, 16-bit lanes
574 template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
575  HWY_IF_GE128(T, N)>
576 HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
577  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
578  const Repartition<uint8_t, decltype(d)> du8;
579  const auto k2 = Set(du8, 2 * sizeof(T));
580  const auto k3 = Set(du8, 3 * sizeof(T));
581 
582  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
583  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
584  // filled from other vectors are 0 for blending. Note that these are byte
585  // indices for 16-bit lanes.
586  alignas(16) static constexpr uint8_t tbl_v1[16] = {
587  0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
588  2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
589  alignas(16) static constexpr uint8_t tbl_v2[16] = {
590  0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
591  0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
592 
593  // The interleaved vectors will be named A, B, C; temporaries with suffix
594  // 0..2 indicate which input vector's lanes they hold.
595  const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0.
596  // .2..1..0
597  const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
598  const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0..
599 
600  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
601  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
602  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
603  const V A = BitCast(d, A0 | A1 | A2);
604 
605  // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
606  const auto shuf_B0 = shuf_A1 + k3; // 5..4..3.
607  const auto shuf_B1 = shuf_A2 + k3; // ..4..3..
608  const auto shuf_B2 = shuf_A0 + k2; // .4..3..2
609  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
610  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
611  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
612  const V B = BitCast(d, B0 | B1 | B2);
613 
614  // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
615  const auto shuf_C0 = shuf_B1 + k3; // ..7..6..
616  const auto shuf_C1 = shuf_B2 + k3; // .7..6..5
617  const auto shuf_C2 = shuf_B0 + k2; // 7..6..5.
618  const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
619  const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
620  const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
621  const V C = BitCast(d, C0 | C1 | C2);
622 
623  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
624 }
625 
626 // >= 128-bit vector, 32-bit lanes
627 template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 4),
628  HWY_IF_GE128(T, N)>
629 HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
630  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
631  const RepartitionToWide<decltype(d)> dw;
632 
633  const V v10_v00 = InterleaveLower(d, v0, v1);
634  const V v01_v20 = OddEven(v0, v2);
635  // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
636  const V A = BitCast(
637  d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
638 
639  const V v1_321 = ShiftRightLanes<1>(d, v1);
640  const V v0_32 = ShiftRightLanes<2>(d, v0);
641  const V v21_v11 = OddEven(v2, v1_321);
642  const V v12_v02 = OddEven(v1_321, v0_32);
643  // B: v1[2],v0[2], v2[1],v1[1]
644  const V B = BitCast(
645  d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
646 
647  // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
648  const V v23_v13 = OddEven(v2, v1_321);
649  const V v03_v22 = OddEven(v0, v2);
650  // C: v2[3],v1[3],v0[3], v2[2]
651  const V C = BitCast(
652  d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
653 
654  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
655 }
656 
657 // >= 128-bit vector, 64-bit lanes
658 template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
659  HWY_IF_GE128(T, N)>
660 HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
661  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
662  const V A = InterleaveLower(d, v0, v1);
663  const V B = OddEven(v0, v2);
664  const V C = InterleaveUpper(d, v1, v2);
665  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
666 }
667 
668 // 64-bit vector, 8-bit lanes
669 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
670 HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
671  const Vec64<T> part2, Full64<T> d,
672  T* HWY_RESTRICT unaligned) {
673  constexpr size_t N = 16 / sizeof(T);
674  // Use full vectors for the shuffles and first result.
675  const Full128<uint8_t> du;
676  const Full128<T> d_full;
677  const auto k5 = Set(du, 5);
678  const auto k6 = Set(du, 6);
679 
680  const Vec128<T> v0{part0.raw};
681  const Vec128<T> v1{part1.raw};
682  const Vec128<T> v2{part2.raw};
683 
684  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
685  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
686  // filled from other vectors are 0 for blending.
687  alignas(16) static constexpr uint8_t tbl_v0[16] = {
688  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
689  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
690  alignas(16) static constexpr uint8_t tbl_v1[16] = {
691  0x80, 0, 0x80, 0x80, 1, 0x80, //
692  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
693  // The interleaved vectors will be named A, B, C; temporaries with suffix
694  // 0..2 indicate which input vector's lanes they hold.
695  const auto shuf_A0 = Load(du, tbl_v0);
696  const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
697  const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
698  const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
699  const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
700  const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
701  const auto A = BitCast(d_full, A0 | A1 | A2);
702  StoreU(A, d_full, unaligned + 0 * N);
703 
704  // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
705  const auto shuf_B0 = shuf_A2 + k6; // ..7..6..
706  const auto shuf_B1 = shuf_A0 + k5; // .7..6..5
707  const auto shuf_B2 = shuf_A1 + k5; // 7..6..5.
708  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
709  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
710  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
711  const Vec64<T> B{(B0 | B1 | B2).raw};
712  StoreU(B, d, unaligned + 1 * N);
713 }
714 
715 // 64-bit vector, 16-bit lanes
716 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
717 HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
718  const Vec64<T> part2, Full64<T> dh,
719  T* HWY_RESTRICT unaligned) {
720  const Full128<T> d;
721  const Full128<uint8_t> du8;
722  constexpr size_t N = 16 / sizeof(T);
723  const auto k2 = Set(du8, 2 * sizeof(T));
724  const auto k3 = Set(du8, 3 * sizeof(T));
725 
726  const Vec128<T> v0{part0.raw};
727  const Vec128<T> v1{part1.raw};
728  const Vec128<T> v2{part2.raw};
729 
730  // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
731  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
732  // to their place, with 0x80 so lanes to be filled from other vectors are 0
733  // to enable blending by ORing together.
734  alignas(16) static constexpr uint8_t tbl_v1[16] = {
735  0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
736  2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
737  alignas(16) static constexpr uint8_t tbl_v2[16] = {
738  0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
739  0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
740 
741  // The interleaved vectors will be named A, B; temporaries with suffix
742  // 0..2 indicate which input vector's lanes they hold.
743  const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
744  // .2..1..0
745  const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
746  const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0..
747 
748  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
749  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
750  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
751  const Vec128<T> A = BitCast(d, A0 | A1 | A2);
752  StoreU(A, d, unaligned + 0 * N);
753 
754  // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
755  const auto shuf_B0 = shuf_A1 + k3; // ..3.
756  const auto shuf_B1 = shuf_A2 + k3; // .3..
757  const auto shuf_B2 = shuf_A0 + k2; // 3..2
758  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
759  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
760  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
761  const Vec128<T> B = BitCast(d, B0 | B1 | B2);
762  StoreU(Vec64<T>{B.raw}, dh, unaligned + 1 * N);
763 }
764 
765 // 64-bit vector, 32-bit lanes
766 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
767 HWY_API void StoreInterleaved3(const Vec64<T> v0, const Vec64<T> v1,
768  const Vec64<T> v2, Full64<T> d,
769  T* HWY_RESTRICT unaligned) {
770  // (same code as 128-bit vector, 64-bit lanes)
771  constexpr size_t N = 2;
772  const Vec64<T> v10_v00 = InterleaveLower(d, v0, v1);
773  const Vec64<T> v01_v20 = OddEven(v0, v2);
774  const Vec64<T> v21_v11 = InterleaveUpper(d, v1, v2);
775  StoreU(v10_v00, d, unaligned + 0 * N);
776  StoreU(v01_v20, d, unaligned + 1 * N);
777  StoreU(v21_v11, d, unaligned + 2 * N);
778 }
779 
780 // 64-bit lanes are handled by the N=1 case below.
781 
782 // <= 32-bit vector, 8-bit lanes
783 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE32(T, N)>
785  const Vec128<T, N> part1,
786  const Vec128<T, N> part2, Simd<T, N, 0> /*tag*/,
787  T* HWY_RESTRICT unaligned) {
788  // Use full vectors for the shuffles and result.
789  const Full128<uint8_t> du;
790  const Full128<T> d_full;
791 
792  const Vec128<T> v0{part0.raw};
793  const Vec128<T> v1{part1.raw};
794  const Vec128<T> v2{part2.raw};
795 
796  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
797  // so lanes to be filled from other vectors are 0 to enable blending by ORing
798  // together.
799  alignas(16) static constexpr uint8_t tbl_v0[16] = {
800  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
801  0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
802  // The interleaved vector will be named A; temporaries with suffix
803  // 0..2 indicate which input vector's lanes they hold.
804  const auto shuf_A0 = Load(du, tbl_v0);
805  const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
806  const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
807  const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
808  const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
809  const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
810  const Vec128<T> A = BitCast(d_full, A0 | A1 | A2);
811  alignas(16) T buf[16 / sizeof(T)];
812  StoreU(A, d_full, buf);
813  CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
814 }
815 
816 // 32-bit vector, 16-bit lanes
817 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
819  const Vec128<T, 2> part1,
820  const Vec128<T, 2> part2, Simd<T, 2, 0> /*tag*/,
821  T* HWY_RESTRICT unaligned) {
822  constexpr size_t N = 4 / sizeof(T);
823  // Use full vectors for the shuffles and result.
824  const Full128<uint8_t> du8;
825  const Full128<T> d_full;
826 
827  const Vec128<T> v0{part0.raw};
828  const Vec128<T> v1{part1.raw};
829  const Vec128<T> v2{part2.raw};
830 
831  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
832  // so lanes to be filled from other vectors are 0 to enable blending by ORing
833  // together.
834  alignas(16) static constexpr uint8_t tbl_v2[16] = {
835  0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
836  0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
837  // The interleaved vector will be named A; temporaries with suffix
838  // 0..2 indicate which input vector's lanes they hold.
839  const auto shuf_A2 = // ..1..0..
840  Load(du8, tbl_v2);
841  const auto shuf_A1 = // ...1..0.
842  CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
843  const auto shuf_A0 = // ....1..0
844  CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
845  const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
846  const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
847  const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
848  const auto A = BitCast(d_full, A0 | A1 | A2);
849  alignas(16) T buf[16 / sizeof(T)];
850  StoreU(A, d_full, buf);
851  CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
852 }
853 
854 // Single-element vector, any lane size: just store directly
855 template <typename T>
857  const Vec128<T, 1> v2, Simd<T, 1, 0> d,
858  T* HWY_RESTRICT unaligned) {
859  StoreU(v0, d, unaligned + 0);
860  StoreU(v1, d, unaligned + 1);
861  StoreU(v2, d, unaligned + 2);
862 }
863 
864 // ------------------------------ StoreInterleaved4
865 
866 namespace detail {
867 
868 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
869 template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
870 HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D,
872  T* HWY_RESTRICT unaligned) {
873  StoreU(A, d, unaligned + 0 * N);
874  StoreU(B, d, unaligned + 1 * N);
875  StoreU(C, d, unaligned + 2 * N);
876  StoreU(D, d, unaligned + 3 * N);
877 }
878 
879 } // namespace detail
880 
881 // >= 128-bit vector, 8..32-bit lanes
882 template <typename T, size_t N, class V, HWY_IF_NOT_LANE_SIZE(T, 8),
883  HWY_IF_GE128(T, N)>
884 HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
885  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
886  const RepartitionToWide<decltype(d)> dw;
887  const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
888  const auto v32L = ZipLower(dw, v2, v3);
889  const auto v10U = ZipUpper(dw, v0, v1);
890  const auto v32U = ZipUpper(dw, v2, v3);
891  // The interleaved vectors are A, B, C, D.
892  const auto A = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210
893  const auto B = BitCast(d, InterleaveUpper(dw, v10L, v32L));
894  const auto C = BitCast(d, InterleaveLower(dw, v10U, v32U));
895  const auto D = BitCast(d, InterleaveUpper(dw, v10U, v32U));
896  detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
897 }
898 
899 // >= 128-bit vector, 64-bit lanes
900 template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
901  HWY_IF_GE128(T, N)>
902 HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
903  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
904  // The interleaved vectors are A, B, C, D.
905  const auto A = InterleaveLower(d, v0, v1); // v1[0] v0[0]
906  const auto B = InterleaveLower(d, v2, v3);
907  const auto C = InterleaveUpper(d, v0, v1);
908  const auto D = InterleaveUpper(d, v2, v3);
909  detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
910 }
911 
912 // 64-bit vector, 8..32-bit lanes
913 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
914 HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
915  const Vec64<T> part2, const Vec64<T> part3,
916  Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
917  constexpr size_t N = 16 / sizeof(T);
918  // Use full vectors to reduce the number of stores.
919  const Full128<T> d_full;
920  const RepartitionToWide<decltype(d_full)> dw;
921  const Vec128<T> v0{part0.raw};
922  const Vec128<T> v1{part1.raw};
923  const Vec128<T> v2{part2.raw};
924  const Vec128<T> v3{part3.raw};
925  const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0]
926  const auto v32 = ZipLower(dw, v2, v3);
927  const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
928  const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
929  StoreU(A, d_full, unaligned + 0 * N);
930  StoreU(B, d_full, unaligned + 1 * N);
931 }
932 
933 // 64-bit vector, 64-bit lane
934 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
935 HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
936  const Vec64<T> part2, const Vec64<T> part3,
937  Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
938  constexpr size_t N = 16 / sizeof(T);
939  // Use full vectors to reduce the number of stores.
940  const Full128<T> d_full;
941  const Vec128<T> v0{part0.raw};
942  const Vec128<T> v1{part1.raw};
943  const Vec128<T> v2{part2.raw};
944  const Vec128<T> v3{part3.raw};
945  const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0]
946  const auto B = InterleaveLower(d_full, v2, v3);
947  StoreU(A, d_full, unaligned + 0 * N);
948  StoreU(B, d_full, unaligned + 1 * N);
949 }
950 
951 // <= 32-bit vectors
952 template <typename T, size_t N, HWY_IF_LE32(T, N)>
953 HWY_API void StoreInterleaved4(const Vec128<T, N> part0,
954  const Vec128<T, N> part1,
955  const Vec128<T, N> part2,
956  const Vec128<T, N> part3, Simd<T, N, 0> /*tag*/,
957  T* HWY_RESTRICT unaligned) {
958  // Use full vectors to reduce the number of stores.
959  const Full128<T> d_full;
960  const RepartitionToWide<decltype(d_full)> dw;
961  const Vec128<T> v0{part0.raw};
962  const Vec128<T> v1{part1.raw};
963  const Vec128<T> v2{part2.raw};
964  const Vec128<T> v3{part3.raw};
965  const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
966  const auto v32 = ZipLower(dw, v2, v3);
967  const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
968  alignas(16) T buf[16 / sizeof(T)];
969  StoreU(v3210, d_full, buf);
970  CopyBytes<4 * N * sizeof(T)>(buf, unaligned);
971 }
972 
973 #endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED
974 
975 // ------------------------------ AESRound
976 
977 // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
978 #if HWY_TARGET != HWY_SCALAR
979 
980 // Define for white-box testing, even if native instructions are available.
981 namespace detail {
982 
983 // Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
984 // Vector Permute Instructions" and the accompanying assembly language
985 // implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
986 // https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
987 //
988 // A brute-force 256 byte table lookup can also be made constant-time, and
989 // possibly competitive on NEON, but this is more performance-portable
990 // especially for x86 and large vectors.
991 template <class V> // u8
992 HWY_INLINE V SubBytes(V state) {
993  const DFromV<V> du;
994  const auto mask = Set(du, 0xF);
995 
996  // Change polynomial basis to GF(2^4)
997  {
998  alignas(16) static constexpr uint8_t basisL[16] = {
999  0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
1000  0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
1001  alignas(16) static constexpr uint8_t basisU[16] = {
1002  0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
1003  0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
1004  const auto sL = And(state, mask);
1005  const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
1006  const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
1007  const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
1008  state = Xor(gf4L, gf4U);
1009  }
1010 
1011  // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
1012  // cause TableLookupBytesOr0 to return 0.
1013  alignas(16) static constexpr uint8_t kZetaInv[16] = {
1014  0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
1015  alignas(16) static constexpr uint8_t kInv[16] = {
1016  0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
1017  const auto tbl = LoadDup128(du, kInv);
1018  const auto sL = And(state, mask); // L=low nibble, U=upper
1019  const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
1020  const auto sX = Xor(sU, sL);
1021  const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
1022  const auto invU = TableLookupBytes(tbl, sU);
1023  const auto invX = TableLookupBytes(tbl, sX);
1024  const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
1025  const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
1026 
1027  // Linear skew (cannot bake 0x63 bias into the table because out* indices
1028  // may have the infinity flag set).
1029  alignas(16) static constexpr uint8_t kAffineL[16] = {
1030  0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
1031  0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
1032  alignas(16) static constexpr uint8_t kAffineU[16] = {
1033  0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
1034  0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
1035  const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
1036  const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
1037  return Xor(Xor(affL, affU), Set(du, 0x63));
1038 }
1039 
1040 } // namespace detail
1041 
1042 #endif // HWY_TARGET != HWY_SCALAR
1043 
1044 // "Include guard": skip if native AES instructions are available.
1045 #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
1046 #ifdef HWY_NATIVE_AES
1047 #undef HWY_NATIVE_AES
1048 #else
1049 #define HWY_NATIVE_AES
1050 #endif
1051 
1052 // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
1053 #if HWY_TARGET != HWY_SCALAR
1054 
1055 namespace detail {
1056 
1057 template <class V> // u8
1058 HWY_API V ShiftRows(const V state) {
1059  const DFromV<V> du;
1060  alignas(16) static constexpr uint8_t kShiftRow[16] = {
1061  0, 5, 10, 15, // transposed: state is column major
1062  4, 9, 14, 3, //
1063  8, 13, 2, 7, //
1064  12, 1, 6, 11};
1065  const auto shift_row = LoadDup128(du, kShiftRow);
1066  return TableLookupBytes(state, shift_row);
1067 }
1068 
1069 template <class V> // u8
1070 HWY_API V MixColumns(const V state) {
1071  const DFromV<V> du;
1072  // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
1073  // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3.
1074  // 1 2 3 1 // d are on diagonal, no permutation needed.
1075  // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
1076  // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
1077  alignas(16) static constexpr uint8_t k2301[16] = {
1078  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
1079  alignas(16) static constexpr uint8_t k1230[16] = {
1080  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
1081  const RebindToSigned<decltype(du)> di; // can only do signed comparisons
1082  const auto msb = Lt(BitCast(di, state), Zero(di));
1083  const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B)));
1084  const auto d = Xor(Add(state, state), overflow); // = state*2 in GF(2^8).
1085  const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
1086  const auto d_s2301 = Xor(d, s2301);
1087  const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
1088  const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
1089  return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
1090 }
1091 
1092 } // namespace detail
1093 
1094 template <class V> // u8
1095 HWY_API V AESRound(V state, const V round_key) {
1096  // Intel docs swap the first two steps, but it does not matter because
1097  // ShiftRows is a permutation and SubBytes is independent of lane index.
1098  state = detail::SubBytes(state);
1099  state = detail::ShiftRows(state);
1100  state = detail::MixColumns(state);
1101  state = Xor(state, round_key); // AddRoundKey
1102  return state;
1103 }
1104 
1105 template <class V> // u8
1106 HWY_API V AESLastRound(V state, const V round_key) {
1107  // LIke AESRound, but without MixColumns.
1108  state = detail::SubBytes(state);
1109  state = detail::ShiftRows(state);
1110  state = Xor(state, round_key); // AddRoundKey
1111  return state;
1112 }
1113 
1114 // Constant-time implementation inspired by
1115 // https://www.bearssl.org/constanttime.html, but about half the cost because we
1116 // use 64x64 multiplies and 128-bit XORs.
1117 template <class V>
1118 HWY_API V CLMulLower(V a, V b) {
1119  const DFromV<V> d;
1120  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
1121  const auto k1 = Set(d, 0x1111111111111111ULL);
1122  const auto k2 = Set(d, 0x2222222222222222ULL);
1123  const auto k4 = Set(d, 0x4444444444444444ULL);
1124  const auto k8 = Set(d, 0x8888888888888888ULL);
1125  const auto a0 = And(a, k1);
1126  const auto a1 = And(a, k2);
1127  const auto a2 = And(a, k4);
1128  const auto a3 = And(a, k8);
1129  const auto b0 = And(b, k1);
1130  const auto b1 = And(b, k2);
1131  const auto b2 = And(b, k4);
1132  const auto b3 = And(b, k8);
1133 
1134  auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
1135  auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
1136  auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
1137  auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
1138  m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
1139  m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
1140  m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
1141  m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
1142  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
1143 }
1144 
1145 template <class V>
1146 HWY_API V CLMulUpper(V a, V b) {
1147  const DFromV<V> d;
1148  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
1149  const auto k1 = Set(d, 0x1111111111111111ULL);
1150  const auto k2 = Set(d, 0x2222222222222222ULL);
1151  const auto k4 = Set(d, 0x4444444444444444ULL);
1152  const auto k8 = Set(d, 0x8888888888888888ULL);
1153  const auto a0 = And(a, k1);
1154  const auto a1 = And(a, k2);
1155  const auto a2 = And(a, k4);
1156  const auto a3 = And(a, k8);
1157  const auto b0 = And(b, k1);
1158  const auto b1 = And(b, k2);
1159  const auto b2 = And(b, k4);
1160  const auto b3 = And(b, k8);
1161 
1162  auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
1163  auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
1164  auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
1165  auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
1166  m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
1167  m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
1168  m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
1169  m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
1170  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
1171 }
1172 
1173 #endif // HWY_NATIVE_AES
1174 #endif // HWY_TARGET != HWY_SCALAR
1175 
1176 // "Include guard": skip if native POPCNT-related instructions are available.
1177 #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
1178 #ifdef HWY_NATIVE_POPCNT
1179 #undef HWY_NATIVE_POPCNT
1180 #else
1181 #define HWY_NATIVE_POPCNT
1182 #endif
1183 
1184 #undef HWY_MIN_POW2_FOR_128
1185 #if HWY_TARGET == HWY_RVV
1186 #define HWY_MIN_POW2_FOR_128 1
1187 #else
1188 // All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D)
1189 // guarantee 128 bits anyway.
1190 #define HWY_MIN_POW2_FOR_128 0
1191 #endif
1192 
1193 // This algorithm requires vectors to be at least 16 bytes, which is the case
1194 // for LMUL >= 2. If not, use the fallback below.
1195 template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
1196  HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
1198  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
1199  const D d;
1200  HWY_ALIGN constexpr uint8_t kLookup[16] = {
1201  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1202  };
1203  const auto lo = And(v, Set(d, 0xF));
1204  const auto hi = ShiftRight<4>(v);
1205  const auto lookup = LoadDup128(d, kLookup);
1206  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
1207 }
1208 
1209 // RVV has a specialization that avoids the Set().
1210 #if HWY_TARGET != HWY_RVV
1211 // Slower fallback for capped vectors.
1212 template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1), HWY_IF_LT128_D(D)>
1213 HWY_API V PopulationCount(V v) {
1214  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
1215  const D d;
1216  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
1217  v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
1218  v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
1219  return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
1220 }
1221 #endif // HWY_TARGET != HWY_RVV
1222 
1223 template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
1225  static_assert(IsSame<TFromD<D>, uint16_t>(), "V must be u16");
1226  const D d;
1227  const Repartition<uint8_t, decltype(d)> d8;
1228  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
1229  return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
1230 }
1231 
1232 template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
1233 HWY_API V PopulationCount(V v) {
1234  static_assert(IsSame<TFromD<D>, uint32_t>(), "V must be u32");
1235  const D d;
1236  Repartition<uint16_t, decltype(d)> d16;
1237  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
1238  return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
1239 }
1240 
1241 #if HWY_HAVE_INTEGER64
1242 template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
1243 HWY_API V PopulationCount(V v) {
1244  static_assert(IsSame<TFromD<D>, uint64_t>(), "V must be u64");
1245  const D d;
1246  Repartition<uint32_t, decltype(d)> d32;
1247  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
1248  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
1249 }
1250 #endif
1251 
1252 #endif // HWY_NATIVE_POPCNT
1253 
1254 // NOLINTNEXTLINE(google-readability-namespace-comments)
1255 } // namespace HWY_NAMESPACE
1256 } // namespace hwy
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES)
Definition: base.h:353
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:346
#define HWY_API
Definition: base.h:120
#define HWY_IF_NOT_LANE_SIZE(T, bytes)
Definition: base.h:348
#define HWY_IF_GE128(T, N)
Definition: base.h:337
#define HWY_INLINE
Definition: base.h:62
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:151
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:462
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:870
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:513
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:332
d
Definition: rvv-inl.h:1742
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec< D > NaN(D d)
Definition: generic_ops-inl.h:68
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
decltype(GetLane(V())) LaneType
Definition: generic_ops-inl.h:25
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API V Clamp(const V v, const V lo, const V hi)
Definition: generic_ops-inl.h:42
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void SafeFillN(const size_t num, const T value, D d, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:88
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec< D > Inf(D d)
Definition: generic_ops-inl.h:77
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:103
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:38
N
Definition: rvv-inl.h:1742
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2238
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
constexpr HWY_API T LimitsMax()
Definition: base.h:548
constexpr HWY_API bool IsSame()
Definition: base.h:322
constexpr MakeUnsigned< T > SignMask()
Definition: base.h:612
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: ops/shared-inl.h:40