Grok  10.0.3
rvv-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // RISC-V V vectors (length not known at compile time).
17 // External include guard in highway.h - see comment there.
18 
19 #include <riscv_vector.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 
23 #include "hwy/base.h"
24 #include "hwy/ops/shared-inl.h"
25 
27 namespace hwy {
28 namespace HWY_NAMESPACE {
29 
30 template <class V>
31 struct DFromV_t {}; // specialized in macros
32 template <class V>
33 using DFromV = typename DFromV_t<RemoveConst<V>>::type;
34 
35 template <class V>
36 using TFromV = TFromD<DFromV<V>>;
37 
38 // Enables the overload if Pow2 is in [min, max].
39 #define HWY_RVV_IF_POW2_IN(D, min, max) \
40  hwy::EnableIf<(min) <= Pow2(D()) && Pow2(D()) <= (max)>* = nullptr
41 
42 template <typename T, size_t N, int kPow2>
43 constexpr size_t MLenFromD(Simd<T, N, kPow2> /* tag */) {
44  // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower
45  // argument enables fractional LMUL < 1. Limit to 64 because that is the
46  // largest value for which vbool##_t are defined.
47  return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2));
48 }
49 
50 // ================================================== MACROS
51 
52 // Generate specializations and function definitions using X macros. Although
53 // harder to read and debug, writing everything manually is too bulky.
54 
55 namespace detail { // for code folding
56 
57 // For all mask sizes MLEN: (1/Nth of a register, one bit per lane)
58 // The first two arguments are SEW and SHIFT such that SEW >> SHIFT = MLEN.
59 #define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
60  X_MACRO(64, 0, 64, NAME, OP) \
61  X_MACRO(32, 0, 32, NAME, OP) \
62  X_MACRO(16, 0, 16, NAME, OP) \
63  X_MACRO(8, 0, 8, NAME, OP) \
64  X_MACRO(8, 1, 4, NAME, OP) \
65  X_MACRO(8, 2, 2, NAME, OP) \
66  X_MACRO(8, 3, 1, NAME, OP)
67 
68 // For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows
69 // reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or
70 // _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix.
71 //
72 // Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same
73 // reason, also pass the double-width and half SEW and LMUL (suffixed D and H,
74 // respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8).
75 // Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP
76 
77 // LMULS = _TRUNC: truncatable (not the smallest LMUL)
78 #define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
79  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
80  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
81  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
82  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
83  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
84  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
85 
86 #define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
87  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
88  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
89  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
90  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
91  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
92 
93 #define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
94  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
95  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
96  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
97  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
98 
99 #define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
100  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
101  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
102  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
103 
104 // LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
105 #define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
106  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
107  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
108  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
109  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
110  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
111  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
112 
113 #define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
114  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
115  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
116  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
117  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
118  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
119  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
120 
121 #define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
122  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
123  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
124  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
125  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
126  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
127 
128 #define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
129  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
130  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
131  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
132  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
133 
134 // LMULS = _LE2: <= 2
135 #define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
136  X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) \
137  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
138  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
139  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
140  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)
141 
142 #define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
143  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
144  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
145  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
146  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)
147 
148 #define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
149  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
150  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
151  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)
152 
153 #define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
154  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
155  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP)
156 
157 // LMULS = _EXT: not the largest LMUL
158 #define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
159  HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
160  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)
161 
162 #define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
163  HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
164  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)
165 
166 #define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
167  HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
168  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)
169 
170 #define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
171  HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
172  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP)
173 
174 // LMULS = _ALL (2^MinPow2() <= LMUL <= 8)
175 #define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
176  HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
177  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
178 
179 #define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
180  HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
181  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
182 
183 #define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
184  HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
185  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
186 
187 #define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
188  HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
189  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
190 
191 // 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least
192 // 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even
193 // though RISC-V LMUL must be at least SEW/64 (notice that this rules out
194 // LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to
195 // one less than should be supported, with all other parameters (vector type
196 // etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes()
197 // returns half of what it usually would.
198 //
199 // Notice that we can only add overloads whenever there is a D argument: those
200 // are unique with respect to non-virtual-LMUL overloads because their kPow2
201 // template argument differs. Otherwise, there is no actual vuint64mf2_t, and
202 // defining another overload with the same LMUL would be an error. Thus we have
203 // a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is
204 // _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most
205 // functions that take a D.
206 
207 #define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
208 
209 #define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
210  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP)
211 
212 #define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
213  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP)
214 
215 #define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
216  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP)
217 
218 // ALL + VIRT
219 #define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
220  HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
221  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
222 
223 #define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
224  HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
225  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
226 
227 #define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
228  HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
229  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
230 
231 #define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
232  HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
233  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
234 
235 // LE2 + VIRT
236 #define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
237  HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
238  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
239 
240 #define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
241  HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
242  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
243 
244 #define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
245  HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
246  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
247 
248 #define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
249  HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
250  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
251 
252 // EXT + VIRT
253 #define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
254  HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
255  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
256 
257 #define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
258  HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
259  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
260 
261 #define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
262  HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
263  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
264 
265 #define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
266  HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
267  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
268 
269 // DEMOTE + VIRT
270 #define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
271  HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
272  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
273 
274 #define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
275  HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
276  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
277 
278 #define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
279  HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
280  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
281 
282 #define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
283  HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
284  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
285 
286 // SEW for unsigned:
287 #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
288  HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP)
289 #define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
290  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP)
291 #define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
292  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP)
293 #define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
294  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP)
295 
296 // SEW for signed:
297 #define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
298  HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP)
299 #define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
300  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP)
301 #define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
302  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP)
303 #define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \
304  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP)
305 
306 // SEW for float:
307 #if HWY_HAVE_FLOAT16
308 #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
309  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP)
310 #else
311 #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
312 #endif
313 #define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
314  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
315 #define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \
316  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP)
317 
318 // Commonly used type/SEW groups:
319 #define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \
320  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
321  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
322 
323 #define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \
324  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
325  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
326 
327 #define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
328  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
329  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
330 
331 #define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \
332  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
333  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
334 
335 #define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \
336  HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
337  HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS)
338 
339 #define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
340  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
341  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
342  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
343 
344 #define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \
345  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
346  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
347  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
348 
349 #define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \
350  HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
351  HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
352 
353 #define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \
354  HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
355  HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
356 
357 // For all combinations of SEW:
358 #define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
359  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
360  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
361  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
362  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
363 
364 #define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
365  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
366  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
367  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
368  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
369 
370 #define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
371  HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
372  HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
373 
374 // Commonly used type categories:
375 #define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
376  HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
377  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
378 
379 #define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
380  HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
381  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
382  HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
383 
384 // Assemble types for use in x-macros
385 #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
386 #define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT>
387 #define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
388 #define HWY_RVV_M(MLEN) vbool##MLEN##_t
389 
390 } // namespace detail
391 
392 // Until we have full intrinsic support for fractional LMUL, mixed-precision
393 // code can use LMUL 1..8 (adequate unless they need many registers).
394 #define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
395  MLEN, NAME, OP) \
396  template <> \
397  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
398  using Lane = HWY_RVV_T(BASE, SEW); \
399  using type = ScalableTag<Lane, SHIFT>; \
400  };
401 
402 HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
403 #undef HWY_SPECIALIZE
404 
405 // ------------------------------ Lanes
406 
407 // WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
408 // vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
409 #define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
410  MLEN, NAME, OP) \
411  template <size_t N> \
412  HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
413  size_t actual = v##OP##SEW##LMUL(); \
414  /* Common case of full vectors: avoid any extra instructions. */ \
415  /* actual includes LMUL, so do not shift again. */ \
416  if (detail::IsFull(d)) return actual; \
417  /* Check for virtual LMUL, e.g. "uint16mf8_t" (not provided by */ \
418  /* intrinsics). In this case the actual LMUL is 1/4, so divide by */ \
419  /* another factor of two. */ \
420  if (detail::ScaleByPower(128 / SEW, SHIFT) == 1) actual >>= 1; \
421  return HWY_MIN(actual, N); \
422  }
423 
424 HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL_VIRT)
425 #undef HWY_RVV_LANES
426 
427 template <size_t N, int kPow2>
430 }
431 
432 // ------------------------------ Common x-macros
433 
434 // Last argument to most intrinsics. Use when the op has no d arg of its own,
435 // which means there is no user-specified cap.
436 #define HWY_RVV_AVL(SEW, SHIFT) \
437  Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>())
438 
439 // vector = f(vector), e.g. Not
440 #define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
441  SHIFT, MLEN, NAME, OP) \
442  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
443  return v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
444  }
445 
446 // vector = f(vector, scalar), e.g. detail::AddS
447 #define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
448  SHIFT, MLEN, NAME, OP) \
449  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
450  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
451  return v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
452  }
453 
454 // vector = f(vector, vector), e.g. Add
455 #define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
456  SHIFT, MLEN, NAME, OP) \
457  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
458  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
459  return v##OP##_vv_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
460  }
461 
462 // mask = f(mask)
463 #define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
464  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
465  return vm##OP##_m_b##MLEN(m, ~0ull); \
466  }
467 
468 // ================================================== INIT
469 
470 // ------------------------------ Set
471 
472 #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
473  MLEN, NAME, OP) \
474  template <size_t N> \
475  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
476  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \
477  return v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \
478  }
479 
480 HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT)
481 HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
482 #undef HWY_RVV_SET
483 
484 // Treat bfloat16_t as uint16_t (using the previously defined Set overloads);
485 // required for Zero and VFromD.
486 template <size_t N, int kPow2>
487 decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(Simd<bfloat16_t, N, kPow2> d,
488  bfloat16_t arg) {
489  return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
490 }
491 
492 template <class D>
493 using VFromD = decltype(Set(D(), TFromD<D>()));
494 
495 // ------------------------------ Zero
496 
497 template <typename T, size_t N, int kPow2>
499  return Set(d, T(0));
500 }
501 
502 // ------------------------------ Undefined
503 
504 // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
505 // by it gives unpredictable results. It should only be used for maskoff, so
506 // keep it internal. For the Highway op, just use Zero (single instruction).
507 namespace detail {
508 #define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
509  SHIFT, MLEN, NAME, OP) \
510  template <size_t N> \
511  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
512  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) { \
513  return v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \
514  }
515 
517 #undef HWY_RVV_UNDEFINED
518 } // namespace detail
519 
520 template <class D>
522  return Zero(d);
523 }
524 
525 // ------------------------------ BitCast
526 
527 namespace detail {
528 
529 // Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.)
530 #define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
531  MLEN, NAME, OP) \
532  HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
533  return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH(v); /* no AVL */ \
534  }
535 HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC)
536 #undef HWY_RVV_TRUNC
537 
538 // Doubles LMUL to `d2` (the arg is only necessary for _VIRT).
539 #define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
540  MLEN, NAME, OP) \
541  template <size_t N> \
542  HWY_API HWY_RVV_V(BASE, SEW, LMULD) \
543  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
544  HWY_RVV_V(BASE, SEW, LMUL) v) { \
545  return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD(v); /* no AVL */ \
546  }
547 HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
548 #undef HWY_RVV_EXT
549 
550 // For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is
551 // the same as the actual input type.
552 #define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
553  SHIFT, MLEN, NAME, OP) \
554  template <size_t N> \
555  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
556  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
557  HWY_RVV_V(BASE, SEW, LMUL) v) { \
558  return v; \
559  }
560 HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
561 #undef HWY_RVV_EXT_VIRT
562 
563 // For BitCastToByte, the D arg is only to prevent duplicate definitions caused
564 // by _ALL_VIRT.
565 
566 // There is no reinterpret from u8 <-> u8, so just return.
567 #define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
568  SHIFT, MLEN, NAME, OP) \
569  template <typename T, size_t N> \
570  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
571  vuint8##LMUL##_t v) { \
572  return v; \
573  } \
574  template <size_t N> \
575  HWY_API vuint8##LMUL##_t BitCastFromByte( \
576  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
577  return v; \
578  }
579 
580 // For i8, need a single reinterpret (HWY_RVV_CAST_IF does two).
581 #define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
582  SHIFT, MLEN, NAME, OP) \
583  template <typename T, size_t N> \
584  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
585  vint8##LMUL##_t v) { \
586  return vreinterpret_v_i8##LMUL##_u8##LMUL(v); \
587  } \
588  template <size_t N> \
589  HWY_API vint8##LMUL##_t BitCastFromByte( \
590  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
591  return vreinterpret_v_u8##LMUL##_i8##LMUL(v); \
592  }
593 
594 // Separate u/i because clang only provides signed <-> unsigned reinterpret for
595 // the same SEW.
596 #define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
597  MLEN, NAME, OP) \
598  template <typename T, size_t N> \
599  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
600  HWY_RVV_V(BASE, SEW, LMUL) v) { \
601  return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \
602  } \
603  template <size_t N> \
604  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
605  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
606  return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \
607  }
608 
609 // Signed/Float: first cast to/from unsigned
610 #define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
611  SHIFT, MLEN, NAME, OP) \
612  template <typename T, size_t N> \
613  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
614  HWY_RVV_V(BASE, SEW, LMUL) v) { \
615  return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
616  v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \
617  } \
618  template <size_t N> \
619  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
620  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
621  return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
622  v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \
623  }
624 
625 // Additional versions for virtual LMUL using LMULH for byte vectors.
626 #define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
627  SHIFT, MLEN, NAME, OP) \
628  template <typename T, size_t N> \
629  HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
630  HWY_RVV_V(BASE, SEW, LMUL) v) { \
631  return detail::Trunc(v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \
632  } \
633  template <size_t N> \
634  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
635  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
636  HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
637  const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
638  return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \
639  }
640 
641 // Signed/Float: first cast to/from unsigned
642 #define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
643  SHIFT, MLEN, NAME, OP) \
644  template <typename T, size_t N> \
645  HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
646  HWY_RVV_V(BASE, SEW, LMUL) v) { \
647  return detail::Trunc(v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
648  v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \
649  } \
650  template <size_t N> \
651  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
652  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
653  HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
654  const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
655  return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
656  v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \
657  }
658 
659 HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL)
660 HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL)
661 HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL)
662 HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
663 HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
664 HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT)
665 HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
666 HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
667 
668 #undef HWY_RVV_CAST_U8
669 #undef HWY_RVV_CAST_I8
670 #undef HWY_RVV_CAST_U
671 #undef HWY_RVV_CAST_IF
672 #undef HWY_RVV_CAST_VIRT_U
673 #undef HWY_RVV_CAST_VIRT_IF
674 
675 template <size_t N, int kPow2>
679 }
680 
681 } // namespace detail
682 
683 template <class D, class FromV>
684 HWY_API VFromD<D> BitCast(D d, FromV v) {
686 }
687 
688 namespace detail {
689 
690 template <class V, class DU = RebindToUnsigned<DFromV<V>>>
692  return BitCast(DU(), v);
693 }
694 
695 } // namespace detail
696 
697 // ------------------------------ Iota
698 
699 namespace detail {
700 
701 #define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
702  MLEN, NAME, OP) \
703  template <size_t N> \
704  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
705  return v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \
706  }
707 
708 HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT)
709 #undef HWY_RVV_IOTA
710 
711 template <class D, class DU = RebindToUnsigned<D>>
712 HWY_INLINE VFromD<DU> Iota0(const D /*d*/) {
713  return BitCastToUnsigned(Iota0(DU()));
714 }
715 
716 } // namespace detail
717 
718 // ================================================== LOGICAL
719 
720 // ------------------------------ Not
721 
723 
724 template <class V, HWY_IF_FLOAT_V(V)>
725 HWY_API V Not(const V v) {
726  using DF = DFromV<V>;
727  using DU = RebindToUnsigned<DF>;
728  return BitCast(DF(), Not(BitCast(DU(), v)));
729 }
730 
731 // ------------------------------ And
732 
733 // Non-vector version (ideally immediate) for use with Iota0
734 namespace detail {
735 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL)
736 } // namespace detail
737 
739 
740 template <class V, HWY_IF_FLOAT_V(V)>
741 HWY_API V And(const V a, const V b) {
742  using DF = DFromV<V>;
743  using DU = RebindToUnsigned<DF>;
744  return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b)));
745 }
746 
747 // ------------------------------ Or
748 
750 
751 template <class V, HWY_IF_FLOAT_V(V)>
752 HWY_API V Or(const V a, const V b) {
753  using DF = DFromV<V>;
754  using DU = RebindToUnsigned<DF>;
755  return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b)));
756 }
757 
758 // ------------------------------ Xor
759 
760 // Non-vector version (ideally immediate) for use with Iota0
761 namespace detail {
762 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL)
763 } // namespace detail
764 
766 
767 template <class V, HWY_IF_FLOAT_V(V)>
768 HWY_API V Xor(const V a, const V b) {
769  using DF = DFromV<V>;
770  using DU = RebindToUnsigned<DF>;
771  return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b)));
772 }
773 
774 // ------------------------------ AndNot
775 
776 template <class V>
777 HWY_API V AndNot(const V not_a, const V b) {
778  return And(Not(not_a), b);
779 }
780 
781 // ------------------------------ Or3
782 
783 template <class V>
784 HWY_API V Or3(V o1, V o2, V o3) {
785  return Or(o1, Or(o2, o3));
786 }
787 
788 // ------------------------------ OrAnd
789 
790 template <class V>
791 HWY_API V OrAnd(const V o, const V a1, const V a2) {
792  return Or(o, And(a1, a2));
793 }
794 
795 // ------------------------------ CopySign
796 
798 
799 template <class V>
800 HWY_API V CopySignToAbs(const V abs, const V sign) {
801  // RVV can also handle abs < 0, so no extra action needed.
802  return CopySign(abs, sign);
803 }
804 
805 // ================================================== ARITHMETIC
806 
807 // ------------------------------ Add
808 
809 namespace detail {
810 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL)
811 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL)
812 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL)
813 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL)
814 } // namespace detail
815 
818 
819 // ------------------------------ Sub
822 
823 // ------------------------------ SaturatedAdd
824 
827 
830 
831 // ------------------------------ SaturatedSub
832 
835 
838 
839 // ------------------------------ AverageRound
840 
841 // TODO(janwas): check vxrm rounding mode
844 
845 // ------------------------------ ShiftLeft[Same]
846 
847 // Intrinsics do not define .vi forms, so use .vx instead.
848 #define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
849  MLEN, NAME, OP) \
850  template <int kBits> \
851  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
852  return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, HWY_RVV_AVL(SEW, SHIFT)); \
853  } \
854  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
855  NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
856  return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \
857  HWY_RVV_AVL(SEW, SHIFT)); \
858  }
859 
861 
862 // ------------------------------ ShiftRight[Same]
863 
866 
867 #undef HWY_RVV_SHIFT
868 
869 // ------------------------------ SumsOf8 (ShiftRight, Add)
870 template <class VU8>
872  const DFromV<VU8> du8;
873  const RepartitionToWide<decltype(du8)> du16;
874  const RepartitionToWide<decltype(du16)> du32;
875  const RepartitionToWide<decltype(du32)> du64;
876  using VU16 = VFromD<decltype(du16)>;
877 
878  const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
879  const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF);
880  const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
881 
882  const VU16 szz_FE_zz_BA_zz_76_zz_32 =
883  BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
884  const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
885  Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
886  const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
887  BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
888  const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
889  Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
890  return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
891 }
892 
893 // ------------------------------ RotateRight
894 template <int kBits, class V>
895 HWY_API V RotateRight(const V v) {
896  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
897  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
898  if (kBits == 0) return v;
899  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
900 }
901 
902 // ------------------------------ Shl
903 #define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
904  SHIFT, MLEN, NAME, OP) \
905  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
906  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
907  return v##OP##_vv_##CHAR##SEW##LMUL(v, bits, HWY_RVV_AVL(SEW, SHIFT)); \
908  }
909 
911 
912 #define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
913  SHIFT, MLEN, NAME, OP) \
914  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
915  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
916  return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits), \
917  HWY_RVV_AVL(SEW, SHIFT)); \
918  }
919 
921 
922 // ------------------------------ Shr
923 
926 
927 #undef HWY_RVV_SHIFT_II
928 #undef HWY_RVV_SHIFT_VV
929 
930 // ------------------------------ Min
931 
935 
936 // ------------------------------ Max
937 
938 namespace detail {
939 
940 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL)
941 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL)
942 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL)
943 
944 } // namespace detail
945 
949 
950 // ------------------------------ Mul
951 
952 // Only for internal use (Highway only promises Mul for 16/32-bit inputs).
953 // Used by MulLower.
954 namespace detail {
956 } // namespace detail
957 
961 
962 // ------------------------------ MulHigh
963 
964 // Only for internal use (Highway only promises MulHigh for 16-bit inputs).
965 // Used by MulEven; vwmul does not work for m8.
966 namespace detail {
970 } // namespace detail
971 
974 
975 // ------------------------------ MulFixedPoint15
977 
978 // ------------------------------ Div
980 
981 // ------------------------------ ApproximateReciprocal
983 
984 // ------------------------------ Sqrt
986 
987 // ------------------------------ ApproximateReciprocalSqrt
989 
990 // ------------------------------ MulAdd
991 // Note: op is still named vv, not vvv.
992 #define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
993  MLEN, NAME, OP) \
994  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
995  NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
996  HWY_RVV_V(BASE, SEW, LMUL) add) { \
997  return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, HWY_RVV_AVL(SEW, SHIFT)); \
998  }
999 
1000 HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc, _ALL)
1001 
1002 // ------------------------------ NegMulAdd
1003 HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulAdd, fnmsac, _ALL)
1004 
1005 // ------------------------------ MulSub
1006 HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulSub, fmsac, _ALL)
1007 
1008 // ------------------------------ NegMulSub
1009 HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
1010 
1011 #undef HWY_RVV_FMA
1012 
1013 // ================================================== COMPARE
1014 
1015 // Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
1016 // vboolXX_t is a power of two divisor for vector bits. SLEN 8 / LMUL 1 = 1/8th
1017 // of all bits; SLEN 8 / LMUL 4 = half of all bits.
1018 
1019 // mask = f(vector, vector)
1020 #define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1021  SHIFT, MLEN, NAME, OP) \
1022  HWY_API HWY_RVV_M(MLEN) \
1023  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
1024  return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b, \
1025  HWY_RVV_AVL(SEW, SHIFT)); \
1026  }
1027 
1028 // mask = f(vector, scalar)
1029 #define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1030  SHIFT, MLEN, NAME, OP) \
1031  HWY_API HWY_RVV_M(MLEN) \
1032  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
1033  return v##OP##_##CHAR##SEW##LMUL##_b##MLEN(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1034  }
1035 
1036 // ------------------------------ Eq
1039 
1040 namespace detail {
1041 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
1042 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
1043 } // namespace detail
1044 
1045 // ------------------------------ Ne
1048 
1049 namespace detail {
1050 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
1051 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
1052 } // namespace detail
1053 
1054 // ------------------------------ Lt
1058 
1059 namespace detail {
1060 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
1061 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL)
1062 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
1063 } // namespace detail
1064 
1065 // ------------------------------ Le
1067 
1068 #undef HWY_RVV_RETM_ARGVV
1069 #undef HWY_RVV_RETM_ARGVS
1070 
1071 // ------------------------------ Gt/Ge
1072 
1073 template <class V>
1074 HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
1075  return Le(b, a);
1076 }
1077 
1078 template <class V>
1079 HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
1080  return Lt(b, a);
1081 }
1082 
1083 // ------------------------------ TestBit
1084 template <class V>
1085 HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) {
1086  return detail::NeS(And(a, bit), 0);
1087 }
1088 
1089 // ------------------------------ Not
1091 
1092 
1093 // ------------------------------ And
1094 
1095 // mask = f(mask_a, mask_b) (note arg2,arg1 order!)
1096 #define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \
1097  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
1098  return vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \
1099  }
1100 
1102 
1103 // ------------------------------ AndNot
1105 
1106 // ------------------------------ Or
1108 
1109 // ------------------------------ Xor
1111 
1112 #undef HWY_RVV_RETM_ARGMM
1113 
1114 // ------------------------------ IfThenElse
1115 #define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1116  SHIFT, MLEN, NAME, OP) \
1117  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1118  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
1119  HWY_RVV_V(BASE, SEW, LMUL) no) { \
1120  return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes, HWY_RVV_AVL(SEW, SHIFT)); \
1121  }
1122 
1124 
1125 #undef HWY_RVV_IF_THEN_ELSE
1126 
1127 // ------------------------------ IfThenElseZero
1128 template <class M, class V>
1129 HWY_API V IfThenElseZero(const M mask, const V yes) {
1130  return IfThenElse(mask, yes, Zero(DFromV<V>()));
1131 }
1132 
1133 // ------------------------------ IfThenZeroElse
1134 
1135 #define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1136  LMULH, SHIFT, MLEN, NAME, OP) \
1137  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1138  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \
1139  return v##OP##_##CHAR##SEW##LMUL(m, no, 0, HWY_RVV_AVL(SEW, SHIFT)); \
1140  }
1141 
1144 
1145 #undef HWY_RVV_IF_THEN_ZERO_ELSE
1146 
1147 // ------------------------------ MaskFromVec
1148 
1149 template <class V>
1150 HWY_API auto MaskFromVec(const V v) -> decltype(Eq(v, v)) {
1151  return detail::NeS(v, 0);
1152 }
1153 
1154 template <class D>
1155 using MFromD = decltype(MaskFromVec(Zero(D())));
1156 
1157 template <class D, typename MFrom>
1158 HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
1159  // No need to check lane size/LMUL are the same: if not, casting MFrom to
1160  // MFromD<D> would fail.
1161  return mask;
1162 }
1163 
1164 // ------------------------------ VecFromMask
1165 
1166 namespace detail {
1167 #define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1168  SHIFT, MLEN, NAME, OP) \
1169  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1170  NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_M(MLEN) m) { \
1171  return v##OP##_##CHAR##SEW##LMUL##_m(m, v0, v0, 1, \
1172  HWY_RVV_AVL(SEW, SHIFT)); \
1173  }
1174 
1175 HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, SubS, sub_vx, _ALL)
1176 #undef HWY_RVV_VEC_FROM_MASK
1177 } // namespace detail
1178 
1179 template <class D, HWY_IF_NOT_FLOAT_D(D)>
1181  return detail::SubS(Zero(d), mask);
1182 }
1183 
1184 template <class D, HWY_IF_FLOAT_D(D)>
1185 HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) {
1186  return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
1187 }
1188 
1189 // ------------------------------ IfVecThenElse (MaskFromVec)
1190 
1191 template <class V>
1192 HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
1193  return IfThenElse(MaskFromVec(mask), yes, no);
1194 }
1195 
1196 // ------------------------------ ZeroIfNegative
1197 template <class V>
1198 HWY_API V ZeroIfNegative(const V v) {
1199  return IfThenZeroElse(detail::LtS(v, 0), v);
1200 }
1201 
1202 // ------------------------------ BroadcastSignBit
1203 template <class V>
1204 HWY_API V BroadcastSignBit(const V v) {
1205  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
1206 }
1207 
1208 // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1209 template <class V>
1210 HWY_API V IfNegativeThenElse(V v, V yes, V no) {
1211  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
1212  const DFromV<V> d;
1213  const RebindToSigned<decltype(d)> di;
1214 
1215  MFromD<decltype(d)> m =
1217  return IfThenElse(m, yes, no);
1218 }
1219 
1220 // ------------------------------ FindFirstTrue
1221 
1222 #define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1223  template <class D> \
1224  HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
1225  static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1226  return vfirst_m_b##MLEN(m, Lanes(d)); \
1227  }
1228 
1230 #undef HWY_RVV_FIND_FIRST_TRUE
1231 
1232 // ------------------------------ AllFalse
1233 template <class D>
1235  return FindFirstTrue(d, m) < 0;
1236 }
1237 
1238 // ------------------------------ AllTrue
1239 
1240 #define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1241  template <class D> \
1242  HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \
1243  static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1244  return AllFalse(d, vmnot_m_b##MLEN(m, Lanes(d))); \
1245  }
1246 
1248 #undef HWY_RVV_ALL_TRUE
1249 
1250 // ------------------------------ CountTrue
1251 
1252 #define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1253  template <class D> \
1254  HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \
1255  static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1256  return vcpop_m_b##MLEN(m, Lanes(d)); \
1257  }
1258 
1260 #undef HWY_RVV_COUNT_TRUE
1261 
1262 // ================================================== MEMORY
1263 
1264 // ------------------------------ Load
1265 
1266 #define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1267  MLEN, NAME, OP) \
1268  template <size_t N> \
1269  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1270  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1271  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1272  return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, Lanes(d)); \
1273  }
1274 HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
1275 #undef HWY_RVV_LOAD
1276 
1277 // There is no native BF16, treat as uint16_t.
1278 template <size_t N, int kPow2>
1281  return Load(RebindToUnsigned<decltype(d)>(),
1282  reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1283 }
1284 
1285 template <size_t N, int kPow2>
1288  Store(v, RebindToUnsigned<decltype(d)>(),
1289  reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1290 }
1291 
1292 // ------------------------------ LoadU
1293 
1294 // RVV only requires lane alignment, not natural alignment of the entire vector.
1295 template <class D>
1296 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1297  return Load(d, p);
1298 }
1299 
1300 // ------------------------------ MaskedLoad
1301 
1302 #define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1303  SHIFT, MLEN, NAME, OP) \
1304  template <size_t N> \
1305  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1306  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1307  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1308  return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, Zero(d), p, Lanes(d)); \
1309  }
1311 #undef HWY_RVV_MASKED_LOAD
1312 
1313 // ------------------------------ Store
1314 
1315 #define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1316  MLEN, NAME, OP) \
1317  template <size_t N> \
1318  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1319  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1320  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1321  return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, Lanes(d)); \
1322  }
1323 HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
1324 #undef HWY_RVV_STORE
1325 
1326 // ------------------------------ BlendedStore
1327 
1328 #define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1329  SHIFT, MLEN, NAME, OP) \
1330  template <size_t N> \
1331  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1332  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1333  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1334  return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, p, v, Lanes(d)); \
1335  }
1337 #undef HWY_RVV_BLENDED_STORE
1338 
1339 namespace detail {
1340 
1341 #define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1342  MLEN, NAME, OP) \
1343  template <size_t N> \
1344  HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
1345  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \
1346  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1347  return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, count); \
1348  }
1349 HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
1350 #undef HWY_RVV_STOREN
1351 
1352 } // namespace detail
1353 
1354 // ------------------------------ StoreU
1355 
1356 // RVV only requires lane alignment, not natural alignment of the entire vector.
1357 template <class V, class D>
1358 HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1359  Store(v, d, p);
1360 }
1361 
1362 // ------------------------------ Stream
1363 template <class V, class D, typename T>
1364 HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
1365  Store(v, d, aligned);
1366 }
1367 
1368 // ------------------------------ ScatterOffset
1369 
1370 #define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1371  SHIFT, MLEN, NAME, OP) \
1372  template <size_t N> \
1373  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1374  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1375  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1376  HWY_RVV_V(int, SEW, LMUL) offset) { \
1377  return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1378  base, detail::BitCastToUnsigned(offset), v, Lanes(d)); \
1379  }
1381 #undef HWY_RVV_SCATTER
1382 
1383 // ------------------------------ ScatterIndex
1384 
1385 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1387  const VFromD<RebindToSigned<D>> index) {
1388  return ScatterOffset(v, d, base, ShiftLeft<2>(index));
1389 }
1390 
1391 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1392 HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
1393  const VFromD<RebindToSigned<D>> index) {
1394  return ScatterOffset(v, d, base, ShiftLeft<3>(index));
1395 }
1396 
1397 // ------------------------------ GatherOffset
1398 
1399 #define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1400  MLEN, NAME, OP) \
1401  template <size_t N> \
1402  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1403  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1404  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1405  HWY_RVV_V(int, SEW, LMUL) offset) { \
1406  return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1407  base, detail::BitCastToUnsigned(offset), Lanes(d)); \
1408  }
1410 #undef HWY_RVV_GATHER
1411 
1412 // ------------------------------ GatherIndex
1413 
1414 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1416  const VFromD<RebindToSigned<D>> index) {
1417  return GatherOffset(d, base, ShiftLeft<2>(index));
1418 }
1419 
1420 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1421 HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
1422  const VFromD<RebindToSigned<D>> index) {
1423  return GatherOffset(d, base, ShiftLeft<3>(index));
1424 }
1425 
1426 // ------------------------------ LoadInterleaved2
1427 
1428 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1429 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1430 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1431 #else
1432 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1433 #endif
1434 
1435 #define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1436  MLEN, NAME, OP) \
1437  template <size_t N> \
1438  HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1439  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
1440  HWY_RVV_V(BASE, SEW, LMUL) & v0, \
1441  HWY_RVV_V(BASE, SEW, LMUL) & v1) { \
1442  v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, unaligned, Lanes(d)); \
1443  }
1444 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1446 #undef HWY_RVV_LOAD2
1447 
1448 // ------------------------------ LoadInterleaved3
1449 
1450 #define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1451  MLEN, NAME, OP) \
1452  template <size_t N> \
1453  HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1454  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
1455  HWY_RVV_V(BASE, SEW, LMUL) & v0, \
1456  HWY_RVV_V(BASE, SEW, LMUL) & v1, \
1457  HWY_RVV_V(BASE, SEW, LMUL) & v2) { \
1458  v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, unaligned, Lanes(d)); \
1459  }
1460 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1462 #undef HWY_RVV_LOAD3
1463 
1464 // ------------------------------ LoadInterleaved4
1465 
1466 #define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1467  MLEN, NAME, OP) \
1468  template <size_t N> \
1469  HWY_API void NAME( \
1470  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1471  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned, \
1472  HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1, \
1473  HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) { \
1474  v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, &v3, aligned, \
1475  Lanes(d)); \
1476  }
1477 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1479 #undef HWY_RVV_LOAD4
1480 
1481 // ------------------------------ StoreInterleaved2
1482 
1483 #define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1484  MLEN, NAME, OP) \
1485  template <size_t N> \
1486  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, \
1487  HWY_RVV_V(BASE, SEW, LMUL) v1, \
1488  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1489  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
1490  v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, Lanes(d)); \
1491  }
1492 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1494 #undef HWY_RVV_STORE2
1495 
1496 // ------------------------------ StoreInterleaved3
1497 
1498 #define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1499  MLEN, NAME, OP) \
1500  template <size_t N> \
1501  HWY_API void NAME( \
1502  HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1503  HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1504  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
1505  v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, v2, Lanes(d)); \
1506  }
1507 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1509 #undef HWY_RVV_STORE3
1510 
1511 // ------------------------------ StoreInterleaved4
1512 
1513 #define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1514  MLEN, NAME, OP) \
1515  template <size_t N> \
1516  HWY_API void NAME( \
1517  HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1518  HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \
1519  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1520  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \
1521  v##OP##e##SEW##_v_##CHAR##SEW##LMUL(aligned, v0, v1, v2, v3, Lanes(d)); \
1522  }
1523 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1525 #undef HWY_RVV_STORE4
1526 
1527 // ================================================== CONVERT
1528 
1529 // ------------------------------ PromoteTo
1530 
1531 // SEW is for the input so we can use F16 (no-op if not supported).
1532 #define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1533  SHIFT, MLEN, NAME, OP) \
1534  template <size_t N> \
1535  HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
1536  HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1537  return OP##CHAR##SEWD##LMULD(v, Lanes(d)); \
1538  }
1539 
1540 HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1541 HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1542 HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1543 HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1544 HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1545 HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1546 HWY_RVV_FOREACH_F16(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
1547 HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
1548 #undef HWY_RVV_PROMOTE
1549 
1550 // The above X-macro cannot handle 4x promotion nor type switching.
1551 // TODO(janwas): use BASE2 arg to allow the latter.
1552 #define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \
1553  SHIFT, ADD) \
1554  template <size_t N> \
1555  HWY_API HWY_RVV_V(BASE, BITS, LMUL) \
1556  PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \
1557  HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \
1558  return OP##CHAR##BITS##LMUL(v, Lanes(d)); \
1559  }
1560 
1561 #define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1562  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \
1563  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \
1564  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \
1565  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \
1566  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1)
1567 
1568 #define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1569  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \
1570  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \
1571  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \
1572  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \
1573  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2)
1574 
1575 HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
1576 HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
1577 
1578 // i32 to f64
1579 HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
1580 
1581 #undef HWY_RVV_PROMOTE_X4
1582 #undef HWY_RVV_PROMOTE_X2
1583 #undef HWY_RVV_PROMOTE
1584 
1585 // Unsigned to signed: cast for unsigned promote.
1586 template <size_t N, int kPow2>
1588  VFromD<Rebind<uint8_t, decltype(d)>> v)
1589  -> VFromD<decltype(d)> {
1590  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1591 }
1592 
1593 template <size_t N, int kPow2>
1595  VFromD<Rebind<uint8_t, decltype(d)>> v)
1596  -> VFromD<decltype(d)> {
1597  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1598 }
1599 
1600 template <size_t N, int kPow2>
1602  VFromD<Rebind<uint16_t, decltype(d)>> v)
1603  -> VFromD<decltype(d)> {
1604  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1605 }
1606 
1607 template <size_t N, int kPow2>
1609  VFromD<Rebind<bfloat16_t, decltype(d)>> v)
1610  -> VFromD<decltype(d)> {
1611  const RebindToSigned<decltype(d)> di32;
1612  const Rebind<uint16_t, decltype(d)> du16;
1613  return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
1614 }
1615 
1616 // ------------------------------ DemoteTo U
1617 
1618 // SEW is for the source so we can use _DEMOTE.
1619 #define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1620  MLEN, NAME, OP) \
1621  template <size_t N> \
1622  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
1623  HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1624  return OP##CHAR##SEWH##LMULH(v, 0, Lanes(d)); \
1625  } \
1626  template <size_t N> \
1627  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME##Shr16( \
1628  HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1629  return OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \
1630  }
1631 
1632 // Unsigned -> unsigned (also used for bf16)
1633 namespace detail {
1634 HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
1635 HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
1636 } // namespace detail
1637 
1638 // SEW is for the source so we can use _DEMOTE.
1639 #define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1640  SHIFT, MLEN, NAME, OP) \
1641  template <size_t N> \
1642  HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \
1643  HWY_RVV_D(uint, SEWH, N, SHIFT - 1) d, HWY_RVV_V(int, SEW, LMUL) v) { \
1644  /* First clamp negative numbers to zero to match x86 packus. */ \
1645  return detail::DemoteTo(d, detail::BitCastToUnsigned(detail::MaxS(v, 0))); \
1646  }
1649 #undef HWY_RVV_DEMOTE_I_TO_U
1650 
1651 template <size_t N>
1652 HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vint32mf2_t v) {
1653  return vnclipu_wx_u8mf8(DemoteTo(Simd<uint16_t, N, -2>(), v), 0, Lanes(d));
1654 }
1655 template <size_t N>
1656 HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vint32m1_t v) {
1657  return vnclipu_wx_u8mf4(DemoteTo(Simd<uint16_t, N, -1>(), v), 0, Lanes(d));
1658 }
1659 template <size_t N>
1660 HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vint32m2_t v) {
1661  return vnclipu_wx_u8mf2(DemoteTo(Simd<uint16_t, N, 0>(), v), 0, Lanes(d));
1662 }
1663 template <size_t N>
1664 HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vint32m4_t v) {
1665  return vnclipu_wx_u8m1(DemoteTo(Simd<uint16_t, N, 1>(), v), 0, Lanes(d));
1666 }
1667 template <size_t N>
1668 HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vint32m8_t v) {
1669  return vnclipu_wx_u8m2(DemoteTo(Simd<uint16_t, N, 2>(), v), 0, Lanes(d));
1670 }
1671 
1672 HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
1673  const size_t avl = Lanes(ScalableTag<uint8_t, -3>());
1674  return vnclipu_wx_u8mf8(vnclipu_wx_u16mf4(v, 0, avl), 0, avl);
1675 }
1676 HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) {
1677  const size_t avl = Lanes(ScalableTag<uint8_t, -2>());
1678  return vnclipu_wx_u8mf4(vnclipu_wx_u16mf2(v, 0, avl), 0, avl);
1679 }
1680 HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) {
1681  const size_t avl = Lanes(ScalableTag<uint8_t, -1>());
1682  return vnclipu_wx_u8mf2(vnclipu_wx_u16m1(v, 0, avl), 0, avl);
1683 }
1684 HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) {
1685  const size_t avl = Lanes(ScalableTag<uint8_t, 0>());
1686  return vnclipu_wx_u8m1(vnclipu_wx_u16m2(v, 0, avl), 0, avl);
1687 }
1688 HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) {
1689  const size_t avl = Lanes(ScalableTag<uint8_t, 1>());
1690  return vnclipu_wx_u8m2(vnclipu_wx_u16m4(v, 0, avl), 0, avl);
1691 }
1692 
1693 // ------------------------------ DemoteTo I
1694 
1695 HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
1696 HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
1697 
1698 template <size_t N>
1699 HWY_API vint8mf8_t DemoteTo(Simd<int8_t, N, -3> d, const vint32mf2_t v) {
1700  return DemoteTo(d, DemoteTo(Simd<int16_t, N, -2>(), v));
1701 }
1702 template <size_t N>
1703 HWY_API vint8mf4_t DemoteTo(Simd<int8_t, N, -2> d, const vint32m1_t v) {
1704  return DemoteTo(d, DemoteTo(Simd<int16_t, N, -1>(), v));
1705 }
1706 template <size_t N>
1707 HWY_API vint8mf2_t DemoteTo(Simd<int8_t, N, -1> d, const vint32m2_t v) {
1708  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 0>(), v));
1709 }
1710 template <size_t N>
1711 HWY_API vint8m1_t DemoteTo(Simd<int8_t, N, 0> d, const vint32m4_t v) {
1712  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 1>(), v));
1713 }
1714 template <size_t N>
1715 HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
1716  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v));
1717 }
1718 
1719 #undef HWY_RVV_DEMOTE
1720 
1721 // ------------------------------ DemoteTo F
1722 
1723 // SEW is for the source so we can use _DEMOTE.
1724 #define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1725  SHIFT, MLEN, NAME, OP) \
1726  template <size_t N> \
1727  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
1728  HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1729  return OP##SEWH##LMULH(v, Lanes(d)); \
1730  }
1731 
1732 #if HWY_HAVE_FLOAT16
1733 HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f,
1734  _DEMOTE_VIRT)
1735 #endif
1737  _DEMOTE_VIRT)
1738 #undef HWY_RVV_DEMOTE_F
1739 
1740 // TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
1741 template <size_t N>
1742 HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -2> d, const vfloat64m1_t v) {
1743  return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
1744 }
1745 template <size_t N>
1746 HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -1> d, const vfloat64m1_t v) {
1747  return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
1748 }
1749 template <size_t N>
1750 HWY_API vint32m1_t DemoteTo(Simd<int32_t, N, 0> d, const vfloat64m2_t v) {
1751  return vfncvt_rtz_x_f_w_i32m1(v, Lanes(d));
1752 }
1753 template <size_t N>
1754 HWY_API vint32m2_t DemoteTo(Simd<int32_t, N, 1> d, const vfloat64m4_t v) {
1755  return vfncvt_rtz_x_f_w_i32m2(v, Lanes(d));
1756 }
1757 template <size_t N>
1758 HWY_API vint32m4_t DemoteTo(Simd<int32_t, N, 2> d, const vfloat64m8_t v) {
1759  return vfncvt_rtz_x_f_w_i32m4(v, Lanes(d));
1760 }
1761 
1762 template <size_t N, int kPow2>
1765  const RebindToUnsigned<decltype(d)> du16;
1766  const Rebind<uint32_t, decltype(d)> du32;
1767  return detail::DemoteToShr16(du16, BitCast(du32, v));
1768 }
1769 
1770 // ------------------------------ ConvertTo F
1771 
1772 #define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1773  SHIFT, MLEN, NAME, OP) \
1774  template <size_t N> \
1775  HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
1776  HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \
1777  return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \
1778  } \
1779  /* Truncates (rounds toward zero). */ \
1780  template <size_t N> \
1781  HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
1782  HWY_RVV_V(BASE, SEW, LMUL) v) { \
1783  return vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \
1784  } \
1785 // API only requires f32 but we provide f64 for internal use.
1786 HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
1787 #undef HWY_RVV_CONVERT
1788 
1789 // Uses default rounding mode. Must be separate because there is no D arg.
1790 #define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1791  SHIFT, MLEN, NAME, OP) \
1792  HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1793  return vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
1794  }
1796 #undef HWY_RVV_NEAREST
1797 
1798 // ================================================== COMBINE
1799 
1800 namespace detail {
1801 
1802 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes
1803 // offsets are implicitly relative to the start of their 128-bit block.
1804 template <typename T, size_t N, int kPow2>
1806  size_t lpb = 16 / sizeof(T);
1807  if (IsFull(d)) return lpb;
1808  // Also honor the user-specified (constexpr) N limit.
1809  lpb = HWY_MIN(lpb, N);
1810  // No fraction, we're done.
1811  if (kPow2 >= 0) return lpb;
1812  // Fractional LMUL: Lanes(d) may be smaller than lpb, so honor that.
1813  return HWY_MIN(lpb, Lanes(d));
1814 }
1815 
1816 template <class D, class V>
1817 HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
1818  using T = MakeUnsigned<TFromD<D>>;
1819  return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
1820 }
1821 
1822 template <size_t kLanes, class D>
1824  const RebindToUnsigned<D> du;
1825  const RebindToSigned<D> di;
1826  const auto idx_mod = AndS(Iota0(du), LanesPerBlock(du) - 1);
1827  return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
1828 }
1829 
1830 // vector = f(vector, vector, size_t)
1831 #define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1832  MLEN, NAME, OP) \
1833  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1834  NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
1835  size_t lanes) { \
1836  return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \
1837  HWY_RVV_AVL(SEW, SHIFT)); \
1838  }
1839 
1840 HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideUp, slideup, _ALL)
1841 HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideDown, slidedown, _ALL)
1842 
1843 #undef HWY_RVV_SLIDE
1844 
1845 } // namespace detail
1846 
1847 // ------------------------------ ConcatUpperLower
1848 template <class D, class V>
1849 HWY_API V ConcatUpperLower(D d, const V hi, const V lo) {
1850  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
1851 }
1852 
1853 // ------------------------------ ConcatLowerLower
1854 template <class D, class V>
1855 HWY_API V ConcatLowerLower(D d, const V hi, const V lo) {
1856  return detail::SlideUp(lo, hi, Lanes(d) / 2);
1857 }
1858 
1859 // ------------------------------ ConcatUpperUpper
1860 template <class D, class V>
1861 HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) {
1862  // Move upper half into lower
1863  const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
1864  return ConcatUpperLower(d, hi, lo_down);
1865 }
1866 
1867 // ------------------------------ ConcatLowerUpper
1868 template <class D, class V>
1869 HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) {
1870  // Move half of both inputs to the other half
1871  const auto hi_up = detail::SlideUp(hi, hi, Lanes(d) / 2);
1872  const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
1873  return ConcatUpperLower(d, hi_up, lo_down);
1874 }
1875 
1876 // ------------------------------ Combine
1877 template <class D2, class V>
1878 HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) {
1879  return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi),
1880  Lanes(d2) / 2);
1881 }
1882 
1883 // ------------------------------ ZeroExtendVector
1884 
1885 template <class D2, class V>
1887  return Combine(d2, Xor(lo, lo), lo);
1888 }
1889 
1890 // ------------------------------ Lower/UpperHalf
1891 
1892 namespace detail {
1893 
1894 // RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note
1895 // that SEW = sizeof(T)*8 and LMUL = 1 << Pow2().
1896 template <class D>
1897 constexpr bool IsSupportedLMUL(D d) {
1898  return (size_t{1} << (Pow2(d) + 3)) >= sizeof(TFromD<D>);
1899 }
1900 
1901 } // namespace detail
1902 
1903 // If IsSupportedLMUL, just 'truncate' i.e. halve LMUL.
1904 template <class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* = nullptr>
1905 HWY_API VFromD<DH> LowerHalf(const DH /* tag */, const VFromD<Twice<DH>> v) {
1906  return detail::Trunc(v);
1907 }
1908 
1909 // Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and
1910 // the hardware may set "vill" if we attempt such an LMUL. However, the V
1911 // extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it
1912 // still makes sense to have half of an SEW=64 vector. We instead just return
1913 // the vector, and rely on the kPow2 in DH to halve the return value of Lanes().
1914 template <class DH, class V,
1915  hwy::EnableIf<!detail::IsSupportedLMUL(DH())>* = nullptr>
1916 HWY_API V LowerHalf(const DH /* tag */, const V v) {
1917  return v;
1918 }
1919 
1920 // Same, but without D arg
1921 template <class V>
1923  return LowerHalf(Half<DFromV<V>>(), v);
1924 }
1925 
1926 template <class DH>
1928  return LowerHalf(d2, detail::SlideDown(v, v, Lanes(d2)));
1929 }
1930 
1931 // ================================================== SWIZZLE
1932 
1933 namespace detail {
1934 // Special instruction for 1 lane is presumably faster?
1935 #define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1936  MLEN, NAME, OP) \
1937  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1938  return v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \
1939  }
1940 
1941 HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL)
1942 HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL)
1943 HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL)
1944 HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL)
1945 #undef HWY_RVV_SLIDE1
1946 } // namespace detail
1947 
1948 // ------------------------------ GetLane
1949 
1950 #define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1951  SHIFT, MLEN, NAME, OP) \
1952  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1953  return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \
1954  }
1955 
1958 #undef HWY_RVV_GET_LANE
1959 
1960 // ------------------------------ ExtractLane
1961 template <class V>
1962 HWY_API TFromV<V> ExtractLane(const V v, size_t i) {
1963  return GetLane(detail::SlideDown(v, v, i));
1964 }
1965 
1966 // ------------------------------ InsertLane
1967 
1968 template <class V, HWY_IF_NOT_LANE_SIZE_V(V, 1)>
1969 HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
1970  const DFromV<V> d;
1971  const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
1972  using TU = TFromD<decltype(du)>;
1973  const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
1974  return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
1975 }
1976 
1977 namespace detail {
1978 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetOnlyFirst, sof)
1979 } // namespace detail
1980 
1981 // For 8-bit lanes, Iota0 might overflow.
1982 template <class V, HWY_IF_LANE_SIZE_V(V, 1)>
1983 HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
1984  const DFromV<V> d;
1985  const auto zero = Zero(d);
1986  const auto one = Set(d, 1);
1987  const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
1988  const auto is_i = detail::SetOnlyFirst(ge_i);
1989  return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
1990 }
1991 
1992 // ------------------------------ OddEven
1993 template <class V>
1994 HWY_API V OddEven(const V a, const V b) {
1995  const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
1996  const auto is_even = detail::EqS(detail::AndS(detail::Iota0(du), 1), 0);
1997  return IfThenElse(is_even, b, a);
1998 }
1999 
2000 // ------------------------------ DupEven (OddEven)
2001 template <class V>
2002 HWY_API V DupEven(const V v) {
2003  const V up = detail::Slide1Up(v);
2004  return OddEven(up, v);
2005 }
2006 
2007 // ------------------------------ DupOdd (OddEven)
2008 template <class V>
2009 HWY_API V DupOdd(const V v) {
2010  const V down = detail::Slide1Down(v);
2011  return OddEven(v, down);
2012 }
2013 
2014 // ------------------------------ OddEvenBlocks
2015 template <class V>
2016 HWY_API V OddEvenBlocks(const V a, const V b) {
2017  const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
2018  constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV<V>));
2019  const auto idx_block = ShiftRight<kShift>(detail::Iota0(du));
2020  const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0);
2021  return IfThenElse(is_even, b, a);
2022 }
2023 
2024 // ------------------------------ SwapAdjacentBlocks
2025 
2026 template <class V>
2027 HWY_API V SwapAdjacentBlocks(const V v) {
2028  const DFromV<V> d;
2029  const size_t lpb = detail::LanesPerBlock(d);
2030  const V down = detail::SlideDown(v, v, lpb);
2031  const V up = detail::SlideUp(v, v, lpb);
2032  return OddEvenBlocks(up, down);
2033 }
2034 
2035 // ------------------------------ TableLookupLanes
2036 
2037 template <class D, class VI>
2038 HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) {
2039  static_assert(sizeof(TFromD<D>) == sizeof(TFromV<VI>), "Index != lane");
2040  const RebindToUnsigned<decltype(d)> du; // instead of <D>: avoids unused d.
2041  const auto indices = BitCast(du, vec);
2042 #if HWY_IS_DEBUG_BUILD
2043  HWY_DASSERT(AllTrue(du, detail::LtS(indices, Lanes(d))));
2044 #endif
2045  return indices;
2046 }
2047 
2048 template <class D, typename TI>
2049 HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
2050  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
2051  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
2052 }
2053 
2054 // <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
2055 // to 2048! We could instead use vrgatherei16.
2056 #define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2057  MLEN, NAME, OP) \
2058  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2059  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
2060  return v##OP##_vv_##CHAR##SEW##LMUL(v, idx, HWY_RVV_AVL(SEW, SHIFT)); \
2061  }
2062 
2064 #undef HWY_RVV_TABLE
2065 
2066 // ------------------------------ ConcatOdd (TableLookupLanes)
2067 template <class D, class V>
2068 HWY_API V ConcatOdd(D d, const V hi, const V lo) {
2069  const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
2070  const auto iota = detail::Iota0(du);
2071  const auto idx = detail::AddS(Add(iota, iota), 1);
2072  const auto lo_odd = TableLookupLanes(lo, idx);
2073  const auto hi_odd = TableLookupLanes(hi, idx);
2074  return detail::SlideUp(lo_odd, hi_odd, Lanes(d) / 2);
2075 }
2076 
2077 // ------------------------------ ConcatEven (TableLookupLanes)
2078 template <class D, class V>
2079 HWY_API V ConcatEven(D d, const V hi, const V lo) {
2080  const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
2081  const auto iota = detail::Iota0(du);
2082  const auto idx = Add(iota, iota);
2083  const auto lo_even = TableLookupLanes(lo, idx);
2084  const auto hi_even = TableLookupLanes(hi, idx);
2085  return detail::SlideUp(lo_even, hi_even, Lanes(d) / 2);
2086 }
2087 
2088 // ------------------------------ Reverse (TableLookupLanes)
2089 template <class D>
2091  const RebindToUnsigned<D> du;
2092  using TU = TFromD<decltype(du)>;
2093  const size_t N = Lanes(du);
2094  const auto idx =
2095  detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
2096  return TableLookupLanes(v, idx);
2097 }
2098 
2099 // ------------------------------ Reverse2 (RotateRight, OddEven)
2100 
2101 // Shifting and adding requires fewer instructions than blending, but casting to
2102 // u32 only works for LMUL in [1/2, 8].
2103 template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -1, 3)>
2105  const Repartition<uint32_t, D> du32;
2106  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2107 }
2108 // For LMUL < 1/2, we can extend and then truncate.
2109 template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -3, -2)>
2110 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2111  const Twice<decltype(d)> d2;
2112  const Twice<decltype(d2)> d4;
2113  const Repartition<uint32_t, decltype(d4)> du32;
2114  const auto vx = detail::Ext(d4, detail::Ext(d2, v));
2115  const auto rx = BitCast(d4, RotateRight<16>(BitCast(du32, vx)));
2116  return detail::Trunc(detail::Trunc(rx));
2117 }
2118 
2119 // Shifting and adding requires fewer instructions than blending, but casting to
2120 // u64 does not work for LMUL < 1.
2121 template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, 0, 3)>
2122 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2123  const Repartition<uint64_t, decltype(d)> du64;
2124  return BitCast(d, RotateRight<32>(BitCast(du64, v)));
2125 }
2126 
2127 // For fractions, we can extend and then truncate.
2128 template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, -2, -1)>
2129 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2130  const Twice<decltype(d)> d2;
2131  const Twice<decltype(d2)> d4;
2132  const Repartition<uint64_t, decltype(d4)> du64;
2133  const auto vx = detail::Ext(d4, detail::Ext(d2, v));
2134  const auto rx = BitCast(d4, RotateRight<32>(BitCast(du64, vx)));
2135  return detail::Trunc(detail::Trunc(rx));
2136 }
2137 
2138 template <class D, class V = VFromD<D>, HWY_IF_LANE_SIZE_D(D, 8)>
2139 HWY_API V Reverse2(D /* tag */, const V v) {
2140  const V up = detail::Slide1Up(v);
2141  const V down = detail::Slide1Down(v);
2142  return OddEven(up, down);
2143 }
2144 
2145 // ------------------------------ Reverse4 (TableLookupLanes)
2146 
2147 template <class D>
2148 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
2149  const RebindToUnsigned<D> du;
2150  const auto idx = detail::XorS(detail::Iota0(du), 3);
2151  return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
2152 }
2153 
2154 // ------------------------------ Reverse8 (TableLookupLanes)
2155 
2156 template <class D>
2157 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
2158  const RebindToUnsigned<D> du;
2159  const auto idx = detail::XorS(detail::Iota0(du), 7);
2160  return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
2161 }
2162 
2163 // ------------------------------ ReverseBlocks (Reverse, Shuffle01)
2164 template <class D, class V = VFromD<D>>
2165 HWY_API V ReverseBlocks(D d, V v) {
2166  const Repartition<uint64_t, D> du64;
2167  const size_t N = Lanes(du64);
2168  const auto rev =
2169  detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1));
2170  // Swap lo/hi u64 within each block
2171  const auto idx = detail::XorS(rev, 1);
2172  return BitCast(d, TableLookupLanes(BitCast(du64, v), idx));
2173 }
2174 
2175 // ------------------------------ Compress
2176 
2177 template <typename T>
2178 struct CompressIsPartition {
2179  enum { value = 0 };
2180 };
2181 
2182 #define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2183  SHIFT, MLEN, NAME, OP) \
2184  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2185  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
2186  return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v, HWY_RVV_AVL(SEW, SHIFT)); \
2187  }
2188 
2191 #undef HWY_RVV_COMPRESS
2192 
2193 // ------------------------------ CompressNot
2194 template <class V, class M>
2195 HWY_API V CompressNot(V v, const M mask) {
2196  return Compress(v, Not(mask));
2197 }
2198 
2199 // ------------------------------ CompressBlocksNot
2200 template <class V, class M>
2201 HWY_API V CompressBlocksNot(V v, const M mask) {
2202  return CompressNot(v, mask);
2203 }
2204 
2205 // ------------------------------ CompressStore
2206 template <class V, class M, class D>
2207 HWY_API size_t CompressStore(const V v, const M mask, const D d,
2208  TFromD<D>* HWY_RESTRICT unaligned) {
2209  StoreU(Compress(v, mask), d, unaligned);
2210  return CountTrue(d, mask);
2211 }
2212 
2213 // ------------------------------ CompressBlendedStore
2214 template <class V, class M, class D>
2215 HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
2216  TFromD<D>* HWY_RESTRICT unaligned) {
2217  const size_t count = CountTrue(d, mask);
2218  detail::StoreN(count, Compress(v, mask), d, unaligned);
2219  return count;
2220 }
2221 
2222 // ================================================== BLOCKWISE
2223 
2224 // ------------------------------ CombineShiftRightBytes
2225 template <size_t kBytes, class D, class V = VFromD<D>>
2226 HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) {
2227  const Repartition<uint8_t, decltype(d)> d8;
2228  const auto hi8 = BitCast(d8, hi);
2229  const auto lo8 = BitCast(d8, lo);
2230  const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes);
2231  const auto lo_down = detail::SlideDown(lo8, lo8, kBytes);
2232  const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
2233  return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
2234 }
2235 
2236 // ------------------------------ CombineShiftRightLanes
2237 template <size_t kLanes, class D, class V = VFromD<D>>
2238 HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) {
2239  constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes;
2240  const auto hi_up = detail::SlideUp(hi, hi, kLanesUp);
2241  const auto lo_down = detail::SlideDown(lo, lo, kLanes);
2242  const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d);
2243  return IfThenElse(is_lo, lo_down, hi_up);
2244 }
2245 
2246 // ------------------------------ Shuffle2301 (ShiftLeft)
2247 template <class V>
2248 HWY_API V Shuffle2301(const V v) {
2249  const DFromV<V> d;
2250  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2251  const Repartition<uint64_t, decltype(d)> du64;
2252  const auto v64 = BitCast(du64, v);
2253  return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64)));
2254 }
2255 
2256 // ------------------------------ Shuffle2103
2257 template <class V>
2258 HWY_API V Shuffle2103(const V v) {
2259  const DFromV<V> d;
2260  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2261  return CombineShiftRightLanes<3>(d, v, v);
2262 }
2263 
2264 // ------------------------------ Shuffle0321
2265 template <class V>
2266 HWY_API V Shuffle0321(const V v) {
2267  const DFromV<V> d;
2268  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2269  return CombineShiftRightLanes<1>(d, v, v);
2270 }
2271 
2272 // ------------------------------ Shuffle1032
2273 template <class V>
2274 HWY_API V Shuffle1032(const V v) {
2275  const DFromV<V> d;
2276  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2277  return CombineShiftRightLanes<2>(d, v, v);
2278 }
2279 
2280 // ------------------------------ Shuffle01
2281 template <class V>
2282 HWY_API V Shuffle01(const V v) {
2283  const DFromV<V> d;
2284  static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
2285  return CombineShiftRightLanes<1>(d, v, v);
2286 }
2287 
2288 // ------------------------------ Shuffle0123
2289 template <class V>
2290 HWY_API V Shuffle0123(const V v) {
2291  return Shuffle2301(Shuffle1032(v));
2292 }
2293 
2294 // ------------------------------ TableLookupBytes
2295 
2296 // Extends or truncates a vector to match the given d.
2297 namespace detail {
2298 
2299 template <typename T, size_t N, int kPow2>
2301  -> VFromD<decltype(d)> {
2302  const Simd<T, N, kPow2 - 1> dh;
2303  const Simd<T, N, kPow2 - 2> dhh;
2304  return Ext(d, Ext(dh, Ext(dhh, v)));
2305 }
2306 template <typename T, size_t N, int kPow2>
2308  -> VFromD<decltype(d)> {
2309  const Simd<T, N, kPow2 - 1> dh;
2310  return Ext(d, Ext(dh, v));
2311 }
2312 template <typename T, size_t N, int kPow2>
2314  -> VFromD<decltype(d)> {
2315  return Ext(d, v);
2316 }
2317 
2318 template <typename T, size_t N, int kPow2>
2320  -> VFromD<decltype(d)> {
2321  return v;
2322 }
2323 
2324 template <typename T, size_t N, int kPow2>
2326  -> VFromD<decltype(d)> {
2327  return Trunc(v);
2328 }
2329 template <typename T, size_t N, int kPow2>
2331  -> VFromD<decltype(d)> {
2332  return Trunc(Trunc(v));
2333 }
2334 template <typename T, size_t N, int kPow2>
2336  -> VFromD<decltype(d)> {
2337  return Trunc(Trunc(Trunc(v)));
2338 }
2339 
2340 } // namespace detail
2341 
2342 template <class VT, class VI>
2343 HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
2344  const DFromV<VT> dt; // T=table, I=index.
2345  const DFromV<VI> di;
2346  const Repartition<uint8_t, decltype(dt)> dt8;
2347  const Repartition<uint8_t, decltype(di)> di8;
2348  // Required for producing half-vectors with table lookups from a full vector.
2349  // If we instead run at the LMUL of the index vector, lookups into the table
2350  // would be truncated. Thus we run at the larger of the two LMULs and truncate
2351  // the result vector to the original index LMUL.
2352  constexpr int kPow2T = Pow2(dt8);
2353  constexpr int kPow2I = Pow2(di8);
2354  const Simd<uint8_t, MaxLanes(di8), HWY_MAX(kPow2T, kPow2I)> dm8; // m=max
2355  const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt));
2356  const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi));
2357  auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8));
2358  // If the table is shorter, wrap around offsets so they do not reference
2359  // undefined lanes in the newly extended vmt.
2360  if (kPow2T < kPow2I) {
2361  offsets = detail::AndS(offsets, Lanes(dt8) - 1);
2362  }
2363  const auto out = TableLookupLanes(vmt, Add(vmi, offsets));
2364  return BitCast(di, detail::ChangeLMUL(di8, out));
2365 }
2366 
2367 template <class VT, class VI>
2368 HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
2369  const DFromV<VI> di;
2370  const Repartition<int8_t, decltype(di)> di8;
2371  const auto idx8 = BitCast(di8, idx);
2372  const auto lookup = TableLookupBytes(vt, idx8);
2373  return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup));
2374 }
2375 
2376 // ------------------------------ Broadcast
2377 template <int kLane, class V>
2378 HWY_API V Broadcast(const V v) {
2379  const DFromV<V> d;
2380  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
2382  if (kLane != 0) {
2383  idx = detail::AddS(idx, kLane);
2384  }
2385  return TableLookupLanes(v, idx);
2386 }
2387 
2388 // ------------------------------ ShiftLeftLanes
2389 
2390 template <size_t kLanes, class D, class V = VFromD<D>>
2391 HWY_API V ShiftLeftLanes(const D d, const V v) {
2392  const RebindToSigned<decltype(d)> di;
2393  using TI = TFromD<decltype(di)>;
2394  const auto shifted = detail::SlideUp(v, v, kLanes);
2395  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
2396  const auto idx_mod =
2397  detail::AndS(detail::Iota0(di), detail::LanesPerBlock(di) - 1);
2398  const auto clear = detail::LtS(BitCast(di, idx_mod), static_cast<TI>(kLanes));
2399  return IfThenZeroElse(clear, shifted);
2400 }
2401 
2402 template <size_t kLanes, class V>
2403 HWY_API V ShiftLeftLanes(const V v) {
2404  return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
2405 }
2406 
2407 // ------------------------------ ShiftLeftBytes
2408 
2409 template <int kBytes, class D>
2411  const Repartition<uint8_t, decltype(d)> d8;
2412  return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
2413 }
2414 
2415 template <int kBytes, class V>
2416 HWY_API V ShiftLeftBytes(const V v) {
2417  return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
2418 }
2419 
2420 // ------------------------------ ShiftRightLanes
2421 template <size_t kLanes, typename T, size_t N, int kPow2,
2422  class V = VFromD<Simd<T, N, kPow2>>>
2424  const RebindToSigned<decltype(d)> di;
2425  using TI = TFromD<decltype(di)>;
2426  // For partial vectors, clear upper lanes so we shift in zeros.
2427  if (N <= 16 / sizeof(T)) {
2428  v = IfThenElseZero(FirstN(d, N), v);
2429  }
2430 
2431  const auto shifted = detail::SlideDown(v, v, kLanes);
2432  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
2433  const size_t lpb = detail::LanesPerBlock(di);
2434  const auto idx_mod = detail::AndS(detail::Iota0(di), lpb - 1);
2435  const auto keep =
2436  detail::LtS(BitCast(di, idx_mod), static_cast<TI>(lpb - kLanes));
2437  return IfThenElseZero(keep, shifted);
2438 }
2439 
2440 // ------------------------------ ShiftRightBytes
2441 template <int kBytes, class D, class V = VFromD<D>>
2442 HWY_API V ShiftRightBytes(const D d, const V v) {
2443  const Repartition<uint8_t, decltype(d)> d8;
2444  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
2445 }
2446 
2447 // ------------------------------ InterleaveLower
2448 
2449 template <class D, class V>
2450 HWY_API V InterleaveLower(D d, const V a, const V b) {
2451  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
2452  const RebindToUnsigned<decltype(d)> du;
2453  const auto i = detail::Iota0(du);
2454  const auto idx_mod =
2455  ShiftRight<1>(detail::AndS(i, detail::LanesPerBlock(du) - 1));
2456  const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
2457  const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
2458  return IfThenElse(is_even, TableLookupLanes(a, idx),
2459  TableLookupLanes(b, idx));
2460 }
2461 
2462 template <class V>
2463 HWY_API V InterleaveLower(const V a, const V b) {
2464  return InterleaveLower(DFromV<V>(), a, b);
2465 }
2466 
2467 // ------------------------------ InterleaveUpper
2468 
2469 template <class D, class V>
2470 HWY_API V InterleaveUpper(const D d, const V a, const V b) {
2471  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
2472  const RebindToUnsigned<decltype(d)> du;
2473  const size_t lpb = detail::LanesPerBlock(du);
2474  const auto i = detail::Iota0(du);
2475  const auto idx_mod = ShiftRight<1>(detail::AndS(i, lpb - 1));
2476  const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
2477  const auto idx = detail::AddS(idx_lower, lpb / 2);
2478  const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
2479  return IfThenElse(is_even, TableLookupLanes(a, idx),
2480  TableLookupLanes(b, idx));
2481 }
2482 
2483 // ------------------------------ ZipLower
2484 
2485 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2486 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2487  const RepartitionToNarrow<DW> dn;
2488  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2489  return BitCast(dw, InterleaveLower(dn, a, b));
2490 }
2491 
2492 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2493 HWY_API VFromD<DW> ZipLower(V a, V b) {
2494  return BitCast(DW(), InterleaveLower(a, b));
2495 }
2496 
2497 // ------------------------------ ZipUpper
2498 template <class DW, class V>
2499 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2500  const RepartitionToNarrow<DW> dn;
2501  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2502  return BitCast(dw, InterleaveUpper(dn, a, b));
2503 }
2504 
2505 // ================================================== REDUCE
2506 
2507 // vector = f(vector, zero_m1)
2508 #define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2509  MLEN, NAME, OP) \
2510  template <class D> \
2511  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2512  NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \
2513  return Set(d, GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
2514  v0, v, v0, Lanes(d)))); \
2515  }
2516 
2517 // ------------------------------ SumOfLanes
2518 
2519 namespace detail {
2520 HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL)
2521 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL)
2522 } // namespace detail
2523 
2524 template <class D>
2526  const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1
2527  return detail::RedSum(d, v, v0);
2528 }
2529 
2530 // ------------------------------ MinOfLanes
2531 namespace detail {
2532 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL)
2533 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL)
2534 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL)
2535 } // namespace detail
2536 
2537 template <class D>
2539  using T = TFromD<D>;
2540  const ScalableTag<T> d1; // always m1
2541  const auto neutral = Set(d1, HighestValue<T>());
2542  return detail::RedMin(d, v, neutral);
2543 }
2544 
2545 // ------------------------------ MaxOfLanes
2546 namespace detail {
2547 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL)
2548 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL)
2549 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL)
2550 } // namespace detail
2551 
2552 template <class D>
2554  using T = TFromD<D>;
2555  const ScalableTag<T> d1; // always m1
2556  const auto neutral = Set(d1, LowestValue<T>());
2557  return detail::RedMax(d, v, neutral);
2558 }
2559 
2560 #undef HWY_RVV_REDUCE
2561 
2562 // ================================================== Ops with dependencies
2563 
2564 // ------------------------------ PopulationCount (ShiftRight)
2565 
2566 // Handles LMUL >= 2 or capped vectors, which generic_ops-inl cannot.
2567 template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
2568  hwy::EnableIf<Pow2(D()) < 1 || MaxLanes(D()) < 16>* = nullptr>
2569 HWY_API V PopulationCount(V v) {
2570  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
2571  v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55));
2572  v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33));
2573  return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F);
2574 }
2575 
2576 // ------------------------------ LoadDup128
2577 
2578 template <class D>
2579 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
2580  const auto loaded = Load(d, p);
2581  // Broadcast the first block
2582  const auto idx = detail::AndS(detail::Iota0(d), detail::LanesPerBlock(d) - 1);
2583  return TableLookupLanes(loaded, idx);
2584 }
2585 
2586 // ------------------------------ LoadMaskBits
2587 
2588 // Support all combinations of T and SHIFT(LMUL) without explicit overloads for
2589 // each. First overload for MLEN=1..64.
2590 namespace detail {
2591 
2592 // Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN
2593 // increases with lane size and decreases for increasing LMUL. Cap at 64, the
2594 // largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL
2595 // e.g. vuint16mf8_t: (8*2 << 3) == 128.
2596 template <class D>
2597 using MaskTag = hwy::SizeTag<HWY_MIN(
2598  64, detail::ScaleByPower(8 * sizeof(TFromD<D>), -Pow2(D())))>;
2599 
2600 #define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
2601  HWY_INLINE HWY_RVV_M(MLEN) \
2602  NAME(hwy::SizeTag<MLEN> /* tag */, const uint8_t* bits, size_t N) { \
2603  return OP##_v_b##MLEN(bits, N); \
2604  }
2606 #undef HWY_RVV_LOAD_MASK_BITS
2607 } // namespace detail
2608 
2609 template <class D, class MT = detail::MaskTag<D>>
2610 HWY_API auto LoadMaskBits(D d, const uint8_t* bits)
2611  -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) {
2612  return detail::LoadMaskBits(MT(), bits, Lanes(d));
2613 }
2614 
2615 // ------------------------------ StoreMaskBits
2616 #define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
2617  template <class D> \
2618  HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \
2619  const size_t N = Lanes(d); \
2620  OP##_v_b##MLEN(bits, m, N); \
2621  /* Non-full byte, need to clear the undefined upper bits. */ \
2622  /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \
2623  constexpr bool kLessThan8 = \
2624  detail::ScaleByPower(16 / sizeof(TFromD<D>), Pow2(d)) < 8; \
2625  if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \
2626  const int mask = (1 << N) - 1; \
2627  bits[0] = static_cast<uint8_t>(bits[0] & mask); \
2628  } \
2629  return (N + 7) / 8; \
2630  }
2632 #undef HWY_RVV_STORE_MASK_BITS
2633 
2634 // ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits)
2635 
2636 template <class V>
2637 HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
2638  return Compress(v, LoadMaskBits(DFromV<V>(), bits));
2639 }
2640 
2641 template <class D>
2642 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
2643  D d, TFromD<D>* HWY_RESTRICT unaligned) {
2644  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
2645 }
2646 
2647 // ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
2648 
2649 // Disallow for 8-bit because Iota is likely to overflow.
2650 template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2651 HWY_API MFromD<D> FirstN(const D d, const size_t n) {
2652  const RebindToSigned<D> di;
2653  using TI = TFromD<decltype(di)>;
2654  return RebindMask(
2655  d, detail::LtS(BitCast(di, detail::Iota0(d)), static_cast<TI>(n)));
2656 }
2657 
2658 template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
2659 HWY_API MFromD<D> FirstN(const D d, const size_t n) {
2660  const auto zero = Zero(d);
2661  const auto one = Set(d, 1);
2662  return Eq(detail::SlideUp(one, zero, n), one);
2663 }
2664 
2665 // ------------------------------ Neg (Sub)
2666 
2667 template <class V, HWY_IF_SIGNED_V(V)>
2668 HWY_API V Neg(const V v) {
2669  return detail::ReverseSubS(v, 0);
2670 }
2671 
2672 // vector = f(vector), but argument is repeated
2673 #define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2674  SHIFT, MLEN, NAME, OP) \
2675  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
2676  return v##OP##_vv_##CHAR##SEW##LMUL(v, v, HWY_RVV_AVL(SEW, SHIFT)); \
2677  }
2678 
2680 
2681 // ------------------------------ Abs (Max, Neg)
2682 
2683 template <class V, HWY_IF_SIGNED_V(V)>
2684 HWY_API V Abs(const V v) {
2685  return Max(v, Neg(v));
2686 }
2687 
2689 
2690 #undef HWY_RVV_RETV_ARGV2
2691 
2692 // ------------------------------ AbsDiff (Abs, Sub)
2693 template <class V>
2694 HWY_API V AbsDiff(const V a, const V b) {
2695  return Abs(Sub(a, b));
2696 }
2697 
2698 // ------------------------------ Round (NearestInt, ConvertTo, CopySign)
2699 
2700 // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have
2701 // a dedicated instruction for that. Rounding to integer and converting back to
2702 // float is correct except when the input magnitude is large, in which case the
2703 // input was already an integer (because mantissa >> exponent is zero).
2704 
2705 namespace detail {
2706 enum RoundingModes { kNear, kTrunc, kDown, kUp };
2707 
2708 template <class V>
2709 HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
2710  return detail::LtS(Abs(v), MantissaEnd<TFromV<V>>());
2711 }
2712 
2713 } // namespace detail
2714 
2715 template <class V>
2716 HWY_API V Round(const V v) {
2717  const DFromV<V> df;
2718 
2719  const auto integer = NearestInt(v); // round using current mode
2720  const auto int_f = ConvertTo(df, integer);
2721 
2722  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
2723 }
2724 
2725 // ------------------------------ Trunc (ConvertTo)
2726 template <class V>
2727 HWY_API V Trunc(const V v) {
2728  const DFromV<V> df;
2729  const RebindToSigned<decltype(df)> di;
2730 
2731  const auto integer = ConvertTo(di, v); // round toward 0
2732  const auto int_f = ConvertTo(df, integer);
2733 
2734  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
2735 }
2736 
2737 // ------------------------------ Ceil
2738 template <class V>
2739 HWY_API V Ceil(const V v) {
2740  asm volatile("fsrm %0" ::"r"(detail::kUp));
2741  const auto ret = Round(v);
2742  asm volatile("fsrm %0" ::"r"(detail::kNear));
2743  return ret;
2744 }
2745 
2746 // ------------------------------ Floor
2747 template <class V>
2748 HWY_API V Floor(const V v) {
2749  asm volatile("fsrm %0" ::"r"(detail::kDown));
2750  const auto ret = Round(v);
2751  asm volatile("fsrm %0" ::"r"(detail::kNear));
2752  return ret;
2753 }
2754 
2755 // ------------------------------ Floating-point classification (Ne)
2756 
2757 // vfclass does not help because it would require 3 instructions (to AND and
2758 // then compare the bits), whereas these are just 1-3 integer instructions.
2759 
2760 template <class V>
2762  return Ne(v, v);
2763 }
2764 
2765 template <class V, class D = DFromV<V>>
2767  const D d;
2768  const RebindToSigned<decltype(d)> di;
2769  using T = TFromD<D>;
2770  const VFromD<decltype(di)> vi = BitCast(di, v);
2771  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
2772  return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
2773 }
2774 
2775 // Returns whether normal/subnormal/zero.
2776 template <class V, class D = DFromV<V>>
2778  const D d;
2779  const RebindToUnsigned<decltype(d)> du;
2780  const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
2781  using T = TFromD<D>;
2782  const VFromD<decltype(du)> vu = BitCast(du, v);
2783  // 'Shift left' to clear the sign bit, then right so we can compare with the
2784  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
2785  // negative and non-negative floats would be greater).
2786  const VFromD<decltype(di)> exp =
2787  BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
2788  return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField<T>()));
2789 }
2790 
2791 // ------------------------------ Iota (ConvertTo)
2792 
2793 template <class D, HWY_IF_UNSIGNED_D(D)>
2794 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2795  return detail::AddS(detail::Iota0(d), first);
2796 }
2797 
2798 template <class D, HWY_IF_SIGNED_D(D)>
2799 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2800  const RebindToUnsigned<D> du;
2801  return detail::AddS(BitCast(d, detail::Iota0(du)), first);
2802 }
2803 
2804 template <class D, HWY_IF_FLOAT_D(D)>
2805 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2806  const RebindToUnsigned<D> du;
2807  const RebindToSigned<D> di;
2808  return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first);
2809 }
2810 
2811 // ------------------------------ MulEven/Odd (Mul, OddEven)
2812 
2813 template <class V, HWY_IF_LANE_SIZE_V(V, 4), class D = DFromV<V>,
2814  class DW = RepartitionToWide<D>>
2815 HWY_API VFromD<DW> MulEven(const V a, const V b) {
2816  const auto lo = Mul(a, b);
2817  const auto hi = detail::MulHigh(a, b);
2818  return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
2819 }
2820 
2821 // There is no 64x64 vwmul.
2822 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2823 HWY_INLINE V MulEven(const V a, const V b) {
2824  const auto lo = detail::Mul(a, b);
2825  const auto hi = detail::MulHigh(a, b);
2826  return OddEven(detail::Slide1Up(hi), lo);
2827 }
2828 
2829 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2830 HWY_INLINE V MulOdd(const V a, const V b) {
2831  const auto lo = detail::Mul(a, b);
2832  const auto hi = detail::MulHigh(a, b);
2833  return OddEven(hi, detail::Slide1Down(lo));
2834 }
2835 
2836 // ------------------------------ ReorderDemote2To (OddEven)
2837 
2838 template <size_t N, int kPow2>
2841  VFromD<RepartitionToWide<decltype(dbf16)>> a,
2842  VFromD<RepartitionToWide<decltype(dbf16)>> b) {
2843  const RebindToUnsigned<decltype(dbf16)> du16;
2844  const RebindToUnsigned<DFromV<decltype(a)>> du32;
2845  const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
2846  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2847 }
2848 
2849 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2850 
2851 template <class DF>
2853 
2854 template <size_t N, int kPow2>
2856  VFromD<DU16FromDF<decltype(df32)>> a,
2857  VFromD<DU16FromDF<decltype(df32)>> b,
2858  const VFromD<decltype(df32)> sum0,
2859  VFromD<decltype(df32)>& sum1)
2860  -> VFromD<decltype(df32)> {
2861  const DU16FromDF<decltype(df32)> du16;
2862  const RebindToUnsigned<decltype(df32)> du32;
2863  using VU32 = VFromD<decltype(du32)>;
2864  const VFromD<decltype(du16)> zero = Zero(du16);
2865  const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
2866  const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
2867  const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
2868  const VU32 b1 = ZipUpper(du32, zero, BitCast(du16, b));
2869  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2870  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2871 }
2872 
2873 // ------------------------------ Lt128
2874 template <class D>
2876  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2877  // Truth table of Eq and Compare for Hi and Lo u64.
2878  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
2879  // =H =L cH cL | out = cH | (=H & cL)
2880  // 0 0 0 0 | 0
2881  // 0 0 0 1 | 0
2882  // 0 0 1 0 | 1
2883  // 0 0 1 1 | 1
2884  // 0 1 0 0 | 0
2885  // 0 1 0 1 | 0
2886  // 0 1 1 0 | 1
2887  // 1 0 0 0 | 0
2888  // 1 0 0 1 | 1
2889  // 1 1 0 0 | 0
2890  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
2891  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
2892  // Shift leftward so L can influence H.
2893  const VFromD<D> ltLx = detail::Slide1Up(ltHL);
2894  const VFromD<D> vecHx = OrAnd(ltHL, eqHL, ltLx);
2895  // Replicate H to its neighbor.
2896  return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
2897 }
2898 
2899 // ------------------------------ Lt128Upper
2900 template <class D>
2902  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2903  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
2904  // Replicate H to its neighbor.
2905  return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL)));
2906 }
2907 
2908 // ------------------------------ Min128, Max128 (Lt128)
2909 
2910 template <class D>
2911 HWY_INLINE VFromD<D> Min128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
2912  const VFromD<D> aXH = detail::Slide1Down(a);
2913  const VFromD<D> bXH = detail::Slide1Down(b);
2914  const VFromD<D> minHL = Min(a, b);
2915  const MFromD<D> ltXH = Lt(aXH, bXH);
2916  const MFromD<D> eqXH = Eq(aXH, bXH);
2917  // If the upper lane is the decider, take lo from the same reg.
2918  const VFromD<D> lo = IfThenElse(ltXH, a, b);
2919  // The upper lane is just minHL; if they are equal, we also need to use the
2920  // actual min of the lower lanes.
2921  return OddEven(minHL, IfThenElse(eqXH, minHL, lo));
2922 }
2923 
2924 template <class D>
2925 HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
2926  const VFromD<D> aXH = detail::Slide1Down(a);
2927  const VFromD<D> bXH = detail::Slide1Down(b);
2928  const VFromD<D> maxHL = Max(a, b);
2929  const MFromD<D> ltXH = Lt(aXH, bXH);
2930  const MFromD<D> eqXH = Eq(aXH, bXH);
2931  // If the upper lane is the decider, take lo from the same reg.
2932  const VFromD<D> lo = IfThenElse(ltXH, b, a);
2933  // The upper lane is just maxHL; if they are equal, we also need to use the
2934  // actual min of the lower lanes.
2935  return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
2936 }
2937 
2938 template <class D>
2939 HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
2940  return IfThenElse(Lt128Upper(d, a, b), a, b);
2941 }
2942 
2943 template <class D>
2944 HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
2945  return IfThenElse(Lt128Upper(d, b, a), a, b);
2946 }
2947 
2948 // ================================================== END MACROS
2949 namespace detail { // for code folding
2950 #undef HWY_RVV_AVL
2951 #undef HWY_RVV_D
2952 #undef HWY_RVV_FOREACH
2953 #undef HWY_RVV_FOREACH_08_ALL
2954 #undef HWY_RVV_FOREACH_08_ALL_VIRT
2955 #undef HWY_RVV_FOREACH_08_DEMOTE
2956 #undef HWY_RVV_FOREACH_08_DEMOTE_VIRT
2957 #undef HWY_RVV_FOREACH_08_EXT
2958 #undef HWY_RVV_FOREACH_08_EXT_VIRT
2959 #undef HWY_RVV_FOREACH_08_TRUNC
2960 #undef HWY_RVV_FOREACH_08_VIRT
2961 #undef HWY_RVV_FOREACH_16_ALL
2962 #undef HWY_RVV_FOREACH_16_ALL_VIRT
2963 #undef HWY_RVV_FOREACH_16_DEMOTE
2964 #undef HWY_RVV_FOREACH_16_DEMOTE_VIRT
2965 #undef HWY_RVV_FOREACH_16_EXT
2966 #undef HWY_RVV_FOREACH_16_EXT_VIRT
2967 #undef HWY_RVV_FOREACH_16_TRUNC
2968 #undef HWY_RVV_FOREACH_16_VIRT
2969 #undef HWY_RVV_FOREACH_32_ALL
2970 #undef HWY_RVV_FOREACH_32_ALL_VIRT
2971 #undef HWY_RVV_FOREACH_32_DEMOTE
2972 #undef HWY_RVV_FOREACH_32_DEMOTE_VIRT
2973 #undef HWY_RVV_FOREACH_32_EXT
2974 #undef HWY_RVV_FOREACH_32_EXT_VIRT
2975 #undef HWY_RVV_FOREACH_32_TRUNC
2976 #undef HWY_RVV_FOREACH_32_VIRT
2977 #undef HWY_RVV_FOREACH_64_ALL
2978 #undef HWY_RVV_FOREACH_64_ALL_VIRT
2979 #undef HWY_RVV_FOREACH_64_DEMOTE
2980 #undef HWY_RVV_FOREACH_64_DEMOTE_VIRT
2981 #undef HWY_RVV_FOREACH_64_EXT
2982 #undef HWY_RVV_FOREACH_64_EXT_VIRT
2983 #undef HWY_RVV_FOREACH_64_TRUNC
2984 #undef HWY_RVV_FOREACH_64_VIRT
2985 #undef HWY_RVV_FOREACH_B
2986 #undef HWY_RVV_FOREACH_F
2987 #undef HWY_RVV_FOREACH_F16
2988 #undef HWY_RVV_FOREACH_F32
2989 #undef HWY_RVV_FOREACH_F3264
2990 #undef HWY_RVV_FOREACH_F64
2991 #undef HWY_RVV_FOREACH_I
2992 #undef HWY_RVV_FOREACH_I08
2993 #undef HWY_RVV_FOREACH_I16
2994 #undef HWY_RVV_FOREACH_I163264
2995 #undef HWY_RVV_FOREACH_I32
2996 #undef HWY_RVV_FOREACH_I64
2997 #undef HWY_RVV_FOREACH_U
2998 #undef HWY_RVV_FOREACH_U08
2999 #undef HWY_RVV_FOREACH_U16
3000 #undef HWY_RVV_FOREACH_U163264
3001 #undef HWY_RVV_FOREACH_U32
3002 #undef HWY_RVV_FOREACH_U64
3003 #undef HWY_RVV_FOREACH_UI
3004 #undef HWY_RVV_FOREACH_UI08
3005 #undef HWY_RVV_FOREACH_UI16
3006 #undef HWY_RVV_FOREACH_UI163264
3007 #undef HWY_RVV_FOREACH_UI32
3008 #undef HWY_RVV_FOREACH_UI3264
3009 #undef HWY_RVV_FOREACH_UI64
3010 #undef HWY_RVV_M
3011 #undef HWY_RVV_RETM_ARGM
3012 #undef HWY_RVV_RETV_ARGV
3013 #undef HWY_RVV_RETV_ARGVS
3014 #undef HWY_RVV_RETV_ARGVV
3015 #undef HWY_RVV_T
3016 #undef HWY_RVV_V
3017 } // namespace detail
3018 // NOLINTNEXTLINE(google-readability-namespace-comments)
3019 } // namespace HWY_NAMESPACE
3020 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
HWY_INLINE VFromD< DU > BitCastToUnsigned(V v)
Definition: rvv-inl.h:691
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3345
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition: rvv-inl.h:1823
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: rvv-inl.h:1817
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:1937
HWY_INLINE auto ChangeLMUL(Simd< T, N, kPow2 > d, VFromD< Simd< T, N, kPow2 - 3 >> v) -> VFromD< decltype(d)>
Definition: rvv-inl.h:2300
HWY_INLINE VFromD< DU > Iota0(const D)
Definition: rvv-inl.h:712
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:111
constexpr bool IsSupportedLMUL(D d)
Definition: rvv-inl.h:1897
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition: ops/shared-inl.h:103
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1155
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
RepartitionToNarrow< RebindToUnsigned< DF > > DU16FromDF
Definition: rvv-inl.h:2852
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
constexpr HWY_API size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f, _DEMOTE_VIRT) template< size_t N > HWY_API vint32mf2_t DemoteTo(Simd< int32_t
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:252
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
constexpr size_t MLenFromD(Simd< T, N, kPow2 >)
Definition: rvv-inl.h:43
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:161
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_INLINE constexpr HWY_MAYBE_UNUSED size_t MaxLanes(D)
Definition: ops/shared-inl.h:276
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API V Trunc(const V v)
Definition: rvv-inl.h:2727
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2238
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
constexpr T MantissaEnd()
Definition: base.h:631
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:309
constexpr HWY_API bool IsSame()
Definition: base.h:322
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
constexpr HWY_API bool IsSigned()
Definition: base.h:534
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
#define HWY_IF_LANE_SIZE_D(D, bytes)
Definition: ops/shared-inl.h:235
#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:344
#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1399
HWY_AFTER_NAMESPACE()
#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1561
#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:567
#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1020
#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:339
#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:508
#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1302
#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:299
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1370
#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:353
#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1435
#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:610
#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:293
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:370
#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1328
#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1568
#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2508
#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:626
#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1935
#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1790
#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1341
#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:539
#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:323
#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:912
#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:992
#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1619
#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:409
#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1552
#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:903
#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1483
#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:848
#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:327
#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1167
#define HWY_RVV_IF_POW2_IN(D, min, max)
Definition: rvv-inl.h:39
#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1115
#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2056
#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1450
#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP)
Definition: rvv-inl.h:59
#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:447
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:311
#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:301
#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1513
#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:349
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:463
#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:440
#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1498
#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:394
#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1240
#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:287
#define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1831
#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2182
HWY_BEFORE_NAMESPACE()
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:379
#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:472
#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1724
#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:530
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:313
#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:581
#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1096
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:364
#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1266
#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1639
#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:291
#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:701
#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:642
#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:335
#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1135
#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1466
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1222
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:552
#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:596
#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1315
#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1950
#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1029
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:358
#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:375
#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1772
#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:455
#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:289
#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1252
#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:297
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: ops/shared-inl.h:40
Definition: base.h:358
Definition: base.h:251