Grok  10.0.3
detect_targets.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
17 #define HIGHWAY_HWY_DETECT_TARGETS_H_
18 
19 // Defines targets and chooses which to enable.
20 
22 
23 //------------------------------------------------------------------------------
24 // Optional configuration
25 
26 // See ../quick_reference.md for documentation of these macros.
27 
28 // Uncomment to override the default baseline determined from predefined macros:
29 // #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
30 
31 // Uncomment to override the default blocklist:
32 // #define HWY_BROKEN_TARGETS HWY_AVX3
33 
34 // Uncomment to definitely avoid generating those target(s):
35 // #define HWY_DISABLED_TARGETS HWY_SSE4
36 
37 // Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
38 // AVX2 target for VMs which support AVX2 but not the other instruction sets)
39 // #define HWY_DISABLE_BMI2_FMA
40 
41 // Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled
42 // #define HWY_WANT_SSSE3
43 // #define HWY_WANT_SSE4
44 
45 //------------------------------------------------------------------------------
46 // Targets
47 
48 // Unique bit value for each target. A lower value is "better" (e.g. more lanes)
49 // than a higher value within the same group/platform - see HWY_STATIC_TARGET.
50 //
51 // All values are unconditionally defined so we can test HWY_TARGETS without
52 // first checking the HWY_ARCH_*.
53 //
54 // The C99 preprocessor evaluates #if expressions using intmax_t types, so we
55 // can use 32-bit literals.
56 
57 // 1,2,4: reserved
58 
59 // Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
60 // VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
61 // Tiger Lake? We do not yet have uses for GFNI.
62 #define HWY_AVX3_DL 8 // see HWY_WANT_AVX3_DL below
63 #define HWY_AVX3 16
64 #define HWY_AVX2 32
65 // 64: reserved for AVX
66 #define HWY_SSE4 128
67 #define HWY_SSSE3 256
68 // 512: reserved for SSE3 or SSE2
69 
70 // The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
71 // dynamic dispatch. All x86 target bits must be lower or equal to
72 // (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
73 // HWY_MAX_DYNAMIC_TARGETS in total.
74 #define HWY_HIGHEST_TARGET_BIT_X86 9
75 
76 // 0x400, 0x800: reserved
77 #define HWY_SVE2_128 0x1000 // specialized target (e.g. Arm N2)
78 #define HWY_SVE_256 0x2000 // specialized target (e.g. Arm V1)
79 #define HWY_SVE2 0x4000
80 #define HWY_SVE 0x8000
81 // 0x10000 reserved for Helium
82 #define HWY_NEON 0x20000
83 
84 #define HWY_HIGHEST_TARGET_BIT_ARM 17
85 
86 // 0x40000 reserved
87 #define HWY_PPC8 0x80000 // v2.07 or 3
88 // 0x100000 reserved for prior VSX/AltiVec
89 
90 #define HWY_HIGHEST_TARGET_BIT_PPC 20
91 
92 // 0x200000, 0x400000 reserved
93 #define HWY_WASM_EMU256 0x800000 // Experimental
94 #define HWY_WASM 0x1000000
95 
96 #define HWY_HIGHEST_TARGET_BIT_WASM 24
97 
98 // 0x2000000, 0x4000000, 0x8000000 reserved
99 #define HWY_RVV 0x10000000
100 
101 #define HWY_HIGHEST_TARGET_BIT_RVV 28
102 
103 #define HWY_EMU128 0x20000000
104 #define HWY_SCALAR 0x40000000
105 
106 #define HWY_HIGHEST_TARGET_BIT_SCALAR 30
107 
108 // Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
109 
110 //------------------------------------------------------------------------------
111 // Set default blocklists
112 
113 // Disabled means excluded from enabled at user's request. A separate config
114 // macro allows disabling without deactivating the blocklist below.
115 #ifndef HWY_DISABLED_TARGETS
116 #define HWY_DISABLED_TARGETS 0
117 #endif
118 
119 // Broken means excluded from enabled due to known compiler issues. Allow the
120 // user to override this blocklist without any guarantee of success.
121 #ifndef HWY_BROKEN_TARGETS
122 
123 // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
124 // SSE4 codegen (possibly only for msan), so disable all those targets.
125 #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
126 #define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
127 // This entails a major speed reduction, so warn unless the user explicitly
128 // opts in to scalar-only.
129 #if !defined(HWY_COMPILE_ONLY_SCALAR)
130 #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
131 #endif
132 
133 // 32-bit may fail to compile AVX2/3.
134 #elif HWY_ARCH_X86_32
135 #define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
136 
137 // MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
138 #elif HWY_COMPILER_MSVC != 0
139 #define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
140 
141 // armv7be has not been tested and is not yet supported.
142 #elif HWY_ARCH_ARM_V7 && \
143  (defined(__ARM_BIG_ENDIAN) || \
144  (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
145 #define HWY_BROKEN_TARGETS (HWY_NEON)
146 
147 // SVE[2] require recent clang or gcc versions.
148 #elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
149 (!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
150 #define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
151 
152 #else
153 #define HWY_BROKEN_TARGETS 0
154 #endif
155 
156 #endif // HWY_BROKEN_TARGETS
157 
158 // Enabled means not disabled nor blocklisted.
159 #define HWY_ENABLED(targets) \
160  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
161 
162 //------------------------------------------------------------------------------
163 // Detect baseline targets using predefined macros
164 
165 // Baseline means the targets for which the compiler is allowed to generate
166 // instructions, implying the target CPU would have to support them. This does
167 // not take the blocklist into account.
168 
169 #if defined(HWY_COMPILE_ONLY_SCALAR)
170 #define HWY_BASELINE_SCALAR HWY_SCALAR
171 #else
172 #define HWY_BASELINE_SCALAR HWY_EMU128
173 #endif
174 
175 // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
176 // HWY_TARGET == HWY_BASELINE_SCALAR.
177 
178 #if HWY_ARCH_WASM && defined(__wasm_simd128__)
179 #if defined(HWY_WANT_WASM2)
180 #define HWY_BASELINE_WASM HWY_WASM_EMU256
181 #else
182 #define HWY_BASELINE_WASM HWY_WASM
183 #endif // HWY_WANT_WASM2
184 #else
185 #define HWY_BASELINE_WASM 0
186 #endif
187 
188 // Avoid choosing the PPC target until we have an implementation.
189 #if HWY_ARCH_PPC && defined(__VSX__) && 0
190 #define HWY_BASELINE_PPC8 HWY_PPC8
191 #else
192 #define HWY_BASELINE_PPC8 0
193 #endif
194 
195 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
196 #define HWY_BASELINE_SVE2 HWY_SVE2
197 #else
198 #define HWY_BASELINE_SVE2 0
199 #endif
200 
201 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
202 // Baseline targets can be used unconditionally, which does not apply to
203 // HWY_SVE_256 because it requires a vector size of 256 bits. Including SVE_256
204 // in the baseline would also disable all 'worse' targets (including SVE and
205 // SVE2) in non-test builds. Therefore we instead add HWY_SVE_256 to
206 // HWY_ATTAINABLE_TARGETS below.
207 #define HWY_BASELINE_SVE HWY_SVE
208 #else
209 #define HWY_BASELINE_SVE 0
210 #endif
211 
212 // GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
213 #if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
214 #define HWY_BASELINE_NEON HWY_NEON
215 #else
216 #define HWY_BASELINE_NEON 0
217 #endif
218 
219 // Special handling for MSVC because it has fewer predefined macros:
220 #if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
221 
222 // 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
223 // https://stackoverflow.com/questions/18563978/.
224 #if defined(__AVX__)
225 #define HWY_CHECK_SSSE3 1
226 #define HWY_CHECK_SSE4 1
227 #else
228 #define HWY_CHECK_SSSE3 0
229 #define HWY_CHECK_SSE4 0
230 #endif
231 
232 // 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
233 // PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
234 #define HWY_CHECK_PCLMUL_AES 1
235 #define HWY_CHECK_BMI2_FMA 1
236 #define HWY_CHECK_F16C 1
237 
238 #else // non-MSVC
239 
240 #if defined(__SSSE3__)
241 #define HWY_CHECK_SSSE3 1
242 #else
243 #define HWY_CHECK_SSSE3 0
244 #endif
245 
246 #if defined(__SSE4_1__) && defined(__SSE4_2__)
247 #define HWY_CHECK_SSE4 1
248 #else
249 #define HWY_CHECK_SSE4 0
250 #endif
251 
252 // If these are disabled, they should not gate the availability of SSE4/AVX2.
253 #if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
254 #define HWY_CHECK_PCLMUL_AES 1
255 #else
256 #define HWY_CHECK_PCLMUL_AES 0
257 #endif
258 
259 #if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
260 #define HWY_CHECK_BMI2_FMA 1
261 #else
262 #define HWY_CHECK_BMI2_FMA 0
263 #endif
264 
265 #if defined(HWY_DISABLE_F16C) || defined(__F16C__)
266 #define HWY_CHECK_F16C 1
267 #else
268 #define HWY_CHECK_F16C 0
269 #endif
270 
271 #endif // non-MSVC
272 
273 #if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
274 #define HWY_BASELINE_SSSE3 HWY_SSSE3
275 #else
276 #define HWY_BASELINE_SSSE3 0
277 #endif
278 
279 #if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
280 #define HWY_BASELINE_SSE4 HWY_SSE4
281 #else
282 #define HWY_BASELINE_SSE4 0
283 #endif
284 
285 #if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
286  defined(__AVX2__)
287 #define HWY_BASELINE_AVX2 HWY_AVX2
288 #else
289 #define HWY_BASELINE_AVX2 0
290 #endif
291 
292 // Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
293 #if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
294  defined(__AVX512DQ__) && defined(__AVX512VL__)
295 #define HWY_BASELINE_AVX3 HWY_AVX3
296 #else
297 #define HWY_BASELINE_AVX3 0
298 #endif
299 
300 // TODO(janwas): not yet known whether these will be set by MSVC
301 #if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
302  defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
303  defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
304  defined(__AVX512BITALG__)
305 #define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
306 #else
307 #define HWY_BASELINE_AVX3_DL 0
308 #endif
309 
310 #if HWY_ARCH_RVV && defined(__riscv_vector)
311 #define HWY_BASELINE_RVV HWY_RVV
312 #else
313 #define HWY_BASELINE_RVV 0
314 #endif
315 
316 // Allow the user to override this without any guarantee of success.
317 #ifndef HWY_BASELINE_TARGETS
318 #define HWY_BASELINE_TARGETS \
319  (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
320  HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON | \
321  HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \
322  HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
323 #endif // HWY_BASELINE_TARGETS
324 
325 //------------------------------------------------------------------------------
326 // Choose target for static dispatch
327 
328 #define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
329 #if HWY_ENABLED_BASELINE == 0
330 #error "At least one baseline target must be defined and enabled"
331 #endif
332 
333 // Best baseline, used for static dispatch. This is the least-significant 1-bit
334 // within HWY_ENABLED_BASELINE and lower bit values imply "better".
335 #define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
336 
337 // Start by assuming static dispatch. If we later use dynamic dispatch, this
338 // will be defined to other targets during the multiple-inclusion, and finally
339 // return to the initial value. Defining this outside begin/end_target ensures
340 // inl headers successfully compile by themselves (required by Bazel).
341 #define HWY_TARGET HWY_STATIC_TARGET
342 
343 //------------------------------------------------------------------------------
344 // Choose targets for dynamic dispatch according to one of four policies
345 
346 #if defined(HWY_COMPILE_ONLY_SCALAR) && defined(HWY_COMPILE_ONLY_STATIC)
347 #error "Defined both HWY_COMPILE_ONLY_{SCALAR|STATIC} - bug?"
348 #endif
349 // Defining either HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
350 
351 // AVX3_DL is not widely available yet. To reduce code size and compile time,
352 // only include it in the set of attainable targets (for dynamic dispatch) if
353 // the user opts in, OR it is in the baseline (we check whether enabled below).
354 #if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL)
355 #define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL
356 #else
357 #define HWY_ATTAINABLE_AVX3_DL 0
358 #endif
359 
360 #if HWY_ARCH_ARM_A64 && (HWY_ENABLED_BASELINE & HWY_SVE)
361 #define HWY_ATTAINABLE_SVE_256 HWY_ENABLED(HWY_SVE_256)
362 #else
363 #define HWY_ATTAINABLE_SVE_256 0
364 #endif
365 
366 #if HWY_ARCH_ARM_A64 && (HWY_ENABLED_BASELINE & HWY_SVE2)
367 #define HWY_ATTAINABLE_SVE2_128 HWY_ENABLED(HWY_SVE2_128)
368 #else
369 #define HWY_ATTAINABLE_SVE2_128 0
370 #endif
371 
372 // Attainable means enabled and the compiler allows intrinsics (even when not
373 // allowed to autovectorize). Used in 3 and 4.
374 #if HWY_ARCH_X86
375 #define HWY_ATTAINABLE_TARGETS \
376  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
377  HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
378 #else
379 #define HWY_ATTAINABLE_TARGETS \
380  (HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE_256 | HWY_ATTAINABLE_SVE2_128)
381 #endif
382 
383 // 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
384 // to ~HWY_SCALAR, but this is more explicit).
385 #if defined(HWY_COMPILE_ONLY_SCALAR)
386 #undef HWY_STATIC_TARGET
387 #define HWY_STATIC_TARGET HWY_SCALAR // override baseline
388 #define HWY_TARGETS HWY_SCALAR
389 
390 // 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
391 #elif defined(HWY_COMPILE_ONLY_STATIC)
392 #define HWY_TARGETS HWY_STATIC_TARGET
393 
394 // 3) For tests: include all attainable targets (in particular: scalar)
395 #elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
396 #define HWY_TARGETS HWY_ATTAINABLE_TARGETS
397 
398 // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
399 // excluding superseded targets, in particular scalar.
400 #else
401 #define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
402 
403 #endif // target policy
404 
405 // HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
406 // one of the dynamic targets. This also implies HWY_TARGETS != 0 and
407 // (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
408 #if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
409 #error "Logic error: best baseline should be included in dynamic targets"
410 #endif
411 
412 #endif // HIGHWAY_HWY_DETECT_TARGETS_H_