Grok  10.0.3
cache_control.h
Go to the documentation of this file.
1 // Copyright 2020 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
17 #define HIGHWAY_HWY_CACHE_CONTROL_H_
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 
22 #include "hwy/base.h"
23 
24 // Requires SSE2; fails to compile on 32-bit Clang 7 (see
25 // https://github.com/gperftools/gperftools/issues/946).
26 #if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
27 #undef HWY_DISABLE_CACHE_CONTROL
28 #define HWY_DISABLE_CACHE_CONTROL
29 #endif
30 
31 // intrin.h is sufficient on MSVC and already included by base.h.
32 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
33 #include <emmintrin.h> // SSE2
34 #endif
35 
36 // Windows.h #defines these, which causes infinite recursion. Temporarily
37 // undefine them in this header; these functions are anyway deprecated.
38 // TODO(janwas): remove when these functions are removed.
39 #pragma push_macro("LoadFence")
40 #undef LoadFence
41 
42 namespace hwy {
43 
44 // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
45 #define HWY_STREAM_MULTIPLE 16
46 
47 // The following functions may also require an attribute.
48 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
49 #define HWY_ATTR_CACHE __attribute__((target("sse2")))
50 #else
51 #define HWY_ATTR_CACHE
52 #endif
53 
54 // Delays subsequent loads until prior loads are visible. On Intel CPUs, also
55 // serves as a full fence (waits for all prior instructions to complete).
56 // No effect on non-x86.
57 // DEPRECATED due to differing behavior across architectures AND vendors.
59 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
60  _mm_lfence();
61 #endif
62 }
63 
64 // Ensures values written by previous `Stream` calls are visible on the current
65 // core. This is NOT sufficient for synchronizing across cores; when `Stream`
66 // outputs are to be consumed by other core(s), the producer must publish
67 // availability (e.g. via mutex or atomic_flag) after `FlushStream`.
69 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
70  _mm_sfence();
71 #endif
72 }
73 
74 // Optionally begins loading the cache line containing "p" to reduce latency of
75 // subsequent actual loads.
76 template <typename T>
78 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
79  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
80 #elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
81  // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
82  // desirable, so use the default 3 (keep in caches).
83  __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
84 #else
85  (void)p;
86 #endif
87 }
88 
89 // Invalidates and flushes the cache line containing "p", if possible.
91 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
92  _mm_clflush(p);
93 #else
94  (void)p;
95 #endif
96 }
97 
98 // When called inside a spin-loop, may reduce power consumption.
100 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
101  _mm_pause();
102 #endif
103 }
104 
105 } // namespace hwy
106 
107 // TODO(janwas): remove when these functions are removed. (See above.)
108 #pragma pop_macro("LoadFence")
109 
110 #endif // HIGHWAY_HWY_CACHE_CONTROL_H_
#define HWY_INLINE
Definition: base.h:62
#define HWY_ATTR_CACHE
Definition: cache_control.h:51
Definition: aligned_allocator.h:27
HWY_INLINE HWY_ATTR_CACHE void FlushStream()
Definition: cache_control.h:68
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T *p)
Definition: cache_control.h:77
HWY_INLINE HWY_ATTR_CACHE void Pause()
Definition: cache_control.h:99
HWY_INLINE HWY_ATTR_CACHE void LoadFence()
Definition: cache_control.h:58
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void *p)
Definition: cache_control.h:90