Difference between revisions of "cpp/thread/hardware destructive interference size"
From cppreference.com
Andreas Krug (Talk | contribs) m (fmt) |
|||
(2 intermediate revisions by 2 users not shown) | |||
Line 94: | Line 94: | ||
void oneCacheLinerThread() | void oneCacheLinerThread() | ||
{ | { | ||
− | const auto start { now() }; | + | const auto start{now()}; |
for (uint64_t count{}; count != max_write_iterations; ++count) | for (uint64_t count{}; count != max_write_iterations; ++count) | ||
Line 102: | Line 102: | ||
oneCacheLiner.y.fetch_add(1, std::memory_order_relaxed); | oneCacheLiner.y.fetch_add(1, std::memory_order_relaxed); | ||
− | const std::chrono::duration<double, std::milli> elapsed { now() - start }; | + | const std::chrono::duration<double, std::milli> elapsed{now() - start}; |
std::lock_guard lk{cout_mutex}; | std::lock_guard lk{cout_mutex}; | ||
std::cout << "oneCacheLinerThread() spent " << elapsed.count() << " ms\n"; | std::cout << "oneCacheLinerThread() spent " << elapsed.count() << " ms\n"; | ||
Line 114: | Line 114: | ||
void twoCacheLinerThread() | void twoCacheLinerThread() | ||
{ | { | ||
− | const auto start { now() }; | + | const auto start{now()}; |
for (uint64_t count{}; count != max_write_iterations; ++count) | for (uint64_t count{}; count != max_write_iterations; ++count) | ||
Line 122: | Line 122: | ||
twoCacheLiner.y.fetch_add(1, std::memory_order_relaxed); | twoCacheLiner.y.fetch_add(1, std::memory_order_relaxed); | ||
− | const std::chrono::duration<double, std::milli> elapsed { now() - start }; | + | const std::chrono::duration<double, std::milli> elapsed{now() - start}; |
std::lock_guard lk{cout_mutex}; | std::lock_guard lk{cout_mutex}; | ||
std::cout << "twoCacheLinerThread() spent " << elapsed.count() << " ms\n"; | std::cout << "twoCacheLinerThread() spent " << elapsed.count() << " ms\n"; | ||
Line 146: | Line 146: | ||
<< hardware_constructive_interference_size << "\n\n" | << hardware_constructive_interference_size << "\n\n" | ||
<< std::fixed << std::setprecision(2) | << std::fixed << std::setprecision(2) | ||
− | << "sizeof( OneCacheLiner ) == " << sizeof( OneCacheLiner ) << '\n' | + | << "sizeof( OneCacheLiner ) == " << sizeof(OneCacheLiner) << '\n' |
− | << "sizeof( TwoCacheLiner ) == " << sizeof( TwoCacheLiner ) << "\n\n"; | + | << "sizeof( TwoCacheLiner ) == " << sizeof(TwoCacheLiner) << "\n\n"; |
constexpr int max_runs{4}; | constexpr int max_runs{4}; | ||
Line 156: | Line 156: | ||
std::thread th1{oneCacheLinerThread<0>}; | std::thread th1{oneCacheLinerThread<0>}; | ||
std::thread th2{oneCacheLinerThread<1>}; | std::thread th2{oneCacheLinerThread<1>}; | ||
− | th1.join(); th2.join(); | + | th1.join(); |
+ | th2.join(); | ||
oneCacheLiner_average += oneCacheLiner.x + oneCacheLiner.y; | oneCacheLiner_average += oneCacheLiner.x + oneCacheLiner.y; | ||
} | } | ||
Line 167: | Line 168: | ||
std::thread th1{twoCacheLinerThread<0>}; | std::thread th1{twoCacheLinerThread<0>}; | ||
std::thread th2{twoCacheLinerThread<1>}; | std::thread th2{twoCacheLinerThread<1>}; | ||
− | th1.join(); th2.join(); | + | th1.join(); |
+ | th2.join(); | ||
twoCacheLiner_average += twoCacheLiner.x + twoCacheLiner.y; | twoCacheLiner_average += twoCacheLiner.x + twoCacheLiner.y; | ||
} | } |
Latest revision as of 09:54, 22 October 2023
Defined in header <new>
|
||
inline constexpr std::size_t hardware_destructive_interference_size = /*implementation-defined*/; |
(1) | (since C++17) |
inline constexpr std::size_t hardware_constructive_interference_size = /*implementation-defined*/; |
(2) | (since C++17) |
1) Minimum offset between two objects to avoid false sharing. Guaranteed to be at least alignof(std::max_align_t)
struct keep_apart { alignas(std::hardware_destructive_interference_size) std::atomic<int> cat; alignas(std::hardware_destructive_interference_size) std::atomic<int> dog; };
2) Maximum size of contiguous memory to promote true sharing. Guaranteed to be at least alignof(std::max_align_t)
struct together { std::atomic<int> dog; int puppy; }; struct kennel { // Other data members... alignas(sizeof(together)) together pack; // Other data members... }; static_assert(sizeof(together) <= std::hardware_constructive_interference_size);
[edit] Notes
These constants provide a portable way to access the L1 data cache line size.
Feature-test macro | Value | Std | Feature |
---|---|---|---|
__cpp_lib_hardware_interference_size |
201703L | (C++17) | constexpr std::hardware_constructive_interference_size and
|
[edit] Example
The program uses two threads that atomically write to the data members of the given global objects. The first object fits in one cache line, which results in "hardware interference". The second object keeps its data members on separate cache lines, so possible "cache synchronization" after thread writes is avoided.
Run this code
#include <atomic> #include <chrono> #include <cstddef> #include <iomanip> #include <iostream> #include <mutex> #include <new> #include <thread> #ifdef __cpp_lib_hardware_interference_size using std::hardware_constructive_interference_size; using std::hardware_destructive_interference_size; #else // 64 bytes on x86-64 │ L1_CACHE_BYTES │ L1_CACHE_SHIFT │ __cacheline_aligned │ ... constexpr std::size_t hardware_constructive_interference_size = 64; constexpr std::size_t hardware_destructive_interference_size = 64; #endif std::mutex cout_mutex; constexpr int max_write_iterations{10'000'000}; // the benchmark time tuning struct alignas(hardware_constructive_interference_size) OneCacheLiner // occupies one cache line { std::atomic_uint64_t x{}; std::atomic_uint64_t y{}; } oneCacheLiner; struct TwoCacheLiner // occupies two cache lines { alignas(hardware_destructive_interference_size) std::atomic_uint64_t x{}; alignas(hardware_destructive_interference_size) std::atomic_uint64_t y{}; } twoCacheLiner; inline auto now() noexcept { return std::chrono::high_resolution_clock::now(); } template<bool xy> void oneCacheLinerThread() { const auto start{now()}; for (uint64_t count{}; count != max_write_iterations; ++count) if constexpr (xy) oneCacheLiner.x.fetch_add(1, std::memory_order_relaxed); else oneCacheLiner.y.fetch_add(1, std::memory_order_relaxed); const std::chrono::duration<double, std::milli> elapsed{now() - start}; std::lock_guard lk{cout_mutex}; std::cout << "oneCacheLinerThread() spent " << elapsed.count() << " ms\n"; if constexpr (xy) oneCacheLiner.x = elapsed.count(); else oneCacheLiner.y = elapsed.count(); } template<bool xy> void twoCacheLinerThread() { const auto start{now()}; for (uint64_t count{}; count != max_write_iterations; ++count) if constexpr (xy) twoCacheLiner.x.fetch_add(1, std::memory_order_relaxed); else twoCacheLiner.y.fetch_add(1, std::memory_order_relaxed); const std::chrono::duration<double, std::milli> elapsed{now() - start}; std::lock_guard lk{cout_mutex}; std::cout << "twoCacheLinerThread() spent " << elapsed.count() << " ms\n"; if constexpr (xy) twoCacheLiner.x = elapsed.count(); else twoCacheLiner.y = elapsed.count(); } int main() { std::cout << "__cpp_lib_hardware_interference_size " # ifdef __cpp_lib_hardware_interference_size "= " << __cpp_lib_hardware_interference_size << '\n'; # else "is not defined, use " << hardware_destructive_interference_size << " as fallback\n"; # endif std::cout << "hardware_destructive_interference_size == " << hardware_destructive_interference_size << '\n' << "hardware_constructive_interference_size == " << hardware_constructive_interference_size << "\n\n" << std::fixed << std::setprecision(2) << "sizeof( OneCacheLiner ) == " << sizeof(OneCacheLiner) << '\n' << "sizeof( TwoCacheLiner ) == " << sizeof(TwoCacheLiner) << "\n\n"; constexpr int max_runs{4}; int oneCacheLiner_average{0}; for (auto i{0}; i != max_runs; ++i) { std::thread th1{oneCacheLinerThread<0>}; std::thread th2{oneCacheLinerThread<1>}; th1.join(); th2.join(); oneCacheLiner_average += oneCacheLiner.x + oneCacheLiner.y; } std::cout << "Average T1 time: " << (oneCacheLiner_average / max_runs / 2) << " ms\n\n"; int twoCacheLiner_average{0}; for (auto i{0}; i != max_runs; ++i) { std::thread th1{twoCacheLinerThread<0>}; std::thread th2{twoCacheLinerThread<1>}; th1.join(); th2.join(); twoCacheLiner_average += twoCacheLiner.x + twoCacheLiner.y; } std::cout << "Average T2 time: " << (twoCacheLiner_average / max_runs / 2) << " ms\n\n" << "Ratio T1/T2:~ " << 1.0 * oneCacheLiner_average / twoCacheLiner_average << '\n'; }
Possible output:
__cpp_lib_hardware_interference_size = 201703 hardware_destructive_interference_size == 64 hardware_constructive_interference_size == 64 sizeof( OneCacheLiner ) == 64 sizeof( TwoCacheLiner ) == 128 oneCacheLinerThread() spent 517.83 ms oneCacheLinerThread() spent 533.43 ms oneCacheLinerThread() spent 527.36 ms oneCacheLinerThread() spent 555.69 ms oneCacheLinerThread() spent 574.74 ms oneCacheLinerThread() spent 591.66 ms oneCacheLinerThread() spent 555.63 ms oneCacheLinerThread() spent 555.76 ms Average T1 time: 550 ms twoCacheLinerThread() spent 89.79 ms twoCacheLinerThread() spent 89.94 ms twoCacheLinerThread() spent 89.46 ms twoCacheLinerThread() spent 90.28 ms twoCacheLinerThread() spent 89.73 ms twoCacheLinerThread() spent 91.11 ms twoCacheLinerThread() spent 89.17 ms twoCacheLinerThread() spent 90.09 ms Average T2 time: 89 ms Ratio T1/T2:~ 6.16
[edit] See also
[static] |
returns the number of concurrent threads supported by the implementation (public static member function of std::thread )
|
[static] |
returns the number of concurrent threads supported by the implementation (public static member function of std::jthread )
|