Namespaces
Variants
Views
Actions

Talk:cpp/thread/hardware destructive interference size

From cppreference.com

[edit] Clarification

I think more wording could help, but I want to be sure I'm correct. Is the idea that one would want to have multiple objects aligned to std::hardware_constructive_interference_size and size not more than that in order to ensure they are on the same cache line? The idea being that you can write higher-performance code when you know you'll want to access dog and puppy (in the example) in quick succession?

Likewise for std::hardware_destructive_interference_size, having that alignment and not more than that size means that two structs will be on separate cache lines and so two threads can hammer on two instances all they want without the cache line bouncing between cores?

I feel like some examples showing expected multi-threaded use would help clarify where one would use each of these. BenFrantzDale (talk) 08:35, 27 March 2021 (PDT)

The real effect of std::hardware_destructive_interference_size could be demonstrated by the following (pleonastic) code. WIP.
#include <atomic>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <new>
#include <thread>
 
#ifdef __cpp_lib_hardware_interference_size
    using std::hardware_constructive_interference_size;
    using std::hardware_destructive_interference_size;
#else
    // Lucky guess │ __cacheline_aligned │ L1_CACHE_BYTES │ L1_CACHE_SHIFT │ ...
    constexpr std::size_t hardware_constructive_interference_size
        = 2 * sizeof(std::max_align_t);
    constexpr std::size_t hardware_destructive_interference_size
        = 2 * sizeof(std::max_align_t);
#endif
 
 
std::mutex cout_mutex;
 
constexpr int max_iterations{10'000'000};
 
inline auto now() noexcept { return std::chrono::high_resolution_clock::now(); }
 
 
struct
alignas(hardware_constructive_interference_size)
OneCacheLiner {
    std::atomic_uint64_t x{};
    std::atomic_uint64_t y{};
} oneCacheLiner;
 
struct TwoCacheLiner {
    alignas(hardware_destructive_interference_size) std::atomic_uint64_t x{};
    alignas(hardware_destructive_interference_size) std::atomic_uint64_t y{};
} twoCacheLiner;
 
 
template<bool xy>
void oneCacheLiner_writer() {
    const auto start = now();
 
    for (uint64_t count{}; count != max_iterations; ++count)
        if constexpr (xy) oneCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
        else oneCacheLiner.y.fetch_add(1, std::memory_order_relaxed);
 
    std::chrono::duration<double, std::milli> elapsed {now() - start};
    std::lock_guard lk{cout_mutex};
    std::cout << "oneCacheLiner_writer() spent " << elapsed.count() << " ms\n";
    if constexpr (xy) oneCacheLiner.x = elapsed.count();
    else oneCacheLiner.y = elapsed.count();
}
 
template<bool xy>
void twoCacheLiner_writer() {
    const auto start = now();
 
    for (uint64_t count{}; count != max_iterations; ++count)
        if constexpr (xy) twoCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
        else twoCacheLiner.y.fetch_add(1, std::memory_order_relaxed);
 
    std::chrono::duration<double, std::milli> elapsed {now() - start};
    std::lock_guard lk{cout_mutex};
    std::cout << "twoCacheLiner_writer() spent " << elapsed.count() << " ms\n";
    if constexpr (xy) twoCacheLiner.x = elapsed.count();
    else twoCacheLiner.y = elapsed.count();
}
 
int main()
{
    std::cout
        << std::fixed << std::setprecision(2)
        << "sizeof(OneCacheLiner) : " << sizeof(OneCacheLiner) << '\n'
        << "sizeof(TwoCacheLiner) : " << sizeof(TwoCacheLiner) << "\n\n";
 
    constexpr int max_runs{4};
 
    int oneCacheLiner_average{0};
    for (auto i{0}; i != max_runs; ++i)
    {
        std::thread th1(oneCacheLiner_writer<0>);
        std::thread th2(oneCacheLiner_writer<1>);
        th1.join(); th2.join();
        oneCacheLiner_average += oneCacheLiner.x + oneCacheLiner.y;
    }
    std::cout << "Average time: " << (oneCacheLiner_average / max_runs / 2) << " ms\n\n";
 
    int twoCacheLiner_average{0};
    for (auto i{0}; i != max_runs; ++i)
    {
        std::thread th3(twoCacheLiner_writer<0>);
        std::thread th4(twoCacheLiner_writer<1>);
        th3.join(); th4.join();
        twoCacheLiner_average += twoCacheLiner.x + twoCacheLiner.y;
    }
    std::cout << "Average time: " << (twoCacheLiner_average / max_runs / 2) << " ms\n\n";
 
    std::cout << "__cpp_lib_hardware_interference_size "
#   ifdef __cpp_lib_hardware_interference_size
        " = " << __cpp_lib_hardware_interference_size << "\n";
#   else
        "is not defined\n";
#   endif
 
    std::cout
        << "hardware_destructive_interference_size == "
        << hardware_destructive_interference_size << '\n'
        << "hardware_constructive_interference_size == "
        << hardware_constructive_interference_size << '\n'
        << "sizeof( std::max_align_t ) == " << sizeof(std::max_align_t) << "\n\n";
}

Possible output:

sizeof(OneCacheLiner) : 64
sizeof(TwoCacheLiner) : 128
 
oneCacheLiner_writer() spent 270.02 ms
oneCacheLiner_writer() spent 327.32 ms
oneCacheLiner_writer() spent 193.18 ms
oneCacheLiner_writer() spent 283.19 ms
oneCacheLiner_writer() spent 505.74 ms
oneCacheLiner_writer() spent 521.13 ms
oneCacheLiner_writer() spent 380.60 ms
oneCacheLiner_writer() spent 441.62 ms
Average time: 365 ms
 
twoCacheLiner_writer() spent 113.94 ms
twoCacheLiner_writer() spent 131.01 ms
twoCacheLiner_writer() spent 115.32 ms
twoCacheLiner_writer() spent 127.05 ms
twoCacheLiner_writer() spent 116.03 ms
twoCacheLiner_writer() spent 148.28 ms
twoCacheLiner_writer() spent 116.60 ms
twoCacheLiner_writer() spent 133.39 ms
Average time: 124 ms
 
__cpp_lib_hardware_interference_size is not defined
hardware_destructive_interference_size == 64
hardware_constructive_interference_size == 64
sizeof( std::max_align_t ) == 32