Namespaces
Variants
Views
Actions

Difference between revisions of "cpp/thread/hardware destructive interference size"

From cppreference.com
< cpp‎ | thread
m (fmt)
 
(One intermediate revision by one user not shown)
Line 1: Line 1:
{{cpp/title | hardware_destructive_interference_size|hardware_constructive_interference_size}}
+
{{cpp/title|hardware_destructive_interference_size|hardware_constructive_interference_size}}
 
{{cpp/thread/navbar}}
 
{{cpp/thread/navbar}}
 
{{dcl begin}}
 
{{dcl begin}}
 
{{dcl header|new}}
 
{{dcl header|new}}
{{dcl | since=c++17 |num=1|1=
+
{{dcl|since=c++17|num=1|1=
 
inline constexpr std::size_t
 
inline constexpr std::size_t
 
     hardware_destructive_interference_size = /*implementation-defined*/;
 
     hardware_destructive_interference_size = /*implementation-defined*/;
 
}}
 
}}
{{dcl | since=c++17 |num=2|1=
+
{{dcl|since=c++17|num=2|1=
 
inline constexpr std::size_t
 
inline constexpr std::size_t
 
     hardware_constructive_interference_size = /*implementation-defined*/;
 
     hardware_constructive_interference_size = /*implementation-defined*/;
Line 15: Line 15:
 
@1@ Minimum offset between two objects to avoid false sharing. Guaranteed to be at least {{c|alignof(std::max_align_t)}}
 
@1@ Minimum offset between two objects to avoid false sharing. Guaranteed to be at least {{c|alignof(std::max_align_t)}}
 
{{source|1=
 
{{source|1=
struct keep_apart {
+
struct keep_apart
  alignas(std::hardware_destructive_interference_size) std::atomic<int> cat;
+
{
  alignas(std::hardware_destructive_interference_size) std::atomic<int> dog;
+
    alignas(std::hardware_destructive_interference_size) std::atomic<int> cat;
 +
    alignas(std::hardware_destructive_interference_size) std::atomic<int> dog;
 
};
 
};
 
}}
 
}}
 +
 
@2@ Maximum size of contiguous memory to promote true sharing. Guaranteed to be at least {{c|alignof(std::max_align_t)}}
 
@2@ Maximum size of contiguous memory to promote true sharing. Guaranteed to be at least {{c|alignof(std::max_align_t)}}
 
{{source|1=
 
{{source|1=
struct together {
+
struct together
  std::atomic<int> dog;
+
{
  int puppy;
+
    std::atomic<int> dog;
 +
    int puppy;
 
};
 
};
struct kennel {
+
 
  // Other data members...
+
struct kennel
  alignas(sizeof(together)) together pack;
+
{
  // Other data members...
+
    // Other data members...
 +
 
 +
    alignas(sizeof(together)) together pack;
 +
 
 +
    // Other data members...
 
};
 
};
 +
 
static_assert(sizeof(together) <= std::hardware_constructive_interference_size);
 
static_assert(sizeof(together) <= std::hardware_constructive_interference_size);
 
}}
 
}}
Line 36: Line 44:
 
===Notes===
 
===Notes===
 
These constants provide a portable way to access the L1 data cache line size.
 
These constants provide a portable way to access the L1 data cache line size.
 +
{{feature test macro|__cpp_lib_hardware_interference_size|std=C++17|value=201703L|
 +
{{tt|constexpr std::hardware_constructive_interference_size}} and<br>
 +
{{tt|constexpr std::hardware_destructive_interference_size}}}}
  
 
===Example===
 
===Example===
 
{{example
 
{{example
|The program uses two threads that (atomically) write to the data members of the given global objects. The first object fits in one cache line, which results in "hardware interference". The second object keeps its data members on separate cache lines, so possible "cache synchronization" after thread writes is avoided.
+
|The program uses two threads that atomically write to the data members of the given global objects. The first object fits in one cache line, which results in "hardware interference". The second object keeps its data members on separate cache lines, so possible "cache synchronization" after thread writes is avoided.
 
|code=
 
|code=
 
#include <atomic>
 
#include <atomic>
Line 61: Line 72:
 
std::mutex cout_mutex;
 
std::mutex cout_mutex;
  
constexpr int max_write_iterations{10'000'000}; // benchmark time tuning
+
constexpr int max_write_iterations{10'000'000}; // the benchmark time tuning
  
 
struct alignas(hardware_constructive_interference_size)
 
struct alignas(hardware_constructive_interference_size)
OneCacheLiner { // occupies one cache line
+
OneCacheLiner // occupies one cache line
 +
{
 
     std::atomic_uint64_t x{};
 
     std::atomic_uint64_t x{};
 
     std::atomic_uint64_t y{};
 
     std::atomic_uint64_t y{};
} oneCacheLiner;
+
}
 +
oneCacheLiner;
  
struct TwoCacheLiner { // occupies two cache lines
+
struct TwoCacheLiner // occupies two cache lines
 +
{
 
     alignas(hardware_destructive_interference_size) std::atomic_uint64_t x{};
 
     alignas(hardware_destructive_interference_size) std::atomic_uint64_t x{};
 
     alignas(hardware_destructive_interference_size) std::atomic_uint64_t y{};
 
     alignas(hardware_destructive_interference_size) std::atomic_uint64_t y{};
} twoCacheLiner;
+
}
 +
twoCacheLiner;
  
 
inline auto now() noexcept { return std::chrono::high_resolution_clock::now(); }
 
inline auto now() noexcept { return std::chrono::high_resolution_clock::now(); }
  
 
template<bool xy>
 
template<bool xy>
void oneCacheLinerThread() {
+
void oneCacheLinerThread()
     const auto start { now() };
+
{
 +
     const auto start{now()};
  
 
     for (uint64_t count{}; count != max_write_iterations; ++count)
 
     for (uint64_t count{}; count != max_write_iterations; ++count)
 
         if constexpr (xy)
 
         if constexpr (xy)
            oneCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
+
            oneCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
         else oneCacheLiner.y.fetch_add(1, std::memory_order_relaxed);
+
         else
 +
            oneCacheLiner.y.fetch_add(1, std::memory_order_relaxed);
  
     const std::chrono::duration<double, std::milli> elapsed { now() - start };
+
     const std::chrono::duration<double, std::milli> elapsed{now() - start};
 
     std::lock_guard lk{cout_mutex};
 
     std::lock_guard lk{cout_mutex};
 
     std::cout << "oneCacheLinerThread() spent " << elapsed.count() << " ms\n";
 
     std::cout << "oneCacheLinerThread() spent " << elapsed.count() << " ms\n";
 
     if constexpr (xy)
 
     if constexpr (xy)
        oneCacheLiner.x = elapsed.count();
+
        oneCacheLiner.x = elapsed.count();
     else oneCacheLiner.y = elapsed.count();
+
     else
 +
        oneCacheLiner.y = elapsed.count();
 
}
 
}
  
 
template<bool xy>
 
template<bool xy>
void twoCacheLinerThread() {
+
void twoCacheLinerThread()
     const auto start { now() };
+
{
 +
     const auto start{now()};
  
 
     for (uint64_t count{}; count != max_write_iterations; ++count)
 
     for (uint64_t count{}; count != max_write_iterations; ++count)
 
         if constexpr (xy)
 
         if constexpr (xy)
            twoCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
+
            twoCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
         else twoCacheLiner.y.fetch_add(1, std::memory_order_relaxed);
+
         else
 +
            twoCacheLiner.y.fetch_add(1, std::memory_order_relaxed);
  
     const std::chrono::duration<double, std::milli> elapsed { now() - start };
+
     const std::chrono::duration<double, std::milli> elapsed{now() - start};
 
     std::lock_guard lk{cout_mutex};
 
     std::lock_guard lk{cout_mutex};
 
     std::cout << "twoCacheLinerThread() spent " << elapsed.count() << " ms\n";
 
     std::cout << "twoCacheLinerThread() spent " << elapsed.count() << " ms\n";
 
     if constexpr (xy)
 
     if constexpr (xy)
        twoCacheLiner.x = elapsed.count();
+
        twoCacheLiner.x = elapsed.count();
     else twoCacheLiner.y = elapsed.count();
+
     else
 +
        twoCacheLiner.y = elapsed.count();
 
}
 
}
  
int main() {
+
int main()
 +
{
 
     std::cout << "__cpp_lib_hardware_interference_size "
 
     std::cout << "__cpp_lib_hardware_interference_size "
 
#  ifdef __cpp_lib_hardware_interference_size
 
#  ifdef __cpp_lib_hardware_interference_size
         " = " << __cpp_lib_hardware_interference_size << "\n";
+
         "= " << __cpp_lib_hardware_interference_size << '\n';
 
#  else
 
#  else
         "is not defined, use 64 as fallback\n";
+
         "is not defined, use " << hardware_destructive_interference_size
 +
                              << " as fallback\n";
 
#  endif
 
#  endif
  
     std::cout
+
     std::cout << "hardware_destructive_interference_size == "
        << "hardware_destructive_interference_size == "
+
              << hardware_destructive_interference_size << '\n'
        << hardware_destructive_interference_size << '\n'
+
              << "hardware_constructive_interference_size == "
        << "hardware_constructive_interference_size == "
+
              << hardware_constructive_interference_size << "\n\n"
        << hardware_constructive_interference_size << "\n\n";
+
              << std::fixed << std::setprecision(2)
 
+
              << "sizeof( OneCacheLiner ) == " << sizeof(OneCacheLiner) << '\n'
    std::cout
+
              << "sizeof( TwoCacheLiner ) == " << sizeof(TwoCacheLiner) << "\n\n";
        << std::fixed << std::setprecision(2)
+
        << "sizeof( OneCacheLiner ) == " << sizeof( OneCacheLiner ) << '\n'
+
        << "sizeof( TwoCacheLiner ) == " << sizeof( TwoCacheLiner ) << "\n\n";
+
  
 
     constexpr int max_runs{4};
 
     constexpr int max_runs{4};
  
 
     int oneCacheLiner_average{0};
 
     int oneCacheLiner_average{0};
     for (auto i{0}; i != max_runs; ++i) {
+
     for (auto i{0}; i != max_runs; ++i)
 +
    {
 
         std::thread th1{oneCacheLinerThread<0>};
 
         std::thread th1{oneCacheLinerThread<0>};
 
         std::thread th2{oneCacheLinerThread<1>};
 
         std::thread th2{oneCacheLinerThread<1>};
         th1.join(); th2.join();
+
         th1.join();
 +
        th2.join();
 
         oneCacheLiner_average += oneCacheLiner.x + oneCacheLiner.y;
 
         oneCacheLiner_average += oneCacheLiner.x + oneCacheLiner.y;
 
     }
 
     }
     std::cout << "Average time: " << (oneCacheLiner_average / max_runs / 2) << " ms\n\n";
+
     std::cout << "Average T1 time: "
 +
              << (oneCacheLiner_average / max_runs / 2) << " ms\n\n";
  
 
     int twoCacheLiner_average{0};
 
     int twoCacheLiner_average{0};
     for (auto i{0}; i != max_runs; ++i) {
+
     for (auto i{0}; i != max_runs; ++i)
 +
    {
 
         std::thread th1{twoCacheLinerThread<0>};
 
         std::thread th1{twoCacheLinerThread<0>};
 
         std::thread th2{twoCacheLinerThread<1>};
 
         std::thread th2{twoCacheLinerThread<1>};
         th1.join(); th2.join();
+
         th1.join();
 +
        th2.join();
 
         twoCacheLiner_average += twoCacheLiner.x + twoCacheLiner.y;
 
         twoCacheLiner_average += twoCacheLiner.x + twoCacheLiner.y;
 
     }
 
     }
     std::cout << "Average time: " << (twoCacheLiner_average / max_runs / 2) << " ms\n\n";
+
     std::cout << "Average T2 time: "
 +
              << (twoCacheLiner_average / max_runs / 2) << " ms\n\n"
 +
              << "Ratio T1/T2:~ "
 +
              << 1.0 * oneCacheLiner_average / twoCacheLiner_average << '\n';
 
}
 
}
| p=true
+
|p=true
| output=
+
|output=
__cpp_lib_hardware_interference_size is not defined, use 64 as fallback
+
__cpp_lib_hardware_interference_size = 201703
 
hardware_destructive_interference_size == 64
 
hardware_destructive_interference_size == 64
 
hardware_constructive_interference_size == 64
 
hardware_constructive_interference_size == 64
Line 158: Line 186:
 
sizeof( TwoCacheLiner ) == 128
 
sizeof( TwoCacheLiner ) == 128
  
oneCacheLinerThread() spent 634.25 ms
+
oneCacheLinerThread() spent 517.83 ms
oneCacheLinerThread() spent 651.55 ms
+
oneCacheLinerThread() spent 533.43 ms
oneCacheLinerThread() spent 990.23 ms
+
oneCacheLinerThread() spent 527.36 ms
oneCacheLinerThread() spent 1033.94 ms
+
oneCacheLinerThread() spent 555.69 ms
oneCacheLinerThread() spent 838.14 ms
+
oneCacheLinerThread() spent 574.74 ms
oneCacheLinerThread() spent 883.25 ms
+
oneCacheLinerThread() spent 591.66 ms
oneCacheLinerThread() spent 873.02 ms
+
oneCacheLinerThread() spent 555.63 ms
oneCacheLinerThread() spent 914.26 ms
+
oneCacheLinerThread() spent 555.76 ms
Average time: 852 ms
+
Average T1 time: 550 ms
 +
 
 +
twoCacheLinerThread() spent 89.79 ms
 +
twoCacheLinerThread() spent 89.94 ms
 +
twoCacheLinerThread() spent 89.46 ms
 +
twoCacheLinerThread() spent 90.28 ms
 +
twoCacheLinerThread() spent 89.73 ms
 +
twoCacheLinerThread() spent 91.11 ms
 +
twoCacheLinerThread() spent 89.17 ms
 +
twoCacheLinerThread() spent 90.09 ms
 +
Average T2 time: 89 ms
  
twoCacheLinerThread() spent 119.22 ms
+
Ratio T1/T2:~ 6.16
twoCacheLinerThread() spent 127.91 ms
+
twoCacheLinerThread() spent 114.17 ms
+
twoCacheLinerThread() spent 126.41 ms
+
twoCacheLinerThread() spent 125.17 ms
+
twoCacheLinerThread() spent 126.06 ms
+
twoCacheLinerThread() spent 117.94 ms
+
twoCacheLinerThread() spent 129.03 ms
+
Average time: 122 ms
+
 
}}
 
}}
  
 
===See also===
 
===See also===
 
{{dsc begin}}
 
{{dsc begin}}
{{dsc inc | cpp/thread/thread/dsc hardware_concurrency | thread}}
+
{{dsc inc|cpp/thread/thread/dsc hardware_concurrency|thread}}
{{dsc inc | cpp/thread/thread/dsc hardware_concurrency | jthread}}
+
{{dsc inc|cpp/thread/thread/dsc hardware_concurrency|jthread}}
 
{{dsc end}}
 
{{dsc end}}
  
 
{{langlinks|ar|de|es|fr|it|ja|ko|pt|ru|zh}}
 
{{langlinks|ar|de|es|fr|it|ja|ko|pt|ru|zh}}

Latest revision as of 09:54, 22 October 2023

 
 
Concurrency support library
Threads
(C++11)
(C++20)
hardware_destructive_interference_sizehardware_constructive_interference_size
(C++17)(C++17)
this_thread namespace
(C++11)
(C++11)
(C++11)
Cooperative cancellation
Mutual exclusion
(C++11)
Generic lock management
(C++11)
(C++11)
(C++11)
(C++11)
(C++11)
Condition variables
(C++11)
Semaphores
Latches and Barriers
(C++20)
(C++20)
Futures
(C++11)
(C++11)
(C++11)
(C++11)
Safe Reclamation
(C++26)
Hazard Pointers
Atomic types
(C++11)
(C++20)
Initialization of atomic types
(C++11)(deprecated in C++20)
(C++11)(deprecated in C++20)
Memory ordering
Free functions for atomic operations
Free functions for atomic flags
 
Defined in header <new>
inline constexpr std::size_t
    hardware_destructive_interference_size = /*implementation-defined*/;
(1) (since C++17)
inline constexpr std::size_t
    hardware_constructive_interference_size = /*implementation-defined*/;
(2) (since C++17)
1) Minimum offset between two objects to avoid false sharing. Guaranteed to be at least alignof(std::max_align_t)
struct keep_apart
{
    alignas(std::hardware_destructive_interference_size) std::atomic<int> cat;
    alignas(std::hardware_destructive_interference_size) std::atomic<int> dog;
};
2) Maximum size of contiguous memory to promote true sharing. Guaranteed to be at least alignof(std::max_align_t)
struct together
{
    std::atomic<int> dog;
    int puppy;
};
 
struct kennel
{
    // Other data members...
 
    alignas(sizeof(together)) together pack;
 
    // Other data members...
};
 
static_assert(sizeof(together) <= std::hardware_constructive_interference_size);

[edit] Notes

These constants provide a portable way to access the L1 data cache line size.

Feature-test macro Value Std Feature
__cpp_lib_hardware_interference_size 201703L (C++17) constexpr std::hardware_constructive_interference_size and

constexpr std::hardware_destructive_interference_size

[edit] Example

The program uses two threads that atomically write to the data members of the given global objects. The first object fits in one cache line, which results in "hardware interference". The second object keeps its data members on separate cache lines, so possible "cache synchronization" after thread writes is avoided.

#include <atomic>
#include <chrono>
#include <cstddef>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <new>
#include <thread>
 
#ifdef __cpp_lib_hardware_interference_size
    using std::hardware_constructive_interference_size;
    using std::hardware_destructive_interference_size;
#else
    // 64 bytes on x86-64 │ L1_CACHE_BYTES │ L1_CACHE_SHIFT │ __cacheline_aligned │ ...
    constexpr std::size_t hardware_constructive_interference_size = 64;
    constexpr std::size_t hardware_destructive_interference_size = 64;
#endif
 
std::mutex cout_mutex;
 
constexpr int max_write_iterations{10'000'000}; // the benchmark time tuning
 
struct alignas(hardware_constructive_interference_size)
OneCacheLiner // occupies one cache line
{
    std::atomic_uint64_t x{};
    std::atomic_uint64_t y{};
}
oneCacheLiner;
 
struct TwoCacheLiner // occupies two cache lines
{
    alignas(hardware_destructive_interference_size) std::atomic_uint64_t x{};
    alignas(hardware_destructive_interference_size) std::atomic_uint64_t y{};
}
twoCacheLiner;
 
inline auto now() noexcept { return std::chrono::high_resolution_clock::now(); }
 
template<bool xy>
void oneCacheLinerThread()
{
    const auto start{now()};
 
    for (uint64_t count{}; count != max_write_iterations; ++count)
        if constexpr (xy)
            oneCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
        else
            oneCacheLiner.y.fetch_add(1, std::memory_order_relaxed);
 
    const std::chrono::duration<double, std::milli> elapsed{now() - start};
    std::lock_guard lk{cout_mutex};
    std::cout << "oneCacheLinerThread() spent " << elapsed.count() << " ms\n";
    if constexpr (xy)
        oneCacheLiner.x = elapsed.count();
    else
        oneCacheLiner.y = elapsed.count();
}
 
template<bool xy>
void twoCacheLinerThread()
{
    const auto start{now()};
 
    for (uint64_t count{}; count != max_write_iterations; ++count)
        if constexpr (xy)
            twoCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
        else
            twoCacheLiner.y.fetch_add(1, std::memory_order_relaxed);
 
    const std::chrono::duration<double, std::milli> elapsed{now() - start};
    std::lock_guard lk{cout_mutex};
    std::cout << "twoCacheLinerThread() spent " << elapsed.count() << " ms\n";
    if constexpr (xy)
        twoCacheLiner.x = elapsed.count();
    else
        twoCacheLiner.y = elapsed.count();
}
 
int main()
{
    std::cout << "__cpp_lib_hardware_interference_size "
#   ifdef __cpp_lib_hardware_interference_size
        "= " << __cpp_lib_hardware_interference_size << '\n';
#   else
        "is not defined, use " << hardware_destructive_interference_size
                               << " as fallback\n";
#   endif
 
    std::cout << "hardware_destructive_interference_size == "
              << hardware_destructive_interference_size << '\n'
              << "hardware_constructive_interference_size == "
              << hardware_constructive_interference_size << "\n\n"
              << std::fixed << std::setprecision(2)
              << "sizeof( OneCacheLiner ) == " << sizeof(OneCacheLiner) << '\n'
              << "sizeof( TwoCacheLiner ) == " << sizeof(TwoCacheLiner) << "\n\n";
 
    constexpr int max_runs{4};
 
    int oneCacheLiner_average{0};
    for (auto i{0}; i != max_runs; ++i)
    {
        std::thread th1{oneCacheLinerThread<0>};
        std::thread th2{oneCacheLinerThread<1>};
        th1.join();
        th2.join();
        oneCacheLiner_average += oneCacheLiner.x + oneCacheLiner.y;
    }
    std::cout << "Average T1 time: "
              << (oneCacheLiner_average / max_runs / 2) << " ms\n\n";
 
    int twoCacheLiner_average{0};
    for (auto i{0}; i != max_runs; ++i)
    {
        std::thread th1{twoCacheLinerThread<0>};
        std::thread th2{twoCacheLinerThread<1>};
        th1.join();
        th2.join();
        twoCacheLiner_average += twoCacheLiner.x + twoCacheLiner.y;
    }
    std::cout << "Average T2 time: "
              << (twoCacheLiner_average / max_runs / 2) << " ms\n\n"
              << "Ratio T1/T2:~ "
              << 1.0 * oneCacheLiner_average / twoCacheLiner_average << '\n';
}

Possible output:

__cpp_lib_hardware_interference_size = 201703
hardware_destructive_interference_size == 64
hardware_constructive_interference_size == 64
 
sizeof( OneCacheLiner ) == 64
sizeof( TwoCacheLiner ) == 128
 
oneCacheLinerThread() spent 517.83 ms
oneCacheLinerThread() spent 533.43 ms
oneCacheLinerThread() spent 527.36 ms
oneCacheLinerThread() spent 555.69 ms
oneCacheLinerThread() spent 574.74 ms
oneCacheLinerThread() spent 591.66 ms
oneCacheLinerThread() spent 555.63 ms
oneCacheLinerThread() spent 555.76 ms
Average T1 time: 550 ms
 
twoCacheLinerThread() spent 89.79 ms
twoCacheLinerThread() spent 89.94 ms
twoCacheLinerThread() spent 89.46 ms
twoCacheLinerThread() spent 90.28 ms
twoCacheLinerThread() spent 89.73 ms
twoCacheLinerThread() spent 91.11 ms
twoCacheLinerThread() spent 89.17 ms
twoCacheLinerThread() spent 90.09 ms
Average T2 time: 89 ms
 
Ratio T1/T2:~ 6.16

[edit] See also

returns the number of concurrent threads supported by the implementation
(public static member function of std::thread) [edit]
returns the number of concurrent threads supported by the implementation
(public static member function of std::jthread) [edit]