- Futures, coroutines - 1 Update
- new benchmark for my read/write algorithm... - 18 Updates
- Please critique my atomic queue implementation - 1 Update
- is there any UB in this integer math... - 4 Updates
- an experimental read/write mutex... - 1 Update
woodbrian77@gmail.com: Feb 20 11:58AM -0800 Shalom I've been working on this program: https://github.com/Ebenezer-group/onwards/blob/master/src/cmw/tiers/cmwA.cc for a number of years. I haven't watched this talk: https://www.reddit.com/r/cpp/comments/asmpj3/continuable_asynchronous_programming_with/ but I wonder if there's anything there that could help me make my program better. I've managed to reduce the size of the event loop over the years with traditional refactoring. I don't think the size of it is a big problem, but... Thanks. Brian Ebenezer Enterprises - Enjoying programming again. http://webEbenezer.net |
"Chris M. Thomasson" <invalid_chris_thomasson_invalid@invalid.com>: Feb 19 10:49PM -0800 Fwiw, I wrote a crude new benchmark that measures how many reads and writes can be performed in a given amount of time. My algorithm vs std::shared_mutex. So, we are _primarily_ looking for how many reads can be performed in this test at 60 seconds. The number of threads is variable and is determined by std::hardware_concurrency * THREADS, set at 8 in the test. So on my system the setup is: ___________________________________ cpu_threads_n = 4 threads_n = 32 writers = 16 readers = 16 test duration = 60 seconds ___________________________________ This is going to be different for each system. Readers iterate a shared linked list with 1000000 nodes in this test. Writers pop and push items, and never use new or delete. Well, so far, the timings I am getting on my end using MSVC 2017 is: Testing: Chris M. Thomasson's Experimental Read/Write Mutex ___________________________________ cpu_threads_n = 4 threads_n = 32 writers = 16 readers = 16 test duration = 60 seconds ___________________________________ ___________________________________ Raw Reads: 18046464 Raw Writes: 347498 reads_per_tick = 300693 writes_per_tick = 5790 Ticks = 60.0161 ___________________________________ Testing: std::shared_mutex ___________________________________ cpu_threads_n = 4 threads_n = 32 writers = 16 readers = 16 test duration = 60 seconds ___________________________________ ___________________________________ Raw Reads: 650006 Raw Writes: 168741 reads_per_tick = 10829 writes_per_tick = 2811 Ticks = 60.0193 ___________________________________ As you can see, my algorithm is performing many more reads than std::shared_mutex. Anyway, here is my code: When you get some time, can you please give it a go? I want to see timings on other systems, the good, bad and ugly. ;^) Thanks. https://pastebin.com/raw/1QtPCGhV ___________________________________ /* Crude Read/Write Mutex Benchmark This tests my algorithm ct_rwmutex vs. std::shared_mutex. It shows how many reads and writes can be performed within a fixed amount of time. by: Chris M. Thomasson __________________________________________*/ #include <thread> #include <atomic> #include <shared_mutex> #include <condition_variable> #include <iostream> #include <functional> #include <chrono> #include <cassert> #include <cstdlib> #include <ctime> #include <climits> #include <cstdint> #include <vector> // undefine to test std::shared_mutex #define CT_TEST_FAST_MUTEX 1 #define THREADS 8 // multiplied by std::hardware_concurrency #define NODE_PRIME 1000000 // number of nodes #define CRUDE_CACHE_PAD 256 // crude cache pad #define TEST_DURATION_SEC 60 // number of seconds for the test // bare bones mutex/condvar based semaphore struct ct_slow_semaphore { unsigned long m_state; std::mutex m_mutex; std::condition_variable m_cond; ct_slow_semaphore(unsigned long state) : m_state(state) {} void inc() { m_mutex.lock(); ++m_state; m_mutex.unlock(); m_cond.notify_one(); } void add(unsigned long addend) { { std::unique_lock<std::mutex> lock(m_mutex); m_state += addend; } m_cond.notify_all(); } void dec() { std::unique_lock<std::mutex> lock(m_mutex); while (m_state == 0) m_cond.wait(lock); --m_state; } }; // bin-sema struct ct_auto_reset_event { bool m_state; std::mutex m_mutex; std::condition_variable m_cond; ct_auto_reset_event() : m_state(false) {} void signal() { std::unique_lock<std::mutex> lock(m_mutex); m_state = true; m_cond.notify_one(); } void wait() { std::unique_lock<std::mutex> lock(m_mutex); while (m_state == false) m_cond.wait(lock); m_state = false; // auto-reset } }; // just a layer over an auto-reset event struct ct_fast_mutex { std::atomic<unsigned int> m_state; ct_auto_reset_event m_waitset; ct_fast_mutex() : m_state(0) {} void lock() { if (m_state.exchange(1, std::memory_order_acquire)) { while (m_state.exchange(2, std::memory_order_acquire)) { m_waitset.wait(); } } } void unlock() { if (m_state.exchange(0, std::memory_order_release) == 2) { m_waitset.signal(); } } }; // Chris M. Thomassons Experimental Read/Write Mutex // Yeah, it is pretty damn fat wrt the state, however // it has some interesting properties... // The state can be compressed a bit... // btw, it has no loops... // Take a look at the lock_shared and unlock_shared functions // The number of readers _must_ never exceed LONG_MAX! #define RWMUTEX_COUNT_MAX LONG_MAX struct ct_rwmutex { unsigned char m_crude_cache_pad_0[CRUDE_CACHE_PAD]; // shared state std::atomic<long> m_count; unsigned char m_crude_cache_pad_1[CRUDE_CACHE_PAD]; std::atomic<long> m_rdwake; unsigned char m_crude_cache_pad_2[CRUDE_CACHE_PAD]; ct_slow_semaphore m_rdwset; unsigned char m_crude_cache_pad_3[CRUDE_CACHE_PAD]; ct_slow_semaphore m_wrwset; unsigned char m_crude_cache_pad_4[CRUDE_CACHE_PAD]; ct_fast_mutex m_wrlock; unsigned char m_crude_cache_pad_5[CRUDE_CACHE_PAD]; ct_rwmutex() : m_count(RWMUTEX_COUNT_MAX), m_rdwake(0), m_rdwset(0), m_wrwset(0) { } // READ, pretty slim... void lock_shared() { if (m_count.fetch_add(-1, std::memory_order_acquire) < 1) { // We need to wait for a writer. m_rdwset.dec(); } } void unlock_shared() { if (m_count.fetch_add(1, std::memory_order_release) < 0) { // There is a writer if (m_rdwake.fetch_add(-1, std::memory_order_acq_rel) == 1) { // We need to wake the writer up m_wrwset.inc(); } } } // WRITE, more hefty void lock() { // Acquire exclusive access m_wrlock.lock(); // we are the only thread in here now. // Gain write access wrt m_count long count = m_count.fetch_add(-RWMUTEX_COUNT_MAX, std::memory_order_acquire); // count can never be negative. if (count < RWMUTEX_COUNT_MAX) { // We detected readers. long rdwake = m_rdwake.fetch_add(RWMUTEX_COUNT_MAX - count, std::memory_order_acquire); if (rdwake + RWMUTEX_COUNT_MAX - count) { // Okay, we need to wait for all of the readers // The number of readers is actually // RWMUTEX_COUNT_MAX - count m_wrwset.dec(); } } } // write unlock void unlock() { // Release write access wrt m_count long count = m_count.fetch_add(RWMUTEX_COUNT_MAX, std::memory_order_release); if (count < 0) { // We need to wake -count readers. m_rdwset.add(-count); } // Release exclusive access m_wrlock.unlock(); } }; struct ct_simple_stack { struct node { node* m_next; unsigned int m_tid; node(unsigned int tid) : m_tid(tid) {} }; node* m_head; ct_simple_stack() : m_head(nullptr) {} ~ct_simple_stack() { ct_simple_stack::node* n = flush(); while (n) { ct_simple_stack::node* next = n->m_next; delete n; n = next; } } void push(node* n) { n->m_next = m_head; m_head = n; } node* pop() { node* n = m_head; if (n) { m_head = n->m_next; } return n; } node* flush() { node* n = m_head; m_head = nullptr; return n; } }; struct ct_shared { std::atomic<bool> m_run; // protected by m_std_rwmutex std::uint64_t m_reads; std::uint64_t m_writes; ct_simple_stack m_stack; ct_simple_stack m_stack_dtor; #if defined (CT_TEST_FAST_MUTEX) ct_rwmutex m_std_rwmutex; #else std::shared_mutex m_std_rwmutex;
Subscribe to:
Post Comments (Atom)
|
No comments:
Post a Comment