Wednesday, February 20, 2019

Digest for comp.lang.c++@googlegroups.com - 25 updates in 5 topics

woodbrian77@gmail.com: Feb 20 11:58AM -0800

Shalom
 
I've been working on this program:
https://github.com/Ebenezer-group/onwards/blob/master/src/cmw/tiers/cmwA.cc
 
for a number of years. I haven't watched this talk:
https://www.reddit.com/r/cpp/comments/asmpj3/continuable_asynchronous_programming_with/
 
but I wonder if there's anything there that could help me
make my program better. I've managed to reduce the size of
the event loop over the years with traditional refactoring.
I don't think the size of it is a big problem, but... Thanks.
 
 
Brian
Ebenezer Enterprises - Enjoying programming again.
http://webEbenezer.net
"Chris M. Thomasson" <invalid_chris_thomasson_invalid@invalid.com>: Feb 19 10:49PM -0800

Fwiw, I wrote a crude new benchmark that measures how many reads and
writes can be performed in a given amount of time. My algorithm vs
std::shared_mutex. So, we are _primarily_ looking for how many reads can
be performed in this test at 60 seconds. The number of threads is
variable and is determined by std::hardware_concurrency * THREADS, set
at 8 in the test. So on my system the setup is:
___________________________________
cpu_threads_n = 4
threads_n = 32
writers = 16
readers = 16
test duration = 60 seconds
___________________________________
 
This is going to be different for each system. Readers iterate a shared
linked list with 1000000 nodes in this test. Writers pop and push items,
and never use new or delete. Well, so far, the timings I am getting on
my end using MSVC 2017 is:
 
 
Testing: Chris M. Thomasson's Experimental Read/Write Mutex
 
___________________________________
cpu_threads_n = 4
threads_n = 32
writers = 16
readers = 16
test duration = 60 seconds
___________________________________
 
 
___________________________________
Raw Reads: 18046464
Raw Writes: 347498
reads_per_tick = 300693
writes_per_tick = 5790
Ticks = 60.0161
___________________________________
 
 
 
Testing: std::shared_mutex
 
___________________________________
cpu_threads_n = 4
threads_n = 32
writers = 16
readers = 16
test duration = 60 seconds
___________________________________
 
 
___________________________________
Raw Reads: 650006
Raw Writes: 168741
reads_per_tick = 10829
writes_per_tick = 2811
Ticks = 60.0193
___________________________________
 
 
 
As you can see, my algorithm is performing many more reads than
std::shared_mutex. Anyway, here is my code:
 
When you get some time, can you please give it a go? I want to see
timings on other systems, the good, bad and ugly. ;^) Thanks.
 
https://pastebin.com/raw/1QtPCGhV
___________________________________
/* Crude Read/Write Mutex Benchmark
 
This tests my algorithm ct_rwmutex vs. std::shared_mutex.
 
It shows how many reads and writes can be performed within
a fixed amount of time.
 
by: Chris M. Thomasson
__________________________________________*/
 
 
 
#include <thread>
#include <atomic>
#include <shared_mutex>
#include <condition_variable>
#include <iostream>
#include <functional>
#include <chrono>
#include <cassert>
#include <cstdlib>
#include <ctime>
#include <climits>
#include <cstdint>
#include <vector>
 
 
// undefine to test std::shared_mutex
#define CT_TEST_FAST_MUTEX 1
 
 
#define THREADS 8 // multiplied by std::hardware_concurrency
#define NODE_PRIME 1000000 // number of nodes
#define CRUDE_CACHE_PAD 256 // crude cache pad
#define TEST_DURATION_SEC 60 // number of seconds for the test
 
 
// bare bones mutex/condvar based semaphore
struct ct_slow_semaphore
{
unsigned long m_state;
std::mutex m_mutex;
std::condition_variable m_cond;
 
ct_slow_semaphore(unsigned long state) : m_state(state) {}
 
void inc()
{
m_mutex.lock();
++m_state;
m_mutex.unlock();
m_cond.notify_one();
}
 
void add(unsigned long addend)
{
{
std::unique_lock<std::mutex> lock(m_mutex);
m_state += addend;
}
 
m_cond.notify_all();
}
 
void dec()
{
std::unique_lock<std::mutex> lock(m_mutex);
while (m_state == 0) m_cond.wait(lock);
--m_state;
}
};
 
 
 
 
// bin-sema
struct ct_auto_reset_event
{
bool m_state;
std::mutex m_mutex;
std::condition_variable m_cond;
 
ct_auto_reset_event() : m_state(false) {}
 
void signal()
{
std::unique_lock<std::mutex> lock(m_mutex);
m_state = true;
m_cond.notify_one();
}
 
void wait()
{
std::unique_lock<std::mutex> lock(m_mutex);
while (m_state == false) m_cond.wait(lock);
m_state = false; // auto-reset
}
};
 
 
 
 
// just a layer over an auto-reset event
struct ct_fast_mutex
{
std::atomic<unsigned int> m_state;
ct_auto_reset_event m_waitset;
 
ct_fast_mutex() : m_state(0) {}
 
void lock()
{
if (m_state.exchange(1, std::memory_order_acquire))
{
while (m_state.exchange(2, std::memory_order_acquire))
{
m_waitset.wait();
}
}
}
 
void unlock()
{
if (m_state.exchange(0, std::memory_order_release) == 2)
{
m_waitset.signal();
}
}
};
 
 
 
// Chris M. Thomassons Experimental Read/Write Mutex
// Yeah, it is pretty damn fat wrt the state, however
// it has some interesting properties...
// The state can be compressed a bit...
// btw, it has no loops...
// Take a look at the lock_shared and unlock_shared functions
 
// The number of readers _must_ never exceed LONG_MAX!
 
#define RWMUTEX_COUNT_MAX LONG_MAX
 
struct ct_rwmutex
{
unsigned char m_crude_cache_pad_0[CRUDE_CACHE_PAD];
 
// shared state
std::atomic<long> m_count;
unsigned char m_crude_cache_pad_1[CRUDE_CACHE_PAD];
 
std::atomic<long> m_rdwake;
unsigned char m_crude_cache_pad_2[CRUDE_CACHE_PAD];
 
ct_slow_semaphore m_rdwset;
unsigned char m_crude_cache_pad_3[CRUDE_CACHE_PAD];
 
ct_slow_semaphore m_wrwset;
unsigned char m_crude_cache_pad_4[CRUDE_CACHE_PAD];
 
ct_fast_mutex m_wrlock;
unsigned char m_crude_cache_pad_5[CRUDE_CACHE_PAD];
 
 
ct_rwmutex() :
m_count(RWMUTEX_COUNT_MAX),
m_rdwake(0),
m_rdwset(0),
m_wrwset(0) {
}
 
 
// READ, pretty slim...
void lock_shared()
{
if (m_count.fetch_add(-1, std::memory_order_acquire) < 1)
{
// We need to wait for a writer.
m_rdwset.dec();
}
}
 
void unlock_shared()
{
if (m_count.fetch_add(1, std::memory_order_release) < 0)
{
// There is a writer
if (m_rdwake.fetch_add(-1, std::memory_order_acq_rel) == 1)
{
// We need to wake the writer up
m_wrwset.inc();
}
}
}
 
 
// WRITE, more hefty
void lock()
{
// Acquire exclusive access
m_wrlock.lock();
 
// we are the only thread in here now.
 
// Gain write access wrt m_count
long count = m_count.fetch_add(-RWMUTEX_COUNT_MAX,
std::memory_order_acquire);
 
// count can never be negative.
if (count < RWMUTEX_COUNT_MAX)
{
// We detected readers.
long rdwake = m_rdwake.fetch_add(RWMUTEX_COUNT_MAX - count,
std::memory_order_acquire);
 
if (rdwake + RWMUTEX_COUNT_MAX - count)
{
// Okay, we need to wait for all of the readers
// The number of readers is actually
// RWMUTEX_COUNT_MAX - count
m_wrwset.dec();
}
}
}
 
// write unlock
void unlock()
{
// Release write access wrt m_count
long count = m_count.fetch_add(RWMUTEX_COUNT_MAX,
std::memory_order_release);
 
if (count < 0)
{
// We need to wake -count readers.
m_rdwset.add(-count);
}
 
// Release exclusive access
m_wrlock.unlock();
}
};
 
 
struct ct_simple_stack
{
struct node
{
node* m_next;
unsigned int m_tid;
 
node(unsigned int tid) : m_tid(tid) {}
};
 
node* m_head;
 
ct_simple_stack() : m_head(nullptr) {}
 
~ct_simple_stack()
{
ct_simple_stack::node* n = flush();
 
while (n)
{
ct_simple_stack::node* next = n->m_next;
delete n;
n = next;
}
}
 
void push(node* n)
{
n->m_next = m_head;
m_head = n;
}
 
node* pop()
{
node* n = m_head;
 
if (n)
{
m_head = n->m_next;
}
 
return n;
}
 
 
node* flush()
{
node* n = m_head;
m_head = nullptr;
return n;
}
 
};
 
 
struct ct_shared
{
std::atomic<bool> m_run;
 
// protected by m_std_rwmutex
std::uint64_t m_reads;
std::uint64_t m_writes;
 
ct_simple_stack m_stack;
ct_simple_stack m_stack_dtor;
 
#if defined (CT_TEST_FAST_MUTEX)
ct_rwmutex m_std_rwmutex;
#else
std::shared_mutex m_std_rwmutex;

No comments: