Monday, March 30, 2020

Digest for comp.lang.c++@googlegroups.com - 21 updates in 2 topics

Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 08:32AM +0200

>     for( size_t i = 0; i < n; i+=2 )
>         p[i] = ~p[i];
> }
 
If I didn't make a mistake this should be faster:
 
void invertSecondD( uint8_t *p, size_t n )
{
if( !n-- )
return;
++p;
size_t head = (((intptr_t)p + 7) & (intptr_t)-8) - (intptr_t)p;
head = head >= n ? head : n;
for( uint8_t *w = p, *end = p + head; w < end; w += 2 )
*w = ~*w;
if( head <= n )
return;
int64_t odd = (int64_t)p & 1;
n -= head;
p += head;
if( n / 8 )
{
union
{
uint8_t u8Mask[8] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
0xFF, 0x00 };
uint64_t mask;
};
uint64_t *w = (uint64_t *)p,
*end = w + n / 8;
mask ^= -odd;
do
*w ^= mask;
while( w != end );
p = (uint8_t *)w;
n = n % 8;
}
if( n <= (size_t)odd )
return;
p += (size_t)odd;
n -= (size_t)odd;
for( uint8_t *end = p + n; p < n; p += 2 )
p = ~*p;
}
Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 08:38AM +0200

>     for( uint8_t *end = p + n; p < n; p += 2 )
>         p = ~*p;
 
for( uint8_t *end = p + n; p < end; p += 2 )
*p = ~*p;
James Kuyper <jameskuyper@alumni.caltech.edu>: Mar 29 11:54AM -0400

On 3/29/20 11:40 AM, Frederick Gotham wrote:
...
>I use "bind2nd" with "bit_xor" and "0xFF" in this snippet because I can't find the unary "bit_not" class. Is there such a thing?
 
The bit_not class is described in 23.14.9.4, immediately after
23.14.9.3, which describes the bit_xor class.
Bonita Montero <Bonita.Montero@gmail.com>: Mar 29 06:00PM +0200

>>      for( int i = 0; i < g_LEN; ++i ) { if( i%2 ) { data[i] = ~data[i]; }
 
> Assuming we have a 2s-complement this would be faster:
 
Or maybe not because the branch-prediction is able
to predict the regular pattern of your branches.
Bonita Montero <Bonita.Montero@gmail.com>: Mar 29 05:56PM +0200

>     for( int i = 0; i < g_LEN; ++i ) { if( i%2 ) { data[i] = ~data[i]; }
 
Assuming we have a 2s-complement this would be faster:
 
void invertSecond( uint8_t *p, size_t n )
{
for( size_t i = 0; i != n; ++i )
p[i] ^= -(int8_t)(i & 1);
}
Bonita Montero <Bonita.Montero@gmail.com>: Mar 29 06:26PM +0200

> Or maybe not because the branch-prediction is able
> to predict the regular pattern of your branches.
 
I tested it; my variant is faster although the loop is
very larger:
 
#include <iostream>
#include <chrono>
#include <cstdint>
#include <vector>
#include <algorithm>
 
using namespace std;
using namespace chrono;
 
void invertSecondA( uint8_t *p, size_t n )
{
for( size_t i = 0; i != n; ++i )
p[i] ^= -(int8_t)(i & 1);
}
 
void invertSecondB( uint8_t *p, size_t n )
{
for( size_t i = 0; i != n; ++i )
if( i % 2 )
p[i] = ~p[i];
}
 
int main()
{
size_t const SIZE = 1024, // fits in L1
ROUNDS = 1'000'000;
vector<uint8_t> v( SIZE, 0 );
time_point<high_resolution_clock> start = high_resolution_clock::now();
for( size_t round = ROUNDS; round; --round )
invertSecondA( &v[0], SIZE );
double sA = (double)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count() / 1.0E9;
start = high_resolution_clock::now();
for( size_t round = ROUNDS; round; --round )
invertSecondB( &v[0], SIZE );
double sB = (double)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count() / 1.0E9;
cout << sA << endl << sB << endl;
}
 
This are the MSVC 2019 times:
0.533812
1.1952
And this are the g++ 6.3.0 times:
0.190201
0.401627
Christian Gollwitzer <auriocus@gmx.de>: Mar 29 06:39PM +0200

Am 29.03.20 um 18:26 schrieb Bonita Montero:
>         if( i % 2 )
>             p[i] = ~p[i];
> }
 
How about:
for( size_t i = 0; i < n; i+=2 )
p[i] = ~p[i];
}
 
?
 
Christian
Bonita Montero <Bonita.Montero@gmail.com>: Mar 29 06:49PM +0200

>       for( size_t i = 0; i < n; i+=2 )
>               p[i] = ~p[i];
>       }
 
That's too simple. ;-)
Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 11:10AM +0200

This is a blocked version that adapts to 32- and 64-bitness:
 
void invertSecondBlocked( uint8_t *p, size_t n )
{
if( !n-- )
return;
++p;
size_t head = (((intptr_t)p + 7) & (intptr_t)-8) - (intptr_t)p;
head = head <= n ? head : n;
for( uint8_t *w = p, *end = p + head; w < end; w += 2 )
*w = ~*w;
if( n == head )
return;
size_t odd = (size_t)p & 1; // assume size_t or ptrdiff_t is our
register width
n -= head;
p += head;
if constexpr( sizeof(size_t) == 8 )
{
if( n / 8 )
{
union
{
uint8_t u8Mask[8] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF,
0x00, 0xFF, 0x00 };
size_t mask;
};
size_t *w = (size_t *)p,
*end = w + n / 8;
mask ^= -(ptrdiff_t)odd;
do
*w ^= mask;
while( ++w != end );
p = (uint8_t *)w;
n = n % 8;
}
}
else if constexpr( sizeof(size_t) == 4 )
if( n / 4 )
{
union
{
uint8_t u8Mask[4] = { 0xFF, 0x00, 0xFF, 0x00 };
size_t mask;
};
size_t *w = (size_t *)p,
*end = w + n / 4;
mask ^= -(ptrdiff_t)odd;
do
*w ^= mask;
while( ++w != end );
p = (uint8_t *)w;
n = n % 4;
}
if( n <= odd )
return;
p += odd;
n -= odd;
uint8_t *end = p + n;
do
*p = ~*p;
while( (p += 2) < end );
}
Melzzzzz <Melzzzzz@zzzzz.com>: Mar 30 09:39AM

> *p = ~*p;
> while( (p += 2) < end );
> }
What does this do?
 
--
press any key to continue or any other to quit...
U ničemu ja ne uživam kao u svom statusu INVALIDA -- Zli Zec
Svi smo svedoci - oko 3 godine intenzivne propagande je dovoljno da jedan narod poludi -- Zli Zec
Na divljem zapadu i nije bilo tako puno nasilja, upravo zato jer su svi
bili naoruzani. -- Mladen Gogala
Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 11:55AM +0200

Am 30.03.2020 um 11:39 schrieb Melzzzzz:
>> while( (p += 2) < end );
>> }
> What does this do?
 
It inverts every second byte. First the head until the first aligned
uint64_t-block, then the uint64_t-blocks and then the tail. It's more
than 10 times faster than the fastest algorithm so far.
Melzzzzz <Melzzzzz@zzzzz.com>: Mar 30 09:59AM


> It inverts every second byte. First the head until the first aligned
> uint64_t-block, then the uint64_t-blocks and then the tail. It's more
> than 10 times faster than the fastest algorithm so far.
 
Have you measured? It looks pretty complicated for task at hand?
Also I really doubt it is faster then SSE2 solution...
 
--
press any key to continue or any other to quit...
U ničemu ja ne uživam kao u svom statusu INVALIDA -- Zli Zec
Svi smo svedoci - oko 3 godine intenzivne propagande je dovoljno da jedan narod poludi -- Zli Zec
Na divljem zapadu i nije bilo tako puno nasilja, upravo zato jer su svi
bili naoruzani. -- Mladen Gogala
Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 12:03PM +0200

> Have you measured? It looks pretty complicated for task at hand?
 
There's no way to make it simpler.
 
> Also I really doubt it is faster then SSE2 solution...
 
A SSE-solution would have a bit more complexity and it wouldn't
be portable.
Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 12:37PM +0200

Here's a SSE-enabled version:
 
void invertSecondBlocked( uint8_t *p, size_t n )
{
size_t const BLOCKSIZE = SSE_BLOCKS ? 16 : sizeof(size_t);
if( !n-- )
return;
++p;
size_t head = (((intptr_t)p + (BLOCKSIZE - 1)) &
-(intptr_t)BLOCKSIZE) - (intptr_t)p;
head = head <= n ? head : n;
for( uint8_t *w = p, *end = p + head; w < end; w += 2 )
*w = ~*w;
if( n == head )
return;
size_t odd = (size_t)p & 1; // assume size_t or ptrdiff_t is our
register width
n -= head;
p += head;
if constexpr( SSE_BLOCKS )
{
if( n / 16 )
{
union
{
uint8_t u8Mask[16] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF,
0x00, 0xFF, 0x00,
0xFF, 0x00, 0xFF, 0x00, 0xFF,
0x00, 0xFF, 0x00 };
__m128 mask;
};
static const union
{
uint8_t u8Invert[16] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF };
__m128 invert;
};
__m128 *w = (__m128 *)p,
*end = w + n / 16;
if( odd )
mask = _mm_xor_ps( mask, invert );
do
*w = _mm_xor_ps( mask, *w );
while( ++w != end );
p = (uint8_t *)w;
n = n % 16;
}
}
else if constexpr( sizeof(size_t) == 8 )
{
if( n / 8 )
{
union
{
uint8_t u8Mask[8] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF,
0x00, 0xFF, 0x00 };
size_t mask;
};
size_t *w = (size_t *)p,
*end = w + n / 8;
mask ^= -(ptrdiff_t)odd;
do
*w ^= mask;
while( ++w != end );
p = (uint8_t *)w;
n = n % 8;
}
}
else if constexpr( sizeof(size_t) == 4 )
{
if( n / 4 )
{
union
{
uint8_t u8Mask[4] = { 0xFF, 0x00, 0xFF, 0x00 };
size_t mask;
};
size_t *w = (size_t *)p,
*end = w + n / 4;
mask ^= -(ptrdiff_t)odd;
do
*w ^= mask;
while( ++w != end );
p = (uint8_t *)w;
n = n % 4;
}
}
if( n <= odd )
return;
p += odd;
n -= odd;
uint8_t *end = p + n;
do
*p = ~*p;
while( (p += 2) < end );
}
 
It's almost twice as fast on my CPU.
Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 01:57PM +0200

This is SSE as well as AVX-optimized.
Unfortunately this runs only slightly faster on my old Ryzen 1800X
because the first two generations of Ryzen-CPUs split an 256 bit
AVX-operation into two 128 bit operations.
 
void invertSecondBlocked( uint8_t *p, size_t n )
{
size_t const BLOCKSIZE = AVX_BLOCKS ? 32 : SSE_BLOCKS ? 16 :
sizeof(size_t);
if( !n-- )
return;
++p;
size_t head = (((intptr_t)p + BLOCKSIZE - 1) &
-(intptr_t)BLOCKSIZE) - (intptr_t)p;
head = head <= n ? head : n;
for( uint8_t *w = p, *end = p + head; w < end; w += 2 )
*w = ~*w;
if( n == head )
return;
size_t odd = (size_t)p & 1; // assume size_t or ptrdiff_t is our
register width
n -= head;
p += head;
#if AVX_BLOCKS != 0
if( n / 32 )
{
union
{
uint8_t u8Mask[32] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
0xFF, 0x00,
0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
0xFF, 0x00,
0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
0xFF, 0x00,
0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
0xFF, 0x00 };
__m256 mask;
};
static const union
{
uint8_t u8Invert[32] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF };
__m256 invert;
};
__m256 *w = (__m256 *)p,
*end = w + n / 32;
if( odd )
mask = _mm256_xor_ps( mask, invert );
do
*w = _mm256_xor_ps( *w, mask );
while( ++w != end );
p = (uint8_t *)w;
n = n % 32;
}
#elif SSE_BLOCKS != 0
if( n / 16 )
{
union
{
uint8_t u8Mask[16] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
0xFF, 0x00,
0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
0xFF, 0x00 };
__m128 mask;
};
static const union
{
uint8_t u8Invert[16] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF };
__m128 invert;
};
__m128 *w = (__m128 *)p,
*end = w + n / 16;
if( odd )
mask = _mm_xor_ps( mask, invert );
do
*w = _mm_xor_ps( *w, mask );
while( ++w != end );
p = (uint8_t *)w;
n = n % 16;
}
#else
if constexpr( sizeof(size_t) == 8 )
{
if( n / 8 )
{
union
{
uint8_t u8Mask[8] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF,
0x00, 0xFF, 0x00 };
size_t mask;
};
size_t *w = (size_t *)p,
*end = w + n / 8;
mask ^= -(ptrdiff_t)odd;
do
*w ^= mask;
while( ++w != end );
p = (uint8_t *)w;
n = n % 8;
}
}
else if constexpr( sizeof(size_t) == 4 )
{
if( n / 4 )
{
union
{
uint8_t u8Mask[4] = { 0xFF, 0x00, 0xFF, 0x00 };
size_t mask;
};
size_t *w = (size_t *)p,
*end = w + n / 4;
mask ^= -(ptrdiff_t)odd;
do
*w ^= mask;
while( ++w != end );
p = (uint8_t *)w;
n = n % 4;
}
}

No comments: