| Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 08:32AM +0200 > for( size_t i = 0; i < n; i+=2 ) > p[i] = ~p[i]; > } If I didn't make a mistake this should be faster: void invertSecondD( uint8_t *p, size_t n ) { if( !n-- ) return; ++p; size_t head = (((intptr_t)p + 7) & (intptr_t)-8) - (intptr_t)p; head = head >= n ? head : n; for( uint8_t *w = p, *end = p + head; w < end; w += 2 ) *w = ~*w; if( head <= n ) return; int64_t odd = (int64_t)p & 1; n -= head; p += head; if( n / 8 ) { union { uint8_t u8Mask[8] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }; uint64_t mask; }; uint64_t *w = (uint64_t *)p, *end = w + n / 8; mask ^= -odd; do *w ^= mask; while( w != end ); p = (uint8_t *)w; n = n % 8; } if( n <= (size_t)odd ) return; p += (size_t)odd; n -= (size_t)odd; for( uint8_t *end = p + n; p < n; p += 2 ) p = ~*p; } |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 08:38AM +0200 > for( uint8_t *end = p + n; p < n; p += 2 ) > p = ~*p; for( uint8_t *end = p + n; p < end; p += 2 ) *p = ~*p; |
| James Kuyper <jameskuyper@alumni.caltech.edu>: Mar 29 11:54AM -0400 On 3/29/20 11:40 AM, Frederick Gotham wrote: ... >I use "bind2nd" with "bit_xor" and "0xFF" in this snippet because I can't find the unary "bit_not" class. Is there such a thing? The bit_not class is described in 23.14.9.4, immediately after 23.14.9.3, which describes the bit_xor class. |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 29 06:00PM +0200 >> for( int i = 0; i < g_LEN; ++i ) { if( i%2 ) { data[i] = ~data[i]; } > Assuming we have a 2s-complement this would be faster: Or maybe not because the branch-prediction is able to predict the regular pattern of your branches. |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 29 05:56PM +0200 > for( int i = 0; i < g_LEN; ++i ) { if( i%2 ) { data[i] = ~data[i]; } Assuming we have a 2s-complement this would be faster: void invertSecond( uint8_t *p, size_t n ) { for( size_t i = 0; i != n; ++i ) p[i] ^= -(int8_t)(i & 1); } |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 29 06:26PM +0200 > Or maybe not because the branch-prediction is able > to predict the regular pattern of your branches. I tested it; my variant is faster although the loop is very larger: #include <iostream> #include <chrono> #include <cstdint> #include <vector> #include <algorithm> using namespace std; using namespace chrono; void invertSecondA( uint8_t *p, size_t n ) { for( size_t i = 0; i != n; ++i ) p[i] ^= -(int8_t)(i & 1); } void invertSecondB( uint8_t *p, size_t n ) { for( size_t i = 0; i != n; ++i ) if( i % 2 ) p[i] = ~p[i]; } int main() { size_t const SIZE = 1024, // fits in L1 ROUNDS = 1'000'000; vector<uint8_t> v( SIZE, 0 ); time_point<high_resolution_clock> start = high_resolution_clock::now(); for( size_t round = ROUNDS; round; --round ) invertSecondA( &v[0], SIZE ); double sA = (double)duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count() / 1.0E9; start = high_resolution_clock::now(); for( size_t round = ROUNDS; round; --round ) invertSecondB( &v[0], SIZE ); double sB = (double)duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count() / 1.0E9; cout << sA << endl << sB << endl; } This are the MSVC 2019 times: 0.533812 1.1952 And this are the g++ 6.3.0 times: 0.190201 0.401627 |
| Christian Gollwitzer <auriocus@gmx.de>: Mar 29 06:39PM +0200 Am 29.03.20 um 18:26 schrieb Bonita Montero: > if( i % 2 ) > p[i] = ~p[i]; > } How about: for( size_t i = 0; i < n; i+=2 ) p[i] = ~p[i]; } ? Christian |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 29 06:49PM +0200 > for( size_t i = 0; i < n; i+=2 ) > p[i] = ~p[i]; > } That's too simple. ;-) |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 11:10AM +0200 This is a blocked version that adapts to 32- and 64-bitness: void invertSecondBlocked( uint8_t *p, size_t n ) { if( !n-- ) return; ++p; size_t head = (((intptr_t)p + 7) & (intptr_t)-8) - (intptr_t)p; head = head <= n ? head : n; for( uint8_t *w = p, *end = p + head; w < end; w += 2 ) *w = ~*w; if( n == head ) return; size_t odd = (size_t)p & 1; // assume size_t or ptrdiff_t is our register width n -= head; p += head; if constexpr( sizeof(size_t) == 8 ) { if( n / 8 ) { union { uint8_t u8Mask[8] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }; size_t mask; }; size_t *w = (size_t *)p, *end = w + n / 8; mask ^= -(ptrdiff_t)odd; do *w ^= mask; while( ++w != end ); p = (uint8_t *)w; n = n % 8; } } else if constexpr( sizeof(size_t) == 4 ) if( n / 4 ) { union { uint8_t u8Mask[4] = { 0xFF, 0x00, 0xFF, 0x00 }; size_t mask; }; size_t *w = (size_t *)p, *end = w + n / 4; mask ^= -(ptrdiff_t)odd; do *w ^= mask; while( ++w != end ); p = (uint8_t *)w; n = n % 4; } if( n <= odd ) return; p += odd; n -= odd; uint8_t *end = p + n; do *p = ~*p; while( (p += 2) < end ); } |
| Melzzzzz <Melzzzzz@zzzzz.com>: Mar 30 09:39AM > *p = ~*p; > while( (p += 2) < end ); > } What does this do? -- press any key to continue or any other to quit... U ničemu ja ne uživam kao u svom statusu INVALIDA -- Zli Zec Svi smo svedoci - oko 3 godine intenzivne propagande je dovoljno da jedan narod poludi -- Zli Zec Na divljem zapadu i nije bilo tako puno nasilja, upravo zato jer su svi bili naoruzani. -- Mladen Gogala |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 11:55AM +0200 Am 30.03.2020 um 11:39 schrieb Melzzzzz: >> while( (p += 2) < end ); >> } > What does this do? It inverts every second byte. First the head until the first aligned uint64_t-block, then the uint64_t-blocks and then the tail. It's more than 10 times faster than the fastest algorithm so far. |
| Melzzzzz <Melzzzzz@zzzzz.com>: Mar 30 09:59AM > It inverts every second byte. First the head until the first aligned > uint64_t-block, then the uint64_t-blocks and then the tail. It's more > than 10 times faster than the fastest algorithm so far. Have you measured? It looks pretty complicated for task at hand? Also I really doubt it is faster then SSE2 solution... -- press any key to continue or any other to quit... U ničemu ja ne uživam kao u svom statusu INVALIDA -- Zli Zec Svi smo svedoci - oko 3 godine intenzivne propagande je dovoljno da jedan narod poludi -- Zli Zec Na divljem zapadu i nije bilo tako puno nasilja, upravo zato jer su svi bili naoruzani. -- Mladen Gogala |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 12:03PM +0200 > Have you measured? It looks pretty complicated for task at hand? There's no way to make it simpler. > Also I really doubt it is faster then SSE2 solution... A SSE-solution would have a bit more complexity and it wouldn't be portable. |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 12:37PM +0200 Here's a SSE-enabled version: void invertSecondBlocked( uint8_t *p, size_t n ) { size_t const BLOCKSIZE = SSE_BLOCKS ? 16 : sizeof(size_t); if( !n-- ) return; ++p; size_t head = (((intptr_t)p + (BLOCKSIZE - 1)) & -(intptr_t)BLOCKSIZE) - (intptr_t)p; head = head <= n ? head : n; for( uint8_t *w = p, *end = p + head; w < end; w += 2 ) *w = ~*w; if( n == head ) return; size_t odd = (size_t)p & 1; // assume size_t or ptrdiff_t is our register width n -= head; p += head; if constexpr( SSE_BLOCKS ) { if( n / 16 ) { union { uint8_t u8Mask[16] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }; __m128 mask; }; static const union { uint8_t u8Invert[16] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; __m128 invert; }; __m128 *w = (__m128 *)p, *end = w + n / 16; if( odd ) mask = _mm_xor_ps( mask, invert ); do *w = _mm_xor_ps( mask, *w ); while( ++w != end ); p = (uint8_t *)w; n = n % 16; } } else if constexpr( sizeof(size_t) == 8 ) { if( n / 8 ) { union { uint8_t u8Mask[8] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }; size_t mask; }; size_t *w = (size_t *)p, *end = w + n / 8; mask ^= -(ptrdiff_t)odd; do *w ^= mask; while( ++w != end ); p = (uint8_t *)w; n = n % 8; } } else if constexpr( sizeof(size_t) == 4 ) { if( n / 4 ) { union { uint8_t u8Mask[4] = { 0xFF, 0x00, 0xFF, 0x00 }; size_t mask; }; size_t *w = (size_t *)p, *end = w + n / 4; mask ^= -(ptrdiff_t)odd; do *w ^= mask; while( ++w != end ); p = (uint8_t *)w; n = n % 4; } } if( n <= odd ) return; p += odd; n -= odd; uint8_t *end = p + n; do *p = ~*p; while( (p += 2) < end ); } It's almost twice as fast on my CPU. |
| Bonita Montero <Bonita.Montero@gmail.com>: Mar 30 01:57PM +0200 This is SSE as well as AVX-optimized. Unfortunately this runs only slightly faster on my old Ryzen 1800X because the first two generations of Ryzen-CPUs split an 256 bit AVX-operation into two 128 bit operations. void invertSecondBlocked( uint8_t *p, size_t n ) { size_t const BLOCKSIZE = AVX_BLOCKS ? 32 : SSE_BLOCKS ? 16 : sizeof(size_t); if( !n-- ) return; ++p; size_t head = (((intptr_t)p + BLOCKSIZE - 1) & -(intptr_t)BLOCKSIZE) - (intptr_t)p; head = head <= n ? head : n; for( uint8_t *w = p, *end = p + head; w < end; w += 2 ) *w = ~*w; if( n == head ) return; size_t odd = (size_t)p & 1; // assume size_t or ptrdiff_t is our register width n -= head; p += head; #if AVX_BLOCKS != 0 if( n / 32 ) { union { uint8_t u8Mask[32] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }; __m256 mask; }; static const union { uint8_t u8Invert[32] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; __m256 invert; }; __m256 *w = (__m256 *)p, *end = w + n / 32; if( odd ) mask = _mm256_xor_ps( mask, invert ); do *w = _mm256_xor_ps( *w, mask ); while( ++w != end ); p = (uint8_t *)w; n = n % 32; } #elif SSE_BLOCKS != 0 if( n / 16 ) { union { uint8_t u8Mask[16] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }; __m128 mask; }; static const union { uint8_t u8Invert[16] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; __m128 invert; }; __m128 *w = (__m128 *)p, *end = w + n / 16; if( odd ) mask = _mm_xor_ps( mask, invert ); do *w = _mm_xor_ps( *w, mask ); while( ++w != end ); p = (uint8_t *)w; n = n % 16; } #else if constexpr( sizeof(size_t) == 8 ) { if( n / 8 ) { union { uint8_t u8Mask[8] = { 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }; size_t mask; }; size_t *w = (size_t *)p, *end = w + n / 8; mask ^= -(ptrdiff_t)odd; do *w ^= mask; while( ++w != end ); p = (uint8_t *)w; n = n % 8; } } else if constexpr( sizeof(size_t) == 4 ) { if( n / 4 ) { union { uint8_t u8Mask[4] = { 0xFF, 0x00, 0xFF, 0x00 }; size_t mask; }; size_t *w = (size_t *)p, *end = w + n / 4; mask ^= -(ptrdiff_t)odd; do *w ^= mask; while( ++w != end ); p = (uint8_t *)w; n = n % 4; } }
Subscribe to:
Post Comments (Atom)
|
No comments:
Post a Comment