Incorrect AVX2 Adler implementation #1250
-
I have a working Sse3 implementation but I've done something silly porting it to Avx2. I assumed it would be a case of bumping the size + offsets but I'm obviously wrong. If someone can point me in the right direction that would be great. @saucecontrol @antonfirsov @tannergooding // Apologies for handle spamming. Avx2 private static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
{
uint s1 = adler & 0xFFFF;
uint s2 = (adler >> 16) & 0xFFFF;
// Process the data in blocks.
const int BLOCK_SIZE = 64;
uint length = (uint)buffer.Length;
uint blocks = length / BLOCK_SIZE;
length -= blocks * BLOCK_SIZE;
int index = 0;
fixed (byte* bufferPtr = buffer)
fixed (byte* tapPtr = Tap1Tap2)
{
index += (int)blocks * BLOCK_SIZE;
var localBufferPtr = bufferPtr;
// _mm_setr_epi8 on x86
Vector256<sbyte> tap1 = Avx.LoadVector256((sbyte*)tapPtr);
Vector256<sbyte> tap2 = Avx.LoadVector256((sbyte*)(tapPtr + 0x20));
Vector256<byte> zero = Vector256<byte>.Zero;
var ones = Vector256.Create((short)1);
while (blocks > 0)
{
uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks)
{
n = blocks;
}
blocks -= n;
// Process n blocks of data. At most NMAX data bytes can be
// processed before s2 must be reduced modulo BASE.
Vector256<uint> v_ps = Vector256.CreateScalar(s1 * n);
Vector256<uint> v_s2 = Vector256.CreateScalar(s2);
Vector256<uint> v_s1 = Vector256<uint>.Zero;
do
{
// Load 64 input bytes.
Vector256<byte> bytes1 = Avx.LoadDquVector256(localBufferPtr);
Vector256<byte> bytes2 = Avx.LoadDquVector256(localBufferPtr + 0x20);
// Add previous block byte sum to v_ps.
v_ps = Avx2.Add(v_ps, v_s1);
// Horizontally add the bytes for s1, multiply-adds the
// bytes by [ 32, 31, 30, ... ] for s2.
v_s1 = Avx2.Add(v_s1, Avx2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
Vector256<short> mad1 = Avx2.MultiplyAddAdjacent(bytes1, tap1);
v_s2 = Avx2.Add(v_s2, Avx2.MultiplyAddAdjacent(mad1, ones).AsUInt32());
v_s1 = Avx2.Add(v_s1, Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
Vector256<short> mad2 = Avx2.MultiplyAddAdjacent(bytes2, tap2);
v_s2 = Avx2.Add(v_s2, Avx2.MultiplyAddAdjacent(mad2, ones).AsUInt32());
localBufferPtr += BLOCK_SIZE;
}
while (--n > 0);
v_s2 = Avx2.Add(v_s2, Avx2.ShiftLeftLogical(v_ps, 5));
// Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
const byte S2301 = 0b1011_0001; // A B C D -> B A D C
const byte S1032 = 0b0100_1110; // A B C D -> C D A B
v_s1 = Avx2.Add(v_s1, Avx2.Shuffle(v_s1, S1032));
s1 += v_s1.ToScalar();
v_s2 = Avx2.Add(v_s2, Avx2.Shuffle(v_s2, S2301));
v_s2 = Avx2.Add(v_s2, Avx2.Shuffle(v_s2, S1032));
s2 = v_s2.ToScalar();
// Reduce.
s1 %= BASE;
s2 %= BASE;
}
if (length > 0)
{
if (length >= 16)
{
s2 += s1 += localBufferPtr[0];
s2 += s1 += localBufferPtr[1];
s2 += s1 += localBufferPtr[2];
s2 += s1 += localBufferPtr[3];
s2 += s1 += localBufferPtr[4];
s2 += s1 += localBufferPtr[5];
s2 += s1 += localBufferPtr[6];
s2 += s1 += localBufferPtr[7];
s2 += s1 += localBufferPtr[8];
s2 += s1 += localBufferPtr[9];
s2 += s1 += localBufferPtr[10];
s2 += s1 += localBufferPtr[11];
s2 += s1 += localBufferPtr[12];
s2 += s1 += localBufferPtr[13];
s2 += s1 += localBufferPtr[14];
s2 += s1 += localBufferPtr[15];
localBufferPtr += 16;
length -= 16;
}
while (length-- > 0)
{
s2 += s1 += *localBufferPtr++;
}
if (s1 >= BASE)
{
s1 -= BASE;
}
s2 %= BASE;
}
return s1 | (s2 << 16);
}
} Sse3 private static unsafe uint CalculateSse3(uint adler, ReadOnlySpan<byte> buffer)
{
uint s1 = adler & 0xFFFF;
uint s2 = (adler >> 16) & 0xFFFF;
// Process the data in blocks.
const int BLOCK_SIZE = 32;
uint length = (uint)buffer.Length;
uint blocks = length / BLOCK_SIZE;
length -= blocks * BLOCK_SIZE;
int index = 0;
fixed (byte* bufferPtr = buffer)
fixed (byte* tapPtr = Tap1Tap2)
{
index += (int)blocks * BLOCK_SIZE;
var localBufferPtr = bufferPtr;
// _mm_setr_epi8 on x86
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
Vector128<byte> zero = Vector128<byte>.Zero;
var ones = Vector128.Create((short)1);
while (blocks > 0)
{
uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks)
{
n = blocks;
}
blocks -= n;
// Process n blocks of data. At most NMAX data bytes can be
// processed before s2 must be reduced modulo BASE.
Vector128<uint> v_ps = Vector128.CreateScalar(s1 * n);
Vector128<uint> v_s2 = Vector128.CreateScalar(s2);
Vector128<uint> v_s1 = Vector128<uint>.Zero;
do
{
// Load 32 input bytes.
Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);
// Add previous block byte sum to v_ps.
v_ps = Sse2.Add(v_ps, v_s1);
// Horizontally add the bytes for s1, multiply-adds the
// bytes by [ 32, 31, 30, ... ] for s2.
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());
localBufferPtr += BLOCK_SIZE;
}
while (--n > 0);
v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));
// Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
const byte S2301 = 0b1011_0001; // A B C D -> B A D C
const byte S1032 = 0b0100_1110; // A B C D -> C D A B
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));
s1 += v_s1.ToScalar();
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));
s2 = v_s2.ToScalar();
// Reduce.
s1 %= BASE;
s2 %= BASE;
}
if (length > 0)
{
if (length >= 16)
{
s2 += s1 += localBufferPtr[0];
s2 += s1 += localBufferPtr[1];
s2 += s1 += localBufferPtr[2];
s2 += s1 += localBufferPtr[3];
s2 += s1 += localBufferPtr[4];
s2 += s1 += localBufferPtr[5];
s2 += s1 += localBufferPtr[6];
s2 += s1 += localBufferPtr[7];
s2 += s1 += localBufferPtr[8];
s2 += s1 += localBufferPtr[9];
s2 += s1 += localBufferPtr[10];
s2 += s1 += localBufferPtr[11];
s2 += s1 += localBufferPtr[12];
s2 += s1 += localBufferPtr[13];
s2 += s1 += localBufferPtr[14];
s2 += s1 += localBufferPtr[15];
localBufferPtr += 16;
length -= 16;
}
while (length-- > 0)
{
s2 += s1 += *localBufferPtr++;
}
if (s1 >= BASE)
{
s1 -= BASE;
}
s2 %= BASE;
}
return s1 | (s2 << 16);
}
} |
Beta Was this translation helpful? Give feedback.
Replies: 4 comments 8 replies
-
I've not looked more in depth yet (won't have time until later tonight), but the typical issue is that the 256-bit versions don't operate as if it is a contiguous 256-bit vector. Instead, it operates as 2x128-bit vectors. For something like Vector128<ushort> SumAbsoluteDifferences(Vector128<byte> left, Vector128<byte> right)
{
var result = Vector128<ushort>.Zero;
for (int i = 0; i < 16; i++)
{
ushort tmp = result.GetElement(0);
tmp += Math.Abs(left.GetElement(i) - right.GetElement(i));
result = result.WithElement(0, tmp);
}
return result;
}
Vector256<ushort> SumAbsoluteDifferences(Vector256<byte> left, Vector256<byte> right)
{
var lower = SumAbsoluteDifferences(left.GetLower(), right.GetLower());
var upper = SumAbsoluteDifferences(left.GetUpper(), right.GetUpper());
return Vector256.Create(lower, upper);
} For something like |
Beta Was this translation helpful? Give feedback.
-
s1 += v_s1.ToScalar(); This will be incorrect in the AVX2 implementation, because it only takes the low element of the low 128-bit lane. You'll want to sum the lanes or get the first element of the upper lane and sum the scalars. s1 += Sse2.Add(v_s1.GetLower(), v_s1.GetUpper()).ToScalar(); or s1 += v_s1.ToScalar() + v_s1.GetUpper().ToScalar(); would be the equivalent. The rest of the code looks to stay safely in-lane, so it should be good assuming your Oh, and I seem to recall that both @tannergooding and I recommended you change your original SSSE3 code from using s1 += Sse2.ConvertToInt32(v_s1); to s1 += v_s1.ToScalar(); for readability, but I didn't realize the integer versions of |
Beta Was this translation helpful? Give feedback.
-
Thanks both, the lightbulb is starting to dim on my head... I'm assuming
Errmmm.... I think I have in my local version. |
Beta Was this translation helpful? Give feedback.
-
Yep, as @tannergooding said, you can think of the AVX2 version as being 2 instances of the SSSE3 version running in parallel. When you accumulate at the end, the shuffle/add pairs will total everything into the low element, but you have 2 of them to deal with. |
Beta Was this translation helpful? Give feedback.
Yep, as @tannergooding said, you can think of the AVX2 version as being 2 instances of the SSSE3 version running in parallel. When you accumulate at the end, the shuffle/add pairs will total everything into the low element, but you have 2 of them to deal with.