diff --git a/CHANGELOG.md b/CHANGELOG.md index a5f63a3..753bc55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +# 2022-07-17 +- Improve perfomance for XXH32 & XXH64 +- Improve perfomance for XXH128 & XXH3 # 2022-06-13 - Added xxHash3 # 2022-06-05 diff --git a/README.md b/README.md index f5e66e1..b55ddab 100644 --- a/README.md +++ b/README.md @@ -55,10 +55,10 @@ Runtime=.NET 6.0 | Hash32 | x64 | C | 140.2 ns | 129.6 us | 150.3 ms | 6.65 GB/s | | Hash64 | x64 | C# | 73.9 ns | 64.6 us | 81.4 ms | 12.28 GB/s | | Hash64 | x64 | C | 75.5 ns | 65.2 us | 84.5 ms | 11.83 GB/s | -| Hash128 (SSE2/AVX2)| x64 | C# | 151.6 ns | 64.5 us | 80.5 ms | 12.04 GB/s | -| Hash128 (SSE2/AVX2)| x64 | C | 84.4 ns | 38.3 us | 57.4 ms | 17.42 GB/s | -| Hash3 (SSE2/AVX2)| x64 | C# | 77.6 ns | 62.1 us | 78.5 ms | 12.08 GB/s | -| Hash3 (SSE2/AVX2)| x64 | C | 73.7 ns | 42.2 us | 59.8 ms | 16.72 GB/s | +| Hash128 (SSE2/AVX2)| x64 | C# | 84.95 ns | 56.9 us | 73.2 ms | 13.66 GB/s | +| Hash128 (SSE2/AVX2)| x64 | C | 84.35 ns | 38.1 us | 57.2 ms | 17.48 GB/s | +| Hash3 (SSE2/AVX2)| x64 | C# | 75.8 ns | 56.6 us | 74.6 ms | 13.40 GB/s | +| Hash3 (SSE2/AVX2)| x64 | C | 74.1 ns | 42.1 us | 59.5 ms | 16.80 GB/s | ## Api diff --git a/nuget.props b/nuget.props index 3b8485b..816d197 100644 --- a/nuget.props +++ b/nuget.props @@ -3,7 +3,7 @@ net6.0 Standart.Hash.xxHash - 4.0.3 + 4.0.4 Standart.Hash.xxHash Standart.Hash.xxHash Oleksandr Melnyk diff --git a/src/Standart.Hash.xxHash/__inline__xxHash128.cs b/src/Standart.Hash.xxHash/__inline__xxHash128.cs deleted file mode 100644 index 46ddbf4..0000000 --- a/src/Standart.Hash.xxHash/__inline__xxHash128.cs +++ /dev/null @@ -1,1834 +0,0 @@ -/* -* This is the auto generated code. -* All function calls are inlined in XXH3_128bits_internal -* Please don't try to analyze it. -*/ - -using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; - -namespace Standart.Hash.xxHash; - -public static partial class xxHash128 -{ - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe uint128 __inline__XXH3_128bits_internal(byte* input, int len, ulong seed, byte* secret, int secretLen) - { - if (len <= 16) - { - if (len > 8) - { - byte* ptr = secret + 32; - byte* ptr1 = secret + 40; - ulong bitflipl1 = (*(ulong*) ptr ^ *(ulong*) ptr1) - seed; - byte* ptr2 = secret + 48; - byte* ptr3 = secret + 56; - ulong bitfliph1 = (*(ulong*) ptr2 ^ *(ulong*) ptr3) + seed; - ulong input_lo = *(ulong*) input; - byte* ptr4 = input + len - 8; - ulong input_hi = *(ulong*) ptr4; - ulong lhs = input_lo ^ input_hi ^ bitflipl1; - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, XXH_PRIME64_1, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong lo_lo = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(XXH_PRIME64_1 & 0xFFFFFFFF); - ulong hi_lo = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(XXH_PRIME64_1 & 0xFFFFFFFF); - ulong lo_hi = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(XXH_PRIME64_1 >> 32); - ulong hi_hi = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(XXH_PRIME64_1 >> 32); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 m128 = ret; - - m128.low64 += (ulong) (len - 1) << 54; - input_hi ^= bitfliph1; - - m128.high64 += input_hi + (ulong)(uint)(uint) input_hi * (ulong)(uint)(XXH_PRIME32_2 - 1); - m128.low64 ^= ((m128.high64 << 56) & 0xff00000000000000UL) | - ((m128.high64 << 40) & 0x00ff000000000000UL) | - ((m128.high64 << 24) & 0x0000ff0000000000UL) | - ((m128.high64 << 8) & 0x000000ff00000000UL) | - ((m128.high64 >> 8) & 0x00000000ff000000UL) | - ((m128.high64 >> 24) & 0x0000000000ff0000UL) | - ((m128.high64 >> 40) & 0x000000000000ff00UL) | - ((m128.high64 >> 56) & 0x00000000000000ffUL); - - uint128 ret1; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(m128.low64, XXH_PRIME64_2, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret1 = r128; - } - else - { - ulong lo_lo = (ulong)(uint)(m128.low64 & 0xFFFFFFFF) * (ulong)(uint)(XXH_PRIME64_2 & 0xFFFFFFFF); - ulong hi_lo = (ulong)(uint)(m128.low64 >> 32) * (ulong)(uint)(XXH_PRIME64_2 & 0xFFFFFFFF); - ulong lo_hi = (ulong)(uint)(m128.low64 & 0xFFFFFFFF) * (ulong)(uint)(XXH_PRIME64_2 >> 32); - ulong hi_hi = (ulong)(uint)(m128.low64 >> 32) * (ulong)(uint)(XXH_PRIME64_2 >> 32); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret1 = r128; - } - - uint128 h129 = ret1; - h129.high64 += m128.high64 * XXH_PRIME64_2; - - ulong h64 = h129.low64; - h64 = h64 ^ (h64 >> 37); - h64 *= 0x165667919E3779F9UL; - h64 = h64 ^ (h64 >> 32); - h129.low64 = h64; - ulong h65 = h129.high64; - h65 = h65 ^ (h65 >> 37); - h65 *= 0x165667919E3779F9UL; - h65 = h65 ^ (h65 >> 32); - h129.high64 = h65; - return h129; - } - - if (len >= 4) - { - ulong seed1 = seed; - - uint x = (uint) seed1; - seed1 ^= (ulong) (((x << 24) & 0xff000000 ) | - ((x << 8) & 0x00ff0000 ) | - ((x >> 8) & 0x0000ff00 ) | - ((x >> 24) & 0x000000ff )) << 32; - - uint input_lo = *(uint*) input; - byte* ptr2 = input + len - 4; - uint input_hi = *(uint*) ptr2; - ulong input_64 = input_lo + ((ulong) input_hi << 32); - byte* ptr = secret + 16; - byte* ptr1 = secret + 24; - ulong bitflip = (*(ulong*) ptr ^ *(ulong*) ptr1) + seed1; - ulong keyed = input_64 ^ bitflip; - - ulong rhs = XXH_PRIME64_1 + ((ulong) len << 2); - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(keyed, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong)(uint)(keyed & 0xFFFFFFFF) * (ulong)(uint)(y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong)(uint)(keyed >> 32) * (ulong)(uint)(y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong)(uint)(keyed & 0xFFFFFFFF) * (ulong)(uint)(y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong)(uint)(keyed >> 32) * (ulong)(uint)(y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 m128 = ret; - - m128.high64 += (m128.low64 << 1); - m128.low64 ^= (m128.high64 >> 3); - - m128.low64 = m128.low64 ^ (m128.low64 >> 35); - m128.low64 *= 0x9FB21C651E98DF25UL; - m128.low64 = m128.low64 ^ (m128.low64 >> 28); - ulong h64 = m128.high64; - h64 = h64 ^ (h64 >> 37); - h64 *= 0x165667919E3779F9UL; - h64 = h64 ^ (h64 >> 32); - m128.high64 = h64; - - return m128; - } - - if (len != 0) - { - byte c1 = input[0]; - byte c2 = input[len >> 1]; - byte c3 = input[len - 1]; - - uint combinedl = ((uint) c1 << 16) | - ((uint) c2 << 24) | - ((uint) c3 << 0) | - ((uint) len << 8); - uint x = ((combinedl << 24) & 0xff000000 ) | - ((combinedl << 8) & 0x00ff0000 ) | - ((combinedl >> 8) & 0x0000ff00 ) | - ((combinedl >> 24) & 0x000000ff ); - uint combinedh = (x << 13) | (x >> (32 - 13)); - - byte* ptr = secret + 4; - ulong bitflipl1 = (*(uint*) secret ^ *(uint*) ptr) + seed; - byte* ptr1 = secret + 8; - byte* ptr2 = secret + 12; - ulong bitfliph1 = (*(uint*) ptr1 ^ *(uint*) ptr2) - seed; - ulong keyed_lo = (ulong) combinedl ^ bitflipl1; - ulong keyed_hi = (ulong) combinedh ^ bitfliph1; - - uint128 h129; - ulong hash = keyed_lo; - hash ^= hash >> 33; - hash *= XXH_PRIME64_2; - hash ^= hash >> 29; - hash *= XXH_PRIME64_3; - hash ^= hash >> 32; - h129.low64 = hash; - ulong hash1 = keyed_hi; - hash1 ^= hash1 >> 33; - hash1 *= XXH_PRIME64_2; - hash1 ^= hash1 >> 29; - hash1 *= XXH_PRIME64_3; - hash1 ^= hash1 >> 32; - h129.high64 = hash1; - - return h129; - } - - uint128 h128; - byte* ptr5 = secret + 64; - byte* ptr6 = secret + 72; - ulong bitflipl = *(ulong*) ptr5 ^ *(ulong*) ptr6; - byte* ptr7 = secret + 80; - byte* ptr8 = secret + 88; - ulong bitfliph = *(ulong*) ptr7 ^ *(ulong*) ptr8; - ulong hash2 = seed ^ bitflipl; - hash2 ^= hash2 >> 33; - hash2 *= XXH_PRIME64_2; - hash2 ^= hash2 >> 29; - hash2 *= XXH_PRIME64_3; - hash2 ^= hash2 >> 32; - h128.low64 = hash2; - ulong hash3 = seed ^ bitfliph; - hash3 ^= hash3 >> 33; - hash3 *= XXH_PRIME64_2; - hash3 ^= hash3 >> 29; - hash3 *= XXH_PRIME64_3; - hash3 ^= hash3 >> 32; - h128.high64 = hash3; - return h128; - } - - if (len <= 128) - { - uint128 acc; - acc.low64 = (ulong) len * XXH_PRIME64_1; - acc.high64 = 0; - - if (len > 32) { - if (len > 64) { - if (len > 96) - { - uint128 acc1 = acc; - byte* input1 = input+48; - byte* input2 = input+len-64; - byte* secret1 = secret+96; - byte* secret4 = secret1 + 0; - ulong input_lo = *(ulong*) input1; - byte* ptr8 = input1 + 8; - ulong input_hi = *(ulong*) ptr8; - - byte* ptr9 = secret4 + 8; - ulong lhs = input_lo ^ (*(ulong*) secret4 + seed); - ulong rhs = input_hi ^ (*(ulong*) ptr9 - seed); - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - acc1.low64 += product.low64 ^ product.high64; - byte* ptr = input2 + 8; - acc1.low64 ^= *(ulong*) input2 + *(ulong*) ptr; - byte* secret5 = secret1 + 16; - ulong inputLo = *(ulong*) input2; - byte* ptr10 = input2 + 8; - ulong inputHi = *(ulong*) ptr10; - - byte* ptr11 = secret5 + 8; - ulong lhs1 = inputLo ^ (*(ulong*) secret5 + seed); - ulong rhs1 = inputHi ^ (*(ulong*) ptr11 - seed); - uint128 ret1; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs1, rhs1, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret1 = r128; - } - else - { - ulong y4 = rhs1 & 0xFFFFFFFF; - ulong loLo = (ulong)(uint)(lhs1 & 0xFFFFFFFF) * (ulong)(uint)(y4); - ulong y5 = rhs1 & 0xFFFFFFFF; - ulong hiLo = (ulong)(uint)(lhs1 >> 32) * (ulong)(uint)(y5); - ulong y6 = rhs1 >> 32; - ulong loHi = (ulong)(uint)(lhs1 & 0xFFFFFFFF) * (ulong)(uint)(y6); - ulong y7 = rhs1 >> 32; - ulong hiHi = (ulong)(uint)(lhs1 >> 32) * (ulong)(uint)(y7); - - ulong cross1 = (loLo >> 32) + (hiLo & 0xFFFFFFFF) + loHi; - ulong upper1 = (hiLo >> 32) + (cross1 >> 32) + hiHi; - ulong lower1 = (cross1 << 32) | (loLo & 0xFFFFFFFF); - - uint128 r129; - r129.low64 = lower1; - r129.high64 = upper1; - ret1 = r129; - } - - uint128 product1 = ret1; - acc1.high64 += product1.low64 ^ product1.high64; - byte* ptr1 = input1 + 8; - acc1.high64 ^= *(ulong*) input1 + *(ulong*) ptr1; - acc = acc1; - } - - uint128 acc2 = acc; - byte* input3 = input+32; - byte* input4 = input+len-48; - byte* secret2 = secret+64; - byte* secret6 = secret2 + 0; - ulong inputLo1 = *(ulong*) input3; - byte* ptr12 = input3 + 8; - ulong inputHi1 = *(ulong*) ptr12; - - byte* ptr13 = secret6 + 8; - ulong lhs2 = inputLo1 ^ (*(ulong*) secret6 + seed); - ulong rhs2 = inputHi1 ^ (*(ulong*) ptr13 - seed); - uint128 ret2; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs2, rhs2, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret2 = r128; - } - else - { - ulong y8 = rhs2 & 0xFFFFFFFF; - ulong loLo1 = (ulong)(uint)(lhs2 & 0xFFFFFFFF) * (ulong)(uint)(y8); - ulong y9 = rhs2 & 0xFFFFFFFF; - ulong hiLo1 = (ulong)(uint)(lhs2 >> 32) * (ulong)(uint)(y9); - ulong y10 = rhs2 >> 32; - ulong loHi1 = (ulong)(uint)(lhs2 & 0xFFFFFFFF) * (ulong)(uint)(y10); - ulong y11 = rhs2 >> 32; - ulong hiHi1 = (ulong)(uint)(lhs2 >> 32) * (ulong)(uint)(y11); - - ulong cross2 = (loLo1 >> 32) + (hiLo1 & 0xFFFFFFFF) + loHi1; - ulong upper2 = (hiLo1 >> 32) + (cross2 >> 32) + hiHi1; - ulong lower2 = (cross2 << 32) | (loLo1 & 0xFFFFFFFF); - - uint128 r1210; - r1210.low64 = lower2; - r1210.high64 = upper2; - ret2 = r1210; - } - - uint128 product2 = ret2; - acc2.low64 += product2.low64 ^ product2.high64; - byte* ptr2 = input4 + 8; - acc2.low64 ^= *(ulong*) input4 + *(ulong*) ptr2; - byte* secret7 = secret2 + 16; - ulong inputLo2 = *(ulong*) input4; - byte* ptr14 = input4 + 8; - ulong inputHi2 = *(ulong*) ptr14; - - byte* ptr15 = secret7 + 8; - ulong lhs3 = inputLo2 ^ (*(ulong*) secret7 + seed); - ulong rhs3 = inputHi2 ^ (*(ulong*) ptr15 - seed); - uint128 ret3; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs3, rhs3, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret3 = r128; - } - else - { - ulong y12 = rhs3 & 0xFFFFFFFF; - ulong loLo2 = (ulong)(uint)(lhs3 & 0xFFFFFFFF) * (ulong)(uint)(y12); - ulong y13 = rhs3 & 0xFFFFFFFF; - ulong hiLo2 = (ulong)(uint)(lhs3 >> 32) * (ulong)(uint)(y13); - ulong y14 = rhs3 >> 32; - ulong loHi2 = (ulong)(uint)(lhs3 & 0xFFFFFFFF) * (ulong)(uint)(y14); - ulong y15 = rhs3 >> 32; - ulong hiHi2 = (ulong)(uint)(lhs3 >> 32) * (ulong)(uint)(y15); - - ulong cross3 = (loLo2 >> 32) + (hiLo2 & 0xFFFFFFFF) + loHi2; - ulong upper3 = (hiLo2 >> 32) + (cross3 >> 32) + hiHi2; - ulong lower3 = (cross3 << 32) | (loLo2 & 0xFFFFFFFF); - - uint128 r1211; - r1211.low64 = lower3; - r1211.high64 = upper3; - ret3 = r1211; - } - - uint128 product3 = ret3; - acc2.high64 += product3.low64 ^ product3.high64; - byte* ptr3 = input3 + 8; - acc2.high64 ^= *(ulong*) input3 + *(ulong*) ptr3; - acc = acc2; - } - - uint128 acc3 = acc; - byte* input5 = input+16; - byte* input6 = input+len-32; - byte* secret3 = secret+32; - byte* secret8 = secret3 + 0; - ulong inputLo3 = *(ulong*) input5; - byte* ptr16 = input5 + 8; - ulong inputHi3 = *(ulong*) ptr16; - - byte* ptr17 = secret8 + 8; - ulong lhs4 = inputLo3 ^ (*(ulong*) secret8 + seed); - ulong rhs4 = inputHi3 ^ (*(ulong*) ptr17 - seed); - uint128 ret4; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs4, rhs4, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret4 = r128; - } - else - { - ulong y16 = rhs4 & 0xFFFFFFFF; - ulong loLo3 = (ulong)(uint)(lhs4 & 0xFFFFFFFF) * (ulong)(uint)(y16); - ulong y17 = rhs4 & 0xFFFFFFFF; - ulong hiLo3 = (ulong)(uint)(lhs4 >> 32) * (ulong)(uint)(y17); - ulong y18 = rhs4 >> 32; - ulong loHi3 = (ulong)(uint)(lhs4 & 0xFFFFFFFF) * (ulong)(uint)(y18); - ulong y19 = rhs4 >> 32; - ulong hiHi3 = (ulong)(uint)(lhs4 >> 32) * (ulong)(uint)(y19); - - ulong cross4 = (loLo3 >> 32) + (hiLo3 & 0xFFFFFFFF) + loHi3; - ulong upper4 = (hiLo3 >> 32) + (cross4 >> 32) + hiHi3; - ulong lower4 = (cross4 << 32) | (loLo3 & 0xFFFFFFFF); - - uint128 r1212; - r1212.low64 = lower4; - r1212.high64 = upper4; - ret4 = r1212; - } - - uint128 product4 = ret4; - acc3.low64 += product4.low64 ^ product4.high64; - byte* ptr4 = input6 + 8; - acc3.low64 ^= *(ulong*) input6 + *(ulong*) ptr4; - byte* secret9 = secret3 + 16; - ulong inputLo4 = *(ulong*) input6; - byte* ptr18 = input6 + 8; - ulong inputHi4 = *(ulong*) ptr18; - - byte* ptr19 = secret9 + 8; - ulong lhs5 = inputLo4 ^ (*(ulong*) secret9 + seed); - ulong rhs5 = inputHi4 ^ (*(ulong*) ptr19 - seed); - uint128 ret5; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs5, rhs5, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret5 = r128; - } - else - { - ulong y20 = rhs5 & 0xFFFFFFFF; - ulong loLo4 = (ulong)(uint)(lhs5 & 0xFFFFFFFF) * (ulong)(uint)(y20); - ulong y21 = rhs5 & 0xFFFFFFFF; - ulong hiLo4 = (ulong)(uint)(lhs5 >> 32) * (ulong)(uint)(y21); - ulong y22 = rhs5 >> 32; - ulong loHi4 = (ulong)(uint)(lhs5 & 0xFFFFFFFF) * (ulong)(uint)(y22); - ulong y23 = rhs5 >> 32; - ulong hiHi4 = (ulong)(uint)(lhs5 >> 32) * (ulong)(uint)(y23); - - ulong cross5 = (loLo4 >> 32) + (hiLo4 & 0xFFFFFFFF) + loHi4; - ulong upper5 = (hiLo4 >> 32) + (cross5 >> 32) + hiHi4; - ulong lower5 = (cross5 << 32) | (loLo4 & 0xFFFFFFFF); - - uint128 r1213; - r1213.low64 = lower5; - r1213.high64 = upper5; - ret5 = r1213; - } - - uint128 product5 = ret5; - acc3.high64 += product5.low64 ^ product5.high64; - byte* ptr5 = input5 + 8; - acc3.high64 ^= *(ulong*) input5 + *(ulong*) ptr5; - acc = acc3; - } - - uint128 acc4 = acc; - byte* input7 = input+len-16; - byte* secret10 = secret + 0; - ulong inputLo5 = *(ulong*) input; - byte* ptr20 = input + 8; - ulong inputHi5 = *(ulong*) ptr20; - - byte* ptr21 = secret10 + 8; - ulong lhs6 = inputLo5 ^ (*(ulong*) secret10 + seed); - ulong rhs6 = inputHi5 ^ (*(ulong*) ptr21 - seed); - uint128 ret6; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs6, rhs6, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret6 = r128; - } - else - { - ulong y24 = rhs6 & 0xFFFFFFFF; - ulong loLo5 = (ulong)(uint)(lhs6 & 0xFFFFFFFF) * (ulong)(uint)(y24); - ulong y25 = rhs6 & 0xFFFFFFFF; - ulong hiLo5 = (ulong)(uint)(lhs6 >> 32) * (ulong)(uint)(y25); - ulong y26 = rhs6 >> 32; - ulong loHi5 = (ulong)(uint)(lhs6 & 0xFFFFFFFF) * (ulong)(uint)(y26); - ulong y27 = rhs6 >> 32; - ulong hiHi5 = (ulong)(uint)(lhs6 >> 32) * (ulong)(uint)(y27); - - ulong cross6 = (loLo5 >> 32) + (hiLo5 & 0xFFFFFFFF) + loHi5; - ulong upper6 = (hiLo5 >> 32) + (cross6 >> 32) + hiHi5; - ulong lower6 = (cross6 << 32) | (loLo5 & 0xFFFFFFFF); - - uint128 r1214; - r1214.low64 = lower6; - r1214.high64 = upper6; - ret6 = r1214; - } - - uint128 product6 = ret6; - acc4.low64 += product6.low64 ^ product6.high64; - byte* ptr6 = input7 + 8; - acc4.low64 ^= *(ulong*) input7 + *(ulong*) ptr6; - byte* secret11 = secret + 16; - ulong inputLo6 = *(ulong*) input7; - byte* ptr22 = input7 + 8; - ulong inputHi6 = *(ulong*) ptr22; - - byte* ptr23 = secret11 + 8; - ulong lhs7 = inputLo6 ^ (*(ulong*) secret11 + seed); - ulong rhs7 = inputHi6 ^ (*(ulong*) ptr23 - seed); - uint128 ret7; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs7, rhs7, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret7 = r128; - } - else - { - ulong y28 = rhs7 & 0xFFFFFFFF; - ulong loLo6 = (ulong)(uint)(lhs7 & 0xFFFFFFFF) * (ulong)(uint)(y28); - ulong y29 = rhs7 & 0xFFFFFFFF; - ulong hiLo6 = (ulong)(uint)(lhs7 >> 32) * (ulong)(uint)(y29); - ulong y30 = rhs7 >> 32; - ulong loHi6 = (ulong)(uint)(lhs7 & 0xFFFFFFFF) * (ulong)(uint)(y30); - ulong y31 = rhs7 >> 32; - ulong hiHi6 = (ulong)(uint)(lhs7 >> 32) * (ulong)(uint)(y31); - - ulong cross7 = (loLo6 >> 32) + (hiLo6 & 0xFFFFFFFF) + loHi6; - ulong upper7 = (hiLo6 >> 32) + (cross7 >> 32) + hiHi6; - ulong lower7 = (cross7 << 32) | (loLo6 & 0xFFFFFFFF); - - uint128 r1215; - r1215.low64 = lower7; - r1215.high64 = upper7; - ret7 = r1215; - } - - uint128 product7 = ret7; - acc4.high64 += product7.low64 ^ product7.high64; - byte* ptr7 = input + 8; - acc4.high64 ^= *(ulong*) input + *(ulong*) ptr7; - acc = acc4; - - uint128 h128; - h128.low64 = acc.low64 + acc.high64; - h128.high64 = (acc.low64 * XXH_PRIME64_1) - + (acc.high64 * XXH_PRIME64_4) - + (((ulong) len - seed) * XXH_PRIME64_2); - ulong h64 = h128.low64; - h64 = h64 ^ (h64 >> 37); - h64 *= 0x165667919E3779F9UL; - h64 = h64 ^ (h64 >> 32); - h128.low64 = h64; - ulong h65 = h128.high64; - h65 = h65 ^ (h65 >> 37); - h65 *= 0x165667919E3779F9UL; - h65 = h65 ^ (h65 >> 32); - h128.high64 = (ulong) 0 - h65; - return h128; - } - - if (len <= XXH3_MIDSIZE_MAX) - { - uint128 acc; - int nbRounds = len / 32; - - acc.low64 = (ulong) len * XXH_PRIME64_1; - acc.high64 = 0; - for (int i = 0; i < 4; i++) - { - uint128 acc1 = acc; - byte* input1 = input + (32 * i); - byte* input2 = input + (32 * i) + 16; - byte* secret1 = secret + (32 * i); - byte* secret3 = secret1 + 0; - ulong input_lo = *(ulong*) input1; - byte* ptr4 = input1 + 8; - ulong input_hi = *(ulong*) ptr4; - - byte* ptr5 = secret3 + 8; - ulong lhs = input_lo ^ (*(ulong*) secret3 + seed); - ulong rhs = input_hi ^ (*(ulong*) ptr5 - seed); - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - acc1.low64 += product.low64 ^ product.high64; - byte* ptr = input2 + 8; - acc1.low64 ^= *(ulong*) input2 + *(ulong*) ptr; - byte* secret4 = secret1 + 16; - ulong inputLo = *(ulong*) input2; - byte* ptr6 = input2 + 8; - ulong inputHi = *(ulong*) ptr6; - - byte* ptr7 = secret4 + 8; - ulong lhs1 = inputLo ^ (*(ulong*) secret4 + seed); - ulong rhs1 = inputHi ^ (*(ulong*) ptr7 - seed); - uint128 ret1; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs1, rhs1, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret1 = r128; - } - else - { - ulong y4 = rhs1 & 0xFFFFFFFF; - ulong loLo = (ulong)(uint)(lhs1 & 0xFFFFFFFF) * (ulong)(uint)(y4); - ulong y5 = rhs1 & 0xFFFFFFFF; - ulong hiLo = (ulong)(uint)(lhs1 >> 32) * (ulong)(uint)(y5); - ulong y6 = rhs1 >> 32; - ulong loHi = (ulong)(uint)(lhs1 & 0xFFFFFFFF) * (ulong)(uint)(y6); - ulong y7 = rhs1 >> 32; - ulong hiHi = (ulong)(uint)(lhs1 >> 32) * (ulong)(uint)(y7); - - ulong cross1 = (loLo >> 32) + (hiLo & 0xFFFFFFFF) + loHi; - ulong upper1 = (hiLo >> 32) + (cross1 >> 32) + hiHi; - ulong lower1 = (cross1 << 32) | (loLo & 0xFFFFFFFF); - - uint128 r129; - r129.low64 = lower1; - r129.high64 = upper1; - ret1 = r129; - } - - uint128 product1 = ret1; - acc1.high64 += product1.low64 ^ product1.high64; - byte* ptr1 = input1 + 8; - acc1.high64 ^= *(ulong*) input1 + *(ulong*) ptr1; - acc = acc1; - } - - ulong h64 = acc.low64; - h64 = h64 ^ (h64 >> 37); - h64 *= 0x165667919E3779F9UL; - h64 = h64 ^ (h64 >> 32); - acc.low64 = h64; - ulong h65 = acc.high64; - h65 = h65 ^ (h65 >> 37); - h65 *= 0x165667919E3779F9UL; - h65 = h65 ^ (h65 >> 32); - acc.high64 = h65; - - for (int i = 4 ; i < nbRounds; i++) - { - uint128 acc1 = acc; - byte* input1 = input + (32 * i); - byte* input2 = input + (32 * i) + 16; - byte* secret1 = secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)); - byte* secret3 = secret1 + 0; - ulong input_lo = *(ulong*) input1; - byte* ptr4 = input1 + 8; - ulong input_hi = *(ulong*) ptr4; - - byte* ptr5 = secret3 + 8; - ulong lhs = input_lo ^ (*(ulong*) secret3 + seed); - ulong rhs = input_hi ^ (*(ulong*) ptr5 - seed); - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - acc1.low64 += product.low64 ^ product.high64; - byte* ptr = input2 + 8; - acc1.low64 ^= *(ulong*) input2 + *(ulong*) ptr; - byte* secret4 = secret1 + 16; - ulong inputLo = *(ulong*) input2; - byte* ptr6 = input2 + 8; - ulong inputHi = *(ulong*) ptr6; - - byte* ptr7 = secret4 + 8; - ulong lhs1 = inputLo ^ (*(ulong*) secret4 + seed); - ulong rhs1 = inputHi ^ (*(ulong*) ptr7 - seed); - uint128 ret1; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs1, rhs1, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret1 = r128; - } - else - { - ulong y4 = rhs1 & 0xFFFFFFFF; - ulong loLo = (ulong)(uint)(lhs1 & 0xFFFFFFFF) * (ulong)(uint)(y4); - ulong y5 = rhs1 & 0xFFFFFFFF; - ulong hiLo = (ulong)(uint)(lhs1 >> 32) * (ulong)(uint)(y5); - ulong y6 = rhs1 >> 32; - ulong loHi = (ulong)(uint)(lhs1 & 0xFFFFFFFF) * (ulong)(uint)(y6); - ulong y7 = rhs1 >> 32; - ulong hiHi = (ulong)(uint)(lhs1 >> 32) * (ulong)(uint)(y7); - - ulong cross1 = (loLo >> 32) + (hiLo & 0xFFFFFFFF) + loHi; - ulong upper1 = (hiLo >> 32) + (cross1 >> 32) + hiHi; - ulong lower1 = (cross1 << 32) | (loLo & 0xFFFFFFFF); - - uint128 r129; - r129.low64 = lower1; - r129.high64 = upper1; - ret1 = r129; - } - - uint128 product1 = ret1; - acc1.high64 += product1.low64 ^ product1.high64; - byte* ptr1 = input1 + 8; - acc1.high64 ^= *(ulong*) input1 + *(ulong*) ptr1; - acc = acc1; - } - - uint128 acc2 = acc; - byte* input3 = input + len - 16; - byte* input4 = input + len - 32; - byte* secret2 = secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16; - ulong seed1 = 0UL - seed; - byte* secret5 = secret2 + 0; - ulong inputLo1 = *(ulong*) input3; - byte* ptr8 = input3 + 8; - ulong inputHi1 = *(ulong*) ptr8; - - byte* ptr9 = secret5 + 8; - ulong lhs2 = inputLo1 ^ (*(ulong*) secret5 + seed1); - ulong rhs2 = inputHi1 ^ (*(ulong*) ptr9 - seed1); - uint128 ret2; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs2, rhs2, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret2 = r128; - } - else - { - ulong y8 = rhs2 & 0xFFFFFFFF; - ulong loLo1 = (ulong)(uint)(lhs2 & 0xFFFFFFFF) * (ulong)(uint)(y8); - ulong y9 = rhs2 & 0xFFFFFFFF; - ulong hiLo1 = (ulong)(uint)(lhs2 >> 32) * (ulong)(uint)(y9); - ulong y10 = rhs2 >> 32; - ulong loHi1 = (ulong)(uint)(lhs2 & 0xFFFFFFFF) * (ulong)(uint)(y10); - ulong y11 = rhs2 >> 32; - ulong hiHi1 = (ulong)(uint)(lhs2 >> 32) * (ulong)(uint)(y11); - - ulong cross2 = (loLo1 >> 32) + (hiLo1 & 0xFFFFFFFF) + loHi1; - ulong upper2 = (hiLo1 >> 32) + (cross2 >> 32) + hiHi1; - ulong lower2 = (cross2 << 32) | (loLo1 & 0xFFFFFFFF); - - uint128 r1210; - r1210.low64 = lower2; - r1210.high64 = upper2; - ret2 = r1210; - } - - uint128 product2 = ret2; - acc2.low64 += product2.low64 ^ product2.high64; - byte* ptr2 = input4 + 8; - acc2.low64 ^= *(ulong*) input4 + *(ulong*) ptr2; - byte* secret6 = secret2 + 16; - ulong inputLo2 = *(ulong*) input4; - byte* ptr10 = input4 + 8; - ulong inputHi2 = *(ulong*) ptr10; - - byte* ptr11 = secret6 + 8; - ulong lhs3 = inputLo2 ^ (*(ulong*) secret6 + seed1); - ulong rhs3 = inputHi2 ^ (*(ulong*) ptr11 - seed1); - uint128 ret3; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs3, rhs3, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret3 = r128; - } - else - { - ulong y12 = rhs3 & 0xFFFFFFFF; - ulong loLo2 = (ulong)(uint)(lhs3 & 0xFFFFFFFF) * (ulong)(uint)(y12); - ulong y13 = rhs3 & 0xFFFFFFFF; - ulong hiLo2 = (ulong)(uint)(lhs3 >> 32) * (ulong)(uint)(y13); - ulong y14 = rhs3 >> 32; - ulong loHi2 = (ulong)(uint)(lhs3 & 0xFFFFFFFF) * (ulong)(uint)(y14); - ulong y15 = rhs3 >> 32; - ulong hiHi2 = (ulong)(uint)(lhs3 >> 32) * (ulong)(uint)(y15); - - ulong cross3 = (loLo2 >> 32) + (hiLo2 & 0xFFFFFFFF) + loHi2; - ulong upper3 = (hiLo2 >> 32) + (cross3 >> 32) + hiHi2; - ulong lower3 = (cross3 << 32) | (loLo2 & 0xFFFFFFFF); - - uint128 r1211; - r1211.low64 = lower3; - r1211.high64 = upper3; - ret3 = r1211; - } - - uint128 product3 = ret3; - acc2.high64 += product3.low64 ^ product3.high64; - byte* ptr3 = input3 + 8; - acc2.high64 ^= *(ulong*) input3 + *(ulong*) ptr3; - acc = acc2; - - uint128 h128; - h128.low64 = acc.low64 + acc.high64; - h128.high64 = (acc.low64 * XXH_PRIME64_1) - + (acc.high64 * XXH_PRIME64_4) - + (((ulong)len - seed) * XXH_PRIME64_2); - ulong h66 = h128.low64; - h66 = h66 ^ (h66 >> 37); - h66 *= 0x165667919E3779F9UL; - h66 = h66 ^ (h66 >> 32); - h128.low64 = h66; - ulong h67 = h128.high64; - h67 = h67 ^ (h67 >> 37); - h67 *= 0x165667919E3779F9UL; - h67 = h67 ^ (h67 >> 32); - h128.high64 = (ulong)0 - h67; - return h128; - } - - if (seed == 0) - { - ulong* acc = stackalloc ulong[8]; - - fixed (ulong* ptr = &XXH3_INIT_ACC[0]) - { - acc[0] = ptr[0]; - acc[1] = ptr[1]; - acc[2] = ptr[2]; - acc[3] = ptr[3]; - acc[4] = ptr[4]; - acc[5] = ptr[5]; - acc[6] = ptr[6]; - acc[7] = ptr[7]; - } - - int nbStripesPerBlock = (secretLen - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; - int block_len = XXH_STRIPE_LEN * nbStripesPerBlock; - int nb_blocks = (len - 1) / block_len; - - for (int n = 0; n < nb_blocks; n++) { - byte* input1 = input + n * block_len; - for (int n1 = 0; n1 < nbStripesPerBlock; n1++ ) { - byte* inp = input1 + n1 * XXH_STRIPE_LEN; - byte* secret1 = secret + n1 * XXH_SECRET_CONSUME_RATE; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*)inp + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*)secret1 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) inp + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret1 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc; - byte* xinput = inp; - byte* xsecret = secret1; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong)(uint)(data_key & 0xFFFFFFFF) * (ulong)(uint)(y); - } - } - } - - byte* secret3 = secret + secretLen - XXH_STRIPE_LEN; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - - var prime32 = Vector256.Create(XXH_PRIME32_1); - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var shifted = Avx2.ShiftRightLogical(acc_vec, 47); - var data_vec = Avx2.Xor(acc_vec, shifted); - var key_vec = Avx2.LoadVector256((ulong*) secret3 + uint64_offset); - var data_key = Avx2.Xor(data_vec, key_vec).AsUInt32(); - var data_key_hi = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var prod_lo = Avx2.Multiply(data_key, prime32); - var prod_hi = Avx2.Multiply(data_key_hi, prime32); - var result = Avx2.Add(prod_lo, Avx2.ShiftLeftLogical(prod_hi, 32)); - Avx2.Store(acc + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - - var prime32 = Vector128.Create(XXH_PRIME32_1); - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc + uint64_offset).AsUInt32(); - var shifted = Sse2.ShiftRightLogical(acc_vec, 47); - var data_vec = Sse2.Xor(acc_vec, shifted); - var key_vec = Sse2.LoadVector128((uint*) secret3 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_hi = Sse2.Shuffle(data_key.AsUInt32(), _MM_SHUFFLE_0_3_0_1); - var prod_lo = Sse2.Multiply(data_key, prime32); - var prod_hi = Sse2.Multiply(data_key_hi, prime32); - var result = Sse2.Add(prod_lo, Sse2.ShiftLeftLogical(prod_hi, 32)); - Sse2.Store(acc + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc; - byte* xsecret = secret3; - - byte* ptr = xsecret + i * 8; - ulong key64 = *(ulong*) ptr; - ulong acc64 = xacc[i]; - acc64 = acc64 ^ (acc64 >> 47); - acc64 ^= key64; - acc64 *= XXH_PRIME32_1; - xacc[i] = acc64; - } - } - } - - int nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; - byte* input2 = input + nb_blocks * block_len; - for (int n2 = 0; n2 < nbStripes; n2++ ) { - byte* inp1 = input2 + n2 * XXH_STRIPE_LEN; - byte* secret1 = secret + n2 * XXH_SECRET_CONSUME_RATE; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*)inp1 + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*)secret1 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) inp1 + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret1 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc; - byte* xinput = inp1; - byte* xsecret = secret1; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong)(uint)(data_key & 0xFFFFFFFF) * (ulong)(uint)(y); - } - } - } - - byte* p = input + len - XXH_STRIPE_LEN; - byte* secret2 = secret + secretLen - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*)p + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*)secret2 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) p + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret2 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc; - byte* xinput = p; - byte* xsecret = secret2; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong)(uint)(data_key & 0xFFFFFFFF) * (ulong)(uint)(y); - } - } - - uint128 uint128; - byte* secret4 = secret + XXH_SECRET_MERGEACCS_START; - ulong result64 = (ulong)len * XXH_PRIME64_1; - - for (int i1 = 0; i1 < 4; i1++) - { - ulong* acc1 = acc + 2 * i1; - byte* secret1 = secret4 + 16 * i1; - byte* ptr = secret1+8; - ulong lhs = acc1[0] ^ *(ulong*) secret1; - ulong rhs = acc1[1] ^ *(ulong*) ptr; - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - result64 += product.low64 ^ product.high64; - } - - ulong h64 = result64; - h64 = h64 ^ (h64 >> 37); - h64 *= 0x165667919E3779F9UL; - h64 = h64 ^ (h64 >> 32); - uint128.low64 = h64; - byte* secret5 = secret + secretLen - XXH3_ACC_SIZE - XXH_SECRET_MERGEACCS_START; - ulong result65 = ~((ulong)len * XXH_PRIME64_2); - - for (int i2 = 0; i2 < 4; i2++) - { - ulong* acc1 = acc + 2 * i2; - byte* secret1 = secret5 + 16 * i2; - byte* ptr = secret1+8; - ulong lhs = acc1[0] ^ *(ulong*) secret1; - ulong rhs = acc1[1] ^ *(ulong*) ptr; - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - result65 += product.low64 ^ product.high64; - } - - ulong h65 = result65; - h65 = h65 ^ (h65 >> 37); - h65 *= 0x165667919E3779F9UL; - h65 = h65 ^ (h65 >> 32); - uint128.high64 = h65; - - return uint128; - } - - int customSecretSize = XXH3_SECRET_DEFAULT_SIZE; - byte* customSecret = stackalloc byte[customSecretSize]; - - fixed (byte* ptr24 = &XXH3_SECRET[0]) - { - for (int i1 = 0; i1 < customSecretSize; i1 += 8) - { - customSecret[i1] = ptr24[i1]; - customSecret[i1+1] = ptr24[i1+1]; - customSecret[i1+2] = ptr24[i1+2]; - customSecret[i1+3] = ptr24[i1+3]; - customSecret[i1+4] = ptr24[i1+4]; - customSecret[i1+5] = ptr24[i1+5]; - customSecret[i1+6] = ptr24[i1+6]; - customSecret[i1+7] = ptr24[i1+7]; - } - } - - if (Avx2.IsSupported) - { - const int m256i_size = 32; - - var seed1 = Vector256.Create((ulong)seed, (ulong)(0U - seed), (ulong)seed, (ulong)(0U - seed)); - - fixed (byte* secret1 = &XXH3_SECRET[0]) - { - for (int i = 0; i < XXH_SECRET_DEFAULT_SIZE / m256i_size; i++) - { - int uint64_offset = i * 4; - - var src32 = Avx2.LoadVector256(((ulong*)secret1) + uint64_offset); - var dst32 = Avx2.Add(src32, seed1); - Avx2.Store((ulong*) customSecret + uint64_offset, dst32); - } - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - - var seed1 = Vector128.Create((long)seed, (long)(0U - seed)); - - fixed (byte* secret1 = &XXH3_SECRET[0]) - { - for (int i = 0; i < XXH_SECRET_DEFAULT_SIZE / m128i_size; i++) - { - int uint64_offset = i * 2; - - var src16 = Sse2.LoadVector128(((long*) secret1) + uint64_offset); - var dst16 = Sse2.Add(src16, seed1); - Sse2.Store((long*) customSecret + uint64_offset, dst16); - - } - } - } - else - { - fixed (byte* kSecretPtr = &XXH3_SECRET[0]) - { - int nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; - - for (int i = 0; i < nbRounds; i++) - { - byte* ptr = kSecretPtr + 16 * i; - ulong lo = *(ulong*) ptr + seed; - byte* ptr1 = kSecretPtr + 16 * i + 8; - ulong hi = *(ulong*) ptr1 - seed; - byte* dst = (byte*) customSecret + 16 * i; - *(ulong*) dst = lo; - byte* dst1 = (byte*) customSecret + 16 * i + 8; - *(ulong*) dst1 = hi; - } - } - } - - ulong* acc5 = stackalloc ulong[8]; - - fixed (ulong* ptr25 = &XXH3_INIT_ACC[0]) - { - acc5[0] = ptr25[0]; - acc5[1] = ptr25[1]; - acc5[2] = ptr25[2]; - acc5[3] = ptr25[3]; - acc5[4] = ptr25[4]; - acc5[5] = ptr25[5]; - acc5[6] = ptr25[6]; - acc5[7] = ptr25[7]; - } - - int nbStripesPerBlock1 = (customSecretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; - int blockLen = XXH_STRIPE_LEN * nbStripesPerBlock1; - int nbBlocks = (len - 1) / blockLen; - - for (int n1 = 0; n1 < nbBlocks; n1++) { - byte* input1 = input + n1 * blockLen; - for (int n = 0; n < nbStripesPerBlock1; n++ ) { - byte* inp = input1 + n * XXH_STRIPE_LEN; - byte* secret1 = customSecret + n * XXH_SECRET_CONSUME_RATE; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc5 + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*)inp + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*)secret1 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc5 + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc5 + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) inp + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret1 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc5 + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc5; - byte* xinput = inp; - byte* xsecret = secret1; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong)(uint)(data_key & 0xFFFFFFFF) * (ulong)(uint)(y); - } - } - } - - byte* secret2 = customSecret + customSecretSize - XXH_STRIPE_LEN; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - - var prime32 = Vector256.Create(XXH_PRIME32_1); - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc5 + uint64_offset); - var shifted = Avx2.ShiftRightLogical(acc_vec, 47); - var data_vec = Avx2.Xor(acc_vec, shifted); - var key_vec = Avx2.LoadVector256((ulong*) secret2 + uint64_offset); - var data_key = Avx2.Xor(data_vec, key_vec).AsUInt32(); - var data_key_hi = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var prod_lo = Avx2.Multiply(data_key, prime32); - var prod_hi = Avx2.Multiply(data_key_hi, prime32); - var result = Avx2.Add(prod_lo, Avx2.ShiftLeftLogical(prod_hi, 32)); - Avx2.Store(acc5 + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - - var prime32 = Vector128.Create(XXH_PRIME32_1); - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc5 + uint64_offset).AsUInt32(); - var shifted = Sse2.ShiftRightLogical(acc_vec, 47); - var data_vec = Sse2.Xor(acc_vec, shifted); - var key_vec = Sse2.LoadVector128((uint*) secret2 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_hi = Sse2.Shuffle(data_key.AsUInt32(), _MM_SHUFFLE_0_3_0_1); - var prod_lo = Sse2.Multiply(data_key, prime32); - var prod_hi = Sse2.Multiply(data_key_hi, prime32); - var result = Sse2.Add(prod_lo, Sse2.ShiftLeftLogical(prod_hi, 32)); - Sse2.Store(acc5 + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc5; - byte* xsecret = secret2; - - byte* ptr = xsecret + i * 8; - ulong key64 = *(ulong*) ptr; - ulong acc64 = xacc[i]; - acc64 = acc64 ^ (acc64 >> 47); - acc64 ^= key64; - acc64 *= XXH_PRIME32_1; - xacc[i] = acc64; - } - } - } - - int nbStripes1 = ((len - 1) - (blockLen * nbBlocks)) / XXH_STRIPE_LEN; - byte* input8 = input + nbBlocks * blockLen; - for (int n3 = 0; n3 < nbStripes1; n3++ ) { - byte* inp2 = input8 + n3 * XXH_STRIPE_LEN; - byte* secret1 = customSecret + n3 * XXH_SECRET_CONSUME_RATE; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc5 + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*)inp2 + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*)secret1 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc5 + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc5 + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) inp2 + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret1 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc5 + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc5; - byte* xinput = inp2; - byte* xsecret = secret1; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong)(uint)(data_key & 0xFFFFFFFF) * (ulong)(uint)(y); - } - } - } - - byte* p1 = input + len - XXH_STRIPE_LEN; - byte* secret12 = customSecret + customSecretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc5 + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*)p1 + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*)secret12 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc5 + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc5 + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) p1 + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret12 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc5 + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc5; - byte* xinput = p1; - byte* xsecret = secret12; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong)(uint)(data_key & 0xFFFFFFFF) * (ulong)(uint)(y); - } - } - - uint128 uint129; - byte* secret13 = customSecret + XXH_SECRET_MERGEACCS_START; - ulong result66 = (ulong)len * XXH_PRIME64_1; - - for (int i3 = 0; i3 < 4; i3++) - { - ulong* acc = acc5 + 2 * i3; - byte* secret1 = secret13 + 16 * i3; - byte* ptr = secret1+8; - ulong lhs = acc[0] ^ *(ulong*) secret1; - ulong rhs = acc[1] ^ *(ulong*) ptr; - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - result66 += product.low64 ^ product.high64; - } - - ulong h68 = result66; - h68 = h68 ^ (h68 >> 37); - h68 *= 0x165667919E3779F9UL; - h68 = h68 ^ (h68 >> 32); - uint129.low64 = h68; - byte* secret14 = customSecret + customSecretSize - XXH3_ACC_SIZE - XXH_SECRET_MERGEACCS_START; - ulong result67 = ~((ulong)len * XXH_PRIME64_2); - - for (int i4 = 0; i4 < 4; i4++) - { - ulong* acc = acc5 + 2 * i4; - byte* secret1 = secret14 + 16 * i4; - byte* ptr = secret1+8; - ulong lhs = acc[0] ^ *(ulong*) secret1; - ulong rhs = acc[1] ^ *(ulong*) ptr; - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong)(uint)(lhs & 0xFFFFFFFF) * (ulong)(uint)(y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong)(uint)(lhs >> 32) * (ulong)(uint)(y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - result67 += product.low64 ^ product.high64; - } - - ulong h69 = result67; - h69 = h69 ^ (h69 >> 37); - h69 *= 0x165667919E3779F9UL; - h69 = h69 ^ (h69 >> 32); - uint129.high64 = h69; - - return uint129; - } -} \ No newline at end of file diff --git a/src/Standart.Hash.xxHash/__inline__xxHash3.cs b/src/Standart.Hash.xxHash/__inline__xxHash3.cs deleted file mode 100644 index 57c827e..0000000 --- a/src/Standart.Hash.xxHash/__inline__xxHash3.cs +++ /dev/null @@ -1,1419 +0,0 @@ -/* -* This is the auto generated code. -* All function calls are inlined in XXH3_128bits_internal -* Please don't try to analyze it. -*/ - -using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; - -namespace Standart.Hash.xxHash -{ - public static partial class xxHash3 - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe ulong __inline__XXH3_64bits_internal(byte* input, int len, ulong seed64, byte* secret, int secretLen) - { - if (len <= 16) - { - if (len > 8) - { - byte* ptr = secret + 24; - byte* ptr1 = secret + 32; - ulong bitflip1 = (*(ulong*) ptr ^ *(ulong*) ptr1) + seed64; - byte* ptr2 = secret + 40; - byte* ptr3 = secret + 48; - ulong bitflip2 = (*(ulong*) ptr2 ^ *(ulong*) ptr3) - seed64; - ulong input_lo = *(ulong*) input ^ bitflip1; - byte* ptr4 = input + len - 8; - ulong input_hi = *(ulong*) ptr4 ^ bitflip2; - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(input_lo, input_hi, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = input_hi & 0xFFFFFFFF; - ulong lo_lo = (ulong) (uint) (input_lo & 0xFFFFFFFF) * (ulong) (uint) (y); - ulong y1 = input_hi & 0xFFFFFFFF; - ulong hi_lo = (ulong) (uint) (input_lo >> 32) * (ulong) (uint) (y1); - ulong y2 = input_hi >> 32; - ulong lo_hi = (ulong) (uint) (input_lo & 0xFFFFFFFF) * (ulong) (uint) (y2); - ulong y3 = input_hi >> 32; - ulong hi_hi = (ulong) (uint) (input_lo >> 32) * (ulong) (uint) (y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - ulong acc = ((ulong) len) - + (((input_lo << 56) & 0xff00000000000000UL) | - ((input_lo << 40) & 0x00ff000000000000UL) | - ((input_lo << 24) & 0x0000ff0000000000UL) | - ((input_lo << 8) & 0x000000ff00000000UL) | - ((input_lo >> 8) & 0x00000000ff000000UL) | - ((input_lo >> 24) & 0x0000000000ff0000UL) | - ((input_lo >> 40) & 0x000000000000ff00UL) | - ((input_lo >> 56) & 0x00000000000000ffUL)) + input_hi - + (product.low64 ^ product.high64); - ulong h64 = acc; - h64 = h64 ^ (h64 >> 37); - h64 *= 0x165667919E3779F9UL; - h64 = h64 ^ (h64 >> 32); - return h64; - } - - if (len >= 4) - { - ulong seed = seed64; - uint x = (uint) seed; - seed ^= (ulong) (((x << 24) & 0xff000000) | - ((x << 8) & 0x00ff0000) | - ((x >> 8) & 0x0000ff00) | - ((x >> 24) & 0x000000ff)) << 32; - { - uint input1 = *(uint*) input; - byte* ptr2 = input + len - 4; - uint input2 = *(uint*) ptr2; - byte* ptr = secret + 8; - byte* ptr1 = secret + 16; - ulong bitflip = (*(ulong*) ptr ^ *(ulong*) ptr1) - seed; - ulong input64 = input2 + (((ulong) input1) << 32); - ulong keyed = input64 ^ bitflip; - ulong h64 = keyed; - h64 ^= ((h64 << 49) | (h64 >> (64 - 49))) ^ ((h64 << 24) | (h64 >> (64 - 24))); - h64 *= 0x9FB21C651E98DF25UL; - h64 ^= (h64 >> 35) + (ulong) len; - h64 *= 0x9FB21C651E98DF25UL; - return h64 ^ (h64 >> 28); - } - } - - if (len != 0) - { - byte c1 = input[0]; - byte c2 = input[len >> 1]; - byte c3 = input[len - 1]; - uint combined = ((uint) c1 << 16) | - ((uint) c2 << 24) | - ((uint) c3 << 0) | - ((uint) len << 8); - - byte* ptr = secret + 4; - ulong bitflip = (*(uint*) secret ^ - *(uint*) ptr) + seed64; - - ulong keyed = (ulong)combined ^ bitflip; - ulong hash = keyed; - hash ^= hash >> 33; - hash *= XXH_PRIME64_2; - hash ^= hash >> 29; - hash *= XXH_PRIME64_3; - hash ^= hash >> 32; - return hash; - } - - byte* ptr5 = secret + 56; - byte* ptr6 = secret + 64; - ulong hash1 = seed64 ^ (*(ulong*) ptr5 ^ *(ulong*) ptr6); - hash1 ^= hash1 >> 33; - hash1 *= XXH_PRIME64_2; - hash1 ^= hash1 >> 29; - hash1 *= XXH_PRIME64_3; - hash1 ^= hash1 >> 32; - return hash1; - } - - if (len <= 128) - { - ulong acc = ((ulong)len) * XXH_PRIME64_1; - - if (len > 32) - { - if (len > 64) - { - if (len > 96) - { - byte* input1 = input + 48; - byte* secret1 = secret + 96; - ulong input_lo = *(ulong*) input1; - byte* ptr = input1 + 8; - ulong input_hi = *(ulong*) ptr; - - byte* ptr1 = secret1 + 8; - ulong rhs = input_hi ^ (*(ulong*) ptr1 - seed64); - ulong lhs = input_lo ^ (*(ulong*) secret1 + seed64); - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - acc += product.low64 ^ product.high64; - byte* input2 = input + len - 64; - byte* secret2 = secret + 112; - ulong inputLo = *(ulong*) input2; - byte* ptr2 = input2 + 8; - ulong inputHi = *(ulong*) ptr2; - - byte* ptr3 = secret2 + 8; - ulong rhs1 = inputHi ^ (*(ulong*) ptr3 - seed64); - ulong lhs1 = inputLo ^ (*(ulong*) secret2 + seed64); - uint128 ret1; - if (Bmi2.IsSupported) - { - ulong productLow; - ulong productHigh = Bmi2.X64.MultiplyNoFlags(lhs1, rhs1, &productLow); - uint128 r129; - r129.low64 = productLow; - r129.high64 = productHigh; - ret1 = r129; - } - else - { - ulong y4 = rhs1 & 0xFFFFFFFF; - ulong loLo = (ulong) (uint) (lhs1 & 0xFFFFFFFF) * (ulong) (uint) (y4); - ulong y5 = rhs1 & 0xFFFFFFFF; - ulong hiLo = (ulong) (uint) (lhs1 >> 32) * (ulong) (uint) (y5); - ulong y6 = rhs1 >> 32; - ulong loHi = (ulong) (uint) (lhs1 & 0xFFFFFFFF) * (ulong) (uint) (y6); - ulong y7 = rhs1 >> 32; - ulong hiHi = (ulong) (uint) (lhs1 >> 32) * (ulong) (uint) (y7); - - ulong cross1 = (loLo >> 32) + (hiLo & 0xFFFFFFFF) + loHi; - ulong upper1 = (hiLo >> 32) + (cross1 >> 32) + hiHi; - ulong lower1 = (cross1 << 32) | (loLo & 0xFFFFFFFF); - - uint128 r1210; - r1210.low64 = lower1; - r1210.high64 = upper1; - ret1 = r1210; - } - - uint128 product1 = ret1; - acc += product1.low64 ^ product1.high64; - } - - byte* input3 = input + 32; - byte* secret3 = secret + 64; - ulong inputLo1 = *(ulong*) input3; - byte* ptr4 = input3 + 8; - ulong inputHi1 = *(ulong*) ptr4; - - byte* ptr5 = secret3 + 8; - ulong rhs2 = inputHi1 ^ (*(ulong*) ptr5 - seed64); - ulong lhs2 = inputLo1 ^ (*(ulong*) secret3 + seed64); - uint128 ret2; - if (Bmi2.IsSupported) - { - ulong productLow1; - ulong productHigh1 = Bmi2.X64.MultiplyNoFlags(lhs2, rhs2, &productLow1); - uint128 r1211; - r1211.low64 = productLow1; - r1211.high64 = productHigh1; - ret2 = r1211; - } - else - { - ulong y8 = rhs2 & 0xFFFFFFFF; - ulong loLo1 = (ulong) (uint) (lhs2 & 0xFFFFFFFF) * (ulong) (uint) (y8); - ulong y9 = rhs2 & 0xFFFFFFFF; - ulong hiLo1 = (ulong) (uint) (lhs2 >> 32) * (ulong) (uint) (y9); - ulong y10 = rhs2 >> 32; - ulong loHi1 = (ulong) (uint) (lhs2 & 0xFFFFFFFF) * (ulong) (uint) (y10); - ulong y11 = rhs2 >> 32; - ulong hiHi1 = (ulong) (uint) (lhs2 >> 32) * (ulong) (uint) (y11); - - ulong cross2 = (loLo1 >> 32) + (hiLo1 & 0xFFFFFFFF) + loHi1; - ulong upper2 = (hiLo1 >> 32) + (cross2 >> 32) + hiHi1; - ulong lower2 = (cross2 << 32) | (loLo1 & 0xFFFFFFFF); - - uint128 r1212; - r1212.low64 = lower2; - r1212.high64 = upper2; - ret2 = r1212; - } - - uint128 product2 = ret2; - acc += product2.low64 ^ product2.high64; - byte* input4 = input + len - 48; - byte* secret4 = secret + 80; - ulong inputLo2 = *(ulong*) input4; - byte* ptr6 = input4 + 8; - ulong inputHi2 = *(ulong*) ptr6; - - byte* ptr7 = secret4 + 8; - ulong rhs3 = inputHi2 ^ (*(ulong*) ptr7 - seed64); - ulong lhs3 = inputLo2 ^ (*(ulong*) secret4 + seed64); - uint128 ret3; - if (Bmi2.IsSupported) - { - ulong productLow2; - ulong productHigh2 = Bmi2.X64.MultiplyNoFlags(lhs3, rhs3, &productLow2); - uint128 r1213; - r1213.low64 = productLow2; - r1213.high64 = productHigh2; - ret3 = r1213; - } - else - { - ulong y12 = rhs3 & 0xFFFFFFFF; - ulong loLo2 = (ulong) (uint) (lhs3 & 0xFFFFFFFF) * (ulong) (uint) (y12); - ulong y13 = rhs3 & 0xFFFFFFFF; - ulong hiLo2 = (ulong) (uint) (lhs3 >> 32) * (ulong) (uint) (y13); - ulong y14 = rhs3 >> 32; - ulong loHi2 = (ulong) (uint) (lhs3 & 0xFFFFFFFF) * (ulong) (uint) (y14); - ulong y15 = rhs3 >> 32; - ulong hiHi2 = (ulong) (uint) (lhs3 >> 32) * (ulong) (uint) (y15); - - ulong cross3 = (loLo2 >> 32) + (hiLo2 & 0xFFFFFFFF) + loHi2; - ulong upper3 = (hiLo2 >> 32) + (cross3 >> 32) + hiHi2; - ulong lower3 = (cross3 << 32) | (loLo2 & 0xFFFFFFFF); - - uint128 r1214; - r1214.low64 = lower3; - r1214.high64 = upper3; - ret3 = r1214; - } - - uint128 product3 = ret3; - acc += product3.low64 ^ product3.high64; - } - - byte* input5 = input + 16; - byte* secret5 = secret + 32; - ulong inputLo3 = *(ulong*) input5; - byte* ptr8 = input5 + 8; - ulong inputHi3 = *(ulong*) ptr8; - - byte* ptr9 = secret5 + 8; - ulong rhs4 = inputHi3 ^ (*(ulong*) ptr9 - seed64); - ulong lhs4 = inputLo3 ^ (*(ulong*) secret5 + seed64); - uint128 ret4; - if (Bmi2.IsSupported) - { - ulong productLow3; - ulong productHigh3 = Bmi2.X64.MultiplyNoFlags(lhs4, rhs4, &productLow3); - uint128 r1215; - r1215.low64 = productLow3; - r1215.high64 = productHigh3; - ret4 = r1215; - } - else - { - ulong y16 = rhs4 & 0xFFFFFFFF; - ulong loLo3 = (ulong) (uint) (lhs4 & 0xFFFFFFFF) * (ulong) (uint) (y16); - ulong y17 = rhs4 & 0xFFFFFFFF; - ulong hiLo3 = (ulong) (uint) (lhs4 >> 32) * (ulong) (uint) (y17); - ulong y18 = rhs4 >> 32; - ulong loHi3 = (ulong) (uint) (lhs4 & 0xFFFFFFFF) * (ulong) (uint) (y18); - ulong y19 = rhs4 >> 32; - ulong hiHi3 = (ulong) (uint) (lhs4 >> 32) * (ulong) (uint) (y19); - - ulong cross4 = (loLo3 >> 32) + (hiLo3 & 0xFFFFFFFF) + loHi3; - ulong upper4 = (hiLo3 >> 32) + (cross4 >> 32) + hiHi3; - ulong lower4 = (cross4 << 32) | (loLo3 & 0xFFFFFFFF); - - uint128 r1216; - r1216.low64 = lower4; - r1216.high64 = upper4; - ret4 = r1216; - } - - uint128 product4 = ret4; - acc += product4.low64 ^ product4.high64; - byte* input6 = input + len - 32; - byte* secret6 = secret + 48; - ulong inputLo4 = *(ulong*) input6; - byte* ptr10 = input6 + 8; - ulong inputHi4 = *(ulong*) ptr10; - - byte* ptr11 = secret6 + 8; - ulong rhs5 = inputHi4 ^ (*(ulong*) ptr11 - seed64); - ulong lhs5 = inputLo4 ^ (*(ulong*) secret6 + seed64); - uint128 ret5; - if (Bmi2.IsSupported) - { - ulong productLow4; - ulong productHigh4 = Bmi2.X64.MultiplyNoFlags(lhs5, rhs5, &productLow4); - uint128 r1217; - r1217.low64 = productLow4; - r1217.high64 = productHigh4; - ret5 = r1217; - } - else - { - ulong y20 = rhs5 & 0xFFFFFFFF; - ulong loLo4 = (ulong) (uint) (lhs5 & 0xFFFFFFFF) * (ulong) (uint) (y20); - ulong y21 = rhs5 & 0xFFFFFFFF; - ulong hiLo4 = (ulong) (uint) (lhs5 >> 32) * (ulong) (uint) (y21); - ulong y22 = rhs5 >> 32; - ulong loHi4 = (ulong) (uint) (lhs5 & 0xFFFFFFFF) * (ulong) (uint) (y22); - ulong y23 = rhs5 >> 32; - ulong hiHi4 = (ulong) (uint) (lhs5 >> 32) * (ulong) (uint) (y23); - - ulong cross5 = (loLo4 >> 32) + (hiLo4 & 0xFFFFFFFF) + loHi4; - ulong upper5 = (hiLo4 >> 32) + (cross5 >> 32) + hiHi4; - ulong lower5 = (cross5 << 32) | (loLo4 & 0xFFFFFFFF); - - uint128 r1218; - r1218.low64 = lower5; - r1218.high64 = upper5; - ret5 = r1218; - } - - uint128 product5 = ret5; - acc += product5.low64 ^ product5.high64; - } - - byte* input7 = input + 0; - byte* secret7 = secret + 0; - ulong inputLo5 = *(ulong*) input7; - byte* ptr12 = input7 + 8; - ulong inputHi5 = *(ulong*) ptr12; - - byte* ptr13 = secret7 + 8; - ulong rhs6 = inputHi5 ^ (*(ulong*) ptr13 - seed64); - ulong lhs6 = inputLo5 ^ (*(ulong*) secret7 + seed64); - uint128 ret6; - if (Bmi2.IsSupported) - { - ulong productLow5; - ulong productHigh5 = Bmi2.X64.MultiplyNoFlags(lhs6, rhs6, &productLow5); - uint128 r1219; - r1219.low64 = productLow5; - r1219.high64 = productHigh5; - ret6 = r1219; - } - else - { - ulong y24 = rhs6 & 0xFFFFFFFF; - ulong loLo5 = (ulong) (uint) (lhs6 & 0xFFFFFFFF) * (ulong) (uint) (y24); - ulong y25 = rhs6 & 0xFFFFFFFF; - ulong hiLo5 = (ulong) (uint) (lhs6 >> 32) * (ulong) (uint) (y25); - ulong y26 = rhs6 >> 32; - ulong loHi5 = (ulong) (uint) (lhs6 & 0xFFFFFFFF) * (ulong) (uint) (y26); - ulong y27 = rhs6 >> 32; - ulong hiHi5 = (ulong) (uint) (lhs6 >> 32) * (ulong) (uint) (y27); - - ulong cross6 = (loLo5 >> 32) + (hiLo5 & 0xFFFFFFFF) + loHi5; - ulong upper6 = (hiLo5 >> 32) + (cross6 >> 32) + hiHi5; - ulong lower6 = (cross6 << 32) | (loLo5 & 0xFFFFFFFF); - - uint128 r1220; - r1220.low64 = lower6; - r1220.high64 = upper6; - ret6 = r1220; - } - - uint128 product6 = ret6; - acc += product6.low64 ^ product6.high64; - byte* input8 = input + len - 16; - byte* secret8 = secret + 16; - ulong inputLo6 = *(ulong*) input8; - byte* ptr14 = input8 + 8; - ulong inputHi6 = *(ulong*) ptr14; - - byte* ptr15 = secret8 + 8; - ulong rhs7 = inputHi6 ^ (*(ulong*) ptr15 - seed64); - ulong lhs7 = inputLo6 ^ (*(ulong*) secret8 + seed64); - uint128 ret7; - if (Bmi2.IsSupported) - { - ulong productLow6; - ulong productHigh6 = Bmi2.X64.MultiplyNoFlags(lhs7, rhs7, &productLow6); - uint128 r1221; - r1221.low64 = productLow6; - r1221.high64 = productHigh6; - ret7 = r1221; - } - else - { - ulong y28 = rhs7 & 0xFFFFFFFF; - ulong loLo6 = (ulong) (uint) (lhs7 & 0xFFFFFFFF) * (ulong) (uint) (y28); - ulong y29 = rhs7 & 0xFFFFFFFF; - ulong hiLo6 = (ulong) (uint) (lhs7 >> 32) * (ulong) (uint) (y29); - ulong y30 = rhs7 >> 32; - ulong loHi6 = (ulong) (uint) (lhs7 & 0xFFFFFFFF) * (ulong) (uint) (y30); - ulong y31 = rhs7 >> 32; - ulong hiHi6 = (ulong) (uint) (lhs7 >> 32) * (ulong) (uint) (y31); - - ulong cross7 = (loLo6 >> 32) + (hiLo6 & 0xFFFFFFFF) + loHi6; - ulong upper7 = (hiLo6 >> 32) + (cross7 >> 32) + hiHi6; - ulong lower7 = (cross7 << 32) | (loLo6 & 0xFFFFFFFF); - - uint128 r1222; - r1222.low64 = lower7; - r1222.high64 = upper7; - ret7 = r1222; - } - - uint128 product7 = ret7; - acc += product7.low64 ^ product7.high64; - ulong h64 = acc; - h64 = h64 ^ (h64 >> 37); - h64 *= 0x165667919E3779F9UL; - h64 = h64 ^ (h64 >> 32); - return h64; - } - - if (len <= XXH3_MIDSIZE_MAX) - { - ulong acc = ((ulong) len) * XXH_PRIME64_1; - int nbRounds = len / 16; - - for (int i = 0; i < 8; i++) - { - byte* input1 = input + (16 * i); - byte* secret1 = secret + (16 * i); - ulong input_lo = *(ulong*) input1; - byte* ptr = input1 + 8; - ulong input_hi = *(ulong*) ptr; - - byte* ptr1 = secret1 + 8; - ulong rhs = input_hi ^ (*(ulong*) ptr1 - seed64); - ulong lhs = input_lo ^ (*(ulong*) secret1 + seed64); - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - acc += product.low64 ^ product.high64; - } - - ulong h64 = acc; - h64 = h64 ^ (h64 >> 37); - h64 *= 0x165667919E3779F9UL; - h64 = h64 ^ (h64 >> 32); - acc = h64; - - for (int i = 8; i < nbRounds; i++) - { - byte* input1 = input + (16 * i); - byte* secret1 = secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET; - ulong input_lo = *(ulong*) input1; - byte* ptr = input1 + 8; - ulong input_hi = *(ulong*) ptr; - - byte* ptr1 = secret1 + 8; - ulong rhs = input_hi ^ (*(ulong*) ptr1 - seed64); - ulong lhs = input_lo ^ (*(ulong*) secret1 + seed64); - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - acc += product.low64 ^ product.high64; - } - - byte* input2 = input + len - 16; - byte* secret2 = secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET; - ulong inputLo = *(ulong*) input2; - byte* ptr2 = input2 + 8; - ulong inputHi = *(ulong*) ptr2; - - byte* ptr3 = secret2 + 8; - ulong rhs1 = inputHi ^ (*(ulong*) ptr3 - seed64); - ulong lhs1 = inputLo ^ (*(ulong*) secret2 + seed64); - uint128 ret1; - if (Bmi2.IsSupported) - { - ulong productLow; - ulong productHigh = Bmi2.X64.MultiplyNoFlags(lhs1, rhs1, &productLow); - uint128 r129; - r129.low64 = productLow; - r129.high64 = productHigh; - ret1 = r129; - } - else - { - ulong y4 = rhs1 & 0xFFFFFFFF; - ulong loLo = (ulong) (uint) (lhs1 & 0xFFFFFFFF) * (ulong) (uint) (y4); - ulong y5 = rhs1 & 0xFFFFFFFF; - ulong hiLo = (ulong) (uint) (lhs1 >> 32) * (ulong) (uint) (y5); - ulong y6 = rhs1 >> 32; - ulong loHi = (ulong) (uint) (lhs1 & 0xFFFFFFFF) * (ulong) (uint) (y6); - ulong y7 = rhs1 >> 32; - ulong hiHi = (ulong) (uint) (lhs1 >> 32) * (ulong) (uint) (y7); - - ulong cross1 = (loLo >> 32) + (hiLo & 0xFFFFFFFF) + loHi; - ulong upper1 = (hiLo >> 32) + (cross1 >> 32) + hiHi; - ulong lower1 = (cross1 << 32) | (loLo & 0xFFFFFFFF); - - uint128 r1210; - r1210.low64 = lower1; - r1210.high64 = upper1; - ret1 = r1210; - } - - uint128 product1 = ret1; - acc += product1.low64 ^ product1.high64; - ulong h65 = acc; - h65 = h65 ^ (h65 >> 37); - h65 *= 0x165667919E3779F9UL; - h65 = h65 ^ (h65 >> 32); - return h65; - } - - if (seed64 == 0) - { - ulong* acc = stackalloc ulong[8]; - - fixed (ulong* ptr = &XXH3_INIT_ACC[0]) - { - acc[0] = ptr[0]; - acc[1] = ptr[1]; - acc[2] = ptr[2]; - acc[3] = ptr[3]; - acc[4] = ptr[4]; - acc[5] = ptr[5]; - acc[6] = ptr[6]; - acc[7] = ptr[7]; - } - - int nbStripesPerBlock = (secretLen - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; - int block_len = XXH_STRIPE_LEN * nbStripesPerBlock; - int nb_blocks = (len - 1) / block_len; - - for (int n = 0; n < nb_blocks; n++) - { - byte* input1 = input + n * block_len; - for (int n1 = 0; n1 < nbStripesPerBlock; n1++) - { - byte* inp = input1 + n1 * XXH_STRIPE_LEN; - byte* secret1 = secret + n1 * XXH_SECRET_CONSUME_RATE; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*) inp + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*) secret1 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) inp + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret1 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc; - byte* xinput = inp; - byte* xsecret = secret1; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong) (uint) (data_key & 0xFFFFFFFF) * (ulong) (uint) (y); - } - } - } - - byte* secret3 = secret + secretLen - XXH_STRIPE_LEN; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - - var prime32 = Vector256.Create(XXH_PRIME32_1); - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var shifted = Avx2.ShiftRightLogical(acc_vec, 47); - var data_vec = Avx2.Xor(acc_vec, shifted); - var key_vec = Avx2.LoadVector256((ulong*) secret3 + uint64_offset); - var data_key = Avx2.Xor(data_vec, key_vec).AsUInt32(); - var data_key_hi = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var prod_lo = Avx2.Multiply(data_key, prime32); - var prod_hi = Avx2.Multiply(data_key_hi, prime32); - var result = Avx2.Add(prod_lo, Avx2.ShiftLeftLogical(prod_hi, 32)); - Avx2.Store(acc + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - - var prime32 = Vector128.Create(XXH_PRIME32_1); - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc + uint64_offset).AsUInt32(); - var shifted = Sse2.ShiftRightLogical(acc_vec, 47); - var data_vec = Sse2.Xor(acc_vec, shifted); - var key_vec = Sse2.LoadVector128((uint*) secret3 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_hi = Sse2.Shuffle(data_key.AsUInt32(), _MM_SHUFFLE_0_3_0_1); - var prod_lo = Sse2.Multiply(data_key, prime32); - var prod_hi = Sse2.Multiply(data_key_hi, prime32); - var result = Sse2.Add(prod_lo, Sse2.ShiftLeftLogical(prod_hi, 32)); - Sse2.Store(acc + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc; - byte* xsecret = secret3; - - byte* ptr = xsecret + i * 8; - ulong key64 = *(ulong*) ptr; - ulong acc64 = xacc[i]; - acc64 = acc64 ^ (acc64 >> 47); - acc64 ^= key64; - acc64 *= XXH_PRIME32_1; - xacc[i] = acc64; - } - } - } - - int nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; - byte* input2 = input + nb_blocks * block_len; - for (int n2 = 0; n2 < nbStripes; n2++) - { - byte* inp1 = input2 + n2 * XXH_STRIPE_LEN; - byte* secret1 = secret + n2 * XXH_SECRET_CONSUME_RATE; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*) inp1 + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*) secret1 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) inp1 + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret1 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc; - byte* xinput = inp1; - byte* xsecret = secret1; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong) (uint) (data_key & 0xFFFFFFFF) * (ulong) (uint) (y); - } - } - } - - byte* p = input + len - XXH_STRIPE_LEN; - byte* secret2 = secret + secretLen - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*) p + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*) secret2 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) p + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret2 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc; - byte* xinput = p; - byte* xsecret = secret2; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong) (uint) (data_key & 0xFFFFFFFF) * (ulong) (uint) (y); - } - } - - byte* secret4 = secret + XXH_SECRET_MERGEACCS_START; - ulong result64 = ((ulong) len) * XXH_PRIME64_1; - - for (int i1 = 0; i1 < 4; i1++) - { - ulong* acc2 = acc + 2 * i1; - byte* secret1 = secret4 + 16 * i1; - byte* ptr = secret1 + 8; - ulong rhs = acc2[1] ^ *(ulong*) ptr; - ulong lhs = acc2[0] ^ *(ulong*) secret1; - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - result64 += product.low64 ^ product.high64; - } - - ulong h64 = result64; - h64 = h64 ^ (h64 >> 37); - h64 *= 0x165667919E3779F9UL; - h64 = h64 ^ (h64 >> 32); - return h64; - } - - int customSecretSize = XXH3_SECRET_DEFAULT_SIZE; - byte* customSecret = stackalloc byte[customSecretSize]; - - fixed (byte* ptr = &XXH3_SECRET[0]) - { - for (int i1 = 0; i1 < customSecretSize; i1 += 8) - { - customSecret[i1] = ptr[i1]; - customSecret[i1 + 1] = ptr[i1 + 1]; - customSecret[i1 + 2] = ptr[i1 + 2]; - customSecret[i1 + 3] = ptr[i1 + 3]; - customSecret[i1 + 4] = ptr[i1 + 4]; - customSecret[i1 + 5] = ptr[i1 + 5]; - customSecret[i1 + 6] = ptr[i1 + 6]; - customSecret[i1 + 7] = ptr[i1 + 7]; - } - } - - if (Avx2.IsSupported) - { - const int m256i_size = 32; - - var seed = Vector256.Create(seed64, 0U - seed64, seed64, 0U - seed64); - - fixed (byte* secret1 = &XXH3_SECRET[0]) - { - for (int i = 0; i < XXH_SECRET_DEFAULT_SIZE / m256i_size; i++) - { - int uint64_offset = i * 4; - - var src32 = Avx2.LoadVector256(((ulong*) secret1) + uint64_offset); - var dst32 = Avx2.Add(src32, seed); - Avx2.Store((ulong*) customSecret + uint64_offset, dst32); - } - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - - var seed = Vector128.Create((long) seed64, (long) (0U - seed64)); - - fixed (byte* secret1 = &XXH3_SECRET[0]) - { - for (int i = 0; i < XXH_SECRET_DEFAULT_SIZE / m128i_size; i++) - { - int uint64_offset = i * 2; - - var src16 = Sse2.LoadVector128(((long*) secret1) + uint64_offset); - var dst16 = Sse2.Add(src16, seed); - Sse2.Store((long*) customSecret + uint64_offset, dst16); - } - } - } - else - { - fixed (byte* kSecretPtr = &XXH3_SECRET[0]) - { - int nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; - - for (int i = 0; i < nbRounds; i++) - { - byte* ptr = kSecretPtr + 16 * i; - ulong lo = *(ulong*) ptr + seed64; - byte* ptr1 = kSecretPtr + 16 * i + 8; - ulong hi = *(ulong*) ptr1 - seed64; - byte* dst = (byte*) customSecret + 16 * i; - *(ulong*) dst = lo; - byte* dst1 = (byte*) customSecret + 16 * i + 8; - *(ulong*) dst1 = hi; - } - } - } - - ulong* acc1 = stackalloc ulong[8]; - - fixed (ulong* ptr16 = &XXH3_INIT_ACC[0]) - { - acc1[0] = ptr16[0]; - acc1[1] = ptr16[1]; - acc1[2] = ptr16[2]; - acc1[3] = ptr16[3]; - acc1[4] = ptr16[4]; - acc1[5] = ptr16[5]; - acc1[6] = ptr16[6]; - acc1[7] = ptr16[7]; - } - - int nbStripesPerBlock1 = (customSecretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; - int blockLen = XXH_STRIPE_LEN * nbStripesPerBlock1; - int nbBlocks = (len - 1) / blockLen; - - for (int n1 = 0; n1 < nbBlocks; n1++) - { - byte* input1 = input + n1 * blockLen; - for (int n = 0; n < nbStripesPerBlock1; n++) - { - byte* inp = input1 + n * XXH_STRIPE_LEN; - byte* secret1 = customSecret + n * XXH_SECRET_CONSUME_RATE; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc1 + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*) inp + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*) secret1 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc1 + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc1 + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) inp + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret1 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc1 + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc1; - byte* xinput = inp; - byte* xsecret = secret1; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong) (uint) (data_key & 0xFFFFFFFF) * (ulong) (uint) (y); - } - } - } - - byte* secret2 = customSecret + customSecretSize - XXH_STRIPE_LEN; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - - var prime32 = Vector256.Create(XXH_PRIME32_1); - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc1 + uint64_offset); - var shifted = Avx2.ShiftRightLogical(acc_vec, 47); - var data_vec = Avx2.Xor(acc_vec, shifted); - var key_vec = Avx2.LoadVector256((ulong*) secret2 + uint64_offset); - var data_key = Avx2.Xor(data_vec, key_vec).AsUInt32(); - var data_key_hi = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var prod_lo = Avx2.Multiply(data_key, prime32); - var prod_hi = Avx2.Multiply(data_key_hi, prime32); - var result = Avx2.Add(prod_lo, Avx2.ShiftLeftLogical(prod_hi, 32)); - Avx2.Store(acc1 + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - - var prime32 = Vector128.Create(XXH_PRIME32_1); - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc1 + uint64_offset).AsUInt32(); - var shifted = Sse2.ShiftRightLogical(acc_vec, 47); - var data_vec = Sse2.Xor(acc_vec, shifted); - var key_vec = Sse2.LoadVector128((uint*) secret2 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_hi = Sse2.Shuffle(data_key.AsUInt32(), _MM_SHUFFLE_0_3_0_1); - var prod_lo = Sse2.Multiply(data_key, prime32); - var prod_hi = Sse2.Multiply(data_key_hi, prime32); - var result = Sse2.Add(prod_lo, Sse2.ShiftLeftLogical(prod_hi, 32)); - Sse2.Store(acc1 + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc1; - byte* xsecret = secret2; - - byte* ptr = xsecret + i * 8; - ulong key64 = *(ulong*) ptr; - ulong acc64 = xacc[i]; - acc64 = acc64 ^ (acc64 >> 47); - acc64 ^= key64; - acc64 *= XXH_PRIME32_1; - xacc[i] = acc64; - } - } - } - - int nbStripes1 = ((len - 1) - (blockLen * nbBlocks)) / XXH_STRIPE_LEN; - byte* input9 = input + nbBlocks * blockLen; - for (int n3 = 0; n3 < nbStripes1; n3++) - { - byte* inp2 = input9 + n3 * XXH_STRIPE_LEN; - byte* secret1 = customSecret + n3 * XXH_SECRET_CONSUME_RATE; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc1 + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*) inp2 + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*) secret1 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc1 + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc1 + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) inp2 + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret1 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc1 + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc1; - byte* xinput = inp2; - byte* xsecret = secret1; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong) (uint) (data_key & 0xFFFFFFFF) * (ulong) (uint) (y); - } - } - } - - byte* p1 = input + len - XXH_STRIPE_LEN; - byte* secret9 = customSecret + customSecretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START; - if (Avx2.IsSupported) - { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; - - var acc_vec = Avx2.LoadVector256(acc1 + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*) p1 + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*) secret9 + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc1 + uint64_offset, result); - } - } - else if (Sse2.IsSupported) - { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; - - var acc_vec = Sse2.LoadVector128(acc1 + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) p1 + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret9 + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc1 + uint64_offset, result); - } - } - else - { - for (int i = 0; i < XXH_ACC_NB; i++) - { - ulong* xacc = acc1; - byte* xinput = p1; - byte* xsecret = secret9; - - byte* ptr = xinput + i * 8; - ulong data_val = *(ulong*) ptr; - byte* ptr1 = xsecret + i * 8; - ulong data_key = data_val ^ *(ulong*) ptr1; - xacc[i ^ 1] += data_val; - ulong y = data_key >> 32; - xacc[i] += (ulong) (uint) (data_key & 0xFFFFFFFF) * (ulong) (uint) (y); - } - } - - byte* secret10 = customSecret + XXH_SECRET_MERGEACCS_START; - ulong result65 = ((ulong) len) * XXH_PRIME64_1; - - for (int i2 = 0; i2 < 4; i2++) - { - ulong* acc = acc1 + 2 * i2; - byte* secret1 = secret10 + 16 * i2; - byte* ptr = secret1 + 8; - ulong rhs = acc[1] ^ *(ulong*) ptr; - ulong lhs = acc[0] ^ *(ulong*) secret1; - uint128 ret; - if (Bmi2.IsSupported) - { - ulong product_low; - ulong product_high = Bmi2.X64.MultiplyNoFlags(lhs, rhs, &product_low); - uint128 r128; - r128.low64 = product_low; - r128.high64 = product_high; - ret = r128; - } - else - { - ulong y = rhs & 0xFFFFFFFF; - ulong lo_lo = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y); - ulong y1 = rhs & 0xFFFFFFFF; - ulong hi_lo = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y1); - ulong y2 = rhs >> 32; - ulong lo_hi = (ulong) (uint) (lhs & 0xFFFFFFFF) * (ulong) (uint) (y2); - ulong y3 = rhs >> 32; - ulong hi_hi = (ulong) (uint) (lhs >> 32) * (ulong) (uint) (y3); - - ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - uint128 r128; - r128.low64 = lower; - r128.high64 = upper; - ret = r128; - } - - uint128 product = ret; - result65 += product.low64 ^ product.high64; - } - - ulong h66 = result65; - h66 = h66 ^ (h66 >> 37); - h66 *= 0x165667919E3779F9UL; - h66 = h66 ^ (h66 >> 32); - return h66; - } - } -} - diff --git a/src/Standart.Hash.xxHash/xxHash128.XXH.cs b/src/Standart.Hash.xxHash/xxHash128.XXH.cs index 66d67f1..c1bb5a7 100644 --- a/src/Standart.Hash.xxHash/xxHash128.XXH.cs +++ b/src/Standart.Hash.xxHash/xxHash128.XXH.cs @@ -1,30 +1,40 @@ // ReSharper disable InconsistentNaming using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; namespace Standart.Hash.xxHash { public static partial class xxHash128 { - private const ulong XXH_PRIME64_1 = 11400714785074694791UL; - private const ulong XXH_PRIME64_2 = 14029467366897019727UL; - private const ulong XXH_PRIME64_3 = 1609587929392839161UL; - private const ulong XXH_PRIME64_4 = 9650029242287828579UL; - private const ulong XXH_PRIME64_5 = 2870177450012600261UL; - - private const uint XXH_PRIME32_1 = 2654435761U; - private const uint XXH_PRIME32_2 = 2246822519U; - private const uint XXH_PRIME32_3 = 3266489917U; - private const uint XXH_PRIME32_4 = 668265263U; - private const uint XXH_PRIME32_5 = 374761393U; + private static readonly ulong XXH_PRIME64_1 = 11400714785074694791UL; + private static readonly ulong XXH_PRIME64_2 = 14029467366897019727UL; + private static readonly ulong XXH_PRIME64_3 = 1609587929392839161UL; + private static readonly ulong XXH_PRIME64_4 = 9650029242287828579UL; + private static readonly ulong XXH_PRIME64_5 = 2870177450012600261UL; + + private static readonly uint XXH_PRIME32_1 = 2654435761U; + private static readonly uint XXH_PRIME32_2 = 2246822519U; + private static readonly uint XXH_PRIME32_3 = 3266489917U; + private static readonly uint XXH_PRIME32_4 = 668265263U; + private static readonly uint XXH_PRIME32_5 = 374761393U; + + private static readonly int XXH_STRIPE_LEN = 64; + private static readonly int XXH_ACC_NB = 8; + private static readonly int XXH_SECRET_CONSUME_RATE = 8; + private static readonly int XXH_SECRET_MERGEACCS_START = 11; + private static readonly int XXH_SECRET_DEFAULT_SIZE = 192; + private static readonly int XXH_SECRET_LASTACC_START = 7; + + private static readonly byte MM_SHUFFLE_0_3_0_1 = 0b0011_0001; + private static readonly byte MM_SHUFFLE_1_0_3_2 = 0b0100_1110; + + [FixedAddressValueType] + private static readonly Vector256 M256i_XXH_PRIME32_1 = Vector256.Create(XXH_PRIME32_1); + [FixedAddressValueType] + private static readonly Vector128 M128i_XXH_PRIME32_1 = Vector128.Create(XXH_PRIME32_1); - private const int XXH_STRIPE_LEN = 64; - private const int XXH_ACC_NB = XXH_STRIPE_LEN / 8; - private const int XXH_SECRET_CONSUME_RATE = 8; - private const int XXH_SECRET_MERGEACCS_START = 11; - private const int XXH_SECRET_DEFAULT_SIZE = 192; - private const int XXH_SECRET_LASTACC_START = 7; [MethodImpl(MethodImplOptions.AggressiveInlining)] private static uint XXH_rotl32(uint x, int r) diff --git a/src/Standart.Hash.xxHash/xxHash128.XXH3.cs b/src/Standart.Hash.xxHash/xxHash128.XXH3.cs index a31c115..ff559a9 100644 --- a/src/Standart.Hash.xxHash/xxHash128.XXH3.cs +++ b/src/Standart.Hash.xxHash/xxHash128.XXH3.cs @@ -9,7 +9,7 @@ namespace Standart.Hash.xxHash { public static partial class xxHash128 { - private static byte[] XXH3_SECRET = + private static readonly byte[] XXH3_SECRET = { 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, @@ -25,18 +25,18 @@ namespace Standart.Hash.xxHash 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, }; - private static ulong[] XXH3_INIT_ACC = + private static readonly ulong[] XXH3_INIT_ACC = { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }; - private const int XXH3_SECRET_SIZE_MIN = 136; - private const int XXH3_SECRET_DEFAULT_SIZE = 192; - private const int XXH3_MIDSIZE_MAX = 240; - private const int XXH3_MIDSIZE_STARTOFFSET = 3; - private const int XXH3_MIDSIZE_LASTOFFSET = 17; - private const int XXH3_ACC_SIZE = 64; + private static readonly int XXH3_SECRET_SIZE_MIN = 136; + private static readonly int XXH3_SECRET_DEFAULT_SIZE = 192; + private static readonly int XXH3_MIDSIZE_MAX = 240; + private static readonly int XXH3_MIDSIZE_STARTOFFSET = 3; + private static readonly int XXH3_MIDSIZE_LASTOFFSET = 17; + private static readonly int XXH3_ACC_SIZE = 64; [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe uint128 XXH3_128bits_internal(byte* input, int len, ulong seed, byte* secret, int secretLen) @@ -268,62 +268,50 @@ namespace Standart.Hash.xxHash uint128 product = XXH_mult64to128(lhs, rhs); return product.low64 ^ product.high64; } - + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe uint128 XXH3_hashLong_128b_withSeed(byte* input, int len, byte* secret, int secretSize, ulong seed) { if (seed == 0) return XXH3_hashLong_128b_internal(input, len, secret, secretSize); - int customSecretSize = XXH3_SECRET_DEFAULT_SIZE; - byte* customSecret = stackalloc byte[customSecretSize]; + byte* customSecret = stackalloc byte[XXH3_SECRET_DEFAULT_SIZE]; - fixed (byte* ptr = &XXH3_SECRET[0]) - { - for (int i = 0; i < customSecretSize; i += 8) - { - customSecret[i] = ptr[i]; - customSecret[i+1] = ptr[i+1]; - customSecret[i+2] = ptr[i+2]; - customSecret[i+3] = ptr[i+3]; - customSecret[i+4] = ptr[i+4]; - customSecret[i+5] = ptr[i+5]; - customSecret[i+6] = ptr[i+6]; - customSecret[i+7] = ptr[i+7]; - } - } XXH3_initCustomSecret(customSecret, seed); - - return XXH3_hashLong_128b_internal(input, len, customSecret, customSecretSize); + + return XXH3_hashLong_128b_internal(input, len, customSecret, XXH3_SECRET_DEFAULT_SIZE); + } - + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe uint128 XXH3_hashLong_128b_internal(byte* input, int len, byte* secret, int secretSize) { - ulong* acc = stackalloc ulong[8]; - - fixed (ulong* ptr = &XXH3_INIT_ACC[0]) + fixed (ulong* src = &XXH3_INIT_ACC[0]) { - acc[0] = ptr[0]; - acc[1] = ptr[1]; - acc[2] = ptr[2]; - acc[3] = ptr[3]; - acc[4] = ptr[4]; - acc[5] = ptr[5]; - acc[6] = ptr[6]; - acc[7] = ptr[7]; + ulong* acc = stackalloc ulong[8] + { + *(src + 0), + *(src + 1), + *(src + 2), + *(src + 3), + *(src + 4), + *(src + 5), + *(src + 6), + *(src + 7), + }; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize); + + uint128 uint128; + uint128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (ulong)len * XXH_PRIME64_1); + uint128.high64 = XXH3_mergeAccs(acc, + secret + secretSize - XXH3_ACC_SIZE - XXH_SECRET_MERGEACCS_START, + ~((ulong)len * XXH_PRIME64_2)); + + return uint128; } - XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize); - - uint128 uint128; - uint128.low64 = XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (ulong)len * XXH_PRIME64_1); - uint128.high64 = XXH3_mergeAccs(acc, - secret + secretSize - XXH3_ACC_SIZE - XXH_SECRET_MERGEACCS_START, - ~((ulong)len * XXH_PRIME64_2)); - - return uint128; } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -390,51 +378,89 @@ namespace Standart.Hash.xxHash [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_accumulate_512_avx2(ulong* acc, byte* input, byte* secret) { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; + var acc_vec0 = Unsafe.Read>(acc + 0); + var acc_vec1 = Unsafe.Read>(acc + 4); - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; + var data_vec0 = Unsafe.Read>((ulong*)input + 0).AsUInt32(); + var data_vec1 = Unsafe.Read>((ulong*)input + 4).AsUInt32(); - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*)input + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*)secret + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc + uint64_offset, result); - } + var key_vec0 = Unsafe.Read>((ulong*)secret + 0).AsUInt32(); + var key_vec1 = Unsafe.Read>((ulong*)secret + 4).AsUInt32(); + + var data_key0 = Avx2.Xor(data_vec0, key_vec0); + var data_key1 = Avx2.Xor(data_vec1, key_vec1); + + var data_key_lo0 = Avx2.Shuffle(data_key0, MM_SHUFFLE_0_3_0_1); + var data_key_lo1 = Avx2.Shuffle(data_key1, MM_SHUFFLE_0_3_0_1); + + var product0 = Avx2.Multiply(data_key0, data_key_lo0); + var product1 = Avx2.Multiply(data_key1, data_key_lo1); + + var data_swap0 = Avx2.Shuffle(data_vec0, MM_SHUFFLE_1_0_3_2).AsUInt64(); + var data_swap1 = Avx2.Shuffle(data_vec1, MM_SHUFFLE_1_0_3_2).AsUInt64(); + + var sum0 = Avx2.Add(acc_vec0, data_swap0); + var sum1 = Avx2.Add(acc_vec1, data_swap1); + + var result0 = Avx2.Add(product0, sum0); + var result1 = Avx2.Add(product1, sum1); + + Unsafe.Write(acc + 0, result0); + Unsafe.Write(acc + 4, result1); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_accumulate_512_sse2(ulong* acc, byte* input, byte* secret) { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; + var acc_vec0 = Unsafe.Read>(acc + 0); + var acc_vec1 = Unsafe.Read>(acc + 2); + var acc_vec2 = Unsafe.Read>(acc + 4); + var acc_vec3 = Unsafe.Read>(acc + 6); - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; + var data_vec0 = Unsafe.Read>((ulong*)input + 0).AsUInt32(); + var data_vec1 = Unsafe.Read>((ulong*)input + 2).AsUInt32(); + var data_vec2 = Unsafe.Read>((ulong*)input + 4).AsUInt32(); + var data_vec3 = Unsafe.Read>((ulong*)input + 6).AsUInt32(); - var acc_vec = Sse2.LoadVector128(acc + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) input + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc + uint64_offset, result); - } + var key_vec0 = Unsafe.Read>((ulong*)secret + 0).AsUInt32(); + var key_vec1 = Unsafe.Read>((ulong*)secret + 2).AsUInt32(); + var key_vec2 = Unsafe.Read>((ulong*)secret + 4).AsUInt32(); + var key_vec3 = Unsafe.Read>((ulong*)secret + 6).AsUInt32(); + + var data_key0 = Sse2.Xor(data_vec0, key_vec0); + var data_key1 = Sse2.Xor(data_vec1, key_vec1); + var data_key2 = Sse2.Xor(data_vec2, key_vec2); + var data_key3 = Sse2.Xor(data_vec3, key_vec3); + + var data_key_lo0 = Sse2.Shuffle(data_key0, MM_SHUFFLE_0_3_0_1); + var data_key_lo1 = Sse2.Shuffle(data_key1, MM_SHUFFLE_0_3_0_1); + var data_key_lo2 = Sse2.Shuffle(data_key2, MM_SHUFFLE_0_3_0_1); + var data_key_lo3 = Sse2.Shuffle(data_key3, MM_SHUFFLE_0_3_0_1); + + var product0 = Sse2.Multiply(data_key0, data_key_lo0); + var product1 = Sse2.Multiply(data_key1, data_key_lo1); + var product2 = Sse2.Multiply(data_key2, data_key_lo2); + var product3 = Sse2.Multiply(data_key3, data_key_lo3); + + var data_swap0 = Sse2.Shuffle(data_vec0, MM_SHUFFLE_1_0_3_2).AsUInt64(); + var data_swap1 = Sse2.Shuffle(data_vec1, MM_SHUFFLE_1_0_3_2).AsUInt64(); + var data_swap2 = Sse2.Shuffle(data_vec2, MM_SHUFFLE_1_0_3_2).AsUInt64(); + var data_swap3 = Sse2.Shuffle(data_vec3, MM_SHUFFLE_1_0_3_2).AsUInt64(); + + var sum0 = Sse2.Add(acc_vec0, data_swap0); + var sum1 = Sse2.Add(acc_vec1, data_swap1); + var sum2 = Sse2.Add(acc_vec2, data_swap2); + var sum3 = Sse2.Add(acc_vec3, data_swap3); + + var result0 = Sse2.Add(product0, sum0); + var result1 = Sse2.Add(product1, sum1); + var result2 = Sse2.Add(product2, sum2); + var result3 = Sse2.Add(product3, sum3); + + Unsafe.Write(acc + 0, result0); + Unsafe.Write(acc + 2, result1); + Unsafe.Write(acc + 4, result2); + Unsafe.Write(acc + 6, result3); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -473,52 +499,89 @@ namespace Standart.Hash.xxHash [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_scrambleAcc_avx2(ulong* acc, byte* secret) { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; + var acc_vec0 = Unsafe.Read>(acc + 0); + var acc_vec1 = Unsafe.Read>(acc + 4); - var prime32 = Vector256.Create(XXH_PRIME32_1); + var shifted0 = Avx2.ShiftRightLogical(acc_vec0, 47); + var shifted1 = Avx2.ShiftRightLogical(acc_vec1, 47); - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint64_offset = i * 4; + var data_vec0 = Avx2.Xor(acc_vec0, shifted0); + var data_vec1 = Avx2.Xor(acc_vec1, shifted1); - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var shifted = Avx2.ShiftRightLogical(acc_vec, 47); - var data_vec = Avx2.Xor(acc_vec, shifted); - var key_vec = Avx2.LoadVector256((ulong*) secret + uint64_offset); - var data_key = Avx2.Xor(data_vec, key_vec).AsUInt32(); - var data_key_hi = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var prod_lo = Avx2.Multiply(data_key, prime32); - var prod_hi = Avx2.Multiply(data_key_hi, prime32); - var result = Avx2.Add(prod_lo, Avx2.ShiftLeftLogical(prod_hi, 32)); - Avx2.Store(acc + uint64_offset, result); - } + var key_vec0 = Unsafe.Read>((ulong*)secret + 0); + var key_vec1 = Unsafe.Read>((ulong*)secret + 4); + + var data_key0 = Avx2.Xor(data_vec0, key_vec0).AsUInt32(); + var data_key1 = Avx2.Xor(data_vec1, key_vec1).AsUInt32(); + + var data_key_hi0 = Avx2.Shuffle(data_key0, MM_SHUFFLE_0_3_0_1); + var data_key_hi1 = Avx2.Shuffle(data_key1, MM_SHUFFLE_0_3_0_1); + + var prod_lo0 = Avx2.Multiply(data_key0, M256i_XXH_PRIME32_1); + var prod_lo1 = Avx2.Multiply(data_key1, M256i_XXH_PRIME32_1); + + var prod_hi0 = Avx2.Multiply(data_key_hi0, M256i_XXH_PRIME32_1); + var prod_hi1 = Avx2.Multiply(data_key_hi1, M256i_XXH_PRIME32_1); + + var result0 = Avx2.Add(prod_lo0, Avx2.ShiftLeftLogical(prod_hi0, 32)); + var result1 = Avx2.Add(prod_lo1, Avx2.ShiftLeftLogical(prod_hi1, 32)); + + Unsafe.Write(acc + 0, result0); + Unsafe.Write(acc + 4, result1); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_scrambleAcc_sse2(ulong* acc, byte* secret) { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; + var acc_vec0 = Unsafe.Read>(acc + 0).AsUInt32(); + var acc_vec1 = Unsafe.Read>(acc + 2).AsUInt32(); + var acc_vec2 = Unsafe.Read>(acc + 4).AsUInt32(); + var acc_vec3 = Unsafe.Read>(acc + 6).AsUInt32(); - var prime32 = Vector128.Create(XXH_PRIME32_1); - - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; + var shifted0 = Sse2.ShiftRightLogical(acc_vec0, 47); + var shifted1 = Sse2.ShiftRightLogical(acc_vec1, 47); + var shifted2 = Sse2.ShiftRightLogical(acc_vec2, 47); + var shifted3 = Sse2.ShiftRightLogical(acc_vec3, 47); - var acc_vec = Sse2.LoadVector128(acc + uint64_offset).AsUInt32(); - var shifted = Sse2.ShiftRightLogical(acc_vec, 47); - var data_vec = Sse2.Xor(acc_vec, shifted); - var key_vec = Sse2.LoadVector128((uint*) secret + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_hi = Sse2.Shuffle(data_key.AsUInt32(), _MM_SHUFFLE_0_3_0_1); - var prod_lo = Sse2.Multiply(data_key, prime32); - var prod_hi = Sse2.Multiply(data_key_hi, prime32); - var result = Sse2.Add(prod_lo, Sse2.ShiftLeftLogical(prod_hi, 32)); - Sse2.Store(acc + uint64_offset, result); - } + var data_vec0 = Sse2.Xor(acc_vec0, shifted0); + var data_vec1 = Sse2.Xor(acc_vec1, shifted1); + var data_vec2 = Sse2.Xor(acc_vec2, shifted2); + var data_vec3 = Sse2.Xor(acc_vec3, shifted3); + + var key_vec0 = Unsafe.Read>((ulong*)secret + 0).AsUInt32(); + var key_vec1 = Unsafe.Read>((ulong*)secret + 2).AsUInt32(); + var key_vec2 = Unsafe.Read>((ulong*)secret + 4).AsUInt32(); + var key_vec3 = Unsafe.Read>((ulong*)secret + 6).AsUInt32(); + + var data_key0= Sse2.Xor(data_vec0, key_vec0); + var data_key1= Sse2.Xor(data_vec1, key_vec1); + var data_key2= Sse2.Xor(data_vec2, key_vec2); + var data_key3= Sse2.Xor(data_vec3, key_vec3); + + var data_key_hi0 = Sse2.Shuffle(data_key0.AsUInt32(), MM_SHUFFLE_0_3_0_1); + var data_key_hi1 = Sse2.Shuffle(data_key1.AsUInt32(), MM_SHUFFLE_0_3_0_1); + var data_key_hi2 = Sse2.Shuffle(data_key2.AsUInt32(), MM_SHUFFLE_0_3_0_1); + var data_key_hi3 = Sse2.Shuffle(data_key3.AsUInt32(), MM_SHUFFLE_0_3_0_1); + + var prod_lo0 = Sse2.Multiply(data_key0, M128i_XXH_PRIME32_1); + var prod_lo1 = Sse2.Multiply(data_key1, M128i_XXH_PRIME32_1); + var prod_lo2 = Sse2.Multiply(data_key2, M128i_XXH_PRIME32_1); + var prod_lo3 = Sse2.Multiply(data_key3, M128i_XXH_PRIME32_1); + + var prod_hi0 = Sse2.Multiply(data_key_hi0, M128i_XXH_PRIME32_1); + var prod_hi1 = Sse2.Multiply(data_key_hi1, M128i_XXH_PRIME32_1); + var prod_hi2 = Sse2.Multiply(data_key_hi2, M128i_XXH_PRIME32_1); + var prod_hi3 = Sse2.Multiply(data_key_hi3, M128i_XXH_PRIME32_1); + + var result0 = Sse2.Add(prod_lo0, Sse2.ShiftLeftLogical(prod_hi0, 32)); + var result1 = Sse2.Add(prod_lo1, Sse2.ShiftLeftLogical(prod_hi1, 32)); + var result2 = Sse2.Add(prod_lo2, Sse2.ShiftLeftLogical(prod_hi2, 32)); + var result3 = Sse2.Add(prod_lo3, Sse2.ShiftLeftLogical(prod_hi3, 32)); + + Unsafe.Write(acc + 0, result0); + Unsafe.Write(acc + 2, result1); + Unsafe.Write(acc + 4, result2); + Unsafe.Write(acc + 6, result3); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -558,20 +621,30 @@ namespace Standart.Hash.xxHash [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_initCustomSecret_avx2(byte* customSecret, ulong seed64) { - const int m256i_size = 32; - - var seed = Vector256.Create((ulong)seed64, (ulong)(0U - seed64), (ulong)seed64, (ulong)(0U - seed64)); + var seed = Vector256.Create(seed64, (0U - seed64), seed64, (0U - seed64)); fixed (byte* secret = &XXH3_SECRET[0]) { - for (int i = 0; i < XXH_SECRET_DEFAULT_SIZE / m256i_size; i++) - { - int uint64_offset = i * 4; + var src0 = Unsafe.Read>((ulong*)secret + 0); + var src1 = Unsafe.Read>((ulong*)secret + 4); + var src2 = Unsafe.Read>((ulong*)secret + 8); + var src3 = Unsafe.Read>((ulong*)secret + 12); + var src4 = Unsafe.Read>((ulong*)secret + 16); + var src5 = Unsafe.Read>((ulong*)secret + 20); - var src32 = Avx2.LoadVector256(((ulong*)secret) + uint64_offset); - var dst32 = Avx2.Add(src32, seed); - Avx2.Store((ulong*) customSecret + uint64_offset, dst32); - } + var dst0 = Avx2.Add(src0, seed); + var dst1 = Avx2.Add(src1, seed); + var dst2 = Avx2.Add(src2, seed); + var dst3 = Avx2.Add(src3, seed); + var dst4 = Avx2.Add(src4, seed); + var dst5 = Avx2.Add(src5, seed); + + Unsafe.Write((ulong*)customSecret + 0, dst0); + Unsafe.Write((ulong*)customSecret + 4, dst1); + Unsafe.Write((ulong*)customSecret + 8, dst2); + Unsafe.Write((ulong*)customSecret + 12, dst3); + Unsafe.Write((ulong*)customSecret + 16, dst4); + Unsafe.Write((ulong*)customSecret + 20, dst5); } } @@ -579,21 +652,48 @@ namespace Standart.Hash.xxHash [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_initCustomSecret_sse2(byte* customSecret, ulong seed64) { - const int m128i_size = 16; - var seed = Vector128.Create((long)seed64, (long)(0U - seed64)); fixed (byte* secret = &XXH3_SECRET[0]) { - for (int i = 0; i < XXH_SECRET_DEFAULT_SIZE / m128i_size; i++) - { - int uint64_offset = i * 2; + var src0 = Unsafe.Read>((long*)secret + 0); + var src1 = Unsafe.Read>((long*)secret + 2); + var src2 = Unsafe.Read>((long*)secret + 4); + var src3 = Unsafe.Read>((long*)secret + 6); + var src4 = Unsafe.Read>((long*)secret + 8); + var src5 = Unsafe.Read>((long*)secret + 10); + var src6 = Unsafe.Read>((long*)secret + 12); + var src7 = Unsafe.Read>((long*)secret + 14); + var src8 = Unsafe.Read>((long*)secret + 16); + var src9 = Unsafe.Read>((long*)secret + 18); + var src10 = Unsafe.Read>((long*)secret + 20); + var src11 = Unsafe.Read>((long*)secret + 22); - var src16 = Sse2.LoadVector128(((long*) secret) + uint64_offset); - var dst16 = Sse2.Add(src16, seed); - Sse2.Store((long*) customSecret + uint64_offset, dst16); - - } + var dst0 = Sse2.Add(src0, seed); + var dst1 = Sse2.Add(src1, seed); + var dst2 = Sse2.Add(src2, seed); + var dst3 = Sse2.Add(src3, seed); + var dst4 = Sse2.Add(src4, seed); + var dst5 = Sse2.Add(src5, seed); + var dst6 = Sse2.Add(src6, seed); + var dst7 = Sse2.Add(src7, seed); + var dst8 = Sse2.Add(src8, seed); + var dst9 = Sse2.Add(src9, seed); + var dst10 = Sse2.Add(src10, seed); + var dst11 = Sse2.Add(src11, seed); + + Unsafe.Write((long*)customSecret + 0, dst0); + Unsafe.Write((long*)customSecret + 2, dst1); + Unsafe.Write((long*)customSecret + 4, dst2); + Unsafe.Write((long*)customSecret + 6, dst3); + Unsafe.Write((long*)customSecret + 8, dst4); + Unsafe.Write((long*)customSecret + 10, dst5); + Unsafe.Write((long*)customSecret + 12, dst6); + Unsafe.Write((long*)customSecret + 14, dst7); + Unsafe.Write((long*)customSecret + 16, dst8); + Unsafe.Write((long*)customSecret + 18, dst9); + Unsafe.Write((long*)customSecret + 20, dst10); + Unsafe.Write((long*)customSecret + 22, dst11); } } diff --git a/src/Standart.Hash.xxHash/xxHash128.cs b/src/Standart.Hash.xxHash/xxHash128.cs index d1832a4..bf6c06d 100644 --- a/src/Standart.Hash.xxHash/xxHash128.cs +++ b/src/Standart.Hash.xxHash/xxHash128.cs @@ -166,10 +166,7 @@ namespace Standart.Hash.xxHash { fixed (byte* secret = &XXH3_SECRET[0]) { - // Use inlined version - // return XXH3_128bits_internal(input, len, seed, secret, XXH3_SECRET_DEFAULT_SIZE); - - return __inline__XXH3_128bits_internal(input, len, seed, secret, XXH3_SECRET_DEFAULT_SIZE); + return XXH3_128bits_internal(input, len, seed, secret, XXH3_SECRET_DEFAULT_SIZE); } } } diff --git a/src/Standart.Hash.xxHash/xxHash3.XXH.cs b/src/Standart.Hash.xxHash/xxHash3.XXH.cs index e3b4f2a..71f210d 100644 --- a/src/Standart.Hash.xxHash/xxHash3.XXH.cs +++ b/src/Standart.Hash.xxHash/xxHash3.XXH.cs @@ -1,30 +1,39 @@ // ReSharper disable InconsistentNaming using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; namespace Standart.Hash.xxHash { public static partial class xxHash3 { - private const ulong XXH_PRIME64_1 = 11400714785074694791UL; - private const ulong XXH_PRIME64_2 = 14029467366897019727UL; - private const ulong XXH_PRIME64_3 = 1609587929392839161UL; - private const ulong XXH_PRIME64_4 = 9650029242287828579UL; - private const ulong XXH_PRIME64_5 = 2870177450012600261UL; + private static readonly ulong XXH_PRIME64_1 = 11400714785074694791UL; + private static readonly ulong XXH_PRIME64_2 = 14029467366897019727UL; + private static readonly ulong XXH_PRIME64_3 = 1609587929392839161UL; + private static readonly ulong XXH_PRIME64_4 = 9650029242287828579UL; + private static readonly ulong XXH_PRIME64_5 = 2870177450012600261UL; - private const uint XXH_PRIME32_1 = 2654435761U; - private const uint XXH_PRIME32_2 = 2246822519U; - private const uint XXH_PRIME32_3 = 3266489917U; - private const uint XXH_PRIME32_4 = 668265263U; - private const uint XXH_PRIME32_5 = 374761393U; + private static readonly uint XXH_PRIME32_1 = 2654435761U; + private static readonly uint XXH_PRIME32_2 = 2246822519U; + private static readonly uint XXH_PRIME32_3 = 3266489917U; + private static readonly uint XXH_PRIME32_4 = 668265263U; + private static readonly uint XXH_PRIME32_5 = 374761393U; - private const int XXH_STRIPE_LEN = 64; - private const int XXH_ACC_NB = XXH_STRIPE_LEN / 8; - private const int XXH_SECRET_CONSUME_RATE = 8; - private const int XXH_SECRET_DEFAULT_SIZE = 192; - private const int XXH_SECRET_MERGEACCS_START = 11; - private const int XXH_SECRET_LASTACC_START = 7; + private static readonly int XXH_STRIPE_LEN = 64; + private static readonly int XXH_ACC_NB = XXH_STRIPE_LEN / 8; + private static readonly int XXH_SECRET_CONSUME_RATE = 8; + private static readonly int XXH_SECRET_DEFAULT_SIZE = 192; + private static readonly int XXH_SECRET_MERGEACCS_START = 11; + private static readonly int XXH_SECRET_LASTACC_START = 7; + + private static readonly byte MM_SHUFFLE_0_3_0_1 = 0b0011_0001; + private static readonly byte MM_SHUFFLE_1_0_3_2 = 0b0100_1110; + + [FixedAddressValueType] + private static readonly Vector256 M256i_XXH_PRIME32_1 = Vector256.Create(XXH_PRIME32_1); + [FixedAddressValueType] + private static readonly Vector128 M128i_XXH_PRIME32_1 = Vector128.Create(XXH_PRIME32_1); [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe ulong XXH_readLE64(byte* ptr) diff --git a/src/Standart.Hash.xxHash/xxHash3.XXH3.cs b/src/Standart.Hash.xxHash/xxHash3.XXH3.cs index de698f9..74a1ef6 100644 --- a/src/Standart.Hash.xxHash/xxHash3.XXH3.cs +++ b/src/Standart.Hash.xxHash/xxHash3.XXH3.cs @@ -8,7 +8,7 @@ namespace Standart.Hash.xxHash { public static partial class xxHash3 { - private static byte[] XXH3_SECRET = + private static readonly byte[] XXH3_SECRET = { 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, @@ -24,17 +24,17 @@ namespace Standart.Hash.xxHash 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, }; - private static ulong[] XXH3_INIT_ACC = + private static readonly ulong[] XXH3_INIT_ACC = { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }; - private const int XXH3_MIDSIZE_MAX = 240; - private const int XXH3_MIDSIZE_STARTOFFSET = 3; - private const int XXH3_MIDSIZE_LASTOFFSET = 17; - private const int XXH3_SECRET_SIZE_MIN = 136; - private const int XXH3_SECRET_DEFAULT_SIZE = 192; + private static readonly int XXH3_MIDSIZE_MAX = 240; + private static readonly int XXH3_MIDSIZE_STARTOFFSET = 3; + private static readonly int XXH3_MIDSIZE_LASTOFFSET = 17; + private static readonly int XXH3_SECRET_SIZE_MIN = 136; + private static readonly int XXH3_SECRET_DEFAULT_SIZE = 192; [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe ulong XXH3_64bits_internal(byte* input, int len, ulong seed64, byte* secret, @@ -201,27 +201,11 @@ namespace Standart.Hash.xxHash if (seed == 0) return XXH3_hashLong_64b_internal(input, len, secret, secretSize); - int customSecretSize = XXH3_SECRET_DEFAULT_SIZE; - byte* customSecret = stackalloc byte[customSecretSize]; - - fixed (byte* ptr = &XXH3_SECRET[0]) - { - for (int i = 0; i < customSecretSize; i += 8) - { - customSecret[i] = ptr[i]; - customSecret[i + 1] = ptr[i + 1]; - customSecret[i + 2] = ptr[i + 2]; - customSecret[i + 3] = ptr[i + 3]; - customSecret[i + 4] = ptr[i + 4]; - customSecret[i + 5] = ptr[i + 5]; - customSecret[i + 6] = ptr[i + 6]; - customSecret[i + 7] = ptr[i + 7]; - } - } + byte* customSecret = stackalloc byte[XXH3_SECRET_DEFAULT_SIZE]; XXH3_initCustomSecret(customSecret, seed); - return XXH3_hashLong_64b_internal(input, len, customSecret, customSecretSize); + return XXH3_hashLong_64b_internal(input, len, customSecret, XXH3_SECRET_DEFAULT_SIZE); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -238,40 +222,78 @@ namespace Standart.Hash.xxHash [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_initCustomSecret_avx2(byte* customSecret, ulong seed64) { - const int m256i_size = 32; - - var seed = Vector256.Create(seed64, 0U - seed64, seed64, 0U - seed64); + var seed = Vector256.Create(seed64, (0U - seed64), seed64, (0U - seed64)); fixed (byte* secret = &XXH3_SECRET[0]) { - for (int i = 0; i < XXH_SECRET_DEFAULT_SIZE / m256i_size; i++) - { - int uint64_offset = i * 4; + var src0 = Unsafe.Read>((ulong*)secret + 0); + var src1 = Unsafe.Read>((ulong*)secret + 4); + var src2 = Unsafe.Read>((ulong*)secret + 8); + var src3 = Unsafe.Read>((ulong*)secret + 12); + var src4 = Unsafe.Read>((ulong*)secret + 16); + var src5 = Unsafe.Read>((ulong*)secret + 20); - var src32 = Avx2.LoadVector256(((ulong*) secret) + uint64_offset); - var dst32 = Avx2.Add(src32, seed); - Avx2.Store((ulong*) customSecret + uint64_offset, dst32); - } + var dst0 = Avx2.Add(src0, seed); + var dst1 = Avx2.Add(src1, seed); + var dst2 = Avx2.Add(src2, seed); + var dst3 = Avx2.Add(src3, seed); + var dst4 = Avx2.Add(src4, seed); + var dst5 = Avx2.Add(src5, seed); + + Unsafe.Write((ulong*)customSecret + 0, dst0); + Unsafe.Write((ulong*)customSecret + 4, dst1); + Unsafe.Write((ulong*)customSecret + 8, dst2); + Unsafe.Write((ulong*)customSecret + 12, dst3); + Unsafe.Write((ulong*)customSecret + 16, dst4); + Unsafe.Write((ulong*)customSecret + 20, dst5); } } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_initCustomSecret_sse2(byte* customSecret, ulong seed64) { - const int m128i_size = 16; - - var seed = Vector128.Create((long) seed64, (long) (0U - seed64)); + var seed = Vector128.Create((long)seed64, (long)(0U - seed64)); fixed (byte* secret = &XXH3_SECRET[0]) { - for (int i = 0; i < XXH_SECRET_DEFAULT_SIZE / m128i_size; i++) - { - int uint64_offset = i * 2; + var src0 = Unsafe.Read>((long*)secret + 0); + var src1 = Unsafe.Read>((long*)secret + 2); + var src2 = Unsafe.Read>((long*)secret + 4); + var src3 = Unsafe.Read>((long*)secret + 6); + var src4 = Unsafe.Read>((long*)secret + 8); + var src5 = Unsafe.Read>((long*)secret + 10); + var src6 = Unsafe.Read>((long*)secret + 12); + var src7 = Unsafe.Read>((long*)secret + 14); + var src8 = Unsafe.Read>((long*)secret + 16); + var src9 = Unsafe.Read>((long*)secret + 18); + var src10 = Unsafe.Read>((long*)secret + 20); + var src11 = Unsafe.Read>((long*)secret + 22); - var src16 = Sse2.LoadVector128(((long*) secret) + uint64_offset); - var dst16 = Sse2.Add(src16, seed); - Sse2.Store((long*) customSecret + uint64_offset, dst16); - } + var dst0 = Sse2.Add(src0, seed); + var dst1 = Sse2.Add(src1, seed); + var dst2 = Sse2.Add(src2, seed); + var dst3 = Sse2.Add(src3, seed); + var dst4 = Sse2.Add(src4, seed); + var dst5 = Sse2.Add(src5, seed); + var dst6 = Sse2.Add(src6, seed); + var dst7 = Sse2.Add(src7, seed); + var dst8 = Sse2.Add(src8, seed); + var dst9 = Sse2.Add(src9, seed); + var dst10 = Sse2.Add(src10, seed); + var dst11 = Sse2.Add(src11, seed); + + Unsafe.Write((long*)customSecret + 0, dst0); + Unsafe.Write((long*)customSecret + 2, dst1); + Unsafe.Write((long*)customSecret + 4, dst2); + Unsafe.Write((long*)customSecret + 6, dst3); + Unsafe.Write((long*)customSecret + 8, dst4); + Unsafe.Write((long*)customSecret + 10, dst5); + Unsafe.Write((long*)customSecret + 12, dst6); + Unsafe.Write((long*)customSecret + 14, dst7); + Unsafe.Write((long*)customSecret + 16, dst8); + Unsafe.Write((long*)customSecret + 18, dst9); + Unsafe.Write((long*)customSecret + 20, dst10); + Unsafe.Write((long*)customSecret + 22, dst11); } } @@ -295,23 +317,24 @@ namespace Standart.Hash.xxHash [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe ulong XXH3_hashLong_64b_internal(byte* input, int len, byte* secret, int secretSize) { - ulong* acc = stackalloc ulong[8]; - - fixed (ulong* ptr = &XXH3_INIT_ACC[0]) + fixed (ulong* src = &XXH3_INIT_ACC[0]) { - acc[0] = ptr[0]; - acc[1] = ptr[1]; - acc[2] = ptr[2]; - acc[3] = ptr[3]; - acc[4] = ptr[4]; - acc[5] = ptr[5]; - acc[6] = ptr[6]; - acc[7] = ptr[7]; + ulong* acc = stackalloc ulong[8] + { + *(src + 0), + *(src + 1), + *(src + 2), + *(src + 3), + *(src + 4), + *(src + 5), + *(src + 6), + *(src + 7), + }; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize); + + return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, ((ulong)len) * XXH_PRIME64_1); } - - XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize); - - return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, ((ulong) len) * XXH_PRIME64_1); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -378,51 +401,89 @@ namespace Standart.Hash.xxHash [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_accumulate_512_avx2(ulong* acc, byte* input, byte* secret) { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; + var acc_vec0 = Unsafe.Read>(acc + 0); + var acc_vec1 = Unsafe.Read>(acc + 4); - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint32_offset = i * 8; - int uint64_offset = i * 4; + var data_vec0 = Unsafe.Read>((ulong*)input + 0).AsUInt32(); + var data_vec1 = Unsafe.Read>((ulong*)input + 4).AsUInt32(); - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var data_vec = Avx2.LoadVector256((uint*) input + uint32_offset); - var key_vec = Avx2.LoadVector256((uint*) secret + uint32_offset); - var data_key = Avx2.Xor(data_vec, key_vec); - var data_key_lo = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Avx2.Multiply(data_key, data_key_lo); - var data_swap = Avx2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Avx2.Add(acc_vec, data_swap); - var result = Avx2.Add(product, sum); - Avx2.Store(acc + uint64_offset, result); - } + var key_vec0 = Unsafe.Read>((ulong*)secret + 0).AsUInt32(); + var key_vec1 = Unsafe.Read>((ulong*)secret + 4).AsUInt32(); + + var data_key0 = Avx2.Xor(data_vec0, key_vec0); + var data_key1 = Avx2.Xor(data_vec1, key_vec1); + + var data_key_lo0 = Avx2.Shuffle(data_key0, MM_SHUFFLE_0_3_0_1); + var data_key_lo1 = Avx2.Shuffle(data_key1, MM_SHUFFLE_0_3_0_1); + + var product0 = Avx2.Multiply(data_key0, data_key_lo0); + var product1 = Avx2.Multiply(data_key1, data_key_lo1); + + var data_swap0 = Avx2.Shuffle(data_vec0, MM_SHUFFLE_1_0_3_2).AsUInt64(); + var data_swap1 = Avx2.Shuffle(data_vec1, MM_SHUFFLE_1_0_3_2).AsUInt64(); + + var sum0 = Avx2.Add(acc_vec0, data_swap0); + var sum1 = Avx2.Add(acc_vec1, data_swap1); + + var result0 = Avx2.Add(product0, sum0); + var result1 = Avx2.Add(product1, sum1); + + Unsafe.Write(acc + 0, result0); + Unsafe.Write(acc + 4, result1); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_accumulate_512_sse2(ulong* acc, byte* input, byte* secret) { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; - const byte _MM_SHUFFLE_1_0_3_2 = 0b0100_1110; + var acc_vec0 = Unsafe.Read>(acc + 0); + var acc_vec1 = Unsafe.Read>(acc + 2); + var acc_vec2 = Unsafe.Read>(acc + 4); + var acc_vec3 = Unsafe.Read>(acc + 6); - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; + var data_vec0 = Unsafe.Read>((ulong*)input + 0).AsUInt32(); + var data_vec1 = Unsafe.Read>((ulong*)input + 2).AsUInt32(); + var data_vec2 = Unsafe.Read>((ulong*)input + 4).AsUInt32(); + var data_vec3 = Unsafe.Read>((ulong*)input + 6).AsUInt32(); - var acc_vec = Sse2.LoadVector128(acc + uint64_offset); - var data_vec = Sse2.LoadVector128((uint*) input + uint32_offset); - var key_vec = Sse2.LoadVector128((uint*) secret + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_lo = Sse2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var product = Sse2.Multiply(data_key, data_key_lo); - var data_swap = Sse2.Shuffle(data_vec, _MM_SHUFFLE_1_0_3_2).AsUInt64(); - var sum = Sse2.Add(acc_vec, data_swap); - var result = Sse2.Add(product, sum); - Sse2.Store(acc + uint64_offset, result); - } + var key_vec0 = Unsafe.Read>((ulong*)secret + 0).AsUInt32(); + var key_vec1 = Unsafe.Read>((ulong*)secret + 2).AsUInt32(); + var key_vec2 = Unsafe.Read>((ulong*)secret + 4).AsUInt32(); + var key_vec3 = Unsafe.Read>((ulong*)secret + 6).AsUInt32(); + + var data_key0 = Sse2.Xor(data_vec0, key_vec0); + var data_key1 = Sse2.Xor(data_vec1, key_vec1); + var data_key2 = Sse2.Xor(data_vec2, key_vec2); + var data_key3 = Sse2.Xor(data_vec3, key_vec3); + + var data_key_lo0 = Sse2.Shuffle(data_key0, MM_SHUFFLE_0_3_0_1); + var data_key_lo1 = Sse2.Shuffle(data_key1, MM_SHUFFLE_0_3_0_1); + var data_key_lo2 = Sse2.Shuffle(data_key2, MM_SHUFFLE_0_3_0_1); + var data_key_lo3 = Sse2.Shuffle(data_key3, MM_SHUFFLE_0_3_0_1); + + var product0 = Sse2.Multiply(data_key0, data_key_lo0); + var product1 = Sse2.Multiply(data_key1, data_key_lo1); + var product2 = Sse2.Multiply(data_key2, data_key_lo2); + var product3 = Sse2.Multiply(data_key3, data_key_lo3); + + var data_swap0 = Sse2.Shuffle(data_vec0, MM_SHUFFLE_1_0_3_2).AsUInt64(); + var data_swap1 = Sse2.Shuffle(data_vec1, MM_SHUFFLE_1_0_3_2).AsUInt64(); + var data_swap2 = Sse2.Shuffle(data_vec2, MM_SHUFFLE_1_0_3_2).AsUInt64(); + var data_swap3 = Sse2.Shuffle(data_vec3, MM_SHUFFLE_1_0_3_2).AsUInt64(); + + var sum0 = Sse2.Add(acc_vec0, data_swap0); + var sum1 = Sse2.Add(acc_vec1, data_swap1); + var sum2 = Sse2.Add(acc_vec2, data_swap2); + var sum3 = Sse2.Add(acc_vec3, data_swap3); + + var result0 = Sse2.Add(product0, sum0); + var result1 = Sse2.Add(product1, sum1); + var result2 = Sse2.Add(product2, sum2); + var result3 = Sse2.Add(product3, sum3); + + Unsafe.Write(acc + 0, result0); + Unsafe.Write(acc + 2, result1); + Unsafe.Write(acc + 4, result2); + Unsafe.Write(acc + 6, result3); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -459,52 +520,89 @@ namespace Standart.Hash.xxHash [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_scrambleAcc_avx2(ulong* acc, byte* secret) { - const int m256i_size = 32; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; + var acc_vec0 = Unsafe.Read>(acc + 0); + var acc_vec1 = Unsafe.Read>(acc + 4); - var prime32 = Vector256.Create(XXH_PRIME32_1); + var shifted0 = Avx2.ShiftRightLogical(acc_vec0, 47); + var shifted1 = Avx2.ShiftRightLogical(acc_vec1, 47); - for (int i = 0; i < XXH_STRIPE_LEN / m256i_size; i++) - { - int uint64_offset = i * 4; + var data_vec0 = Avx2.Xor(acc_vec0, shifted0); + var data_vec1 = Avx2.Xor(acc_vec1, shifted1); - var acc_vec = Avx2.LoadVector256(acc + uint64_offset); - var shifted = Avx2.ShiftRightLogical(acc_vec, 47); - var data_vec = Avx2.Xor(acc_vec, shifted); - var key_vec = Avx2.LoadVector256((ulong*) secret + uint64_offset); - var data_key = Avx2.Xor(data_vec, key_vec).AsUInt32(); - var data_key_hi = Avx2.Shuffle(data_key, _MM_SHUFFLE_0_3_0_1); - var prod_lo = Avx2.Multiply(data_key, prime32); - var prod_hi = Avx2.Multiply(data_key_hi, prime32); - var result = Avx2.Add(prod_lo, Avx2.ShiftLeftLogical(prod_hi, 32)); - Avx2.Store(acc + uint64_offset, result); - } + var key_vec0 = Unsafe.Read>((ulong*)secret + 0); + var key_vec1 = Unsafe.Read>((ulong*)secret + 4); + + var data_key0 = Avx2.Xor(data_vec0, key_vec0).AsUInt32(); + var data_key1 = Avx2.Xor(data_vec1, key_vec1).AsUInt32(); + + var data_key_hi0 = Avx2.Shuffle(data_key0, MM_SHUFFLE_0_3_0_1); + var data_key_hi1 = Avx2.Shuffle(data_key1, MM_SHUFFLE_0_3_0_1); + + var prod_lo0 = Avx2.Multiply(data_key0, M256i_XXH_PRIME32_1); + var prod_lo1 = Avx2.Multiply(data_key1, M256i_XXH_PRIME32_1); + + var prod_hi0 = Avx2.Multiply(data_key_hi0, M256i_XXH_PRIME32_1); + var prod_hi1 = Avx2.Multiply(data_key_hi1, M256i_XXH_PRIME32_1); + + var result0 = Avx2.Add(prod_lo0, Avx2.ShiftLeftLogical(prod_hi0, 32)); + var result1 = Avx2.Add(prod_lo1, Avx2.ShiftLeftLogical(prod_hi1, 32)); + + Unsafe.Write(acc + 0, result0); + Unsafe.Write(acc + 4, result1); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void XXH3_scrambleAcc_sse2(ulong* acc, byte* secret) { - const int m128i_size = 16; - const byte _MM_SHUFFLE_0_3_0_1 = 0b0011_0001; + var acc_vec0 = Unsafe.Read>(acc + 0).AsUInt32(); + var acc_vec1 = Unsafe.Read>(acc + 2).AsUInt32(); + var acc_vec2 = Unsafe.Read>(acc + 4).AsUInt32(); + var acc_vec3 = Unsafe.Read>(acc + 6).AsUInt32(); - var prime32 = Vector128.Create(XXH_PRIME32_1); + var shifted0 = Sse2.ShiftRightLogical(acc_vec0, 47); + var shifted1 = Sse2.ShiftRightLogical(acc_vec1, 47); + var shifted2 = Sse2.ShiftRightLogical(acc_vec2, 47); + var shifted3 = Sse2.ShiftRightLogical(acc_vec3, 47); - for (int i = 0; i < XXH_STRIPE_LEN / m128i_size; i++) - { - int uint32_offset = i * 4; - int uint64_offset = i * 2; + var data_vec0 = Sse2.Xor(acc_vec0, shifted0); + var data_vec1 = Sse2.Xor(acc_vec1, shifted1); + var data_vec2 = Sse2.Xor(acc_vec2, shifted2); + var data_vec3 = Sse2.Xor(acc_vec3, shifted3); - var acc_vec = Sse2.LoadVector128(acc + uint64_offset).AsUInt32(); - var shifted = Sse2.ShiftRightLogical(acc_vec, 47); - var data_vec = Sse2.Xor(acc_vec, shifted); - var key_vec = Sse2.LoadVector128((uint*) secret + uint32_offset); - var data_key = Sse2.Xor(data_vec, key_vec); - var data_key_hi = Sse2.Shuffle(data_key.AsUInt32(), _MM_SHUFFLE_0_3_0_1); - var prod_lo = Sse2.Multiply(data_key, prime32); - var prod_hi = Sse2.Multiply(data_key_hi, prime32); - var result = Sse2.Add(prod_lo, Sse2.ShiftLeftLogical(prod_hi, 32)); - Sse2.Store(acc + uint64_offset, result); - } + var key_vec0 = Unsafe.Read>((ulong*)secret + 0).AsUInt32(); + var key_vec1 = Unsafe.Read>((ulong*)secret + 2).AsUInt32(); + var key_vec2 = Unsafe.Read>((ulong*)secret + 4).AsUInt32(); + var key_vec3 = Unsafe.Read>((ulong*)secret + 6).AsUInt32(); + + var data_key0 = Sse2.Xor(data_vec0, key_vec0); + var data_key1 = Sse2.Xor(data_vec1, key_vec1); + var data_key2 = Sse2.Xor(data_vec2, key_vec2); + var data_key3 = Sse2.Xor(data_vec3, key_vec3); + + var data_key_hi0 = Sse2.Shuffle(data_key0.AsUInt32(), MM_SHUFFLE_0_3_0_1); + var data_key_hi1 = Sse2.Shuffle(data_key1.AsUInt32(), MM_SHUFFLE_0_3_0_1); + var data_key_hi2 = Sse2.Shuffle(data_key2.AsUInt32(), MM_SHUFFLE_0_3_0_1); + var data_key_hi3 = Sse2.Shuffle(data_key3.AsUInt32(), MM_SHUFFLE_0_3_0_1); + + var prod_lo0 = Sse2.Multiply(data_key0, M128i_XXH_PRIME32_1); + var prod_lo1 = Sse2.Multiply(data_key1, M128i_XXH_PRIME32_1); + var prod_lo2 = Sse2.Multiply(data_key2, M128i_XXH_PRIME32_1); + var prod_lo3 = Sse2.Multiply(data_key3, M128i_XXH_PRIME32_1); + + var prod_hi0 = Sse2.Multiply(data_key_hi0, M128i_XXH_PRIME32_1); + var prod_hi1 = Sse2.Multiply(data_key_hi1, M128i_XXH_PRIME32_1); + var prod_hi2 = Sse2.Multiply(data_key_hi2, M128i_XXH_PRIME32_1); + var prod_hi3 = Sse2.Multiply(data_key_hi3, M128i_XXH_PRIME32_1); + + var result0 = Sse2.Add(prod_lo0, Sse2.ShiftLeftLogical(prod_hi0, 32)); + var result1 = Sse2.Add(prod_lo1, Sse2.ShiftLeftLogical(prod_hi1, 32)); + var result2 = Sse2.Add(prod_lo2, Sse2.ShiftLeftLogical(prod_hi2, 32)); + var result3 = Sse2.Add(prod_lo3, Sse2.ShiftLeftLogical(prod_hi3, 32)); + + Unsafe.Write(acc + 0, result0); + Unsafe.Write(acc + 2, result1); + Unsafe.Write(acc + 4, result2); + Unsafe.Write(acc + 6, result3); } [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/src/Standart.Hash.xxHash/xxHash3.cs b/src/Standart.Hash.xxHash/xxHash3.cs index 7255dbc..6f27344 100644 --- a/src/Standart.Hash.xxHash/xxHash3.cs +++ b/src/Standart.Hash.xxHash/xxHash3.cs @@ -89,10 +89,7 @@ namespace Standart.Hash.xxHash { fixed (byte* secret = &XXH3_SECRET[0]) { - // Use inlined version - // return XXH3_64bits_internal(input, len, seed, secret, XXH3_SECRET_DEFAULT_SIZE); - - return __inline__XXH3_64bits_internal(input, len, seed, secret, XXH3_SECRET_DEFAULT_SIZE); + return XXH3_64bits_internal(input, len, seed, secret, XXH3_SECRET_DEFAULT_SIZE); } } }