Conversation
|
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak Issue DetailsContributes to #77034 Uses AVX-512 for bool Test(byte[] src, byte[] dst) =>
src.AsSpan(0, 100).SequenceEqual(dst);Old codegen (Main):; Method AlignedBenchmarks:Test(ubyte[],ubyte[]):bool:this
sub rsp, 40
test rdx, rdx
je SHORT G_M22779_IG09
cmp dword ptr [rdx+08H], 100
jb SHORT G_M22779_IG09
add rdx, 16
mov rcx, rdx
test r8, r8
jne SHORT G_M22779_IG04
xor rdx, rdx
xor eax, eax
jmp SHORT G_M22779_IG05
G_M22779_IG04:
lea rdx, bword ptr [r8+10H]
mov eax, dword ptr [r8+08H]
G_M22779_IG05:
cmp eax, 100
jne SHORT G_M22779_IG07
mov r8d, 100
call [System.SpanHelpers:SequenceEqual(byref,byref,ulong):bool]
jmp SHORT G_M22779_IG08
G_M22779_IG07:
xor eax, eax
G_M22779_IG08:
add rsp, 40
ret
G_M22779_IG09:
call [System.ThrowHelper:ThrowArgumentOutOfRangeException()]
int3
; Total bytes of code: 74New codegen:; Method AlignedBenchmarks:Test(ubyte[],ubyte[]):bool:this
sub rsp, 40
vzeroupper
test rdx, rdx
je SHORT G_M22779_IG09
cmp dword ptr [rdx+08H], 100
jb SHORT G_M22779_IG09
add rdx, 16
test r8, r8
jne SHORT G_M22779_IG04
xor rax, rax
xor ecx, ecx
jmp SHORT G_M22779_IG05
G_M22779_IG04:
lea rax, bword ptr [r8+10H]
mov ecx, dword ptr [r8+08H]
G_M22779_IG05:
cmp ecx, 100
jne SHORT G_M22779_IG07
vmovups zmm0, zmmword ptr [rdx]
vmovups zmm1, zmmword ptr [rax]
vmovups zmm2, zmmword ptr [rdx+24H]
vmovups zmm3, zmmword ptr [rax+24H]
vpxorq zmm0, zmm0, zmm1
vpxorq zmm1, zmm2, zmm3
vporq zmm0, zmm0, zmm1
vxorps zmm1, zmm1, zmm1
vpcmpuq k1, zmm0, zmm1, 0
kortestb k1, k1
setb al
movzx rax, al
jmp SHORT G_M22779_IG08
G_M22779_IG07:
xor eax, eax
G_M22779_IG08:
vzeroupper
add rsp, 40
ret
G_M22779_IG09:
call [System.ThrowHelper:ThrowArgumentOutOfRangeException()]
int3
; Total bytes of code: 138Benchmarks:using System;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
[CsvExporter]
public unsafe class AlignedBenchmarks
{
void* _src;
void* _dst;
[GlobalSetup]
public void GlobalSetup()
{
_src = NativeMemory.AlignedAlloc(1024, 64);
_dst = NativeMemory.AlignedAlloc(1024, 64);
}
[Benchmark] public bool Compare_32_aligned() =>
new Span<byte>(_src, 32).SequenceEqual(new Span<byte>(_dst, 32));
[Benchmark] public bool Compare_48_aligned() =>
new Span<byte>(_src, 48).SequenceEqual(new Span<byte>(_dst, 48));
[Benchmark] public bool Compare_64_aligned() =>
new Span<byte>(_src, 64).SequenceEqual(new Span<byte>(_dst, 64));
[Benchmark] public bool Compare_65_aligned() =>
new Span<byte>(_src, 65).SequenceEqual(new Span<byte>(_dst, 65));
[Benchmark] public bool Compare_100_aligned() =>
new Span<byte>(_src, 100).SequenceEqual(new Span<byte>(_dst, 100));
[Benchmark] public bool Compare_128_aligned() =>
new Span<byte>(_src, 128).SequenceEqual(new Span<byte>(_dst, 128));
[Benchmark] public bool Compare_129_aligned() =>
new Span<byte>(_src, 129).SequenceEqual(new Span<byte>(_dst, 129));
[Benchmark] public bool Compare_150_aligned() =>
new Span<byte>(_src, 150).SequenceEqual(new Span<byte>(_dst, 150));
[Benchmark] public bool Compare_200_aligned() =>
new Span<byte>(_src, 200).SequenceEqual(new Span<byte>(_dst, 200));
[GlobalCleanup]
public void GlobalCleanup()
{
NativeMemory.Free(_src);
NativeMemory.Free(_dst);
}
}Also tested mis-aligned access and it still was faster than the baseline, although, the penalty is noticeably.
|
|
/benchmark json aspnet-citrine-lin runtime |
|
Benchmark started for json on aspnet-citrine-lin with runtime. Logs: link |
|
A couple of diffs (size regressions obviously). @dotnet/avx512-contrib PTAL, I've checked that this has no negative impact on TE using crank and local builds |
# Conflicts: # src/coreclr/jit/emitxarch.cpp
cc @kunalspathak it never finished |

Contributes to #77034
Uses AVX-512 for
SequenceEqualunrolling/vectoriazation for[64..128]range where previously we used to give up, e.g.Old codegen (Main):
New codegen:
Benchmarks:
(Ryzen 7950X with V512=V256*2)
Len=128 has the lowest (best) timing because it's the best case for the algorithm - two loads are perfectly aligned since the data is always 64-byte aligned. Same reason why Len=65 (next to 64) has the worst one. I also tested a case where source data is misaligned and avx-512 was still faster than the baseline.