-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Performance improvements to vectorized Span.Reverse #78650
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Tagging subscribers to this area: @dotnet/area-system-memory Issue DetailsSee original PR: #70944 I reverted the overlapping for int and long and and chose to keep it in-between for byte/char where it still shows an improvement to overlap. Machine: BenchmarkDotNet=v0.13.2, OS=Windows 11 (10.0.22621.819)
AMD Ryzen 7 3700X, 1 CPU, 16 logical and 8 physical cores
.NET SDK=7.0.100
[Host] : .NET 7.0.0 (7.0.22.51805), X64 RyuJIT AVX2
Job-AORABE : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
Job-BUGVTU : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2 Byte[HideColumns("Job", "Error", "StdDev", "Median", "RatioSD")]
public class ReverseBytes
{
[Params(16, 24, 32, 36, 38, 40, 42, 44, 48, 56, 64)]
public int NumberOfValues { get; set; }
private byte[] _values;
[GlobalSetup]
public void Setup()
{
_values = ValuesGenerator.Array<byte>(NumberOfValues);
}
[Benchmark]
public void Reverse() => _values.AsSpan().Reverse();
}
Char[HideColumns("Job", "Error", "StdDev", "Median", "RatioSD")]
public class ReverseChars
{
[Params(8, 12, 16, 20, 24, 28, 32, 40, 48)]
public int NumberOfValues { get; set; }
private char[] _values;
[GlobalSetup]
public void Setup()
{
// taken from dotnet/performance
_values = ValuesGenerator.Array<char>(NumberOfValues);
}
[Benchmark]
public void Reverse() => _values.AsSpan().Reverse();
}
Int
Long
@adamsitnik can you confirm if you get similar numbers on your benchmark? I think earlier we saw some slight difference between our numbers last time
|
nint offset = 0; | ||
|
||
// overlapping has a positive performance benefit around 48 elements | ||
if (Avx2.IsSupported && remainder >= Vector256<byte>.Count * 1.5) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is going to do floating point math. That does not look cheap.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was a bit too fast on the keyboard, didn't check the assembly output. It only evaluates it as a constant if I use (nint)(Vector256<byte>.Count * 1.5)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It only evaluates it as a constant if I use (nint)(Vector256.Count * 1.5)
just Vector256<byte>.Count * 1.5
also evaluates into a constant, but a FP constant => datasection memory load
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tier 1 codegen:
.NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
; System.SpanHelpers.Reverse(Byte ByRef, UIntPtr)
vzeroupper
mov rax,rdx
xor r8d,r8d
cmp rax,30
jge near ptr M00_L08
cmp rdx,20
jl short M00_L01
lea rax,[rdx-10]
M00_L00:
vmovdqu xmm0,xmmword ptr [rcx+r8]
vmovdqu xmm1,xmmword ptr [rcx+rax]
vpshufb xmm0,xmm0,[7FFEDA47F5A0]
vpshufb xmm1,xmm1,[7FFEDA47F5A0]
vmovdqu xmmword ptr [rcx+r8],xmm1
vmovdqu xmmword ptr [rcx+rax],xmm0
add r8,10
add rax,0FFFFFFFFFFFFFFF0
cmp rax,r8
jge short M00_L00
add rax,10
sub rax,r8
M00_L01:
cmp rax,8
jl short M00_L03
mov rax,rdx
sub rax,r8
add rax,0FFFFFFFFFFFFFFF8
M00_L02:
mov r9,[rcx+r8]
mov r10,[rcx+rax]
movbe [rcx+r8],r10
movbe [rcx+rax],r9
add r8,8
add rax,0FFFFFFFFFFFFFFF8
cmp rax,r8
jge short M00_L02
add rax,8
sub rax,r8
M00_L03:
cmp rax,4
jl short M00_L05
sub rdx,r8
add rdx,0FFFFFFFFFFFFFFFC
M00_L04:
mov eax,[rcx+r8]
mov r9d,[rcx+rdx]
movbe [rcx+r8],r9d
movbe [rcx+rdx],eax
add r8,4
add rdx,0FFFFFFFFFFFFFFFC
cmp rdx,r8
jge short M00_L04
lea rax,[rdx+4]
sub rax,r8
M00_L05:
cmp rax,1
jle short M00_L07
add rcx,r8
lea rax,[rcx+rax+System.Object.Finalize()]
M00_L06:
movzx edx,byte ptr [rcx]
movzx r8d,byte ptr [rax]
mov [rcx],r8b
mov [rax],dl
inc rcx
dec rax
cmp rcx,rax
jb short M00_L06
M00_L07:
vzeroupper
ret
M00_L08:
lea rax,[rdx-20]
M00_L09:
vmovdqu ymm0,ymmword ptr [rcx+r8]
vmovdqu ymm1,ymmword ptr [rcx+rax]
vmovupd ymm2,[7FFEDA47F5C0]
vpshufb ymm0,ymm0,ymm2
vperm2i128 ymm0,ymm0,ymm0,1
vmovupd ymm2,[7FFEDA47F5C0]
vpshufb ymm1,ymm1,ymm2
vperm2i128 ymm1,ymm1,ymm1,1
vmovdqu ymmword ptr [rcx+r8],ymm1
vmovdqu ymmword ptr [rcx+rax],ymm0
add r8,20
add rax,0FFFFFFFFFFFFFFE0
cmp rax,r8
jge short M00_L09
add rax,20
sub rax,r8
jmp near ptr M00_L01
; Total bytes of code 327
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The changes and the benchmark numbers LGTM, thank you @yesmey !
BenchmarkDotNet=v0.13.2.1950-nightly, OS=Windows 11 (10.0.22621.819)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=8.0.100-alpha.1.22570.9
[Host] : .NET 8.0.0 (8.0.22.55902), X64 RyuJIT AVX2
Job-EFMIQM : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
Job-WVFVYW : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
LaunchCount=3 MemoryRandomization=True
Type | Job | Size | Mean | Ratio |
---|---|---|---|---|
Span<Byte> | PR | 4 | 3.525 ns | 0.90 |
Span<Byte> | main | 4 | 3.935 ns | 1.00 |
Span<Char> | PR | 4 | 4.050 ns | 0.96 |
Span<Char> | main | 4 | 4.220 ns | 1.00 |
Span<Int32> | PR | 4 | 3.721 ns | 0.99 |
Span<Int32> | main | 4 | 3.777 ns | 1.00 |
Span<Int64> | PR | 4 | 3.468 ns | 0.88 |
Span<Int64> | main | 4 | 3.936 ns | 1.00 |
Span<Byte> | PR | 8 | 3.513 ns | 0.67 |
Span<Byte> | main | 8 | 5.237 ns | 1.00 |
Span<Char> | PR | 8 | 5.800 ns | 1.00 |
Span<Char> | main | 8 | 5.841 ns | 1.00 |
Span<Int32> | PR | 8 | 3.458 ns | 0.94 |
Span<Int32> | main | 8 | 3.669 ns | 1.00 |
Span<Int64> | PR | 8 | 3.495 ns | 0.83 |
Span<Int64> | main | 8 | 4.187 ns | 1.00 |
Span<Byte> | PR | 16 | 3.534 ns | 0.35 |
Span<Byte> | main | 16 | 10.259 ns | 1.00 |
Span<Char> | PR | 16 | 4.002 ns | 0.83 |
Span<Char> | main | 16 | 4.832 ns | 1.00 |
Span<Int32> | PR | 16 | 3.527 ns | 0.92 |
Span<Int32> | main | 16 | 3.852 ns | 1.00 |
Span<Int64> | PR | 16 | 4.217 ns | 0.82 |
Span<Int64> | main | 16 | 5.128 ns | 1.00 |
Span<Byte> | PR | 24 | 3.996 ns | 0.34 |
Span<Byte> | main | 24 | 11.935 ns | 1.00 |
Span<Char> | PR | 24 | 4.679 ns | 0.64 |
Span<Char> | main | 24 | 7.353 ns | 1.00 |
Span<Int32> | PR | 24 | 3.943 ns | 0.59 |
Span<Int32> | main | 24 | 6.690 ns | 1.00 |
Span<Int64> | PR | 24 | 4.935 ns | 0.81 |
Span<Int64> | main | 24 | 6.075 ns | 1.00 |
Span<Byte> | PR | 32 | 3.559 ns | 0.88 |
Span<Byte> | main | 32 | 4.044 ns | 1.00 |
Span<Char> | PR | 32 | 4.278 ns | 0.83 |
Span<Char> | main | 32 | 5.152 ns | 1.00 |
Span<Int32> | PR | 32 | 3.971 ns | 0.84 |
Span<Int32> | main | 32 | 4.755 ns | 1.00 |
Span<Int64> | PR | 32 | 5.630 ns | 0.83 |
Span<Int64> | main | 32 | 6.799 ns | 1.00 |
Span<Byte> | PR | 36 | 4.103 ns | 0.80 |
Span<Byte> | main | 36 | 5.149 ns | 1.00 |
Span<Char> | PR | 36 | 5.525 ns | 0.92 |
Span<Char> | main | 36 | 5.974 ns | 1.00 |
Span<Int32> | PR | 36 | 5.759 ns | 0.92 |
Span<Int32> | main | 36 | 6.291 ns | 1.00 |
Span<Int64> | PR | 36 | 6.242 ns | 0.84 |
Span<Int64> | main | 36 | 7.471 ns | 1.00 |
Span<Byte> | PR | 38 | 4.381 ns | 0.74 |
Span<Byte> | main | 38 | 5.911 ns | 1.00 |
Span<Char> | PR | 38 | 6.316 ns | 0.96 |
Span<Char> | main | 38 | 6.575 ns | 1.00 |
Span<Int32> | PR | 38 | 6.303 ns | 0.93 |
Span<Int32> | main | 38 | 6.781 ns | 1.00 |
Span<Int64> | PR | 38 | 6.236 ns | 0.81 |
Span<Int64> | main | 38 | 7.689 ns | 1.00 |
Span<Byte> | PR | 40 | 4.153 ns | 0.63 |
Span<Byte> | main | 40 | 6.637 ns | 1.00 |
Span<Char> | PR | 40 | 7.144 ns | 0.98 |
Span<Char> | main | 40 | 7.289 ns | 1.00 |
Span<Int32> | PR | 40 | 4.690 ns | 0.62 |
Span<Int32> | main | 40 | 7.618 ns | 1.00 |
Span<Int64> | PR | 40 | 6.253 ns | 0.82 |
Span<Int64> | main | 40 | 7.597 ns | 1.00 |
Span<Byte> | PR | 42 | 4.475 ns | 0.56 |
Span<Byte> | main | 42 | 8.054 ns | 1.00 |
Span<Char> | PR | 42 | 9.112 ns | 1.12 |
Span<Char> | main | 42 | 8.159 ns | 1.00 |
Span<Int32> | PR | 42 | 5.279 ns | 0.59 |
Span<Int32> | main | 42 | 8.975 ns | 1.00 |
Span<Int64> | PR | 42 | 6.724 ns | 0.88 |
Span<Int64> | main | 42 | 7.674 ns | 1.00 |
Span<Byte> | PR | 44 | 4.190 ns | 0.44 |
Span<Byte> | main | 44 | 9.675 ns | 1.00 |
Span<Char> | PR | 44 | 9.887 ns | 0.92 |
Span<Char> | main | 44 | 10.800 ns | 1.00 |
Span<Int32> | PR | 44 | 5.317 ns | 0.55 |
Span<Int32> | main | 44 | 9.719 ns | 1.00 |
Span<Int64> | PR | 44 | 6.931 ns | 0.86 |
Span<Int64> | main | 44 | 8.054 ns | 1.00 |
Span<Byte> | PR | 48 | 4.644 ns | 0.42 |
Span<Byte> | main | 48 | 11.215 ns | 1.00 |
Span<Char> | PR | 48 | 5.343 ns | 0.42 |
Span<Char> | main | 48 | 12.655 ns | 1.00 |
Span<Int32> | PR | 48 | 4.915 ns | 0.85 |
Span<Int32> | main | 48 | 5.781 ns | 1.00 |
Span<Int64> | PR | 48 | 7.562 ns | 0.90 |
Span<Int64> | main | 48 | 8.376 ns | 1.00 |
Span<Byte> | PR | 56 | 4.616 ns | 0.34 |
Span<Byte> | main | 56 | 13.658 ns | 1.00 |
Span<Char> | PR | 56 | 5.355 ns | 0.34 |
Span<Char> | main | 56 | 15.667 ns | 1.00 |
Span<Int32> | PR | 56 | 5.782 ns | 0.69 |
Span<Int32> | main | 56 | 8.357 ns | 1.00 |
Span<Int64> | PR | 56 | 7.494 ns | 0.83 |
Span<Int64> | main | 56 | 9.032 ns | 1.00 |
Span<Byte> | PR | 64 | 4.108 ns | 0.92 |
Span<Byte> | main | 64 | 4.458 ns | 1.00 |
Span<Char> | PR | 64 | 5.358 ns | 0.88 |
Span<Char> | main | 64 | 6.077 ns | 1.00 |
Span<Int32> | PR | 64 | 5.806 ns | 0.88 |
Span<Int32> | main | 64 | 6.600 ns | 1.00 |
Span<Int64> | PR | 64 | 7.711 ns | 0.79 |
Span<Int64> | main | 64 | 9.736 ns | 1.00 |
Span<Byte> | PR | 512 | 9.985 ns | 0.90 |
Span<Byte> | main | 512 | 11.143 ns | 1.00 |
Span<Char> | PR | 512 | 17.165 ns | 0.90 |
Span<Char> | main | 512 | 19.078 ns | 1.00 |
Span<Int32> | PR | 512 | 29.440 ns | 0.99 |
Span<Int32> | main | 512 | 29.709 ns | 1.00 |
Span<Int64> | PR | 512 | 62.140 ns | 1.00 |
Span<Int64> | main | 512 | 62.204 ns | 1.00 |
} | ||
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<byte>.Count * 2 <= length) | ||
else if (Vector128.IsHardwareAccelerated && remainder >= Vector128<byte>.Count * 2) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was not sure whether we need the cast here too, so I've run following command to get the disassembly for Vector128 code path (COMPlus_EnableAVX2:0):
py .\scripts\benchmarks_ci.py -f net8.0 --filter *Span<byte>.Reverse*42* --corerun D:\projects\forks\pr78650\artifacts\bin\testhost\net7.0-windows-Release-x64\shared\Microsoft.NETCore.App\8.0.0\corerun.exe --bdn-arguments "--join true --memoryRandomization true --disasmFilter *SpanHelpers*Reverse* --envVars COMPlus_EnableAVX2:0"
The value is const, we are good:
.NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX
; System.SpanHelpers.Reverse(Byte ByRef, UIntPtr)
vzeroupper
mov rax,rdx
xor r8d,r8d
cmp rax,20
jge near ptr M00_L07
M00_L00:
cmp rax,8
jl short M00_L02
mov rax,rdx
sub rax,r8
add rax,0FFFFFFFFFFFFFFF8
M00_L01:
mov r9,[rcx+r8]
mov r10,[rcx+rax]
movbe [rcx+r8],r10
movbe [rcx+rax],r9
add r8,8
add rax,0FFFFFFFFFFFFFFF8
cmp rax,r8
jge short M00_L01
add rax,8
sub rax,r8
M00_L02:
cmp rax,4
jl short M00_L04
sub rdx,r8
add rdx,0FFFFFFFFFFFFFFFC
M00_L03:
mov eax,[rcx+r8]
mov r9d,[rcx+rdx]
movbe [rcx+r8],r9d
movbe [rcx+rdx],eax
add r8,4
add rdx,0FFFFFFFFFFFFFFFC
cmp rdx,r8
jge short M00_L03
lea rax,[rdx+4]
sub rax,r8
M00_L04:
cmp rax,1
jle short M00_L06
add rcx,r8
lea rax,[rcx+rax+System.Object.Finalize()]
M00_L05:
movzx edx,byte ptr [rcx]
movzx r8d,byte ptr [rax]
mov [rcx],r8b
mov [rax],dl
inc rcx
dec rax
cmp rcx,rax
jb short M00_L05
M00_L06:
ret
M00_L07:
lea rax,[rdx-10]
M00_L08:
vmovdqu xmm0,xmmword ptr [rcx+r8]
vmovdqu xmm1,xmmword ptr [rcx+rax]
vpshufb xmm0,xmm0,[7FFE70D0F170]
vpshufb xmm1,xmm1,[7FFE70D0F170]
vmovdqu xmmword ptr [rcx+r8],xmm1
vmovdqu xmmword ptr [rcx+rax],xmm0
add r8,10
add rax,0FFFFFFFFFFFFFFF0
cmp rax,r8
jge short M00_L08
add rax,10
sub rax,r8
jmp near ptr M00_L00
; Total bytes of code 234
the perf numbers also look good:
BenchmarkDotNet=v0.13.2.1950-nightly, OS=Windows 11 (10.0.22621.819)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=8.0.100-alpha.1.22570.9
[Host] : .NET 8.0.0 (8.0.22.55902), X64 RyuJIT AVX2
Job-FOAVXK : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX
Job-YMSIID : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX
EnvironmentVariables=COMPlus_EnableAVX2=0 MemoryRandomization=True
Type | Job | Size | Mean | Ratio |
---|---|---|---|---|
Span<Byte> | PR | 4 | 3.564 ns | 0.90 |
Span<Byte> | main | 4 | 3.982 ns | 1.00 |
Span<Char> | PR | 4 | 3.861 ns | 0.98 |
Span<Char> | main | 4 | 3.927 ns | 1.00 |
Span<Int32> | PR | 4 | 3.792 ns | 1.01 |
Span<Int32> | main | 4 | 3.747 ns | 1.00 |
Span<Int64> | PR | 4 | 3.173 ns | 0.84 |
Span<Int64> | main | 4 | 3.775 ns | 1.00 |
Span<Byte> | PR | 8 | 3.556 ns | 0.70 |
Span<Byte> | main | 8 | 5.093 ns | 1.00 |
Span<Char> | PR | 8 | 5.611 ns | 1.00 |
Span<Char> | main | 8 | 5.594 ns | 1.00 |
Span<Int32> | PR | 8 | 3.548 ns | 0.95 |
Span<Int32> | main | 8 | 3.742 ns | 1.00 |
Span<Int64> | PR | 8 | 4.049 ns | 0.85 |
Span<Int64> | main | 8 | 4.752 ns | 1.00 |
Span<Byte> | PR | 16 | 3.483 ns | 0.34 |
Span<Byte> | main | 16 | 10.388 ns | 1.00 |
Span<Char> | PR | 16 | 3.538 ns | 0.93 |
Span<Char> | main | 16 | 3.817 ns | 1.00 |
Span<Int32> | PR | 16 | 4.000 ns | 0.90 |
Span<Int32> | main | 16 | 4.405 ns | 1.00 |
Span<Int64> | PR | 16 | 5.392 ns | 0.88 |
Span<Int64> | main | 16 | 6.098 ns | 1.00 |
Span<Byte> | PR | 24 | 3.969 ns | 0.34 |
Span<Byte> | main | 24 | 11.721 ns | 1.00 |
Span<Char> | PR | 24 | 4.265 ns | 0.59 |
Span<Char> | main | 24 | 7.262 ns | 1.00 |
Span<Int32> | PR | 24 | 4.662 ns | 0.87 |
Span<Int32> | main | 24 | 5.381 ns | 1.00 |
Span<Int64> | PR | 24 | 6.465 ns | 0.88 |
Span<Int64> | main | 24 | 7.378 ns | 1.00 |
Span<Byte> | PR | 32 | 3.771 ns | 0.92 |
Span<Byte> | main | 32 | 4.104 ns | 1.00 |
Span<Char> | PR | 32 | 4.269 ns | 0.92 |
Span<Char> | main | 32 | 4.658 ns | 1.00 |
Span<Int32> | PR | 32 | 5.676 ns | 0.91 |
Span<Int32> | main | 32 | 6.244 ns | 1.00 |
Span<Int64> | PR | 32 | 8.414 ns | 0.97 |
Span<Int64> | main | 32 | 8.717 ns | 1.00 |
Span<Byte> | PR | 36 | 4.296 ns | 0.84 |
Span<Byte> | main | 36 | 5.105 ns | 1.00 |
Span<Char> | PR | 36 | 6.253 ns | 0.99 |
Span<Char> | main | 36 | 6.292 ns | 1.00 |
Span<Int32> | PR | 36 | 6.709 ns | 0.87 |
Span<Int32> | main | 36 | 7.759 ns | 1.00 |
Span<Int64> | PR | 36 | 9.138 ns | 1.00 |
Span<Int64> | main | 36 | 9.099 ns | 1.00 |
Span<Byte> | PR | 38 | 4.407 ns | 0.74 |
Span<Byte> | main | 38 | 5.994 ns | 1.00 |
Span<Char> | PR | 38 | 8.025 ns | 1.16 |
Span<Char> | main | 38 | 6.929 ns | 1.00 |
Span<Int32> | PR | 38 | 6.798 ns | 0.81 |
Span<Int32> | main | 38 | 8.483 ns | 1.00 |
Span<Int64> | PR | 38 | 8.607 ns | 0.92 |
Span<Int64> | main | 38 | 9.311 ns | 1.00 |
Span<Byte> | PR | 40 | 4.279 ns | 0.65 |
Span<Byte> | main | 40 | 6.634 ns | 1.00 |
Span<Char> | PR | 40 | 5.902 ns | 0.65 |
Span<Char> | main | 40 | 9.069 ns | 1.00 |
Span<Int32> | PR | 40 | 6.633 ns | 0.92 |
Span<Int32> | main | 40 | 7.203 ns | 1.00 |
Span<Int64> | PR | 40 | 8.678 ns | 0.88 |
Span<Int64> | main | 40 | 9.856 ns | 1.00 |
Span<Byte> | PR | 42 | 4.415 ns | 0.55 |
Span<Byte> | main | 42 | 7.962 ns | 1.00 |
Span<Char> | PR | 42 | 5.812 ns | 0.58 |
Span<Char> | main | 42 | 10.202 ns | 1.00 |
Span<Int32> | PR | 42 | 7.659 ns | 0.98 |
Span<Int32> | main | 42 | 7.776 ns | 1.00 |
Span<Int64> | PR | 42 | 9.069 ns | 0.91 |
Span<Int64> | main | 42 | 9.913 ns | 1.00 |
Span<Byte> | PR | 44 | 4.275 ns | 0.44 |
Span<Byte> | main | 44 | 9.826 ns | 1.00 |
Span<Char> | PR | 44 | 5.811 ns | 0.55 |
Span<Char> | main | 44 | 10.529 ns | 1.00 |
Span<Int32> | PR | 44 | 7.434 ns | 0.91 |
Span<Int32> | main | 44 | 8.209 ns | 1.00 |
Span<Int64> | PR | 44 | 8.993 ns | 0.86 |
Span<Int64> | main | 44 | 10.410 ns | 1.00 |
Span<Byte> | PR | 48 | 4.523 ns | 0.40 |
Span<Byte> | main | 48 | 11.309 ns | 1.00 |
Span<Char> | PR | 48 | 5.825 ns | 1.01 |
Span<Char> | main | 48 | 5.739 ns | 1.00 |
Span<Int32> | PR | 48 | 7.413 ns | 0.95 |
Span<Int32> | main | 48 | 7.763 ns | 1.00 |
Span<Int64> | PR | 48 | 9.778 ns | 0.87 |
Span<Int64> | main | 48 | 11.274 ns | 1.00 |
Span<Byte> | PR | 56 | 4.523 ns | 0.33 |
Span<Byte> | main | 56 | 13.947 ns | 1.00 |
Span<Char> | PR | 56 | 7.008 ns | 0.78 |
Span<Char> | main | 56 | 9.051 ns | 1.00 |
Span<Int32> | PR | 56 | 8.326 ns | 0.99 |
Span<Int32> | main | 56 | 8.383 ns | 1.00 |
Span<Int64> | PR | 56 | 11.643 ns | 0.92 |
Span<Int64> | main | 56 | 12.645 ns | 1.00 |
Span<Byte> | PR | 64 | 4.549 ns | 0.88 |
Span<Byte> | main | 64 | 5.173 ns | 1.00 |
Span<Char> | PR | 64 | 6.989 ns | 1.05 |
Span<Char> | main | 64 | 6.634 ns | 1.00 |
Span<Int32> | PR | 64 | 8.592 ns | 0.94 |
Span<Int32> | main | 64 | 9.124 ns | 1.00 |
Span<Int64> | PR | 64 | 11.862 ns | 0.85 |
Span<Int64> | main | 64 | 13.894 ns | 1.00 |
Span<Byte> | PR | 512 | 14.294 ns | 0.94 |
Span<Byte> | main | 512 | 15.121 ns | 1.00 |
Span<Char> | PR | 512 | 26.252 ns | 1.00 |
Span<Char> | main | 512 | 26.226 ns | 1.00 |
Span<Int32> | PR | 512 | 40.489 ns | 0.85 |
Span<Int32> | main | 512 | 48.029 ns | 1.00 |
Span<Int64> | PR | 512 | 82.763 ns | 0.93 |
Span<Int64> | main | 512 | 88.969 ns | 1.00 |
I was going to check it on arm64 too, but the VM I am using is super slow today so I won't be able to do that:
See original PR: #70944
Reverted: #78605
I reverted the overlapping for int and long and and chose to keep it in-between for byte/char where it still shows an improvement to overlap.
Machine:
Byte
Char
Int
Long
@adamsitnik can you confirm if you get similar numbers on your benchmark? I think earlier we saw some slight difference between our numbers last time