-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Faster unsigned division by constants #49585
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Still a draft so I'm not going to review in depth yet. I'd definitely be interested in seeing some perf numbers and assembly diffs 😄 I'd also be interested in how this compares to say |
Somebody smarter than me needs to validate license information. There's license text at the top of https://github.com/ridiculousfish/libdivide/blob/master/doc/divide_by_constants_codegen_reference.c which does not match the license text within https://github.com/ridiculousfish/libdivide/blob/master/LICENSE.txt. |
That should be the current implementation from dotnet/coreclr#10996 (at least the comments in code referenced the same paper by Torbjorn Granlund and Peter L. Montgomery). This is the first time I'm dealing with ARM assembly and I'm having trouble figuring out how to emit conditional |
The repo license is for the |
@tannergooding here are some diffs highlighting the change. I'll try to get a PMI diff also. ASM diff for DateTime.GetDatePart (DateTime.Year) mov rcx, 0xD1FFAB1E
mov rax, rdx
mul rdx:rax, rcx
shr rdx, 39
mov ecx, edx
- mov edx, 0xD1FFAB1E
mov eax, ecx
- mul edx:eax, edx
- mov r9d, edx
- shr r9d, 15
+ imul r9, rax, 0xD1FFAB1E
+ shr r9, 47
imul eax, r9d, 0x23AB1
sub ecx, eax
- mov edx, 0xD1FFAB1E
mov eax, ecx
- mul edx:eax, edx
+ shr eax, 2
+ imul r10, rax, 0xD1FFAB1E
+ shr r10, 43
- mov r10d, ecx
- sub r10d, edx
- shr r10d, 1
- add r10d, edx
- shr r10d, 15
cmp r10d, 4
jne SHORT G_M19507_IG04
- ;; bbWeight=1 PerfScore 19.75
+ ;; bbWeight=1 PerfScore 16.25
G_M19507_IG03:
mov r10d, 3
;; bbWeight=0.50 PerfScore 0.13
G_M19507_IG04:
imul eax, r10d, 0x8EAC
sub ecx, eax
- mov edx, 0xD1FFAB1E
mov eax, ecx
- mul edx:eax, edx
- mov r11d, edx
- shr r11d, 7
+ imul r11, rax, 0xD1FFAB1E
+ shr r11, 39
imul eax, r11d, 0x5B5
sub ecx, eax
- mov edx, 0xD1FFAB1E
- mov eax, ecx
- mul edx:eax, edx
+ mov rdx, 0xD1FFAB1E
mov eax, ecx
- sub eax, edx
- shr eax, 1
- add edx, eax
- shr edx, 8
+ mul rdx:rax, rdx
cmp edx, 4
jne SHORT G_M19507_IG06
- ;; bbWeight=1 PerfScore 15.25
+ ;; bbWeight=1 PerfScore 12.00
G_M19507_IG05:
mov edx, 3
;; bbWeight=0.50 PerfScore 0.13
G_M19507_IG06:
test r8d, r8d
@@ -8869,11 +8813,11 @@
cmp eax, r9d
jae SHORT G_M19507_IG20
movsxd r10, eax
cmp dword ptr [rdx+4*r10+16], ecx
ja SHORT G_M19507_IG16
- align [0 bytes]
+ align [10 bytes]
;; bbWeight=0.50 PerfScore 3.88
G_M19507_IG15:
inc eax
cmp eax, r9d
jae SHORT G_M19507_IG20
@@ -8904,37 +8848,35 @@
G_M19507_IG20:
call CORINFO_HELP_RNGCHKFAIL
int3
;; bbWeight=0 PerfScore 0.00
-; Total bytes of code 354, prolog size 7, PerfScore 110.65, instruction count 106, allocated bytes for code 354 (MethodHash=710eb3cc) for method DateTime:GetDatePart(int):int:this
+; Total bytes of code 344, prolog size 7, PerfScore 102.90, instruction count 93, allocated bytes for code 344 (MethodHash=710eb3cc) for method DateTime:GetDatePart(int):int:this Diffs for UDIV by 3, 7, 14, 56 and 64-bit 7, 14, 56 ; 3 (normal divisor; but can be improved by converting to 64bit MUL)
- mov edx, 0xD1FFAB1E
- mov eax, ecx
- mul edx:eax, edx
- mov eax, edx
- shr eax, 1
+ mov eax, 0xD1FFAB1E
+ mov edx, ecx
+ imul rax, rdx
+ shr rax, 33
-; Total bytes of code 14, prolog size 0, PerfScore 6.65, instruction count 6
+; Total bytes of code 16, prolog size 0, PerfScore 5.60, instruction count 5
; 7 (uncooperative divisor; after extending to 64bit becomes easy)
- mov edx, 0xD1FFAB1E
+ mov rdx, 0xD1FFAB1E
mov eax, ecx
- mul edx:eax, edx
- sub ecx, edx
- shr ecx, 1
- lea eax, [rcx+rdx]
- shr eax, 2
+ mul rdx:rax, rdx
+ mov eax, edx
-; Total bytes of code 20, prolog size 0, PerfScore 8.25, instruction count 8
+; Total bytes of code 18, prolog size 0, PerfScore 6.55, instruction count 5
; 14 (needs multiple shifts; after extending to 64bit becomes easy)
- mov edx, 0xD1FFAB1E
+ mov rdx, 0xD1FFAB1E
mov eax, ecx
- mul edx:eax, edx
- sub ecx, edx
- shr ecx, 1
- lea eax, [rcx+rdx]
- shr eax, 3
+ mul rdx:rax, rdx
+ mov eax, edx
-; Total bytes of code 20, prolog size 0, PerfScore 8.25, instruction count 8
+; Total bytes of code 18, prolog size 0, PerfScore 6.55, instruction count 5
; 56 (needs a pre-shift)
- mov edx, 0xD1FFAB1E
- mov eax, ecx
- mul edx:eax, edx
- sub ecx, edx
- shr ecx, 1
- lea eax, [rcx+rdx]
- shr eax, 5
+ shr ecx, 3
+ imul rax, rcx, 0xD1FFAB1E
+ shr rax, 32
-; Total bytes of code 20, prolog size 0, PerfScore 8.25, instruction count 8
+; Total bytes of code 15, prolog size 0, PerfScore 5.50, instruction count 4
; 7ul (uncooperative divisor; but improves with saturating increment)
mov rdx, 0xD1FFAB1E
mov rax, rcx
+ add rax, 1
+ sbb rax, 0
mul rdx:rax, rdx
- sub rcx, rdx
- shr rcx, 1
- lea rax, [rcx+rdx]
+ mov rax, rdx
shr rax, 2
-; Total bytes of code 31, prolog size 0, PerfScore 9.35, instruction count 8
+; Total bytes of code 32, prolog size 0, PerfScore 8.95, instruction count 8
; 14ul (needs multiple shifts)
mov rdx, 0xD1FFAB1E
mov rax, rcx
+ shr rax, 1
mul rdx:rax, rdx
- sub rcx, rdx
- shr rcx, 1
- lea rax, [rcx+rdx]
- shr rax, 3
+ mov rax, rdx
+ shr rax, 1
-; Total bytes of code 31, prolog size 0, PerfScore 9.35, instruction count 8
+; Total bytes of code 26, prolog size 0, PerfScore 8.35, instruction count 7
; 56ul (needs a pre-shift)
mov rdx, 0xD1FFAB1E
mov rax, rcx
+ shr rax, 3
mul rdx:rax, rdx
- sub rcx, rdx
- shr rcx, 1
- lea rax, [rcx+rdx]
- shr rax, 5
+ mov rax, rdx
-; Total bytes of code 31, prolog size 0, PerfScore 9.35, instruction count 8
+; Total bytes of code 24, prolog size 0, PerfScore 7.65, instruction count 6 |
Nice work! Are there any jit-diffs? E.g. via superpmi:
I assume failures on CI are related |
@EgorBo Is it expected that running these commands takes a day to run? So far it has downloaded 8 GBs of mch files and produced a couple of diff outputs: aspnet.run.windows.x64.checked
benchmarks.run.windows.x64.checked
libraries.crossgen.windows.x64.checked
libraries.crossgen2.windows.x64.checked
libraries.pmi.windows.x64.checked
Tests have a ton of these errors: tests.pmi.windows.x64.checked
|
Running
Can't get |
Yep, the SPMI diffs are known to take a very long time. PMI diffs also take a lot of time.
I don't think that's right. "Core root" is a special test folder, see about it here. BTW, I believe the weird version is because it is searching for a prerelease version, and all of them have some combination of You could also try to diff the perf scores: |
PerfScore metrics need this fix first #49620 (otherwise Managed to get jit-diff -f
jit-diff -f --pmi
Analyzed some of the regressions and they come from LSRA using different registers now that Utf8Formatter:TryFormat +110: https://www.diffchecker.com/K4Bjp6kS BigNumber:FormatBigInteger -62: https://www.diffchecker.com/t7nMVJQY Instruction count change is more clearly in one direction: jit-diff -f -m InstrCount
|
@tannergooding could you review this now or is there something else that I should add? |
I will be able to give this a review tomorrow. Assigning myself. |
The math in https://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf looks to check out. That being said, it looks like LLVM, GCC, nor MSVC have adopted this nearly 10 years later and I can't seem to find any other major compile that has done so either. It's unclear to me why given the problem space this is attempting to solve. |
I guess someone has to be the first for progress to occur. The math/proofs look solid to me and the gains are real. |
Thanks for the references. I've not finished going through the math in the actual PR yet but hope to have time to finish that this week. If no one else has any concerns, I'd be fine with bringing this forward and getting the right sign-off from the JIT side. |
@dotnet/jit-contrib PTAL. |
ping @dotnet/jit-contrib, this still needs the JIT side changes reviewed. |
@EgorBo can you shepherd this one? |
I tried to write a bit more complicated tests (e.g. https://gist.github.com/EgorBo/c576fe71bc928c337713c6970949ff14) but couldn't find any regressions (on xarch). Overall, assuming that the License is used correctly - LGTM |
The bugs are for libdivide library which is not used here, the reference code for the original paper is just hosted in the same repo. |
So can this be merged? (I don't have write access) |
/azp run runtime-coreclr jitstress, runtime-coreclr outerloop |
Azure Pipelines successfully started running 2 pipeline(s). |
I've queued the outerloop jobs for some additional validation and will set the auto-merge label. |
Hello @tannergooding! Because this pull request has the p.s. you can customize the way I help with merging this pull request, such as holding this pull request until a specific person approves. Simply @mention me (
|
That didn't quite work like I intended -.- Will check the job results in the morning, hopefully nothing crops up. |
@tannergooding I think something has cropped up.
|
This repos locally on TIP but builds without issue on the previous commit. |
This reverts commit e4b4807.
This is some new check from #52661 and it looks like it's not an actual bug. I'll rebase and create a new PR. |
Right, I tried to ensure that we'd still do the right thing in case there were any missed code paths in the refactoring and to cover cases like this by asserting so we could get them updated. |
Has anyone looked at codegen differences for ARM64? |
Improved magic number calculation based on "Faster Unsigned Division by Constants" by ridiculous_fish.
https://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf
https://github.com/ridiculousfish/libdivide/blob/master/doc/divide_by_constants_codegen_reference.c
Raw performance is 12-40% faster for "uncooperative" divisors and register pressure is also reduced.
DateTime.Year
for example is 38% faster.Perf stats on a i7-7700K CPU 4.20GHz: