; ModuleID = 'small.ll' source_filename = "meh.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @a = common global i16 0, align 2 @b = common global [1 x i16] zeroinitializer, align 2 ; Function Attrs: nounwind uwtable define i32 @main() #0 { entry: %0 = load i16, i16* @a, align 2, !tbaa !1 %conv = sext i16 %0 to i32 %neg = xor i32 %conv, -1 %conv1 = trunc i32 %neg to i16 %conv3 = zext i16 %conv1 to i32 %cmp = icmp slt i32 %conv, %conv3 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry store i16 %conv1, i16* @a, align 2, !tbaa !1 br label %if.end if.end: ; preds = %if.then, %entry %c.0 = phi i16 [ %0, %if.then ], [ %conv1, %entry ] %1 = load i16, i16* @a, align 2, !tbaa !1 %tobool = icmp ne i16 %1, 0 br i1 %tobool, label %if.then8, label %if.end9 if.then8: ; preds = %if.end %idxprom = zext i16 %c.0 to i64 %arrayidx = getelementptr inbounds [1 x i16], [1 x i16]* @b, i64 0, i64 %idxprom %2 = load i16, i16* %arrayidx, align 2, !tbaa !1 store i16 %2, i16* @a, align 2, !tbaa !1 br label %if.end9 if.end9: ; preds = %if.then8, %if.end ret i32 0 } ; Function Attrs: argmemonly nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 ; Function Attrs: argmemonly nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } !llvm.ident = !{!0} !0 = !{!"clang version 5.0.0 (trunk 302523) (llvm/trunk 302532)"} !1 = !{!2, !2, i64 0} !2 = !{!"short", !3, i64 0} !3 = !{!"omnipotent char", !4, i64 0} !4 = !{!"Simple C/C++ TBAA"} Given the following IR, running jump-threading on it introduces a new PHI which operands have an order inconsistent with the one of the original PHI. [davide@cupiditate bin]$ ./opt -jump-threading prejump.ll -preserve-ll-uselistorder -S -o - | grep phi %1 = phi i16 [ %.pr, %if.endthread-pre-split ], [ %conv1, %if.then ] %c.0 = phi i16 [ %0, %if.then ], [ %conv1, %if.endthread-pre-split ] [davide@cupiditate bin]$ cat prejump.ll |grep phi %c.0 = phi i16 [ %0, %if.then ], [ %conv1, %entry ] Found out while looking at a report from Zhendong Su. Looking at this.
The issue is the same discussed with Dan. JumpThreading creates phis and add incoming edges using: // Substitute with Phis & remove. for (auto *Inst : reverse(ToRemove)) { if (!Inst->use_empty()) { PHINode *NewPN = PHINode::Create(Inst->getType(), 2); NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock); NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock); NewPN->insertBefore(InsertionPoint); Inst->replaceAllUsesWith(NewPN); } Inst->eraseFromParent(); and addIncoming() *always* append to the end of the PHI list: /// Add an incoming value to the end of the PHI list /// void addIncoming(Value *V, BasicBlock *BB) { if (getNumOperands() == ReservedSpace) growOperands(); // Get more space! // Initialize some new operands. setNumHungOffUseOperands(getNumOperands() + 1); setIncomingValue(getNumOperands() - 1, V); setIncomingBlock(getNumOperands() - 1, BB); } I don't see why we can't keep PHIs in sorted (predecessor) order.
One immediate concern with this is that order might actually matter here, and might be cause of non-determinism (I think). Every time we create a new PHI instruction and add incoming edges we rely on pointer value comparison and we're not guaranteed that they're always in the some order. I'm not sure there's a cheaper way of sorting. Thoughts?
We could use the same order as predecessors(BB) (from llvm/IR/CFG.h); that's based on the block's use-list.
Another "interesting" optimization that GCC does on this code. If both functions are marked as cold, it replaces the second with a jmp to the first. tinky(int): mov eax, edi mov edx, 36 .L2: imul eax, edi dec edx jne .L2 ret winky(int): jmp tinky(int) llvm should be able to do the same using merge identical functions, but the pass is currently disabled by default at any optimization level.
ops, sorry wrong PR, please ignore :)