Skip to content

Commit b8e2f05

Browse files
committed
[X86] Don't always seperate conditions in (br (and/or cond0, cond1)) into seperate branches
It makes sense to split if the cost of computing `cond1` is high (proportionally to how likely `cond0` is), but it doesn't really make sense to introduce a second branch if its only a few instructions. Splitting can also get in the way of potentially folding patterns. This patch introduces some logic to try to check if the cost of computing `cond1` is relatively low, and if so don't split the branches. Modest improvement on clang bootstrap build: https://llvm-compile-time-tracker.com/compare.php?from=79ce933114e46c891a5632f7ad4a004b93a5b808&to=978278eabc0bafe2f390ca8fcdad24154f954020&stat=cycles Average stage2-O3: 0.59% Improvement (cycles) Average stage2-O0-g: 1.20% Improvement (cycles) Likewise on llvm-test-suite on SKX saw a net 0.84% improvement (cycles) There is also a modest compile time improvement with this patch: https://llvm-compile-time-tracker.com/compare.php?from=79ce933114e46c891a5632f7ad4a004b93a5b808&to=978278eabc0bafe2f390ca8fcdad24154f954020&stat=instructions%3Au Note that the stage2 instruction count increases is expected, this patch trades instructions for decreasing branch-misses (which is proportionately lower): https://llvm-compile-time-tracker.com/compare.php?from=79ce933114e46c891a5632f7ad4a004b93a5b808&to=978278eabc0bafe2f390ca8fcdad24154f954020&stat=branch-misses NB: This will also likely help for APX targets with the new `CCMP` and `CTEST` instructions.
1 parent 79ce933 commit b8e2f05

27 files changed

+907
-703
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

+7
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,13 @@ class TargetLoweringBase {
596596
/// avoided.
597597
bool isJumpExpensive() const { return JumpIsExpensive; }
598598

599+
virtual bool keepJumpConditionsTogether(const FunctionLoweringInfo &,
600+
const BranchInst &,
601+
Instruction::BinaryOps, const Value *,
602+
const Value *) const {
603+
return false;
604+
}
605+
599606
/// Return true if selects are only cheaper than branches if the branch is
600607
/// unlikely to be predicted right.
601608
bool isPredictableSelectExpensive() const {

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -2646,8 +2646,11 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
26462646
else if (match(BOp, m_LogicalOr(m_Value(BOp0), m_Value(BOp1))))
26472647
Opcode = Instruction::Or;
26482648

2649-
if (Opcode && !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
2650-
match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
2649+
if (Opcode &&
2650+
!(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
2651+
match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value()))) &&
2652+
!DAG.getTargetLoweringInfo().keepJumpConditionsTogether(
2653+
FuncInfo, I, Opcode, BOp0, BOp1)) {
26512654
FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, Opcode,
26522655
getEdgeProbability(BrMBB, Succ0MBB),
26532656
getEdgeProbability(BrMBB, Succ1MBB),

llvm/lib/Target/X86/X86ISelLowering.cpp

+159
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,12 @@
2727
#include "llvm/ADT/StringExtras.h"
2828
#include "llvm/ADT/StringSwitch.h"
2929
#include "llvm/Analysis/BlockFrequencyInfo.h"
30+
#include "llvm/Analysis/BranchProbabilityInfo.h"
3031
#include "llvm/Analysis/ObjCARCUtil.h"
3132
#include "llvm/Analysis/ProfileSummaryInfo.h"
33+
#include "llvm/Analysis/TargetTransformInfo.h"
3234
#include "llvm/Analysis/VectorUtils.h"
35+
#include "llvm/CodeGen/FunctionLoweringInfo.h"
3336
#include "llvm/CodeGen/IntrinsicLowering.h"
3437
#include "llvm/CodeGen/MachineFrameInfo.h"
3538
#include "llvm/CodeGen/MachineFunction.h"
@@ -55,6 +58,7 @@
5558
#include "llvm/MC/MCContext.h"
5659
#include "llvm/MC/MCExpr.h"
5760
#include "llvm/MC/MCSymbol.h"
61+
#include "llvm/Support/BranchProbability.h"
5862
#include "llvm/Support/CommandLine.h"
5963
#include "llvm/Support/Debug.h"
6064
#include "llvm/Support/ErrorHandling.h"
@@ -77,6 +81,24 @@ static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
7781
"alignment set by x86-experimental-pref-loop-alignment."),
7882
cl::Hidden);
7983

84+
static cl::opt<int> BrMergingBaseCostThresh(
85+
"x86-cond-base", cl::init(1),
86+
cl::desc(
87+
"Base."),
88+
cl::Hidden);
89+
90+
static cl::opt<int> BrMergingLikelyBias(
91+
"x86-cond-likely-bias", cl::init(0),
92+
cl::desc(
93+
"Likely."),
94+
cl::Hidden);
95+
96+
static cl::opt<int> BrMergingUnlikelyBias(
97+
"x86-cond-unlikely-bias", cl::init(1),
98+
cl::desc(
99+
"Unlikely."),
100+
cl::Hidden);
101+
80102
static cl::opt<bool> MulConstantOptimization(
81103
"mul-constant-optimization", cl::init(true),
82104
cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -3339,6 +3361,143 @@ unsigned X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand(
33393361
return ISD::SRL;
33403362
}
33413363

3364+
// Collect dependings on V recursively. This is used for the cost analysis in
3365+
// `keepJumpConditionsTogether`.
3366+
static bool
3367+
collectDeps(SmallPtrSet<const Instruction *, 8> *Deps, const Value *V,
3368+
SmallPtrSet<const Instruction *, 8> *Necessary = nullptr,
3369+
unsigned Depth = 0) {
3370+
// Return false if we have an incomplete count.
3371+
if (Depth >= 6)
3372+
return false;
3373+
3374+
auto *I = dyn_cast<Instruction>(V);
3375+
if (I == nullptr)
3376+
return true;
3377+
3378+
if (Necessary != nullptr) {
3379+
// This instruction is necessary for the other side of the condition so
3380+
// don't count it.
3381+
if (Necessary->contains(I))
3382+
return true;
3383+
}
3384+
3385+
// Already added this dep.
3386+
if (!Deps->insert(I).second)
3387+
return true;
3388+
3389+
for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx)
3390+
if (!collectDeps(Deps, I->getOperand(OpIdx), Necessary, Depth + 1))
3391+
return false;
3392+
return true;
3393+
}
3394+
3395+
bool X86TargetLowering::keepJumpConditionsTogether(
3396+
const FunctionLoweringInfo &FuncInfo, const BranchInst &I,
3397+
Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const {
3398+
using namespace llvm::PatternMatch;
3399+
if (I.getNumSuccessors() != 2)
3400+
return false;
3401+
3402+
// Baseline cost. This is properly arbitrary.
3403+
InstructionCost CostThresh = BrMergingBaseCostThresh.getValue();
3404+
if (BrMergingBaseCostThresh.getValue() < 0)
3405+
return false;
3406+
3407+
// a == b && c == d can be efficiently combined.
3408+
ICmpInst::Predicate Pred;
3409+
if (Opc == Instruction::And &&
3410+
match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3411+
Pred == ICmpInst::ICMP_EQ &&
3412+
match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3413+
Pred == ICmpInst::ICMP_EQ)
3414+
CostThresh += 1;
3415+
3416+
BranchProbabilityInfo *BPI = nullptr;
3417+
if (BrMergingLikelyBias.getValue() || BrMergingUnlikelyBias.getValue())
3418+
BPI = FuncInfo.BPI;
3419+
if (BPI != nullptr) {
3420+
BasicBlock *IfFalse = I.getSuccessor(0);
3421+
BasicBlock *IfTrue = I.getSuccessor(1);
3422+
3423+
std::optional<bool> Likely;
3424+
if (BPI->isEdgeHot(I.getParent(), IfTrue))
3425+
Likely = true;
3426+
else if (BPI->isEdgeHot(I.getParent(), IfFalse))
3427+
Likely = false;
3428+
3429+
if (Likely) {
3430+
if (Opc == (*Likely ? Instruction::And : Instruction::Or))
3431+
// Its likely we will have to compute both lhs and rhs of condition
3432+
CostThresh += BrMergingLikelyBias.getValue();
3433+
else {
3434+
// Its likely we will get an early out.
3435+
CostThresh -= BrMergingUnlikelyBias.getValue();
3436+
if (BrMergingUnlikelyBias.getValue() < 0) {
3437+
return false;
3438+
}
3439+
}
3440+
}
3441+
}
3442+
3443+
if (CostThresh <= 0)
3444+
return false;
3445+
3446+
// Collect "all" instructions that lhs condition is dependent on.
3447+
SmallPtrSet<const Instruction *, 8> LhsDeps, RhsDeps;
3448+
collectDeps(&LhsDeps, Lhs);
3449+
// Collect "all" instructions that rhs condition is dependent on AND are
3450+
// dependencies of lhs. This gives us an estimate on which instructions we
3451+
// stand to save by splitting the condition.
3452+
if (!collectDeps(&RhsDeps, Rhs, &LhsDeps))
3453+
return false;
3454+
// Add the compare instruction itself unless its a dependency on the LHS.
3455+
if (const auto *RhsI = dyn_cast<Instruction>(Rhs))
3456+
if (!LhsDeps.contains(RhsI))
3457+
RhsDeps.insert(RhsI);
3458+
const auto &TTI = getTargetMachine().getTargetTransformInfo(*I.getFunction());
3459+
3460+
InstructionCost CostOfIncluding = 0;
3461+
// See if this instruction will need to computed independently of whether RHS
3462+
// is.
3463+
auto ShouldCountInsn = [&RhsDeps](const Instruction *Ins) {
3464+
for (const auto *U : Ins->users()) {
3465+
// If user is independent of RHS calculation we don't need to count it.
3466+
if (auto *UIns = dyn_cast<Instruction>(U))
3467+
if (!RhsDeps.contains(UIns))
3468+
return false;
3469+
}
3470+
return true;
3471+
};
3472+
3473+
// Prune instructions from RHS Deps that are dependencies of unrelated
3474+
// instructions.
3475+
const unsigned MaxPruneIters = 8;
3476+
// Stop after a certain point. No incorrectness from including too many
3477+
// instructions.
3478+
for (unsigned PruneIters = 0; PruneIters < MaxPruneIters; ++PruneIters) {
3479+
const Instruction *ToDrop = nullptr;
3480+
for (const auto *Ins : RhsDeps) {
3481+
if (!ShouldCountInsn(Ins)) {
3482+
ToDrop = Ins;
3483+
break;
3484+
}
3485+
}
3486+
if (ToDrop == nullptr)
3487+
break;
3488+
RhsDeps.erase(ToDrop);
3489+
}
3490+
3491+
for (const auto *Ins : RhsDeps) {
3492+
CostOfIncluding +=
3493+
TTI.getInstructionCost(Ins, TargetTransformInfo::TCK_Latency);
3494+
3495+
if (CostOfIncluding > CostThresh)
3496+
return false;
3497+
}
3498+
return true;
3499+
}
3500+
33423501
bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
33433502
return N->getOpcode() != ISD::FP_EXTEND;
33443503
}

llvm/lib/Target/X86/X86ISelLowering.h

+5
Original file line numberDiff line numberDiff line change
@@ -1150,6 +1150,11 @@ namespace llvm {
11501150

11511151
bool preferScalarizeSplat(SDNode *N) const override;
11521152

1153+
bool keepJumpConditionsTogether(const FunctionLoweringInfo &,
1154+
const BranchInst &, Instruction::BinaryOps,
1155+
const Value *,
1156+
const Value *) const override;
1157+
11531158
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
11541159
CombineLevel Level) const override;
11551160

llvm/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll

+6-5
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,16 @@ define i1 @loadAndRLEsource_no_exit_2E_1_label_2E_0(i32 %tmp.21.reload, i32 %tmp
1818
; CHECK-NEXT: movl _block, %esi
1919
; CHECK-NEXT: movb %al, 1(%esi,%edx)
2020
; CHECK-NEXT: cmpl %ecx, _last
21-
; CHECK-NEXT: jge LBB0_3
22-
; CHECK-NEXT: ## %bb.1: ## %label.0
21+
; CHECK-NEXT: setl %cl
2322
; CHECK-NEXT: cmpl $257, %eax ## imm = 0x101
24-
; CHECK-NEXT: je LBB0_3
25-
; CHECK-NEXT: ## %bb.2: ## %label.0.no_exit.1_crit_edge.exitStub
23+
; CHECK-NEXT: setne %al
24+
; CHECK-NEXT: testb %al, %cl
25+
; CHECK-NEXT: je LBB0_2
26+
; CHECK-NEXT: ## %bb.1: ## %label.0.no_exit.1_crit_edge.exitStub
2627
; CHECK-NEXT: movb $1, %al
2728
; CHECK-NEXT: popl %esi
2829
; CHECK-NEXT: retl
29-
; CHECK-NEXT: LBB0_3: ## %codeRepl5.exitStub
30+
; CHECK-NEXT: LBB0_2: ## %codeRepl5.exitStub
3031
; CHECK-NEXT: xorl %eax, %eax
3132
; CHECK-NEXT: popl %esi
3233
; CHECK-NEXT: retl

0 commit comments

Comments
 (0)