Skip to content

Commit 077ff9a

Browse files
committed
[X86] Don't always separate conditions in (br (and/or cond0, cond1)) into separate branches
It makes sense to split if the cost of computing `cond1` is high (proportionally to how likely `cond0` is), but it doesn't really make sense to introduce a second branch if its only a few instructions. Splitting can also get in the way of potentially folding patterns. This patch introduces some logic to try to check if the cost of computing `cond1` is relatively low, and if so don't split the branches. Modest improvement on clang bootstrap build: https://llvm-compile-time-tracker.com/compare.php?from=79ce933114e46c891a5632f7ad4a004b93a5b808&to=978278eabc0bafe2f390ca8fcdad24154f954020&stat=cycles Average stage2-O3: 0.59% Improvement (cycles) Average stage2-O0-g: 1.20% Improvement (cycles) Likewise on llvm-test-suite on SKX saw a net 0.84% improvement (cycles) There is also a modest compile time improvement with this patch: https://llvm-compile-time-tracker.com/compare.php?from=79ce933114e46c891a5632f7ad4a004b93a5b808&to=978278eabc0bafe2f390ca8fcdad24154f954020&stat=instructions%3Au Note that the stage2 instruction count increases is expected, this patch trades instructions for decreasing branch-misses (which is proportionately lower): https://llvm-compile-time-tracker.com/compare.php?from=79ce933114e46c891a5632f7ad4a004b93a5b808&to=978278eabc0bafe2f390ca8fcdad24154f954020&stat=branch-misses NB: This will also likely help for APX targets with the new `CCMP` and `CTEST` instructions.
1 parent 5daea7c commit 077ff9a

28 files changed

+920
-740
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

+36
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,42 @@ class TargetLoweringBase {
596596
/// avoided.
597597
bool isJumpExpensive() const { return JumpIsExpensive; }
598598

599+
// Costs parameters used by
600+
// SelectionDAGBuilder::shouldKeepJumpConditionsTogether.
601+
// shouldKeepJumpConditionsTogether will use these parameter value to
602+
// determine if two conditions in the form `br (and/or cond1, cond2)` should
603+
// be split into two branches or left as one.
604+
//
605+
// BaseCost is the cost threshold (in latency). If the estimated latency of
606+
// computing both `cond1` and `cond2` is below the cost of just computing
607+
// `cond1` + BaseCost, the two conditions will be kept together. Otherwise
608+
// they will be split.
609+
//
610+
// LikelyBias increases BaseCost if branch probability info indicates that it
611+
// is likely that both `cond1` and `cond2` will be computed.
612+
//
613+
// UnlikelyBias decreases BaseCost if branch probability info indicates that
614+
// it is likely that both `cond1` and `cond2` will be computed.
615+
//
616+
// Set any field to -1 to make it ignored (setting BaseCost to -1 results in
617+
// `shouldKeepJumpConditionsTogether` always returning false).
618+
struct CondMergingParams {
619+
int BaseCost;
620+
int LikelyBias;
621+
int UnlikelyBias;
622+
};
623+
// Return params for deciding if we should keep two branch conditions merged
624+
// or split them into two separate branches.
625+
// Arg0: The binary op joining the two conditions (and/or).
626+
// Arg1: The first condition (cond1)
627+
// Arg2: The second condition (cond2)
628+
virtual CondMergingParams
629+
getJumpConditionMergingParams(Instruction::BinaryOps, const Value *,
630+
const Value *) const {
631+
// -1 will always result in splitting.
632+
return {-1, -1, -1};
633+
}
634+
599635
/// Return true if selects are only cheaper than branches if the branch is
600636
/// unlikely to be predicted right.
601637
bool isPredictableSelectExpensive() const {

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

+150-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "llvm/Analysis/Loads.h"
2727
#include "llvm/Analysis/MemoryLocation.h"
2828
#include "llvm/Analysis/TargetLibraryInfo.h"
29+
#include "llvm/Analysis/TargetTransformInfo.h"
2930
#include "llvm/Analysis/ValueTracking.h"
3031
#include "llvm/Analysis/VectorUtils.h"
3132
#include "llvm/CodeGen/Analysis.h"
@@ -93,6 +94,7 @@
9394
#include "llvm/Support/CommandLine.h"
9495
#include "llvm/Support/Compiler.h"
9596
#include "llvm/Support/Debug.h"
97+
#include "llvm/Support/InstructionCost.h"
9698
#include "llvm/Support/MathExtras.h"
9799
#include "llvm/Support/raw_ostream.h"
98100
#include "llvm/Target/TargetIntrinsicInfo.h"
@@ -2446,6 +2448,147 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
24462448
SL->SwitchCases.push_back(CB);
24472449
}
24482450

2451+
// Collect dependencies on V recursively. This is used for the cost analysis in
2452+
// `shouldKeepJumpConditionsTogether`.
2453+
static bool
2454+
collectInstructionDeps(SmallPtrSet<const Instruction *, 8> *Deps,
2455+
const Value *V,
2456+
SmallPtrSet<const Instruction *, 8> *Necessary = nullptr,
2457+
unsigned Depth = 0) {
2458+
// Return false if we have an incomplete count.
2459+
if (Depth >= SelectionDAG::MaxRecursionDepth)
2460+
return false;
2461+
2462+
auto *I = dyn_cast<Instruction>(V);
2463+
if (I == nullptr)
2464+
return true;
2465+
2466+
if (Necessary != nullptr) {
2467+
// This instruction is necessary for the other side of the condition so
2468+
// don't count it.
2469+
if (Necessary->contains(I))
2470+
return true;
2471+
}
2472+
2473+
// Already added this dep.
2474+
if (!Deps->insert(I).second)
2475+
return true;
2476+
2477+
for (unsigned OpIdx = 0, E = I->getNumOperands(); OpIdx < E; ++OpIdx)
2478+
if (!collectInstructionDeps(Deps, I->getOperand(OpIdx), Necessary,
2479+
Depth + 1))
2480+
return false;
2481+
return true;
2482+
}
2483+
2484+
bool SelectionDAGBuilder::shouldKeepJumpConditionsTogether(
2485+
const FunctionLoweringInfo &FuncInfo, const BranchInst &I,
2486+
Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs,
2487+
TargetLoweringBase::CondMergingParams Params) const {
2488+
if (I.getNumSuccessors() != 2)
2489+
return false;
2490+
2491+
if (Params.BaseCost < 0)
2492+
return false;
2493+
2494+
// Baseline cost.
2495+
InstructionCost CostThresh = Params.BaseCost;
2496+
2497+
BranchProbabilityInfo *BPI = nullptr;
2498+
if (Params.LikelyBias || Params.UnlikelyBias)
2499+
BPI = FuncInfo.BPI;
2500+
if (BPI != nullptr) {
2501+
// See if we are either likely to get an early out or compute both lhs/rhs
2502+
// of the condition.
2503+
BasicBlock *IfFalse = I.getSuccessor(0);
2504+
BasicBlock *IfTrue = I.getSuccessor(1);
2505+
2506+
std::optional<bool> Likely;
2507+
if (BPI->isEdgeHot(I.getParent(), IfTrue))
2508+
Likely = true;
2509+
else if (BPI->isEdgeHot(I.getParent(), IfFalse))
2510+
Likely = false;
2511+
2512+
if (Likely) {
2513+
if (Opc == (*Likely ? Instruction::And : Instruction::Or))
2514+
// Its likely we will have to compute both lhs and rhs of condition
2515+
CostThresh += Params.LikelyBias;
2516+
else {
2517+
if (Params.UnlikelyBias < 0)
2518+
return false;
2519+
// Its likely we will get an early out.
2520+
CostThresh -= Params.UnlikelyBias;
2521+
}
2522+
}
2523+
}
2524+
2525+
if (CostThresh <= 0)
2526+
return false;
2527+
2528+
// Collect "all" instructions that lhs condition is dependent on.
2529+
SmallPtrSet<const Instruction *, 8> LhsDeps, RhsDeps;
2530+
collectInstructionDeps(&LhsDeps, Lhs);
2531+
// Collect "all" instructions that rhs condition is dependent on AND are
2532+
// dependencies of lhs. This gives us an estimate on which instructions we
2533+
// stand to save by splitting the condition.
2534+
if (!collectInstructionDeps(&RhsDeps, Rhs, &LhsDeps))
2535+
return false;
2536+
// Add the compare instruction itself unless its a dependency on the LHS.
2537+
if (const auto *RhsI = dyn_cast<Instruction>(Rhs))
2538+
if (!LhsDeps.contains(RhsI))
2539+
RhsDeps.insert(RhsI);
2540+
2541+
const auto &TLI = DAG.getTargetLoweringInfo();
2542+
const auto &TTI =
2543+
TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
2544+
2545+
InstructionCost CostOfIncluding = 0;
2546+
// See if this instruction will need to computed independently of whether RHS
2547+
// is.
2548+
auto ShouldCountInsn = [&RhsDeps](const Instruction *Ins) {
2549+
for (const auto *U : Ins->users()) {
2550+
// If user is independent of RHS calculation we don't need to count it.
2551+
if (auto *UIns = dyn_cast<Instruction>(U))
2552+
if (!RhsDeps.contains(UIns))
2553+
return false;
2554+
}
2555+
return true;
2556+
};
2557+
2558+
// Prune instructions from RHS Deps that are dependencies of unrelated
2559+
// instructions. The value (SelectionDAG::MaxRecursionDepth) is fairly
2560+
// arbitrary and just meant to cap the how much time we spend in the pruning
2561+
// loop. Its highly unlikely to come into affect.
2562+
const unsigned MaxPruneIters = SelectionDAG::MaxRecursionDepth;
2563+
// Stop after a certain point. No incorrectness from including too many
2564+
// instructions.
2565+
for (unsigned PruneIters = 0; PruneIters < MaxPruneIters; ++PruneIters) {
2566+
const Instruction *ToDrop = nullptr;
2567+
for (const auto *Ins : RhsDeps) {
2568+
if (!ShouldCountInsn(Ins)) {
2569+
ToDrop = Ins;
2570+
break;
2571+
}
2572+
}
2573+
if (ToDrop == nullptr)
2574+
break;
2575+
RhsDeps.erase(ToDrop);
2576+
}
2577+
2578+
for (const auto *Ins : RhsDeps) {
2579+
// Finally accumulate latency that we can only attribute to computing the
2580+
// RHS condition. Use latency because we are essentially trying to calculate
2581+
// the cost of the dependency chain.
2582+
// Possible TODO: We could try to estimate ILP and make this more precise.
2583+
CostOfIncluding +=
2584+
TTI.getInstructionCost(Ins, TargetTransformInfo::TCK_Latency);
2585+
2586+
if (CostOfIncluding > CostThresh)
2587+
return false;
2588+
}
2589+
return true;
2590+
}
2591+
24492592
void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
24502593
MachineBasicBlock *TBB,
24512594
MachineBasicBlock *FBB,
@@ -2660,8 +2803,13 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
26602803
else if (match(BOp, m_LogicalOr(m_Value(BOp0), m_Value(BOp1))))
26612804
Opcode = Instruction::Or;
26622805

2663-
if (Opcode && !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
2664-
match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
2806+
if (Opcode &&
2807+
!(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
2808+
match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value()))) &&
2809+
!shouldKeepJumpConditionsTogether(
2810+
FuncInfo, I, Opcode, BOp0, BOp1,
2811+
DAG.getTargetLoweringInfo().getJumpConditionMergingParams(
2812+
Opcode, BOp0, BOp1))) {
26652813
FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, Opcode,
26662814
getEdgeProbability(BrMBB, Succ0MBB),
26672815
getEdgeProbability(BrMBB, Succ1MBB),

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h

+5
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,11 @@ class SelectionDAGBuilder {
385385
N = NewN;
386386
}
387387

388+
bool shouldKeepJumpConditionsTogether(
389+
const FunctionLoweringInfo &FuncInfo, const BranchInst &I,
390+
Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs,
391+
TargetLoweringBase::CondMergingParams Params) const;
392+
388393
void FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB,
389394
MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
390395
MachineBasicBlock *SwitchBB,

llvm/lib/Target/X86/X86ISelLowering.cpp

+49
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,37 @@ static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
7777
"alignment set by x86-experimental-pref-loop-alignment."),
7878
cl::Hidden);
7979

80+
static cl::opt<int> BrMergingBaseCostThresh(
81+
"x86-br-merging-base-cost", cl::init(1),
82+
cl::desc(
83+
"Sets the cost threshold for when multiple conditionals will be merged "
84+
"into one branch versus be split in multiple branches. Merging "
85+
"conditionals saves branches at the cost of additional instructions. "
86+
"This value sets the instruction cost limit, below which conditionals "
87+
"will be merged, and above which conditionals will be split."),
88+
cl::Hidden);
89+
90+
static cl::opt<int> BrMergingLikelyBias(
91+
"x86-br-merging-likely-bias", cl::init(0),
92+
cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
93+
"that all conditionals will be executed. For example for merging "
94+
"the conditionals (a == b && c > d), if its known that a == b is "
95+
"likely, then it is likely that if the conditionals are split "
96+
"both sides will be executed, so it may be desirable to increase "
97+
"the instruction cost threshold."),
98+
cl::Hidden);
99+
100+
static cl::opt<int> BrMergingUnlikelyBias(
101+
"x86-br-merging-unlikely-bias", cl::init(1),
102+
cl::desc(
103+
"Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
104+
"that all conditionals will be executed. For example for merging "
105+
"the conditionals (a == b && c > d), if its known that a == b is "
106+
"unlikely, then it is unlikely that if the conditionals are split "
107+
"both sides will be executed, so it may be desirable to decrease "
108+
"the instruction cost threshold."),
109+
cl::Hidden);
110+
80111
static cl::opt<bool> MulConstantOptimization(
81112
"mul-constant-optimization", cl::init(true),
82113
cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -3339,6 +3370,24 @@ unsigned X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand(
33393370
return ISD::SRL;
33403371
}
33413372

3373+
TargetLoweringBase::CondMergingParams
3374+
X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
3375+
const Value *Lhs,
3376+
const Value *Rhs) const {
3377+
using namespace llvm::PatternMatch;
3378+
int BaseCost = BrMergingBaseCostThresh.getValue();
3379+
// a == b && a == c is a fast pattern on x86.
3380+
ICmpInst::Predicate Pred;
3381+
if (BaseCost >= 0 && Opc == Instruction::And &&
3382+
match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3383+
Pred == ICmpInst::ICMP_EQ &&
3384+
match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3385+
Pred == ICmpInst::ICMP_EQ)
3386+
BaseCost += 1;
3387+
return {BaseCost, BrMergingLikelyBias.getValue(),
3388+
BrMergingUnlikelyBias.getValue()};
3389+
}
3390+
33423391
bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
33433392
return N->getOpcode() != ISD::FP_EXTEND;
33443393
}

llvm/lib/Target/X86/X86ISelLowering.h

+4
Original file line numberDiff line numberDiff line change
@@ -1150,6 +1150,10 @@ namespace llvm {
11501150

11511151
bool preferScalarizeSplat(SDNode *N) const override;
11521152

1153+
CondMergingParams
1154+
getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
1155+
const Value *Rhs) const override;
1156+
11531157
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
11541158
CombineLevel Level) const override;
11551159

llvm/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll

+6-5
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,16 @@ define i1 @loadAndRLEsource_no_exit_2E_1_label_2E_0(i32 %tmp.21.reload, i32 %tmp
1818
; CHECK-NEXT: movl _block, %esi
1919
; CHECK-NEXT: movb %al, 1(%esi,%edx)
2020
; CHECK-NEXT: cmpl %ecx, _last
21-
; CHECK-NEXT: jge LBB0_3
22-
; CHECK-NEXT: ## %bb.1: ## %label.0
21+
; CHECK-NEXT: setl %cl
2322
; CHECK-NEXT: cmpl $257, %eax ## imm = 0x101
24-
; CHECK-NEXT: je LBB0_3
25-
; CHECK-NEXT: ## %bb.2: ## %label.0.no_exit.1_crit_edge.exitStub
23+
; CHECK-NEXT: setne %al
24+
; CHECK-NEXT: testb %al, %cl
25+
; CHECK-NEXT: je LBB0_2
26+
; CHECK-NEXT: ## %bb.1: ## %label.0.no_exit.1_crit_edge.exitStub
2627
; CHECK-NEXT: movb $1, %al
2728
; CHECK-NEXT: popl %esi
2829
; CHECK-NEXT: retl
29-
; CHECK-NEXT: LBB0_3: ## %codeRepl5.exitStub
30+
; CHECK-NEXT: LBB0_2: ## %codeRepl5.exitStub
3031
; CHECK-NEXT: xorl %eax, %eax
3132
; CHECK-NEXT: popl %esi
3233
; CHECK-NEXT: retl

0 commit comments

Comments
 (0)