Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 27 additions & 15 deletions llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ namespace {
class FrameRef {
MachineBasicBlock::iterator MI; // Instr referencing the frame
int64_t LocalOffset; // Local offset of the frame idx referenced
int64_t InstrOffset; // Offset of the instruction from the frame index
int FrameIdx; // The frame index

// Order reference instruction appears in program. Used to ensure
Expand All @@ -59,16 +60,20 @@ namespace {
unsigned Order;

public:
FrameRef(MachineInstr *I, int64_t Offset, int Idx, unsigned Ord) :
MI(I), LocalOffset(Offset), FrameIdx(Idx), Order(Ord) {}
FrameRef(MachineInstr *I, int64_t Offset, int64_t InstrOffset, int Idx,
unsigned Ord)
: MI(I), LocalOffset(Offset), InstrOffset(InstrOffset), FrameIdx(Idx),
Order(Ord) {}

bool operator<(const FrameRef &RHS) const {
return std::tie(LocalOffset, FrameIdx, Order) <
std::tie(RHS.LocalOffset, RHS.FrameIdx, RHS.Order);
return std::tuple(LocalOffset + InstrOffset, FrameIdx, Order) <
std::tuple(RHS.LocalOffset + RHS.InstrOffset, RHS.FrameIdx,
RHS.Order);
Comment on lines +69 to +71
Copy link

Copilot AI Dec 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comparison operator uses std::tuple() which is a function call syntax, but this should be std::make_tuple() or std::tie(). Using std::tuple() without template arguments is not valid C++ syntax for creating a tuple.

Recommend changing to:

return std::make_tuple(LocalOffset + InstrOffset, FrameIdx, Order) <
       std::make_tuple(RHS.LocalOffset + RHS.InstrOffset, RHS.FrameIdx,
                       RHS.Order);

Or preferably use std::tie() which is more concise and clearer:

return std::tie(LocalOffset + InstrOffset, FrameIdx, Order) <
       std::tie(RHS.LocalOffset + RHS.InstrOffset, RHS.FrameIdx,
                RHS.Order);

Note: std::tie() cannot be used with expressions (like LocalOffset + InstrOffset), so std::make_tuple() is the correct choice here.

Suggested change
return std::tuple(LocalOffset + InstrOffset, FrameIdx, Order) <
std::tuple(RHS.LocalOffset + RHS.InstrOffset, RHS.FrameIdx,
RHS.Order);
return std::make_tuple(LocalOffset + InstrOffset, FrameIdx, Order) <
std::make_tuple(RHS.LocalOffset + RHS.InstrOffset, RHS.FrameIdx,
RHS.Order);

Copilot uses AI. Check for mistakes.
}

MachineBasicBlock::iterator getMachineInstr() const { return MI; }
int64_t getLocalOffset() const { return LocalOffset; }
int64_t getInstrOffset() const { return InstrOffset; }
int getFrameIndex() const { return FrameIdx; }
};

Expand Down Expand Up @@ -335,20 +340,27 @@ bool LocalStackSlotImpl::insertFrameReferenceRegisters(MachineFunction &Fn) {
// than that, but the increased register pressure makes that a
// tricky thing to balance. Investigate if re-materializing these
// becomes an issue.
for (const MachineOperand &MO : MI.operands()) {
for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx != OpEnd;
++OpIdx) {
const MachineOperand &MO = MI.getOperand(OpIdx);
// Consider replacing all frame index operands that reference
// an object allocated in the local block.
if (MO.isFI()) {
// Don't try this with values not in the local block.
if (!MFI.isObjectPreAllocated(MO.getIndex()))
break;
int Idx = MO.getIndex();
int64_t LocalOffset = LocalOffsets[Idx];
if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
break;
FrameReferenceInsns.push_back(FrameRef(&MI, LocalOffset, Idx, Order++));
if (!MO.isFI())
continue;

int FrameIdx = MO.getIndex();
// Don't try this with values not in the local block.
if (!MFI.isObjectPreAllocated(FrameIdx))
break;

int64_t LocalOffset = LocalOffsets[FrameIdx];
if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
break;
}

int64_t InstrOffset = TRI->getFrameIndexInstrOffset(&MI, OpIdx);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering if needsFrameBaseReg needs to account for the InstrOffset, but it seems the AMDGPU implementation at least does. Do other targets?

I'm also wondering if it's worthwhile storing InstrOffset, or just adjusting (and renaming) the offset stored in FrameReferenceInsns?

FrameReferenceInsns.emplace_back(&MI, LocalOffset, InstrOffset,
FrameIdx, Order++);
break;
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,15 @@ machineFunctionInfo:
body: |
bb.0:
; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 100
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec
; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 156, [[V_ADD_U32_e64_]], 0, implicit $exec
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec
; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 412, [[V_ADD_U32_e64_]], 0, implicit $exec
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
; GFX10-NEXT: SI_RETURN
;
; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
Expand Down
11 changes: 6 additions & 5 deletions llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
Original file line number Diff line number Diff line change
Expand Up @@ -294,25 +294,26 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000
; FLATSCR-NEXT: s_add_i32 s1, s0, 0x4000
; FLATSCR-NEXT: s_add_i32 s0, s0, 1
; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
; FLATSCR-NEXT: scratch_store_byte off, v0, s1
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1
; FLATSCR-NEXT: ; %bb.2: ; %split
; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
; FLATSCR-NEXT: s_addk_i32 s0, 0x2000
; FLATSCR-NEXT: s_addk_i32 s0, 0x4000
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
; FLATSCR-NEXT: s_movk_i32 s0, 0x4000
; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:16 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 glc
Expand Down
Loading
Loading