Skip to content

Commit 11c9795

Browse files
authored
Unrolled build for #152020
Rollup merge of #152020 - Sa4dUs:offload-remove-dummy-loads, r=ZuseZ4 Remove dummy loads on offload codegen The current logic generates two dummy loads to prevent some globals from being optimized away. This blocks memtransfer loop hoisting optimizations, so it's time to remove them. r? @ZuseZ4
2 parents 9f4b56a + 212c8c3 commit 11c9795

File tree

2 files changed

+8
-22
lines changed

2 files changed

+8
-22
lines changed

‎compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs‎

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,6 @@ pub(crate) struct OffloadKernelGlobals<'ll> {
347347
pub offload_sizes: &'ll llvm::Value,
348348
pub memtransfer_types: &'ll llvm::Value,
349349
pub region_id: &'ll llvm::Value,
350-
pub offload_entry: &'ll llvm::Value,
351350
}
352351

353352
fn gen_tgt_data_mappers<'ll>(
@@ -468,8 +467,12 @@ pub(crate) fn gen_define_handling<'ll>(
468467
let c_section_name = CString::new("llvm_offload_entries").unwrap();
469468
llvm::set_section(offload_entry, &c_section_name);
470469

471-
let result =
472-
OffloadKernelGlobals { offload_sizes, memtransfer_types, region_id, offload_entry };
470+
cx.add_compiler_used_global(offload_entry);
471+
472+
let result = OffloadKernelGlobals { offload_sizes, memtransfer_types, region_id };
473+
474+
// FIXME(Sa4dUs): use this global for constant offload sizes
475+
cx.add_compiler_used_global(result.offload_sizes);
473476

474477
cx.offload_kernel_cache.borrow_mut().insert(symbol, result);
475478

@@ -532,8 +535,7 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
532535
offload_dims: &OffloadKernelDims<'ll>,
533536
) {
534537
let cx = builder.cx;
535-
let OffloadKernelGlobals { offload_sizes, offload_entry, memtransfer_types, region_id } =
536-
offload_data;
538+
let OffloadKernelGlobals { memtransfer_types, region_id, .. } = offload_data;
537539
let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } =
538540
offload_dims;
539541

@@ -548,20 +550,6 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
548550
let num_args = types.len() as u64;
549551
let bb = builder.llbb();
550552

551-
// FIXME(Sa4dUs): dummy loads are a temp workaround, we should find a proper way to prevent these
552-
// variables from being optimized away
553-
for val in [offload_sizes, offload_entry] {
554-
unsafe {
555-
let dummy = llvm::LLVMBuildLoad2(
556-
&builder.llbuilder,
557-
llvm::LLVMTypeOf(val),
558-
val,
559-
b"dummy\0".as_ptr() as *const _,
560-
);
561-
llvm::LLVMSetVolatile(dummy, llvm::TRUE);
562-
}
563-
}
564-
565553
// Step 0)
566554
unsafe {
567555
llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, builder.llfn());

‎tests/codegen-llvm/gpu_offload/gpu_host.rs‎

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,7 @@ pub fn _kernel_1(x: &mut [f32; 256]) {
5555
// CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8
5656
// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
5757
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
58-
// CHECK: %dummy = load volatile ptr, ptr @.offload_sizes.[[K]], align 8
59-
// CHECK-NEXT: %dummy1 = load volatile ptr, ptr @.offloading.entry.[[K]], align 8
60-
// CHECK-NEXT: call void @__tgt_init_all_rtls()
58+
// CHECK: call void @__tgt_init_all_rtls()
6159
// CHECK-NEXT: store ptr %x, ptr %.offload_baseptrs, align 8
6260
// CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8
6361
// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8

0 commit comments

Comments
 (0)