-
-
Notifications
You must be signed in to change notification settings - Fork 14.2k
Open
Labels
A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-autovectorizationArea: Autovectorization, which can impact perf or code sizeArea: Autovectorization, which can impact perf or code sizeC-bugCategory: This is a bug.Category: This is a bug.E-needs-testCall for participation: An issue has been fixed and does not reproduce, but no test has been added.Call for participation: An issue has been fixed and does not reproduce, but no test has been added.I-slowIssue: Problems and improvements with respect to performance of generated code.Issue: Problems and improvements with respect to performance of generated code.
Description
I tried this code on godbolt.org :
#[derive(Default, Clone, Copy)]
struct Data {
a: u8,
b: u8,
c: u8,
d: u8,
e: u8,
f: u8,
g: u8,
h: u8,
}
#[inline(never)]
fn for_in_closure() {
let mut v = [Data::default(); 5000];
let mut closure = || {
for item in &mut v {
item.a += 1;
item.b += 1;
item.c += 1;
item.d += 1;
item.e += 1;
item.f += 1;
item.g += 1;
item.h += 1;
}
};
closure();
}The generated assembly code is as follows. The additions in the for loop are compiled into four inc instructions and one psubb instruction. Is there any particular reason why these additions cannot be compiled into one SSE addition?
example::for_in_closure:
push rbx
mov r11, rsp
sub r11, 36864
.LBB0_1:
sub rsp, 4096
mov qword ptr [rsp], 0
cmp rsp, r11
jne .LBB0_1
sub rsp, 3136
mov rdi, rsp
xor ebx, ebx
mov edx, 40000
xor esi, esi
call qword ptr [rip + memset@GOTPCREL]
pcmpeqd xmm0, xmm0
.LBB0_3:
inc byte ptr [rsp + 8*rbx]
movd xmm1, dword ptr [rsp + 8*rbx + 1]
psubb xmm1, xmm0
movd dword ptr [rsp + 8*rbx + 1], xmm1
inc byte ptr [rsp + 8*rbx + 5]
inc byte ptr [rsp + 8*rbx + 6]
inc byte ptr [rsp + 8*rbx + 7]
lea rax, [rbx + 1]
mov rbx, rax
cmp rax, 5000
jne .LBB0_3
add rsp, 40000
pop rbx
retInstead, if you move the for loop outside the closure, the for loop will be unrolled into five psubb instructions.
example::for_out_closure:
mov r11, rsp
sub r11, 36864
.LBB1_1:
sub rsp, 4096
mov qword ptr [rsp], 0
cmp rsp, r11
jne .LBB1_1
sub rsp, 3144
lea rdi, [rsp + 8]
mov edx, 40000
xor esi, esi
call qword ptr [rip + memset@GOTPCREL]
mov eax, 4
pcmpeqd xmm0, xmm0
.LBB1_3:
movq xmm1, qword ptr [rsp + 8*rax - 24]
psubb xmm1, xmm0
movq xmm2, qword ptr [rsp + 8*rax - 16]
psubb xmm2, xmm0
punpcklqdq xmm1, xmm2
movdqu xmmword ptr [rsp + 8*rax - 24], xmm1
movq xmm1, qword ptr [rsp + 8*rax - 8]
psubb xmm1, xmm0
movq xmm2, qword ptr [rsp + 8*rax]
psubb xmm2, xmm0
punpcklqdq xmm1, xmm2
movdqu xmmword ptr [rsp + 8*rax - 8], xmm1
movq xmm1, qword ptr [rsp + 8*rax + 8]
psubb xmm1, xmm0
movq qword ptr [rsp + 8*rax + 8], xmm1
add rax, 5
cmp rax, 5004
jne .LBB1_3
add rsp, 40008
retThe complete test code is avaliable here: https://godbolt.org/z/YoMaWWzW7
Metadata
Metadata
Assignees
Labels
A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-autovectorizationArea: Autovectorization, which can impact perf or code sizeArea: Autovectorization, which can impact perf or code sizeC-bugCategory: This is a bug.Category: This is a bug.E-needs-testCall for participation: An issue has been fixed and does not reproduce, but no test has been added.Call for participation: An issue has been fixed and does not reproduce, but no test has been added.I-slowIssue: Problems and improvements with respect to performance of generated code.Issue: Problems and improvements with respect to performance of generated code.