From 8c943eb054d6c641996920e73065a35178ba2a2d Mon Sep 17 00:00:00 2001 From: gingerBill Date: Tue, 1 Jun 2021 10:51:54 +0100 Subject: [PATCH] Make inline array arithmetic use `load+extractvalue` rather than `getelementptr+load` to give the optimizer a better hint for vectorization --- src/llvm_backend.cpp | 49 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp index 431e1429c7c..eaa621291b7 100644 --- a/src/llvm_backend.cpp +++ b/src/llvm_backend.cpp @@ -6863,20 +6863,46 @@ lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lhs, lbValue r lhs = lb_emit_conv(p, lhs, type); rhs = lb_emit_conv(p, rhs, type); - lbValue x = lb_address_from_load_or_generate_local(p, lhs); - lbValue y = lb_address_from_load_or_generate_local(p, rhs); - GB_ASSERT(is_type_array(type)); Type *elem_type = base_array_type(type); - lbAddr res = lb_add_local_generated(p, type, false); - i64 count = base_type(type)->Array.count; bool inline_array_arith = type_size_of(type) <= build_context.max_align; if (inline_array_arith) { #if 1 + #if 1 + unsigned n = cast(unsigned)count; + auto dst_ptrs = array_make(temporary_allocator(), count); + + auto a_loads = array_make(temporary_allocator(), count); + auto b_loads = array_make(temporary_allocator(), count); + auto c_ops = array_make(temporary_allocator(), count); + + for (unsigned i = 0; i < n; i++) { + a_loads[i].value = LLVMBuildExtractValue(p->builder, lhs.value, i, ""); + a_loads[i].type = elem_type; + } + for (unsigned i = 0; i < n; i++) { + b_loads[i].value = LLVMBuildExtractValue(p->builder, rhs.value, i, ""); + b_loads[i].type = elem_type; + } + for (unsigned i = 0; i < n; i++) { + c_ops[i] = lb_emit_arith(p, op, a_loads[i], b_loads[i], elem_type); + } + + lbAddr res = lb_add_local_generated(p, type, false); + for (unsigned i = 0; i < n; i++) { + dst_ptrs[i] = lb_emit_array_epi(p, res.addr, i); + } + for (unsigned i = 0; i < n; i++) { + lb_emit_store(p, dst_ptrs[i], c_ops[i]); + } + #else + lbValue x = lb_address_from_load_or_generate_local(p, lhs); + lbValue y = lb_address_from_load_or_generate_local(p, rhs); + auto a_ptrs = array_make(temporary_allocator(), count); auto b_ptrs = array_make(temporary_allocator(), count); auto dst_ptrs = array_make(temporary_allocator(), count); @@ -6901,12 +6927,14 @@ lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lhs, lbValue r c_ops[i] = lb_emit_arith(p, op, a_loads[i], b_loads[i], elem_type); } + lbAddr res = lb_add_local_generated(p, type, false); for (i64 i = 0; i < count; i++) { dst_ptrs[i] = lb_emit_array_epi(p, res.addr, i); } for (i64 i = 0; i < count; i++) { lb_emit_store(p, dst_ptrs[i], c_ops[i]); } + #endif #else for (i64 i = 0; i < count; i++) { lbValue a_ptr = lb_emit_array_epi(p, x, i); @@ -6919,7 +6947,14 @@ lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lhs, lbValue r lb_emit_store(p, dst_ptr, c); } #endif + + return lb_addr_load(p, res); } else { + lbValue x = lb_address_from_load_or_generate_local(p, lhs); + lbValue y = lb_address_from_load_or_generate_local(p, rhs); + + lbAddr res = lb_add_local_generated(p, type, false); + auto loop_data = lb_loop_start(p, count, t_i32); lbValue a_ptr = lb_emit_array_ep(p, x, loop_data.idx); @@ -6932,9 +6967,9 @@ lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lhs, lbValue r lb_emit_store(p, dst_ptr, c); lb_loop_end(p, loop_data); - } - return lb_addr_load(p, res); + return lb_addr_load(p, res); + } }