Skip to content

Commit

Permalink
optimize constant length memorynew intrinsic (take 2) (#56847)
Browse files Browse the repository at this point in the history
replaces #55913 (the rebase was
more annoying than starting from scratch)
This allows the compiler to better understand what's going on for
`memorynew` with compile-time constant length, allowing for LLVM level
escape analysis in some cases. There is more room to grow this
(currently this only optimizes for fairly small Memory since bigger ones
would require writing some more LLVM code, and we probably want a size
limit on putting Memory on the stack to avoid stackoverflow. For larger
ones, we could potentially inline the free so the Memory doesn't have to
be swept by the GC, etc.
```
julia> function g()
           m = Memory{Int}(undef, 2)
           for i in 1:2
              m[i] = i
           end
           m[1]+m[2]
       end

julia> @Btime g()
  9.735 ns (1 allocation: 48 bytes) #before
  1.719 ns (0 allocations: 0 bytes) #after
```
  • Loading branch information
oscardssmith authored Jan 6, 2025
1 parent 10e20f1 commit 7801351
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 47 deletions.
141 changes: 105 additions & 36 deletions src/cgutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3321,7 +3321,7 @@ static Value *emit_genericmemoryptr(jl_codectx_t &ctx, Value *mem, const jl_data
LoadInst *LI = ctx.builder.CreateAlignedLoad(PPT, addr, Align(sizeof(char*)));
LI->setOrdering(AtomicOrdering::NotAtomic);
LI->setMetadata(LLVMContext::MD_nonnull, MDNode::get(ctx.builder.getContext(), None));
jl_aliasinfo_t aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
jl_aliasinfo_t aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(LI);
Value *ptr = LI;
if (AS) {
Expand All @@ -3347,7 +3347,7 @@ static Value *emit_genericmemoryowner(jl_codectx_t &ctx, Value *t)
return emit_guarded_test(ctx, foreign, t, [&] {
addr = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_jlgenericmemory, m, 1);
LoadInst *owner = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, addr, Align(sizeof(void*)));
jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
ai.decorateInst(owner);
return ctx.builder.CreateSelect(ctx.builder.CreateIsNull(owner), t, owner);
});
Expand Down Expand Up @@ -4432,6 +4432,105 @@ static int compare_cgparams(const jl_cgparams_t *a, const jl_cgparams_t *b)
}
#endif

static auto *emit_genericmemory_unchecked(jl_codectx_t &ctx, Value *cg_nbytes, Value *cg_typ)
{
auto ptls = get_current_ptls(ctx);
auto call = prepare_call(jl_alloc_genericmemory_unchecked_func);
auto *alloc = ctx.builder.CreateCall(call, { ptls, cg_nbytes, cg_typ});
alloc->setAttributes(call->getAttributes());
alloc->addRetAttr(Attribute::getWithAlignment(alloc->getContext(), Align(JL_HEAP_ALIGNMENT)));
call->addRetAttr(Attribute::getWithDereferenceableBytes(call->getContext(), sizeof(jl_genericmemory_t)));
return alloc;
}

static void emit_memory_zeroinit_and_stores(jl_codectx_t &ctx, jl_datatype_t *typ, Value* alloc, Value* nbytes, Value* nel, int zi)
{
auto arg_typename = [&] JL_NOTSAFEPOINT {
std::string type_str;
auto eltype = jl_tparam1(typ);
if (jl_is_datatype(eltype))
type_str = jl_symbol_name(((jl_datatype_t*)eltype)->name->name);
else if (jl_is_uniontype(eltype))
type_str = "Union";
else
type_str = "<unknown type>";
return "Memory{" + type_str + "}[]";
};
setName(ctx.emission_context, alloc, arg_typename);
// set length (jl_alloc_genericmemory_unchecked_func doesn't have it)
Value *decay_alloc = decay_derived(ctx, alloc);
Value *len_field = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 0);
auto len_store = ctx.builder.CreateAlignedStore(nel, len_field, Align(sizeof(void*)));
auto aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memorylen);
aliasinfo.decorateInst(len_store);
//This avoids the length store from being deleted which is illegal
ctx.builder.CreateFence(AtomicOrdering::Release, SyncScope::SingleThread);
// zeroinit pointers and unions
if (zi) {
Value *memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
auto *load = ctx.builder.CreateAlignedLoad(ctx.types().T_ptr, memory_ptr, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(load);
auto int8t = getInt8Ty(ctx.builder.getContext());
ctx.builder.CreateMemSet(load, ConstantInt::get(int8t, 0), nbytes, Align(sizeof(void*)));
}
return;
}


static jl_cgval_t emit_const_len_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, size_t nel, jl_genericmemory_t *inst)
{
if (nel == 0) {
Value *empty_alloc = track_pjlvalue(ctx, literal_pointer_val(ctx, (jl_value_t*)inst));
return mark_julia_type(ctx, empty_alloc, true, typ);
}
const jl_datatype_layout_t *layout = ((jl_datatype_t*)typ)->layout;
assert(((jl_datatype_t*)typ)->has_concrete_subtype && layout != NULL);
size_t elsz = layout->size;
int isboxed = layout->flags.arrayelem_isboxed;
int isunion = layout->flags.arrayelem_isunion;
int zi = ((jl_datatype_t*)typ)->zeroinit;
if (isboxed)
elsz = sizeof(void*);
size_t nbytes;
bool overflow = __builtin_mul_overflow(nel, elsz, &nbytes);
if (isunion) {
// an extra byte for each isbits union memory element, stored at m->ptr + m->length
overflow |= __builtin_add_overflow(nbytes, nel, &nbytes);
}
// overflow if signed size is too big or nel is too big (the latter matters iff elsz==0)
ssize_t tmp=1;
overflow |= __builtin_add_overflow(nel, 1, &tmp) || __builtin_add_overflow(nbytes, 1, &tmp);
if (overflow)
emit_error(ctx, prepare_call(jlargumenterror_func), "invalid GenericMemory size: the number of elements is either negative or too large for system address width");

auto T_size = ctx.types().T_size;
auto cg_typ = literal_pointer_val(ctx, (jl_value_t*) typ);
auto cg_nbytes = ConstantInt::get(T_size, nbytes);
auto cg_nel = ConstantInt::get(T_size, nel);
size_t tot = nbytes + LLT_ALIGN(sizeof(jl_genericmemory_t),JL_SMALL_BYTE_ALIGNMENT);
// if allocation fits within GC pools
int pooled = tot <= GC_MAX_SZCLASS;
Value *alloc, *decay_alloc, *memory_ptr;
jl_aliasinfo_t aliasinfo;
if (pooled) {
alloc = emit_allocobj(ctx, tot, cg_typ, false, JL_SMALL_BYTE_ALIGNMENT);
decay_alloc = decay_derived(ctx, alloc);
memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
setName(ctx.emission_context, memory_ptr, "memory_ptr");
auto objref = emit_pointer_from_objref(ctx, alloc);
Value *memory_data = emit_ptrgep(ctx, objref, JL_SMALL_BYTE_ALIGNMENT);
auto *store = ctx.builder.CreateAlignedStore(memory_data, memory_ptr, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(store);
setName(ctx.emission_context, memory_data, "memory_data");
} else { // just use the dynamic length version since the malloc will be slow anyway
alloc = emit_genericmemory_unchecked(ctx, cg_nbytes, cg_typ);
}
emit_memory_zeroinit_and_stores(ctx, typ, alloc, cg_nbytes, cg_nel, zi);
return mark_julia_type(ctx, alloc, true, typ);
}

static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval_t nel, jl_genericmemory_t *inst)
{
emit_typecheck(ctx, nel, (jl_value_t*)jl_long_type, "memorynew");
Expand All @@ -4448,9 +4547,7 @@ static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval
if (isboxed)
elsz = sizeof(void*);

auto ptls = get_current_ptls(ctx);
auto T_size = ctx.types().T_size;
auto int8t = getInt8Ty(ctx.builder.getContext());
BasicBlock *emptymemBB, *nonemptymemBB, *retvalBB;
emptymemBB = BasicBlock::Create(ctx.builder.getContext(), "emptymem");
nonemptymemBB = BasicBlock::Create(ctx.builder.getContext(), "nonemptymem");
Expand All @@ -4466,18 +4563,7 @@ static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval
ctx.builder.CreateBr(retvalBB);
nonemptymemBB->insertInto(ctx.f);
ctx.builder.SetInsertPoint(nonemptymemBB);
// else actually allocate mem
auto arg_typename = [&] JL_NOTSAFEPOINT {
std::string type_str;
auto eltype = jl_tparam1(typ);
if (jl_is_datatype(eltype))
type_str = jl_symbol_name(((jl_datatype_t*)eltype)->name->name);
else if (jl_is_uniontype(eltype))
type_str = "Union";
else
type_str = "<unknown type>";
return "Memory{" + type_str + "}[]";
};

auto cg_typ = literal_pointer_val(ctx, (jl_value_t*) typ);
auto cg_elsz = ConstantInt::get(T_size, elsz);

Expand All @@ -4501,27 +4587,10 @@ static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval
overflow = ctx.builder.CreateOr(overflow, tobignel);
Value *notoverflow = ctx.builder.CreateNot(overflow);
error_unless(ctx, prepare_call(jlargumenterror_func), notoverflow, "invalid GenericMemory size: the number of elements is either negative or too large for system address width");
// actually allocate
auto call = prepare_call(jl_alloc_genericmemory_unchecked_func);
Value *alloc = ctx.builder.CreateCall(call, { ptls, nbytes, cg_typ});
// set length (jl_alloc_genericmemory_unchecked_func doesn't have it)
Value *decay_alloc = decay_derived(ctx, alloc);
Value *len_field = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 0);
auto len_store = ctx.builder.CreateAlignedStore(nel_unboxed, len_field, Align(sizeof(void*)));
auto aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memorylen);
aliasinfo.decorateInst(len_store);
//This avoids the length store from being deleted which is illegal
ctx.builder.CreateFence(AtomicOrdering::Release, SyncScope::SingleThread);
// zeroinit pointers and unions
if (zi) {
Value *memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
auto *load = ctx.builder.CreateAlignedLoad(ctx.types().T_ptr, memory_ptr, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(load);
ctx.builder.CreateMemSet(load, ConstantInt::get(int8t, 0), nbytes, Align(sizeof(void*)));
}
// actually allocate the memory

setName(ctx.emission_context, alloc, arg_typename);
Value *alloc = emit_genericmemory_unchecked(ctx, nbytes, cg_typ);
emit_memory_zeroinit_and_stores(ctx, typ, alloc, nbytes, nel_unboxed, zi);
ctx.builder.CreateBr(retvalBB);
nonemptymemBB = ctx.builder.GetInsertBlock();
// phi node to choose which side of branch
Expand Down
23 changes: 19 additions & 4 deletions src/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,9 +408,9 @@ struct jl_tbaacache_t {
tbaa_arrayptr = tbaa_make_child(mbuilder, "jtbaa_arrayptr", tbaa_array_scalar).first;
tbaa_arraysize = tbaa_make_child(mbuilder, "jtbaa_arraysize", tbaa_array_scalar).first;
tbaa_arrayselbyte = tbaa_make_child(mbuilder, "jtbaa_arrayselbyte", tbaa_array_scalar).first;
tbaa_memoryptr = tbaa_make_child(mbuilder, "jtbaa_memoryptr", tbaa_array_scalar, true).first;
tbaa_memorylen = tbaa_make_child(mbuilder, "jtbaa_memorylen", tbaa_array_scalar, true).first;
tbaa_memoryown = tbaa_make_child(mbuilder, "jtbaa_memoryown", tbaa_array_scalar, true).first;
tbaa_memoryptr = tbaa_make_child(mbuilder, "jtbaa_memoryptr", tbaa_array_scalar).first;
tbaa_memorylen = tbaa_make_child(mbuilder, "jtbaa_memorylen", tbaa_array_scalar).first;
tbaa_memoryown = tbaa_make_child(mbuilder, "jtbaa_memoryown", tbaa_array_scalar).first;
tbaa_const = tbaa_make_child(mbuilder, "jtbaa_const", nullptr, true).first;
}
};
Expand Down Expand Up @@ -1179,6 +1179,7 @@ static const auto jl_alloc_genericmemory_unchecked_func = new JuliaFunction<Type
},
[](LLVMContext &C) {
auto FnAttrs = AttrBuilder(C);
FnAttrs.addAllocKindAttr(AllocFnKind::Alloc);
FnAttrs.addMemoryAttr(MemoryEffects::argMemOnly(ModRefInfo::Ref) | MemoryEffects::inaccessibleMemOnly(ModRefInfo::ModRef));
FnAttrs.addAttribute(Attribute::WillReturn);
FnAttrs.addAttribute(Attribute::NoUnwind);
Expand Down Expand Up @@ -1499,6 +1500,10 @@ static const auto pointer_from_objref_func = new JuliaFunction<>{
AttrBuilder FnAttrs(C);
FnAttrs.addMemoryAttr(MemoryEffects::none());
FnAttrs.addAttribute(Attribute::NoUnwind);
FnAttrs.addAttribute(Attribute::Speculatable);
FnAttrs.addAttribute(Attribute::WillReturn);
FnAttrs.addAttribute(Attribute::NoRecurse);
FnAttrs.addAttribute(Attribute::NoSync);
return AttributeList::get(C,
AttributeSet::get(C, FnAttrs),
Attributes(C, {Attribute::NonNull}),
Expand Down Expand Up @@ -4470,7 +4475,17 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
jl_genericmemory_t *inst = (jl_genericmemory_t*)((jl_datatype_t*)typ)->instance;
if (inst == NULL)
return false;
*ret = emit_memorynew(ctx, typ, argv[2], inst);
if (argv[2].constant) {
if (!jl_is_long(argv[2].constant))
return false;
size_t nel = jl_unbox_long(argv[2].constant);
if (nel < 0)
return false;
*ret = emit_const_len_memorynew(ctx, typ, nel, inst);
}
else {
*ret = emit_memorynew(ctx, typ, argv[2], inst);
}
return true;
}

Expand Down
4 changes: 2 additions & 2 deletions src/llvm-alloc-helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,8 @@ void jl_alloc::runEscapeAnalysis(llvm::CallInst *I, EscapeAnalysisRequiredArgs r
return true;
}
if (required.pass.gc_loaded_func == callee) {
required.use_info.haspreserve = true;
required.use_info.hasload = true;
// TODO add manual load->store forwarding
push_inst(inst);
return true;
}
if (required.pass.typeof_func == callee) {
Expand Down
10 changes: 5 additions & 5 deletions src/llvm-alloc-opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -752,11 +752,11 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF
call->eraseFromParent();
return;
}
//if (pass.gc_loaded_func == callee) {
// call->replaceAllUsesWith(new_i);
// call->eraseFromParent();
// return;
//}
if (pass.gc_loaded_func == callee) {
// TODO: handle data pointer forwarding, length forwarding, and fence removal
user->replaceUsesOfWith(orig_i, Constant::getNullValue(orig_i->getType()));
return;
}
if (pass.typeof_func == callee) {
++RemovedTypeofs;
call->replaceAllUsesWith(tag);
Expand Down
1 change: 1 addition & 0 deletions src/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,7 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder *
JULIA_PASS(FPM.addPass(LateLowerGCPass()));
JULIA_PASS(FPM.addPass(FinalLowerGCPass()));
if (O.getSpeedupLevel() >= 2) {
FPM.addPass(DSEPass());
FPM.addPass(GVNPass());
FPM.addPass(SCCPPass());
FPM.addPass(DCEPass());
Expand Down
50 changes: 50 additions & 0 deletions test/boundscheck_exec.jl
Original file line number Diff line number Diff line change
Expand Up @@ -297,4 +297,54 @@ end
typeintersect(Int, Integer)
end |> only === Type{Int}

if bc_opt == bc_default
@testset "Array/Memory escape analysis" begin
function no_allocate(T::Type{<:Union{Memory, Vector}})
v = T(undef, 2)
v[1] = 2
v[2] = 3
return v[1] + v[2]
end
function test_alloc(::Type{T}; broken=false) where T
@test (@allocated no_allocate(T)) == 0 broken=broken
end
@testset "$T" for T in [Memory, Vector]
@testset "$ET" for ET in [Int, Float32, Union{Int, Float64}]
no_allocate(T{ET}) #compile
# allocations aren't removed for Union eltypes which they theoretically could be eventually
test_alloc(T{ET}, broken=(ET==Union{Int, Float64}))
end
end
function f() # this was causing a bug on an in progress version of #55913.
m = Memory{Float64}(undef, 4)
m .= 1.0
s = 0.0
for x m
s += x
end
s
end
@test f() === 4.0
function confuse_alias_analysis()
mem0 = Memory{Int}(undef, 1)
mem1 = Memory{Int}(undef, 1)
@inbounds mem0[1] = 3
for width in 1:2
@inbounds mem1[1] = mem0[1]
mem0 = mem1
end
mem0[1]
end
@test confuse_alias_analysis() == 3
@test (@allocated confuse_alias_analysis()) == 0
function no_alias_prove(n)
m1 = Memory{Int}(undef,n)
m2 = Memory{Int}(undef,n)
m1 === m2
end
no_alias_prove(1)
@test_broken (@allocated no_alias_prove(5)) == 0
end
end

end

0 comments on commit 7801351

Please sign in to comment.