From 9e14bf8fbffd1570d719dee1f23df9a1ac114ff7 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Thu, 7 Nov 2024 06:07:26 -0300 Subject: [PATCH] count bytes allocated through malloc more precisely (#55223) Should make the accounting for memory allocated through malloc a bit more accurate. Should also simplify the accounting code by eliminating the use of `jl_gc_count_freed` in `jl_genericmemory_to_string`. --- src/gc-common.c | 91 ++++++++++------------------- src/gc-common.h | 8 +++ src/gc-stock.c | 122 ++++++++++++++++++++++++--------------- src/genericmemory.c | 5 +- src/julia_internal.h | 2 +- test/compiler/codegen.jl | 2 +- 6 files changed, 117 insertions(+), 113 deletions(-) diff --git a/src/gc-common.c b/src/gc-common.c index b552afb8228f0..c751b54f059f5 100644 --- a/src/gc-common.c +++ b/src/gc-common.c @@ -6,9 +6,6 @@ #include "julia_gcext.h" #include "julia_assert.h" #include "threading.h" -#ifdef __GLIBC__ -#include // for malloc_trim -#endif #ifdef __cplusplus extern "C" { @@ -120,6 +117,37 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t jl_gc_deregister_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb); } +// =========================================================================== // +// malloc wrappers, aligned allocation +// =========================================================================== // + +#if defined(_OS_WINDOWS_) +// helper function based partly on wine msvcrt80+ heap.c +// but with several fixes to improve the correctness of the computation and remove unnecessary parameters +#define SAVED_PTR(x) ((void *)((DWORD_PTR)((char *)x - sizeof(void *)) & \ + ~(sizeof(void *) - 1))) +static size_t _aligned_msize(void *p) +{ + void *alloc_ptr = *(void**)SAVED_PTR(p); + return _msize(alloc_ptr) - ((char*)p - (char*)alloc_ptr); +} +#undef SAVED_PTR +#endif + +size_t memory_block_usable_size(void *p, int isaligned) JL_NOTSAFEPOINT +{ +#if defined(_OS_WINDOWS_) + if (isaligned) + return _aligned_msize(p); + else + return _msize(p); +#elif defined(_OS_DARWIN_) + return malloc_size(p); +#else + return malloc_usable_size(p); +#endif +} + // =========================================================================== // // Finalization // =========================================================================== // @@ -505,63 +533,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz) return jl_gc_alloc(ptls, sz, NULL); } -// allocation wrappers that save the size of allocations, to allow using -// jl_gc_counted_* functions with a libc-compatible API. - -JL_DLLEXPORT void *jl_malloc(size_t sz) -{ - int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT); - if (p == NULL) - return NULL; - p[0] = sz; - return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16 -} - -//_unchecked_calloc does not check for potential overflow of nm*sz -STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) { - size_t nmsz = nm*sz; - int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1); - if (p == NULL) - return NULL; - p[0] = nmsz; - return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16 -} - -JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz) -{ - if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT) - return NULL; - return _unchecked_calloc(nm, sz); -} - -JL_DLLEXPORT void jl_free(void *p) -{ - if (p != NULL) { - int64_t *pp = (int64_t *)p - 2; - size_t sz = pp[0]; - jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT); - } -} - -JL_DLLEXPORT void *jl_realloc(void *p, size_t sz) -{ - int64_t *pp; - size_t szold; - if (p == NULL) { - pp = NULL; - szold = 0; - } - else { - pp = (int64_t *)p - 2; - szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT; - } - int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT); - if (pnew == NULL) - return NULL; - pnew[0] = sz; - return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16 -} - // allocator entry points JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty) diff --git a/src/gc-common.h b/src/gc-common.h index 32b7470b13a58..3007151009f7d 100644 --- a/src/gc-common.h +++ b/src/gc-common.h @@ -12,6 +12,14 @@ #endif #endif +#include + +#if defined(_OS_DARWIN_) +#include +#else +#include // for malloc_trim +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/src/gc-stock.c b/src/gc-stock.c index 3a2027f9190a7..86dbea3b9a17a 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -9,9 +9,6 @@ #include "julia_atomics.h" #include "julia_gcext.h" #include "julia_assert.h" -#ifdef __GLIBC__ -#include // for malloc_trim -#endif #ifdef __cplusplus extern "C" { @@ -569,11 +566,6 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT jl_batch_accum_heap_size(ptls, sz); } -void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT -{ - jl_batch_accum_free_size(jl_current_task->ptls, sz); -} - // Only safe to update the heap inside the GC static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT { @@ -643,13 +635,15 @@ static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT jl_genericmemory_t *m = (jl_genericmemory_t*)v; assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2); char *d = (char*)m->ptr; + size_t freed_bytes = memory_block_usable_size(d, isaligned); + assert(freed_bytes != 0); if (isaligned) jl_free_aligned(d); else free(d); jl_atomic_store_relaxed(&gc_heap_stats.heap_size, - jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_genericmemory_nbytes(m)); - gc_num.freed += jl_genericmemory_nbytes(m); + jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - freed_bytes); + gc_num.freed += freed_bytes; gc_num.freecall++; } @@ -3652,14 +3646,69 @@ JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void) return max_total_memory; } -// allocation wrappers that track allocation and let collection run +// allocation wrappers that add to gc pressure + +JL_DLLEXPORT void *jl_malloc(size_t sz) +{ + return jl_gc_counted_malloc(sz); +} + +//_unchecked_calloc does not check for potential overflow of nm*sz +STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) { + size_t nmsz = nm*sz; + return jl_gc_counted_calloc(nmsz, 1); +} + +JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz) +{ + if (nm > SSIZE_MAX/sz) + return NULL; + return _unchecked_calloc(nm, sz); +} + +JL_DLLEXPORT void jl_free(void *p) +{ + if (p != NULL) { + size_t sz = memory_block_usable_size(p, 0); + free(p); + jl_task_t *ct = jl_get_current_task(); + if (ct != NULL) + jl_batch_accum_free_size(ct->ptls, sz); + } +} + +JL_DLLEXPORT void *jl_realloc(void *p, size_t sz) +{ + size_t old = p ? memory_block_usable_size(p, 0) : 0; + void *data = realloc(p, sz); + jl_task_t *ct = jl_get_current_task(); + if (data != NULL && ct != NULL) { + sz = memory_block_usable_size(data, 0); + jl_ptls_t ptls = ct->ptls; + maybe_collect(ptls); + if (!(sz < old)) + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old)); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, + jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1); + + int64_t diff = sz - old; + if (diff < 0) { + jl_batch_accum_free_size(ptls, -diff); + } + else { + jl_batch_accum_heap_size(ptls, diff); + } + } + return data; +} JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { - jl_gcframe_t **pgcstack = jl_get_pgcstack(); - jl_task_t *ct = jl_current_task; void *data = malloc(sz); - if (data != NULL && pgcstack != NULL && ct->world_age) { + jl_task_t *ct = jl_get_current_task(); + if (data != NULL && ct != NULL) { + sz = memory_block_usable_size(data, 0); jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, @@ -3673,54 +3722,29 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) { - jl_gcframe_t **pgcstack = jl_get_pgcstack(); - jl_task_t *ct = jl_current_task; void *data = calloc(nm, sz); - if (data != NULL && pgcstack != NULL && ct->world_age) { + jl_task_t *ct = jl_get_current_task(); + if (data != NULL && ct != NULL) { + sz = memory_block_usable_size(data, 0); jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + nm*sz); + jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz); jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1); - jl_batch_accum_heap_size(ptls, sz * nm); + jl_batch_accum_heap_size(ptls, sz); } return data; } JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) { - jl_gcframe_t **pgcstack = jl_get_pgcstack(); - jl_task_t *ct = jl_current_task; - free(p); - if (pgcstack != NULL && ct->world_age) { - jl_batch_accum_free_size(ct->ptls, sz); - } + return jl_free(p); } JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) { - jl_gcframe_t **pgcstack = jl_get_pgcstack(); - jl_task_t *ct = jl_current_task; - void *data = realloc(p, sz); - if (data != NULL && pgcstack != NULL && ct->world_age) { - jl_ptls_t ptls = ct->ptls; - maybe_collect(ptls); - if (!(sz < old)) - jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old)); - jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, - jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1); - - int64_t diff = sz - old; - if (diff < 0) { - jl_batch_accum_free_size(ptls, -diff); - } - else { - jl_batch_accum_heap_size(ptls, diff); - } - } - return data; + return jl_realloc(p, sz); } // allocating blocks for Arrays and Strings @@ -3741,11 +3765,13 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) if (b == NULL) jl_throw(jl_memory_exception); + size_t allocated_bytes = memory_block_usable_size(b, 1); + assert(allocated_bytes >= allocsz); jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz); + jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocated_bytes); jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1); - jl_batch_accum_heap_size(ptls, allocsz); + jl_batch_accum_heap_size(ptls, allocated_bytes); #ifdef _OS_WINDOWS_ SetLastError(last_error); #endif diff --git a/src/genericmemory.c b/src/genericmemory.c index 0bf2cf46edaae..2b02f021ccdd0 100644 --- a/src/genericmemory.c +++ b/src/genericmemory.c @@ -165,7 +165,8 @@ JL_DLLEXPORT jl_genericmemory_t *jl_ptr_to_genericmemory(jl_value_t *mtype, void if (own_buffer) { int isaligned = 0; // TODO: allow passing memalign'd buffers jl_gc_track_malloced_genericmemory(ct->ptls, m, isaligned); - jl_gc_count_allocd(nel*elsz); + size_t allocated_bytes = memory_block_usable_size(data, isaligned); + jl_gc_count_allocd(allocated_bytes); } return m; } @@ -208,8 +209,6 @@ JL_DLLEXPORT jl_value_t *jl_genericmemory_to_string(jl_genericmemory_t *m, size_ JL_GC_PUSH1(&o); jl_value_t *str = jl_pchar_to_string((const char*)m->ptr, len); JL_GC_POP(); - if (how == 1) // TODO: we might like to early-call jl_gc_free_memory here instead actually, but hopefully `m` will die soon - jl_gc_count_freed(mlength); return str; } // n.b. how == 0 is always pool-allocated, so the freed bytes are computed from the pool not the object diff --git a/src/julia_internal.h b/src/julia_internal.h index c5bac37890042..5eb99be9e333f 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -615,8 +615,8 @@ jl_svec_t *jl_perm_symsvec(size_t n, ...); void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned) JL_NOTSAFEPOINT; size_t jl_genericmemory_nbytes(jl_genericmemory_t *a) JL_NOTSAFEPOINT; +size_t memory_block_usable_size(void *mem, int isaligned) JL_NOTSAFEPOINT; void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT; -void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT; void jl_gc_run_all_finalizers(jl_task_t *ct); void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task); void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT; diff --git a/test/compiler/codegen.jl b/test/compiler/codegen.jl index fcb3beb87b5a5..83f4001e616e7 100644 --- a/test/compiler/codegen.jl +++ b/test/compiler/codegen.jl @@ -407,7 +407,7 @@ function g_dict_hash_alloc() end # Warm up f_dict_hash_alloc(); g_dict_hash_alloc(); -@test (@allocated f_dict_hash_alloc()) == (@allocated g_dict_hash_alloc()) +@test abs((@allocated f_dict_hash_alloc()) / (@allocated g_dict_hash_alloc()) - 1) < 0.1 # less that 10% difference # returning an argument shouldn't alloc a new box @noinline f33829(x) = (global called33829 = true; x)