Skip to content

Commit

Permalink
i#3544 RV64: Optimize private memcpy and memset (#6800)
Browse files Browse the repository at this point in the history
1. Optimize private memcpy and memset for RV64.
2. Add test to compare private and libc memset.
3. Compare private memcpy with libc memcpy on more small sizes.
4. Fix a bug of core/CMakeLists.txt. For unit_tests, to compare private
and libc memcpy, we should link unit_tests to drmemfuncs but not link to
libc.

Compare original memcpy&memset, optimized private memcpy&memset and
glibc memcpy&memset.

Test command:
```
./bin64/unit_tests
```

When we use original memcpy and memset, outputs:
```
our_memcpy_time: size=1 time=0
libc_memcpy_time: size=1 time=2
our_memcpy_time: size=4 time=2
libc_memcpy_time: size=4 time=2
our_memcpy_time: size=128 time=16
libc_memcpy_time: size=128 time=4
our_memcpy_time: size=512 time=57
libc_memcpy_time: size=512 time=7
our_memcpy_time: size=8192 time=824
libc_memcpy_time: size=8192 time=79
our_memcpy_time: size=20480 time=2080
libc_memcpy_time: size=20480 time=183
our_memset_time: 4129
libc_memset_time: 292
io all done
testing string
done testing string
```

When we use optimized memcpy and memset, outputs:
```
our_memcpy_time: size=1 time=1
libc_memcpy_time: size=1 time=2
our_memcpy_time: size=4 time=1
libc_memcpy_time: size=4 time=3
our_memcpy_time: size=128 time=2
libc_memcpy_time: size=128 time=3
our_memcpy_time: size=512 time=7
libc_memcpy_time: size=512 time=7
our_memcpy_time: size=8192 time=72
libc_memcpy_time: size=8192 time=69
our_memcpy_time: size=20480 time=184
libc_memcpy_time: size=20480 time=175
our_memset_time: 307
libc_memset_time: 302
io all done
testing string
done testing string
```

Issue: #3544
  • Loading branch information
chenhy0106 authored May 10, 2024
1 parent 73b6ca0 commit ef1cd6f
Show file tree
Hide file tree
Showing 3 changed files with 234 additions and 35 deletions.
3 changes: 2 additions & 1 deletion core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1149,8 +1149,9 @@ if (BUILD_TESTS AND
append_property_string(TARGET unit_tests LINK_FLAGS "-Wl,${ld_entry_flag},_start")
endif ()
if (NOT ANDROID) # everything is inside Bionic on Android
target_link_libraries(unit_tests c dl m pthread)
target_link_libraries(unit_tests dl m pthread)
endif ()
target_link_libraries(unit_tests drmemfuncs)
set_preferred_base_start_and_end(unit_tests ${preferred_base} ON)
else (UNIX)
# Just like drinjectlib (see above) we need libc before ntdll
Expand Down
187 changes: 169 additions & 18 deletions core/arch/riscv64/memfuncs.asm
Original file line number Diff line number Diff line change
Expand Up @@ -41,33 +41,184 @@ START_FILE
#ifdef UNIX

/* Private memcpy.
* Optimize private memcpy by using loop unrolling and
* branchless sequences.
*/
DECLARE_FUNC(memcpy)
GLOBAL_LABEL(memcpy:)
/* TODO i#3544: Naive version, optimize it. */
mv t1, ARG1
beqz ARG3, 2f
1: lbu t2, 0(ARG2)
addi ARG2, ARG2, 1
sb t2, 0(t1)
addi t1, t1, 1
addi ARG3, ARG3, -1
bnez ARG3, 1b
2: ret
li t6, 32
mv t0, ARG2 /* Save dst for return. */
copy32_:
/* When size is greater than 32, we use 4 ld/sd pairs
* to copy 4*8=32 bytes in each iteration.
* When size is less than 32, we jump to copy_remain and
* use other optimized copy methods.
*/
blt ARG3, t6, copy_remain
ld t1, 0(ARG2)
ld t2, 8(ARG2)
ld t3, 16(ARG2)
ld t4, 24(ARG2)
sd t1, 0(ARG1)
sd t2, 8(ARG1)
sd t3, 16(ARG1)
sd t4, 24(ARG1)
addi ARG3, ARG3, -32
addi ARG1, ARG1, 32
addi ARG2, ARG2, 32
j copy32_
copy_remain:
add a6, ARG2, ARG3 /* a6 = src + size */
add a7, ARG1, ARG3 /* a7 = dst + size */
li t6, 8
bge ARG3, t6, copy8_32
li t6, 4
bge ARG3, t6, copy4_8
bgtz ARG3, copy0_4
j copyexit
copy0_4:
/* 0 < size < 4:
* If the size is 1 or 2,
* this will do some redundant copies to avoid branches.
*/
srli t4, ARG3, 1
add t5, t4, ARG1
add t4, t4, ARG2
lbu t1, 0(ARG2)
lbu t2, -1(a6)
lbu t3, 0(t4)
sb t1, 0(ARG1)
sb t2, -1(a7)
sb t3, 0(t5)
j copyexit
copy4_8:
/* 4 <= size < 8:
* There will be at least 1 byte
* of overlap between the two 4 bytes copies.
* We do this to avoid further branches.
*/
lwu t1, 0(ARG2)
lwu t2, -4(a6)
sw t1, 0(ARG1)
sw t2, -4(a7)
j copyexit
copy8_32:
/* 8 <= size < 32: */
/* Copy the first 8 bytes and the last 8 bytes.
* There will be overlap when size < 16.
*/
ld t1, 0(ARG2)
ld t2, -8(a6)
sd t1, 0(ARG1)
sd t2, -8(a7)
/* If size > 16, intermediate bytes (src[8:size-9])
* have not been copied.
*/
li t6, 16
ble ARG3, t6, copyexit
ld t1, 8(ARG2)
sd t1, 8(ARG1)
/* If size > 24, intermediate bytes (src[16:size-9])
* have not been copied.
*/
li t6, 24
ble ARG3, t6, copyexit
ld t1, 16(ARG2)
sd t1, 16(ARG1)
copyexit:
mv a0, t0 /* Restore original dst as return value. */
ret
END_FUNC(memcpy)

/* Private memset.
* Optimize private memset by using loop unrolling and
* branchless sequences.
*/
DECLARE_FUNC(memset)
GLOBAL_LABEL(memset:)
/* TODO i#3544: Naive version, optimize it. */
mv t1, ARG1
beqz ARG3, 2f
1: sb ARG2, 0(t1)
addi t1, t1, 1
addi ARG3, ARG3, -1
bnez ARG3, 1b
2: ret
li t6, 32
mv t0, ARG1 /* Save for return. */

/* Duplicate a single byte into whole 8 bytes register. */
andi ARG2, ARG2, 0xff
mv t1, ARG2
slli ARG2, ARG2, 8
or t1, t1, ARG2
slli ARG2, ARG2, 8
or t1, t1, ARG2
slli ARG2, ARG2, 8
or t1, t1, ARG2
slli ARG2, ARG2, 8
or t1, t1, ARG2
slli ARG2, ARG2, 8
or t1, t1, ARG2
slli ARG2, ARG2, 8
or t1, t1, ARG2
slli ARG2, ARG2, 8
or t1, t1, ARG2
set32_:
/* When size is greater than 32, we use 4 sd
* to write 4*8=32 bytes in each iteration.
* When size is less than 32, we jump to set_remain and
* use other optimized methods.
*/
blt ARG3, t6, set_remain
sd t1, 0(ARG1)
sd t1, 8(ARG1)
sd t1, 16(ARG1)
sd t1, 24(ARG1)
addi ARG3, ARG3, -32
addi ARG1, ARG1, 32
j set32_
set_remain:
add a6, ARG1, ARG3 /* a6 = dst + size */
li t6, 8
bge ARG3, t6, set8_32
li t6, 4
bge ARG3, t6, set4_8
bgtz ARG3, set0_4
j setexit
set0_4:
/* 0 < size < 4:
* If the size is 1 or 2,
* this will do some redundant writes to avoid branches.
*/
srli t4, ARG3, 1
add t4, t4, ARG1
sb t1, 0(ARG1)
sb t1, -1(a6)
sb t1, 0(t4)
j setexit
set4_8:
/* There will be at least 1 byte
* of overlap between the two 4 bytes write.
* We do this to avoid further branches.
*/
sw t1, 0(ARG1)
sw t1, -4(a6)
j setexit
set8_32:
/* 8 < size < 32: */
/* Write the first 8 bytes and the last 8 bytes.
* There will be overlap when size < 16.
*/
sd t1, 0(ARG1)
sd t1, -8(a6)
/* If size > 16, intermediate bytes (src[8:size-9])
* have not been writen.
*/
li t6, 16
ble ARG3, t6, setexit
sd t1, 8(ARG1)
/* If size > 24, intermediate bytes (src[16:size-9])
* have not been writen.
*/
li t6, 24
ble ARG3, t6, setexit
sd t1, 16(ARG1)
setexit:
mv a0, t0 /* Restore original dst as return value. */
ret
END_FUNC(memset)

/* See x86.asm notes about needing these to avoid gcc invoking *_chk */
Expand Down
79 changes: 63 additions & 16 deletions core/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,7 @@ test_sscanf_all_specs(void)
# endif

typedef void (*memcpy_t)(void *dst, const void *src, size_t n);
typedef void (*memset_t)(void *dst, int src, size_t n);

static void
test_memcpy_offset_size(size_t src_offset, size_t dst_offset, size_t size)
Expand Down Expand Up @@ -1004,9 +1005,7 @@ test_our_memset(void)
static void
our_memcpy_vs_libc(void)
{
/* Compare our memcpy with libc memcpy.
* XXX: Should compare on more sizes, especially small ones.
*/
/* Compare our memcpy with libc memcpy. */
size_t alloc_size = 20 * 1024;
int loop_count = 100 * 1000;
void *src = global_heap_alloc(alloc_size HEAPACCT(ACCT_OTHER));
Expand All @@ -1018,27 +1017,74 @@ our_memcpy_vs_libc(void)
memset(src, -1, alloc_size);
memset(dst, 0, alloc_size);

our_memcpy_start = query_time_millis();
int tests_size[] = { 1, 4, 128, 512, 8192, alloc_size };
int j;
for (j = 0; j < sizeof(tests_size) / sizeof(int); j++) {
our_memcpy_start = query_time_millis();
for (i = 0; i < loop_count; i++) {
memcpy(src, dst, tests_size[j]);
}
our_memcpy_end = query_time_millis();

libc_memcpy_start = query_time_millis();
for (i = 0; i < loop_count; i++) {
glibc_memcpy(src, dst, tests_size[j]);
}
libc_memcpy_end = query_time_millis();

our_memcpy_time = our_memcpy_end - our_memcpy_start;
libc_memcpy_time = libc_memcpy_end - libc_memcpy_start;
print_file(STDERR,
"our_memcpy_time: size=" UINT64_FORMAT_STRING
" time=" UINT64_FORMAT_STRING "\n",
tests_size[j], our_memcpy_time);
print_file(STDERR,
"libc_memcpy_time: size=" UINT64_FORMAT_STRING
" time=" UINT64_FORMAT_STRING "\n",
tests_size[j], libc_memcpy_time);
}
/* We could assert that we're not too much slower, but that's a recipe for
* flaky failures when the suite is run on shared VMs or in parallel.
*/

global_heap_free(src, alloc_size HEAPACCT(ACCT_OTHER));
global_heap_free(dst, alloc_size HEAPACCT(ACCT_OTHER));
}

static void
our_memset_vs_libc(void)
{
/* Compare our memset with libc memset. */
size_t alloc_size = 20 * 1024;
int loop_count = 100 * 1000;
void *src = global_heap_alloc(alloc_size HEAPACCT(ACCT_OTHER));
void *dst = global_heap_alloc(alloc_size HEAPACCT(ACCT_OTHER));
int i;
memset_t glibc_memset = (memset_t)dlsym(RTLD_NEXT, "memset");
uint64 our_memset_start, our_memset_end, our_memset_time;
uint64 libc_memset_start, libc_memset_end, libc_memset_time;

our_memset_start = query_time_millis();
for (i = 0; i < loop_count; i++) {
memcpy(src, dst, alloc_size);
memset(src, -1, alloc_size);
memset(dst, 0, alloc_size);
}
our_memcpy_end = query_time_millis();
our_memset_end = query_time_millis();

libc_memcpy_start = query_time_millis();
libc_memset_start = query_time_millis();
for (i = 0; i < loop_count; i++) {
glibc_memcpy(src, dst, alloc_size);
glibc_memset(src, -1, alloc_size);
glibc_memset(dst, 0, alloc_size);
}
libc_memcpy_end = query_time_millis();
libc_memset_end = query_time_millis();

global_heap_free(src, alloc_size HEAPACCT(ACCT_OTHER));
global_heap_free(dst, alloc_size HEAPACCT(ACCT_OTHER));
our_memcpy_time = our_memcpy_end - our_memcpy_start;
libc_memcpy_time = libc_memcpy_end - libc_memcpy_start;
print_file(STDERR, "our_memcpy_time: " UINT64_FORMAT_STRING "\n", our_memcpy_time);
print_file(STDERR, "libc_memcpy_time: " UINT64_FORMAT_STRING "\n", libc_memcpy_time);
/* We could assert that we're not too much slower, but that's a recipe for
* flaky failures when the suite is run on shared VMs or in parallel.
*/

our_memset_time = our_memset_end - our_memset_start;
libc_memset_time = libc_memset_end - libc_memset_start;
print_file(STDERR, "our_memset_time: " UINT64_FORMAT_STRING "\n", our_memset_time);
print_file(STDERR, "libc_memset_time: " UINT64_FORMAT_STRING "\n", libc_memset_time);
}
# endif /* UNIX */

Expand Down Expand Up @@ -1187,6 +1233,7 @@ unit_test_io(void)

/* memset tests */
test_our_memset();
our_memset_vs_libc();
# endif /* UNIX */

/* XXX: add more tests */
Expand Down

0 comments on commit ef1cd6f

Please sign in to comment.