From 5cb0c6144335b4c6f532cdada672fc5812443802 Mon Sep 17 00:00:00 2001 From: Abhinav Anil Sharma Date: Mon, 5 Feb 2024 10:17:02 -0500 Subject: [PATCH] i#6486 kernel tracing: Include BPF JIT code in kcore dump (#6619) Fixes drmemtrace kernel trace libipt post-processing failures caused by missing instruction encodings for some kernel code execution captured using Intel-PT. The root-cause seems to be that JIT code executed by the kernel, BPF code in this case, does not have entries in `/proc/modules`. So, our kcore dump logic did not include them. This fix looks for BPF related symbols in `/proc/kallsyms` and includes them in the copied regions from `/proc/kcore`. Note that BPF JIT symbols are not included in `/proc/kallsyms` by default. One needs to set `/proc/sys/net/core/bpf_jit_harden` and `/proc/sys/net/core/bpf_jit_kallsyms` appropriately (see https://docs.kernel.org/admin-guide/sysctl/net.html#proc-sys-net-core-network-core-options for more details). Added this suggestion to documentation. It may be better to not automatically make this possibly-too-intrusive change to the user's machine in cmake. This is probably fine because the issue is not widespread (not reproduced on public Linux distributions). Tested PT tracing related tests locally on a machine that supports Intel-PT: ``` $ ctest -R 'drpttracer|drcacheoff.kernel' ... Start 213: code_api|client.drpttracer_SUDO-test [sudo] password for sharmaabhinav: 1/5 Test #213: code_api|client.drpttracer_SUDO-test ..................... Passed 4.29 sec Start 412: code_api|tool.drcacheoff.kernel.simple_SUDO 2/5 Test #412: code_api|tool.drcacheoff.kernel.simple_SUDO .............. Passed 4.66 sec Start 413: code_api|tool.drcacheoff.kernel.opcode-mix_SUDO 3/5 Test #413: code_api|tool.drcacheoff.kernel.opcode-mix_SUDO .......... Passed 4.71 sec Start 414: code_api|tool.drcacheoff.kernel.syscall-mix_SUDO 4/5 Test #414: code_api|tool.drcacheoff.kernel.syscall-mix_SUDO ......... Passed 4.59 sec Start 415: code_api|tool.drcacheoff.kernel.invariant-checker_SUDO 5/5 Test #415: code_api|tool.drcacheoff.kernel.invariant-checker_SUDO ... Passed 5.75 sec 100% tests passed, 0 tests failed out of 5 ``` Unfortunately the decode errors do not go away completely even after this fix, but they have become very less frequent now (tool.kernel.simple in release build failed after 40 successful runs with this fix, which failed every run before). Issue: #6486 --- clients/drcachesim/tracer/kcore_copy.cpp | 75 +++++++++++++++++++++++- ext/drpttracer/drpttracer.dox | 44 +++++++++++++- 2 files changed, 113 insertions(+), 6 deletions(-) diff --git a/clients/drcachesim/tracer/kcore_copy.cpp b/clients/drcachesim/tracer/kcore_copy.cpp index 962ab73f700..0b6f7a08543 100644 --- a/clients/drcachesim/tracer/kcore_copy.cpp +++ b/clients/drcachesim/tracer/kcore_copy.cpp @@ -1,5 +1,5 @@ /* ********************************************************** - * Copyright (c) 2022-2023 Google, Inc. All rights reserved. + * Copyright (c) 2022-2024 Google, Inc. All rights reserved. * **********************************************************/ /* @@ -38,6 +38,7 @@ #include #include +#include #include #include "dr_api.h" @@ -374,6 +375,14 @@ kcore_copy_t::read_modules() return true; } +static bool +is_function_symbol(char type) +{ + // From man nm, "t"/"T" are symbols from the code section, + // and "w"/"W" are weak symbols. + return toupper(type) == 'T' || toupper(type) == 'W'; +} + bool kcore_copy_t::read_kallsyms() { @@ -384,12 +393,32 @@ kcore_copy_t::read_kallsyms() } proc_module_t *kernel_module = nullptr; std::string line; + + /* i#6486: Kernel JIT code like eBPF is not included in /proc/modules, but they have + * entries in /proc/kallsyms if /proc/sys/net/core/bpf_jit_harden and + * /proc/sys/net/core/bpf_jit_kallsyms are set appropriately (see + * docs.kernel.org/admin-guide/sysctl/net.html#proc-sys-net-core-network-core-options + * for more details). + * Perf's kcore copy logic does not copy JIT code but somehow includes JIT encodings + * and symbols in perf.data/data itself (not sure how yet). However, we use a + * different approach and copy the BPF JIT code to our kcore dump. If we find that + * the kernel executes other JIT code (indicated by "no memory mapped at this + * address" errors during libipt decoding), we would need to extend this logic to + * somehow identify those other /proc/kcore JIT regions. + */ + std::set bpf_jit_symbols; +#define BPF_JIT_MODULE_NAME "[bpf]" + while (std::getline(f, line)) { char name[KERNEL_SYMBOL_MAX_LEN]; + char module[KERNEL_SYMBOL_MAX_LEN]; + char type; uint64_t addr; - if (dr_sscanf(line.c_str(), HEX64_FORMAT_STRING " %*1c %299s [%*99s", &addr, - name) < 2) + int n_read = dr_sscanf(line.c_str(), HEX64_FORMAT_STRING " %c %299s %299s", &addr, + &type, name, module); + if (n_read < 3) continue; + bool has_module = n_read > 3; if (strcmp(name, "_stext") == 0) { if (kernel_module != nullptr) { ASSERT(false, "multiple kernel modules found"); @@ -409,9 +438,49 @@ kcore_copy_t::read_kallsyms() kcore_code_segments_num_++; modules_ = kernel_module; kernel_module = nullptr; + } else if (has_module && strcmp(module, BPF_JIT_MODULE_NAME) == 0 && + is_function_symbol(type)) { + bpf_jit_symbols.insert(addr); } } ASSERT(kernel_module == nullptr, "failed to find kernel module"); + + if (!bpf_jit_symbols.empty()) { + /* We copy a page size worth of contents after each bpf-related function symbol + * in an effort to make sure that the complete function is copied. This is + * similar to perf adding page size to the highest kernel symbol in its own + * kcore copy logic. + */ + size_t page_size = dr_page_size(); + proc_module_t *bpf_module = nullptr; + for (auto it = bpf_jit_symbols.begin(); it != bpf_jit_symbols.end();) { + uint64_t addr = *it; + if (bpf_module == nullptr) { + bpf_module = (proc_module_t *)dr_global_alloc(sizeof(proc_module_t)); + bpf_module->start = ALIGN_BACKWARD(addr, page_size); + bpf_module->end = ALIGN_FORWARD(addr + page_size, page_size); + ++it; + continue; + } + if (bpf_module->end >= addr) { + /* Just extend the last module region if the new addr falls within + * the last recorded range. + */ + bpf_module->end = ALIGN_FORWARD(addr + page_size, page_size); + ++it; + } else { + bpf_module->next = modules_; + kcore_code_segments_num_++; + modules_ = bpf_module; + /* Create a new module region for `addr` in the next iteration. */ + bpf_module = nullptr; + } + } + ASSERT(bpf_module != nullptr, "Did not expect nullptr"); + bpf_module->next = modules_; + kcore_code_segments_num_++; + modules_ = bpf_module; + } f.close(); return true; } diff --git a/ext/drpttracer/drpttracer.dox b/ext/drpttracer/drpttracer.dox index 08ec8273b79..bba5bc832f9 100644 --- a/ext/drpttracer/drpttracer.dox +++ b/ext/drpttracer/drpttracer.dox @@ -1,5 +1,5 @@ /* ********************************************************** - * Copyright (c) 2023 Google, Inc. All rights reserved. + * Copyright (c) 2023-2024 Google, Inc. All rights reserved. * **********************************************************/ /* @@ -82,8 +82,8 @@ The create function lets the client specify the following parameters: - pt_size_shift: The size shift of PT trace's ring buffer. It must be greater than 0, and the buffer size is 2^pt_size_shift * PAGE_SIZE. - - sideband_size_shift: The size shift of PT sideband data's ring buffer. It must be greater -than 0, and the buffer size is 2^sideband_size_shift * PAGE_SIZE. + - sideband_size_shift: The size shift of PT sideband data's ring buffer. It must be +greater than 0, and the buffer size is 2^sideband_size_shift * PAGE_SIZE. \note Linux perf sets the buffer size to 4MiB by default. Therefore, it is best for clients to set trace and sideband buffers larger than 4Mib. @@ -141,4 +141,42 @@ flag to drpttracer_start_tracing(). data from \p drpttracer doesn't contain sideband data; it only contains the trace data and metadata. +\section sec_unit_tests Unit Tests + +\p We have some unit tests that verify the kernel tracing feature in drmemtrace, which +also uses the `drpttracer` DynamoRIO extension. These tests are not +built and run by default because they require superuser permission. They are also +disabled automatically if the host system does not support the Intel-PT feature. + +To run these tests, pass `-DRUN_SUDO_TESTS=ON` to cmake when building DynamoRIO. E.g., + +``` +$ cmake -DRUN_SUDO_TESTS=ON -DBUILD_TESTS=ON +$ make -j +$ ctest -R 'drpttracer|drcacheoff.kernel' +``` + +On some systems, one may see errors like the following: + +``` +408: *** postcmd failed (1): drpt2ir: [28430, IP:ffffffffc11dd000] get next +408: instruction error: no memory mapped at this address +``` + +This is because our kcore logic copy may have missed copying some instructions from +`/proc/kcore`. We rely on `/proc/modules` and `/proc/kallsyms` to point to relevant +kernel code regions. Symbols for JIT code like eBPF are not included by default. The +following workaround may help in cases where the missing memory region belongs to +BPF JIT code. They make the BPF JIT code symbols visible in `/proc/kallsyms`. + +``` +$ sudo bash -c "echo 0 > /proc/sys/net/core/bpf_jit_harden" +$ sudo bash -c "echo 1 > /proc/sys/net/core/bpf_jit_kallsyms" +``` + +You may want to record the existing values in these configs so you can revert them +after running the tests. See +https://docs.kernel.org/admin-guide/sysctl/net.html#proc-sys-net-core-network-core-options +for more details. + */