From 5d0575a911e976fcc238cb260688b0ce6bfea772 Mon Sep 17 00:00:00 2001 From: Abhinav Anil Sharma Date: Thu, 1 Feb 2024 16:38:36 -0500 Subject: [PATCH 1/3] i#6486 kernel tracing: Include BPF JIT code in kcore dump Fixes missing instruction encodings for some kernel code execution captured using Intel-PT. The root-cause seemed to be that JIT code executed by the kernel, eBPF code in this case, does not have entries in /proc/kallsyms, so our kcore dump logic did not include them. This fix looks for BPF related symbols in /proc/kallsyms and includes them in the copied regions from /proc/kcore. Note that BPF JIT symbols are not included in /proc/kallsyms by default. One needs to set /proc/sys/net/core/bpf_jit_harden and /proc/sys/net/core/bpf_jit_kallsyms appropriately (see https://docs.kernel.org/admin-guide/sysctl/net.html#proc-sys-net-core-network-core-options for more details). Added this suggestion to documentation. Tested PT tracing related tests locally on a machine that supports Intel-PT: $ ctest -R 'drpttracer|drcacheoff.kernel' ... Start 213: code_api|client.drpttracer_SUDO-test [sudo] password for sharmaabhinav: 1/5 Test #213: code_api|client.drpttracer_SUDO-test ..................... Passed 4.29 sec Start 412: code_api|tool.drcacheoff.kernel.simple_SUDO 2/5 Test #412: code_api|tool.drcacheoff.kernel.simple_SUDO .............. Passed 4.66 sec Start 413: code_api|tool.drcacheoff.kernel.opcode-mix_SUDO 3/5 Test #413: code_api|tool.drcacheoff.kernel.opcode-mix_SUDO .......... Passed 4.71 sec Start 414: code_api|tool.drcacheoff.kernel.syscall-mix_SUDO 4/5 Test #414: code_api|tool.drcacheoff.kernel.syscall-mix_SUDO ......... Passed 4.59 sec Start 415: code_api|tool.drcacheoff.kernel.invariant-checker_SUDO 5/5 Test #415: code_api|tool.drcacheoff.kernel.invariant-checker_SUDO ... Passed 5.75 sec 100% tests passed, 0 tests failed out of 5 Issue: #6486 --- clients/drcachesim/tracer/kcore_copy.cpp | 73 +++++++++++++++++++++++- ext/drpttracer/drpttracer.dox | 44 +++++++++++++- 2 files changed, 111 insertions(+), 6 deletions(-) diff --git a/clients/drcachesim/tracer/kcore_copy.cpp b/clients/drcachesim/tracer/kcore_copy.cpp index 962ab73f700..f81c8e84f22 100644 --- a/clients/drcachesim/tracer/kcore_copy.cpp +++ b/clients/drcachesim/tracer/kcore_copy.cpp @@ -1,5 +1,5 @@ /* ********************************************************** - * Copyright (c) 2022-2023 Google, Inc. All rights reserved. + * Copyright (c) 2022-2024 Google, Inc. All rights reserved. * **********************************************************/ /* @@ -38,6 +38,7 @@ #include #include +#include #include #include "dr_api.h" @@ -374,6 +375,14 @@ kcore_copy_t::read_modules() return true; } +static bool +is_function_symbol(char type) +{ + // From man nm, "t"/"T" are symbols from the code section, + // and "w"/"W" are weak symbols. + return toupper(type) == 'T' || toupper(type) == 'W'; +} + bool kcore_copy_t::read_kallsyms() { @@ -384,12 +393,32 @@ kcore_copy_t::read_kallsyms() } proc_module_t *kernel_module = nullptr; std::string line; + + /* i#6486: Kernel JIT code like eBPF is not included in /proc/modules, but they have + * entries in /proc/kallsyms if /proc/sys/net/core/bpf_jit_harden and + * /proc/sys/net/core/bpf_jit_kallsyms are set appropriately (see + * docs.kernel.org/admin-guide/sysctl/net.html#proc-sys-net-core-network-core-options + * for more details). + * Perf's kcore copy logic does not copy JIT code but somehow includes JIT encodings + * and symbols in perf.data/data itself (not sure how yet). However, we use a + * different approach and copy the BPF JIT code to our kcore dump. If we find the + * kernel to execute other JIT code (indicated by "no memory mapped at this address" + * errors during libipt decoding), we would need to extend this logic to somehow + * identify those other /proc/kcore JIT regions. + */ + std::set bpf_jit_symbols; +#define BPF_JIT_MODULE_NAME "[bpf]" + while (std::getline(f, line)) { char name[KERNEL_SYMBOL_MAX_LEN]; + char module[KERNEL_SYMBOL_MAX_LEN]; + char type; uint64_t addr; - if (dr_sscanf(line.c_str(), HEX64_FORMAT_STRING " %*1c %299s [%*99s", &addr, - name) < 2) + int n_read = dr_sscanf(line.c_str(), HEX64_FORMAT_STRING " %c %299s %299s", &addr, + &type, name, module); + if (n_read < 3) continue; + bool has_module = n_read > 3; if (strcmp(name, "_stext") == 0) { if (kernel_module != nullptr) { ASSERT(false, "multiple kernel modules found"); @@ -409,9 +438,47 @@ kcore_copy_t::read_kallsyms() kcore_code_segments_num_++; modules_ = kernel_module; kernel_module = nullptr; + } else if (has_module && strcmp(module, BPF_JIT_MODULE_NAME) == 0 && + is_function_symbol(type)) { + bpf_jit_symbols.insert(addr); } } ASSERT(kernel_module == nullptr, "failed to find kernel module"); + + if (!bpf_jit_symbols.empty()) { + /* We copy a page size worth of contents after each bpf-related function symbol + * in an effort to make sure that the complete function is copied. This is + * similar to perf adding page size to the highest kernel symbol in its own + * kcore copy logic. + */ +#define PAGE_SIZE 4096 + proc_module_t *bpf_module = nullptr; + for (auto it = bpf_jit_symbols.begin(); it != bpf_jit_symbols.end();) { + uint64_t addr = *it; + if (bpf_module == nullptr) { + bpf_module = (proc_module_t *)dr_global_alloc(sizeof(proc_module_t)); + bpf_module->start = addr; + bpf_module->end = addr + PAGE_SIZE; + ++it; + continue; + } + if (bpf_module->end < addr) { + /* Just extend the last module region. */ + bpf_module->end = addr + PAGE_SIZE; + ++it; + } else { + bpf_module->next = modules_; + kcore_code_segments_num_++; + modules_ = bpf_module; + /* Create a new module region for `addr` in the next iteration. */ + bpf_module = nullptr; + } + } + ASSERT(bpf_module != nullptr, "Did not expect nullptr"); + bpf_module->next = modules_; + kcore_code_segments_num_++; + modules_ = bpf_module; + } f.close(); return true; } diff --git a/ext/drpttracer/drpttracer.dox b/ext/drpttracer/drpttracer.dox index 08ec8273b79..bba5bc832f9 100644 --- a/ext/drpttracer/drpttracer.dox +++ b/ext/drpttracer/drpttracer.dox @@ -1,5 +1,5 @@ /* ********************************************************** - * Copyright (c) 2023 Google, Inc. All rights reserved. + * Copyright (c) 2023-2024 Google, Inc. All rights reserved. * **********************************************************/ /* @@ -82,8 +82,8 @@ The create function lets the client specify the following parameters: - pt_size_shift: The size shift of PT trace's ring buffer. It must be greater than 0, and the buffer size is 2^pt_size_shift * PAGE_SIZE. - - sideband_size_shift: The size shift of PT sideband data's ring buffer. It must be greater -than 0, and the buffer size is 2^sideband_size_shift * PAGE_SIZE. + - sideband_size_shift: The size shift of PT sideband data's ring buffer. It must be +greater than 0, and the buffer size is 2^sideband_size_shift * PAGE_SIZE. \note Linux perf sets the buffer size to 4MiB by default. Therefore, it is best for clients to set trace and sideband buffers larger than 4Mib. @@ -141,4 +141,42 @@ flag to drpttracer_start_tracing(). data from \p drpttracer doesn't contain sideband data; it only contains the trace data and metadata. +\section sec_unit_tests Unit Tests + +\p We have some unit tests that verify the kernel tracing feature in drmemtrace, which +also uses the `drpttracer` DynamoRIO extension. These tests are not +built and run by default because they require superuser permission. They are also +disabled automatically if the host system does not support the Intel-PT feature. + +To run these tests, pass `-DRUN_SUDO_TESTS=ON` to cmake when building DynamoRIO. E.g., + +``` +$ cmake -DRUN_SUDO_TESTS=ON -DBUILD_TESTS=ON +$ make -j +$ ctest -R 'drpttracer|drcacheoff.kernel' +``` + +On some systems, one may see errors like the following: + +``` +408: *** postcmd failed (1): drpt2ir: [28430, IP:ffffffffc11dd000] get next +408: instruction error: no memory mapped at this address +``` + +This is because our kcore logic copy may have missed copying some instructions from +`/proc/kcore`. We rely on `/proc/modules` and `/proc/kallsyms` to point to relevant +kernel code regions. Symbols for JIT code like eBPF are not included by default. The +following workaround may help in cases where the missing memory region belongs to +BPF JIT code. They make the BPF JIT code symbols visible in `/proc/kallsyms`. + +``` +$ sudo bash -c "echo 0 > /proc/sys/net/core/bpf_jit_harden" +$ sudo bash -c "echo 1 > /proc/sys/net/core/bpf_jit_kallsyms" +``` + +You may want to record the existing values in these configs so you can revert them +after running the tests. See +https://docs.kernel.org/admin-guide/sysctl/net.html#proc-sys-net-core-network-core-options +for more details. + */ From 5bc6d745a13c982d7a07a03575af13bb17676a45 Mon Sep 17 00:00:00 2001 From: Abhinav Anil Sharma Date: Fri, 2 Feb 2024 11:23:53 -0500 Subject: [PATCH 2/3] Address reviewer comments. --- clients/drcachesim/tracer/kcore_copy.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/clients/drcachesim/tracer/kcore_copy.cpp b/clients/drcachesim/tracer/kcore_copy.cpp index f81c8e84f22..5d609cd2334 100644 --- a/clients/drcachesim/tracer/kcore_copy.cpp +++ b/clients/drcachesim/tracer/kcore_copy.cpp @@ -401,10 +401,10 @@ kcore_copy_t::read_kallsyms() * for more details). * Perf's kcore copy logic does not copy JIT code but somehow includes JIT encodings * and symbols in perf.data/data itself (not sure how yet). However, we use a - * different approach and copy the BPF JIT code to our kcore dump. If we find the - * kernel to execute other JIT code (indicated by "no memory mapped at this address" - * errors during libipt decoding), we would need to extend this logic to somehow - * identify those other /proc/kcore JIT regions. + * different approach and copy the BPF JIT code to our kcore dump. If we find that + * the kernel executes other JIT code (indicated by "no memory mapped at this + * address" errors during libipt decoding), we would need to extend this logic to + * somehow identify those other /proc/kcore JIT regions. */ std::set bpf_jit_symbols; #define BPF_JIT_MODULE_NAME "[bpf]" @@ -451,20 +451,20 @@ kcore_copy_t::read_kallsyms() * similar to perf adding page size to the highest kernel symbol in its own * kcore copy logic. */ -#define PAGE_SIZE 4096 + size_t page_size = dr_page_size(); proc_module_t *bpf_module = nullptr; for (auto it = bpf_jit_symbols.begin(); it != bpf_jit_symbols.end();) { uint64_t addr = *it; if (bpf_module == nullptr) { bpf_module = (proc_module_t *)dr_global_alloc(sizeof(proc_module_t)); - bpf_module->start = addr; - bpf_module->end = addr + PAGE_SIZE; + bpf_module->start = ALIGN_BACKWARD(addr, page_size); + bpf_module->end = ALIGN_FORWARD(addr + page_size, page_size); ++it; continue; } if (bpf_module->end < addr) { /* Just extend the last module region. */ - bpf_module->end = addr + PAGE_SIZE; + bpf_module->end = ALIGN_FORWARD(addr + page_size, page_size); ++it; } else { bpf_module->next = modules_; From c0eb5a7b3b0967915491fb84edb012206a33a28e Mon Sep 17 00:00:00 2001 From: Abhinav Anil Sharma Date: Fri, 2 Feb 2024 14:45:16 -0500 Subject: [PATCH 3/3] Fix if condition bug --- clients/drcachesim/tracer/kcore_copy.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clients/drcachesim/tracer/kcore_copy.cpp b/clients/drcachesim/tracer/kcore_copy.cpp index 5d609cd2334..0b6f7a08543 100644 --- a/clients/drcachesim/tracer/kcore_copy.cpp +++ b/clients/drcachesim/tracer/kcore_copy.cpp @@ -462,8 +462,10 @@ kcore_copy_t::read_kallsyms() ++it; continue; } - if (bpf_module->end < addr) { - /* Just extend the last module region. */ + if (bpf_module->end >= addr) { + /* Just extend the last module region if the new addr falls within + * the last recorded range. + */ bpf_module->end = ALIGN_FORWARD(addr + page_size, page_size); ++it; } else {