DynamoRIO · abhinav92003 · Feb 5, 2024 · Feb 1, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/clients/drcachesim/tracer/kcore_copy.cpp b/clients/drcachesim/tracer/kcore_copy.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2022-2023 Google, Inc.  All rights reserved.
+ * Copyright (c) 2022-2024 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -38,6 +38,7 @@
 #include <sys/types.h>
 
 #include <fstream>
+#include <set>
 #include <string>
 
 #include "dr_api.h"
@@ -374,6 +375,14 @@ kcore_copy_t::read_modules()
     return true;
 }
 
+static bool
+is_function_symbol(char type)
+{
+    // From man nm, "t"/"T" are symbols from the code section,
+    // and "w"/"W" are weak symbols.
+    return toupper(type) == 'T' || toupper(type) == 'W';
+}
+
 bool
 kcore_copy_t::read_kallsyms()
 {
@@ -384,12 +393,32 @@ kcore_copy_t::read_kallsyms()
     }
     proc_module_t *kernel_module = nullptr;
     std::string line;
+
+    /* i#6486: Kernel JIT code like eBPF is not included in /proc/modules, but they have
+     * entries in /proc/kallsyms if /proc/sys/net/core/bpf_jit_harden and
+     * /proc/sys/net/core/bpf_jit_kallsyms are set appropriately (see
+     * docs.kernel.org/admin-guide/sysctl/net.html#proc-sys-net-core-network-core-options
+     * for more details).
+     * Perf's kcore copy logic does not copy JIT code but somehow includes JIT encodings
+     * and symbols in perf.data/data itself (not sure how yet). However, we use a
+     * different approach and copy the BPF JIT code to our kcore dump. If we find that
+     * the kernel executes other JIT code (indicated by "no memory mapped at this
+     * address" errors during libipt decoding), we would need to extend this logic to
+     * somehow identify those other /proc/kcore JIT regions.
+     */
+    std::set<uint64_t> bpf_jit_symbols;
+#define BPF_JIT_MODULE_NAME "[bpf]"
+
     while (std::getline(f, line)) {
         char name[KERNEL_SYMBOL_MAX_LEN];
+        char module[KERNEL_SYMBOL_MAX_LEN];
+        char type;
         uint64_t addr;
-        if (dr_sscanf(line.c_str(), HEX64_FORMAT_STRING " %*1c %299s [%*99s", &addr,
-                      name) < 2)
+        int n_read = dr_sscanf(line.c_str(), HEX64_FORMAT_STRING " %c %299s %299s", &addr,
+                               &type, name, module);
+        if (n_read < 3)
             continue;
+        bool has_module = n_read > 3;
         if (strcmp(name, "_stext") == 0) {
             if (kernel_module != nullptr) {
                 ASSERT(false, "multiple kernel modules found");
@@ -409,9 +438,49 @@ kcore_copy_t::read_kallsyms()
             kcore_code_segments_num_++;
             modules_ = kernel_module;
             kernel_module = nullptr;
+        } else if (has_module && strcmp(module, BPF_JIT_MODULE_NAME) == 0 &&
+                   is_function_symbol(type)) {
+            bpf_jit_symbols.insert(addr);
         }
     }
     ASSERT(kernel_module == nullptr, "failed to find kernel module");
+
+    if (!bpf_jit_symbols.empty()) {
+        /* We copy a page size worth of contents after each bpf-related function symbol
+         * in an effort to make sure that the complete function is copied. This is
+         * similar to perf adding page size to the highest kernel symbol in its own
+         * kcore copy logic.
+         */
+        size_t page_size = dr_page_size();
+        proc_module_t *bpf_module = nullptr;
+        for (auto it = bpf_jit_symbols.begin(); it != bpf_jit_symbols.end();) {
+            uint64_t addr = *it;
+            if (bpf_module == nullptr) {
+                bpf_module = (proc_module_t *)dr_global_alloc(sizeof(proc_module_t));
+                bpf_module->start = ALIGN_BACKWARD(addr, page_size);
+                bpf_module->end = ALIGN_FORWARD(addr + page_size, page_size);
+                ++it;
+                continue;
+            }
+            if (bpf_module->end >= addr) {
+                /* Just extend the last module region if the new addr falls within
+                 * the last recorded range.
+                 */
+                bpf_module->end = ALIGN_FORWARD(addr + page_size, page_size);
+                ++it;
+            } else {
+                bpf_module->next = modules_;
+                kcore_code_segments_num_++;
+                modules_ = bpf_module;
+                /* Create a new module region for `addr` in the next iteration. */
+                bpf_module = nullptr;
+            }
+        }
+        ASSERT(bpf_module != nullptr, "Did not expect nullptr");
+        bpf_module->next = modules_;
+        kcore_code_segments_num_++;
+        modules_ = bpf_module;
+    }
     f.close();
     return true;
 }

diff --git a/ext/drpttracer/drpttracer.dox b/ext/drpttracer/drpttracer.dox
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2023 Google, Inc.  All rights reserved.
+ * Copyright (c) 2023-2024 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -82,8 +82,8 @@ The create function lets the client specify the following parameters:
  - pt_size_shift: The size shift of PT trace's ring buffer. It must be greater than 0, and
 the buffer size is 2^pt_size_shift * PAGE_SIZE.
 
- - sideband_size_shift: The size shift of PT sideband data's ring buffer. It must be greater
-than 0, and the buffer size is 2^sideband_size_shift * PAGE_SIZE.
+ - sideband_size_shift: The size shift of PT sideband data's ring buffer. It must be
+greater than 0, and the buffer size is 2^sideband_size_shift * PAGE_SIZE.
 
 \note Linux perf sets the buffer size to 4MiB by default. Therefore, it is best for
 clients to set trace and sideband buffers larger than 4Mib.
@@ -141,4 +141,42 @@ flag to drpttracer_start_tracing().
 data from \p drpttracer doesn't contain sideband data; it only contains the trace
 data and metadata.
 
+\section sec_unit_tests Unit Tests
+
+\p We have some unit tests that verify the kernel tracing feature in drmemtrace, which
+also uses the `drpttracer` DynamoRIO extension. These tests are not
+built and run by default because they require superuser permission. They are also
+disabled automatically if the host system does not support the Intel-PT feature.
+
+To run these tests, pass `-DRUN_SUDO_TESTS=ON` to cmake when building DynamoRIO. E.g.,
+
+```
+$ cmake -DRUN_SUDO_TESTS=ON -DBUILD_TESTS=ON <dynamorio-src-dir>
+$ make -j
+$ ctest -R 'drpttracer|drcacheoff.kernel'
+```
+
+On some systems, one may see errors like the following:
+
+```
+408:   *** postcmd failed (1): drpt2ir: [28430, IP:ffffffffc11dd000] get next
+408:   instruction error: no memory mapped at this address
+```
+
+This is because our kcore logic copy may have missed copying some instructions from
+`/proc/kcore`. We rely on `/proc/modules` and `/proc/kallsyms` to point to relevant
+kernel code regions. Symbols for JIT code like eBPF are not included by default. The
+following workaround may help in cases where the missing memory region belongs to
+BPF JIT code. They make the BPF JIT code symbols visible in `/proc/kallsyms`.
+
+```
+$ sudo bash -c "echo 0 > /proc/sys/net/core/bpf_jit_harden"
+$ sudo bash -c "echo 1 > /proc/sys/net/core/bpf_jit_kallsyms"
+```
+
+You may want to record the existing values in these configs so you can revert them
+after running the tests. See
+https://docs.kernel.org/admin-guide/sysctl/net.html#proc-sys-net-core-network-core-options
+for more details.
+
 */