From c60e0d130f98fc929b78a4aec62a9659a4023438 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 13 Jun 2024 21:18:51 +0530 Subject: [PATCH] criu: Support C/R of pidfds Process file descriptors (pidfds) were introduced to provide a stable handle on a process. They solve the problem of pid recycling. For a detailed explanation, see https://lwn.net/Articles/801319/ and http://www.corsix.org/content/what-is-a-pidfd Before Linux 6.9, anonymous inodes were used for the implementation of pidfds. So, we detect them in a fashion similiar to other fd types that use anonymous inodes by calling `readlink()`. After 6.9, pidfs (a file system for pidfds) was introduced. In 6.9 `S_ISREG()` returned true for pidfds, but this again changed with 6.10. (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/pidfs.c?h=v6.11-rc2#n285) After this change, pidfs inodes have no file type in st_mode in userspace. We use `PID_FS_MAGIC` to detect pidfds for kernel >= 6.9 Hence, check for pidfds occurs before the check for regular files. For pidfds that refer to dead processes, we lose the pid of the process as the Pid and NSpid fields in /proc//fdinfo/ change to -1. So, we create a temporary process for each unique inode and open pidfds that refer to this process. After all pidfds have been opened we kill this temporary process. This commit does not include support for pidfds that point to a specific thread, i.e pidfds opened with `PIDFD_THREAD` flag. Fixes: #2258 Signed-off-by: Bhavik Sachdev --- criu/Makefile.crtools | 1 + criu/cr-restore.c | 3 +- criu/files.c | 17 +++ criu/image-desc.c | 1 + criu/include/fs-magic.h | 4 + criu/include/image-desc.h | 1 + criu/include/magic.h | 1 + criu/include/pidfd.h | 16 ++ criu/include/protobuf-desc.h | 1 + criu/pidfd.c | 286 +++++++++++++++++++++++++++++++++++ criu/proc_parse.c | 29 ++++ criu/protobuf-desc.c | 1 + 12 files changed, 360 insertions(+), 1 deletion(-) create mode 100644 criu/include/pidfd.h create mode 100644 criu/pidfd.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 3ddf45cd70..ba6132d2f7 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -101,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o obj-y += hugetlb.o +obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 4d4dfbe6fe..d5b6c8037a 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -79,6 +79,7 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" +#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" @@ -280,7 +281,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo }; /* These images are required to restore namespaces */ diff --git a/criu/files.c b/criu/files.c index 3b653e24be..a57fb860fb 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_pidfd_link(link)) + ops = &pidfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -554,6 +557,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } + if (p.fs_type == PID_FS_MAGIC) { + ops = &pidfd_dump_ops; + return do_dump_gen_file(&p, lfd, ops, e); + } + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; @@ -1778,6 +1786,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; + case FD_TYPES__PIDFD: + ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); + break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); @@ -1800,5 +1811,11 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); + + if (init_dead_pidfd_hash()) { + pr_err("Could not initialise hash map for dead pidfds\n"); + return -1; + } + return collect_image(&files_cinfo); } diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c0986..2d87c73815 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ad34f48915..ffc0455d5f 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -57,4 +57,8 @@ #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #endif +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 9f369be645..79e1ac1113 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -113,6 +113,7 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, + CR_FD_PIDFD, CR_FD_AUTOFS, diff --git a/criu/include/magic.h b/criu/include/magic.h index 0e8c37234e..6f0aff26d8 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -100,6 +100,7 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define PIDFD_MAGIC 0x54435556 /* Ufa */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h new file mode 100644 index 0000000000..4d2d71700e --- /dev/null +++ b/criu/include/pidfd.h @@ -0,0 +1,16 @@ +#ifndef __CR_PIDFD_H__ +#define __CR_PIDFD_H__ + +#include "files.h" +#include "pidfd.pb-c.h" + +extern const struct fdtype_ops pidfd_dump_ops; +extern struct collect_image_info pidfd_cinfo; +extern int is_pidfd_link(char *link); +extern int init_dead_pidfd_hash(void); +struct pidfd_dump_info { + PidfdEntry pidfe; + pid_t pid; +}; + +#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101f..c4241be557 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,7 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_PIDFD, /* PB_AUTOGEN_STOP */ diff --git a/criu/pidfd.c b/criu/pidfd.c new file mode 100644 index 0000000000..06bf30605e --- /dev/null +++ b/criu/pidfd.c @@ -0,0 +1,286 @@ +#include "common/lock.h" +#include "imgset.h" +#include "pidfd.h" +#include "fdinfo.h" +#include "pidfd.pb-c.h" +#include "protobuf.h" +#include "pstree.h" +#include +#include +#include +#include "common/bug.h" +#include "rst-malloc.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "pidfd: " + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +struct pidfd_info { + PidfdEntry *pidfe; + struct file_desc d; +}; + +struct dead_pidfd { + unsigned int ino; + int pid; + size_t count; + mutex_t pidfd_lock; + struct hlist_node hash; +}; + +#define DEAD_PIDFD_HASH_SIZE 32 +static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; +static mutex_t *dead_pidfd_hash_lock; + +int init_dead_pidfd_hash(void) +{ + for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) + INIT_HLIST_HEAD(&dead_pidfd_hash[i]); + + dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); + if (!dead_pidfd_hash_lock) + return -1; + + mutex_init(dead_pidfd_hash_lock); + + return 0; +} + +static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) +{ + struct dead_pidfd *dead; + struct hlist_head *chain; + + mutex_lock(dead_pidfd_hash_lock); + chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; + hlist_for_each_entry(dead, chain, hash) { + if (dead->ino == ino) { + mutex_unlock(dead_pidfd_hash_lock); + return dead; + } + } + mutex_unlock(dead_pidfd_hash_lock); + + return NULL; +} + +int is_pidfd_link(char *link) +{ + /* + * pidfs was introduced in Linux 6.9 + * before which anonymous-inodes were used + */ + return is_anon_link_type(link, "[pidfd]"); +} + +static void pr_info_pidfd(char *action, PidfdEntry *pidfe) +{ + pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", + action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino + ); +} + +static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) +{ + struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) + return -1; + + if (p->flags & PIDFD_THREAD) { + pr_err("PIDFD_THREAD flag is currently not supported\n"); + return -1; + } + + /* + * Check if the pid pidfd refers to is part of process tree + * This ensures the process will exist on restore. + */ + if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { + pr_err("pidfd pid %d is not a part of process tree..\n", + pidfd_info.pid); + return -1; + } + + pidfd_info.pidfe.id = id; + pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); + pidfd_info.pidfe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__PIDFD; + fe.id = pidfd_info.pidfe.id; + fe.pidfd = &pidfd_info.pidfe; + + pr_info_pidfd("Dumping", &pidfd_info.pidfe); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops pidfd_dump_ops = { + .type = FD_TYPES__PIDFD, + .dump = dump_one_pidfd, +}; + +static int pidfd_open(pid_t pid, int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int create_tmp_process(void) +{ + int tmp_process; + tmp_process = fork(); + if (tmp_process < 0) { + pr_perror("Could not fork"); + return -1; + } else if (tmp_process == 0) { + while(1) + sleep(1); + } + return tmp_process; +} + +static int free_dead_pidfd(struct dead_pidfd *dead) +{ + int status; + + if (kill(dead->pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", + dead->pid); + goto err_close; + } + + if (waitpid(dead->pid, &status, 0) != dead->pid) { + pr_perror("Could not wait on temporary process with pid: %d", + dead->pid); + goto err_close; + } + + if (!WIFSIGNALED(status)) { + pr_err("Expected temporary process to be terminated by a signal\n"); + goto err_close; + } + + if (WTERMSIG(status) != SIGKILL) { + pr_err("Expected temporary process to be terminated by SIGKILL\n"); + goto err_close; + } + + mutex_lock(dead_pidfd_hash_lock); + hlist_del(&dead->hash); + mutex_unlock(dead_pidfd_hash_lock); + return 0; +err_close: + return -1; +} + +static int open_one_pidfd(struct file_desc *d, int *new_fd) +{ + struct pidfd_info *info; + struct dead_pidfd *dead = NULL; + int pidfd; + + info = container_of(d, struct pidfd_info, d); + if (info->pidfe->nspid != -1) { + pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + goto out; + } + + dead = lookup_dead_pidfd(info->pidfe->ino); + BUG_ON(!dead); + + mutex_lock(&dead->pidfd_lock); + BUG_ON(dead->count == 0); + dead->count--; + if (dead->pid == -1) { + dead->pid = create_tmp_process(); + if (dead->pid < 0) { + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + } + + pidfd = pidfd_open(dead->pid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + + if (dead->count == 0) { + if (free_dead_pidfd(dead)) { + pr_err("Failed to delete dead_pidfd struct\n"); + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + } + mutex_unlock(&dead->pidfd_lock); + +out: + if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { + goto err_close; + } + + *new_fd = pidfd; + return 0; +err_close: + pr_err("Can't create pidfd %#08x NSpid: %d flags: %u\n", + info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); + return -1; +} + +static struct file_desc_ops pidfd_desc_ops = { + .type = FD_TYPES__PIDFD, + .open = open_one_pidfd +}; + +static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct dead_pidfd *dead; + struct pidfd_info *info = obj; + + info->pidfe = pb_msg(msg, PidfdEntry); + pr_info_pidfd("Collected ", info->pidfe); + + if (info->pidfe->nspid != -1) + goto out; + + dead = lookup_dead_pidfd(info->pidfe->ino); + if (dead) { + mutex_lock(&dead->pidfd_lock); + dead->count++; + mutex_unlock(&dead->pidfd_lock); + goto out; + } + + dead = shmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate shared memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->ino = info->pidfe->ino; + dead->count = 1; + dead->pid = -1; + mutex_init(&dead->pidfd_lock); + + mutex_lock(dead_pidfd_hash_lock); + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); + mutex_unlock(dead_pidfd_hash_lock); +out: + return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); +} + +struct collect_image_info pidfd_cinfo = { + .fd_type = CR_FD_PIDFD, + .pb_type = PB_PIDFD, + .priv_size = sizeof(struct pidfd_info), + .collect = collect_one_pidfd, +}; diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 55aefac7d7..95ebe3a411 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -42,10 +42,12 @@ #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" +#include "pidfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" +#include "pidfd.pb-c.h" #include "plugin.h" #include @@ -2165,6 +2167,33 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (ret) goto parse_err; + entry_met = true; + continue; + } + if (fdinfo_field(str, "ino") || fdinfo_field(str, "NSpid") || fdinfo_field(str, "Pid")) { + struct pidfd_dump_info *pidfd_info = arg; + + if (type != FD_TYPES__PIDFD) + continue; + + if (fdinfo_field(str, "ino")) { + ret = sscanf(str, "%*s %u", &pidfd_info->pidfe.ino); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "Pid")) { + ret = sscanf(str, "%*s %d", &pidfd_info->pid); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "NSpid")) { + char *last; + + last = strrchr(str, '\t'); + if (!last || sscanf(last, "%d", &pidfd_info->pidfe.nspid) != 1) { + pr_err("Unable to parse: %s\n", str); + goto parse_err; + } + } + entry_met = true; continue; } diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5be..e0dbfccc21 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/pidfd.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX];