diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 3ddf45cd70..ba6132d2f7 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -101,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o obj-y += hugetlb.o +obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b95d4f134b..8d2cdcd49d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -79,6 +79,7 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" +#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" @@ -280,7 +281,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo }; /* These images are required to restore namespaces */ diff --git a/criu/files.c b/criu/files.c index 3b653e24be..ee962cfb3f 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_pidfd_link(link)) + ops = &pidfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -582,6 +585,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } + if (p.fs_type == PID_FS_MAGIC) { + ops = &pidfd_dump_ops; + return do_dump_gen_file(&p, lfd, ops, e); + } + /* * For debug purpose -- at least show the link * file pointing to when reporting unsupported file. @@ -1778,6 +1786,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; + case FD_TYPES__PIDFD: + ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); + break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); @@ -1800,5 +1811,6 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); + init_dead_pidfd_hash(); return collect_image(&files_cinfo); } diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c0986..2d87c73815 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ad34f48915..ffc0455d5f 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -57,4 +57,8 @@ #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #endif +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 9f369be645..79e1ac1113 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -113,6 +113,7 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, + CR_FD_PIDFD, CR_FD_AUTOFS, diff --git a/criu/include/magic.h b/criu/include/magic.h index 0e8c37234e..6f0aff26d8 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -100,6 +100,7 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define PIDFD_MAGIC 0x54435556 /* Ufa */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h new file mode 100644 index 0000000000..bcc0fb45ab --- /dev/null +++ b/criu/include/pidfd.h @@ -0,0 +1,16 @@ +#ifndef __CR_PIDFD_H__ +#define __CR_PIDFD_H__ + +#include "files.h" +#include "pidfd.pb-c.h" + +extern const struct fdtype_ops pidfd_dump_ops; +extern struct collect_image_info pidfd_cinfo; +extern int is_pidfd_link(char *link); +extern void init_dead_pidfd_hash(void); +struct pidfd_dump_info { + PidfdEntry pidfe; + pid_t pid; +}; + +#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101f..c4241be557 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,7 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_PIDFD, /* PB_AUTOGEN_STOP */ diff --git a/criu/pidfd.c b/criu/pidfd.c new file mode 100644 index 0000000000..015d3ad699 --- /dev/null +++ b/criu/pidfd.c @@ -0,0 +1,228 @@ +#include "files.h" +#include "imgset.h" +#include "pidfd.h" +#include "fdinfo.h" +#include "pidfd.pb-c.h" +#include "protobuf.h" +#include "pstree.h" +#include +#include +#include +#include "common/bug.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "pidfd: " + +struct pidfd_info { + PidfdEntry *pidfe; + struct file_desc d; +}; + +struct dead_pidfd { + unsigned int ino; + int pid; + size_t count; + struct hlist_node hash; +}; + +#define DEAD_PIDFD_HASH_SIZE 32 +static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; + +void init_dead_pidfd_hash(void) +{ + for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) + INIT_HLIST_HEAD(&dead_pidfd_hash[i]); +} + +static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) +{ + struct dead_pidfd *dead; + struct hlist_head *chain; + + chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; + hlist_for_each_entry(dead, chain, hash) { + if (dead->ino == ino) + return dead; + } + return NULL; +} + +static void free_dead_pidfd_hash(void) +{ + int i; + struct dead_pidfd *dead; + struct hlist_node *tmp; + + for (i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) { + hlist_for_each_entry_safe(dead, tmp, &dead_pidfd_hash[i], hash) { + xfree(dead); + } + INIT_HLIST_HEAD(&dead_pidfd_hash[i]); + } +} + +int is_pidfd_link(char *link) +{ + /* + * pidfs was introduced in Linux 6.9 + * before which anonymous-inodes were used + */ + return is_anon_link_type(link, "[pidfd]"); +} + +static void pr_info_pidfd(char *action, PidfdEntry *pidfe) +{ + pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", + action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino + ); +} + +static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) +{ + struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) + return -1; + + /* + * Check if the pid pidfd refers to is part of process tree + * This ensures the process will exist on restore. + */ + if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { + pr_err("pidfd pid %d is not a part of process tree..", + pidfd_info.pid); + return -1; + } + + pidfd_info.pidfe.id = id; + pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); + pidfd_info.pidfe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__PIDFD; + fe.id = pidfd_info.pidfe.id; + fe.pidfd = &pidfd_info.pidfe; + + pr_info_pidfd("Dumping", &pidfd_info.pidfe); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops pidfd_dump_ops = { + .type = FD_TYPES__PIDFD, + .dump = dump_one_pidfd, +}; + +static int pidfd_open(pid_t pid, int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int create_tmp_process(void) +{ + int tmp_process; + tmp_process = fork(); + if (tmp_process < 0) { + pr_perror("Could not fork"); + return -1; + } else if (tmp_process == 0) { + while(1); + } + return tmp_process; +} + +static int open_one_pidfd(struct file_desc *d, int *new_fd) +{ + struct pidfd_info *info; + struct dead_pidfd *dead; + int pidfd; + + info = container_of(d, struct pidfd_info, d); + + if (info->pidfe->nspid != -1) { + pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); + if (pidfd < 0) + goto err_close; + goto out; + } + + dead = lookup_dead_pidfd(info->pidfe->ino); + BUG_ON(!dead); + + dead->count--; + if (dead->pid == -1) { + dead->pid = create_tmp_process(); + if (dead->pid < 0) + goto err_close; + } + + pidfd = pidfd_open(dead->pid, info->pidfe->flags); + if (pidfd < 0) + goto err_close; + + if (dead->count == 0) { + int status; + + kill(dead->pid, SIGKILL); + waitpid(dead->pid, &status, 0); + if (WTERMSIG(status) != SIGKILL) + goto err_close; + hlist_del(&dead->hash); + xfree(dead); + } +out: + if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { + goto err_close; + } + + *new_fd = pidfd; + return 0; + +err_close: + free_dead_pidfd_hash(); + pr_perror("Can't create pidfd %#08x NSpid: %d flags: %u", + info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); + return -1; +} + +static struct file_desc_ops pidfd_desc_ops = { + .type = FD_TYPES__PIDFD, + .open = open_one_pidfd +}; + +static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct dead_pidfd *dead; + struct pidfd_info *info = obj; + + info->pidfe = pb_msg(msg, PidfdEntry); + pr_info_pidfd("Collected ", info->pidfe); + + if (info->pidfe->nspid != -1) + goto out; + + dead = lookup_dead_pidfd(info->pidfe->ino); + if (dead) { + dead->count++; + goto out; + } + + dead = xmalloc(sizeof(*dead)); + if (!dead) + return -1; + + INIT_HLIST_NODE(&dead->hash); + dead->ino = info->pidfe->ino; + dead->count = 1; + dead->pid = -1; + + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); +out: + return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); +} + +struct collect_image_info pidfd_cinfo = { + .fd_type = CR_FD_PIDFD, + .pb_type = PB_PIDFD, + .priv_size = sizeof(struct pidfd_info), + .collect = collect_one_pidfd, +}; diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 55aefac7d7..95ebe3a411 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -42,10 +42,12 @@ #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" +#include "pidfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" +#include "pidfd.pb-c.h" #include "plugin.h" #include @@ -2165,6 +2167,33 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (ret) goto parse_err; + entry_met = true; + continue; + } + if (fdinfo_field(str, "ino") || fdinfo_field(str, "NSpid") || fdinfo_field(str, "Pid")) { + struct pidfd_dump_info *pidfd_info = arg; + + if (type != FD_TYPES__PIDFD) + continue; + + if (fdinfo_field(str, "ino")) { + ret = sscanf(str, "%*s %u", &pidfd_info->pidfe.ino); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "Pid")) { + ret = sscanf(str, "%*s %d", &pidfd_info->pid); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "NSpid")) { + char *last; + + last = strrchr(str, '\t'); + if (!last || sscanf(last, "%d", &pidfd_info->pidfe.nspid) != 1) { + pr_err("Unable to parse: %s\n", str); + goto parse_err; + } + } + entry_met = true; continue; } diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5be..e0dbfccc21 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/pidfd.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX];