Skip to content

Commit

Permalink
criu: Support C/R of pidfds
Browse files Browse the repository at this point in the history
Process file descriptors (pidfds) were introduced to provide a stable
handle on a process. They solve the problem of pid recycling.

For a detailed explanation, see https://lwn.net/Articles/801319/ and
http://www.corsix.org/content/what-is-a-pidfd

Before Linux 6.9, anonymous inodes were used for the implementation of
pidfds. So, we detect them in a fashion similiar to other fd types that
use anonymous inodes by calling `readlink()`.
After 6.9, pidfs (a file system for pidfds) was introduced.
In 6.9 `S_ISREG()` returned true for pidfds, but this again changed with
6.10.
(https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/pidfs.c?h=v6.11-rc2#n285)
After this change, pidfs inodes have no file type in st_mode in
userspace.
We use `PID_FS_MAGIC` to detect pidfds for kernel >= 6.9
Hence, check for pidfds occurs before the check for regular files.

For pidfds that refer to dead processes, we lose the pid of the process
as the Pid and NSpid fields in /proc/<pid>/fdinfo/<pidfd> change to -1.
So, we create a temporary process for each unique inode and open pidfds
that refer to this process. After all pidfds have been opened we kill
this temporary process.

This commit does not include support for pidfds that point to a specific
thread, i.e pidfds opened with `PIDFD_THREAD` flag.

Fixes: checkpoint-restore#2258 checkpoint-restore#2459

Signed-off-by: Bhavik Sachdev <[email protected]>
  • Loading branch information
bsach64 committed Aug 28, 2024
1 parent 42a4428 commit 829c11b
Show file tree
Hide file tree
Showing 12 changed files with 326 additions and 1 deletion.
1 change: 1 addition & 0 deletions criu/Makefile.crtools
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o
CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV)
obj-y += pidfd-store.o
obj-y += hugetlb.o
obj-y += pidfd.o

PROTOBUF_GEN := scripts/protobuf-gen.sh

Expand Down
3 changes: 2 additions & 1 deletion criu/cr-restore.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
#include "timens.h"
#include "bpfmap.h"
#include "apparmor.h"
#include "pidfd.h"

#include "parasite-syscall.h"
#include "files-reg.h"
Expand Down Expand Up @@ -280,7 +281,7 @@ static struct collect_image_info *cinfos_files[] = {
&unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo,
&netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo,
&tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo,
&fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo,
&fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo
};

/* These images are required to restore namespaces */
Expand Down
12 changes: 12 additions & 0 deletions criu/files.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include "kerndat.h"
#include "fdstore.h"
#include "bpfmap.h"
#include "pidfd.h"

#include "protobuf.h"
#include "util.h"
Expand Down Expand Up @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
ops = &signalfd_dump_ops;
else if (is_timerfd_link(link))
ops = &timerfd_dump_ops;
else if (is_pidfd_link(link))
ops = &pidfd_dump_ops;
#ifdef CONFIG_HAS_LIBBPF
else if (is_bpfmap_link(link))
ops = &bpfmap_dump_ops;
Expand All @@ -554,6 +557,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
return do_dump_gen_file(&p, lfd, ops, e);
}

if (p.fs_type == PID_FS_MAGIC) {
ops = &pidfd_dump_ops;
return do_dump_gen_file(&p, lfd, ops, e);
}

if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) {
if (fill_fdlink(lfd, &p, &link))
return -1;
Expand Down Expand Up @@ -1778,6 +1786,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i)
case FD_TYPES__MEMFD:
ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo);
break;
case FD_TYPES__PIDFD:
ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo);
break;
#ifdef CONFIG_HAS_LIBBPF
case FD_TYPES__BPFMAP:
ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo);
Expand All @@ -1800,5 +1811,6 @@ int prepare_files(void)
{
init_fdesc_hash();
init_sk_info_hash();
init_dead_pidfd_hash();
return collect_image(&files_cinfo);
}
1 change: 1 addition & 0 deletions criu/image-desc.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF),
FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF),
FD_ENTRY(APPARMOR, "apparmor"),
FD_ENTRY(PIDFD, "pidfd"),

[CR_FD_STATS] = {
.fmt = "stats-%s",
Expand Down
4 changes: 4 additions & 0 deletions criu/include/fs-magic.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,8 @@
#define OVERLAYFS_SUPER_MAGIC 0x794c7630
#endif

#ifndef PID_FS_MAGIC
#define PID_FS_MAGIC 0x50494446
#endif

#endif /* __CR_FS_MAGIC_H__ */
1 change: 1 addition & 0 deletions criu/include/image-desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ enum {
CR_FD_PIPES,
CR_FD_TTY_FILES,
CR_FD_MEMFD_FILE,
CR_FD_PIDFD,

CR_FD_AUTOFS,

Expand Down
1 change: 1 addition & 0 deletions criu/include/magic.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
#define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */
#define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */
#define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */
#define PIDFD_MAGIC 0x54435556 /* Ufa */

#define IFADDR_MAGIC RAW_IMAGE_MAGIC
#define ROUTE_MAGIC RAW_IMAGE_MAGIC
Expand Down
16 changes: 16 additions & 0 deletions criu/include/pidfd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef __CR_PIDFD_H__
#define __CR_PIDFD_H__

#include "files.h"
#include "pidfd.pb-c.h"

extern const struct fdtype_ops pidfd_dump_ops;
extern struct collect_image_info pidfd_cinfo;
extern int is_pidfd_link(char *link);
extern void init_dead_pidfd_hash(void);
struct pidfd_dump_info {
PidfdEntry pidfe;
pid_t pid;
};

#endif /* __CR_PIDFD_H__ */
1 change: 1 addition & 0 deletions criu/include/protobuf-desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ enum {
PB_BPFMAP_FILE,
PB_BPFMAP_DATA,
PB_APPARMOR,
PB_PIDFD,

/* PB_AUTOGEN_STOP */

Expand Down
257 changes: 257 additions & 0 deletions criu/pidfd.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
#include "imgset.h"
#include "pidfd.h"
#include "fdinfo.h"
#include "pidfd.pb-c.h"
#include "protobuf.h"
#include "pstree.h"
#include <stdlib.h>
#include <sys/wait.h>
#include <signal.h>
#include "common/bug.h"

#undef LOG_PREFIX
#define LOG_PREFIX "pidfd: "

#ifndef PIDFD_THREAD
#define PIDFD_THREAD O_EXCL
#endif

struct pidfd_info {
PidfdEntry *pidfe;
struct file_desc d;
};

struct dead_pidfd {
unsigned int ino;
int pid;
size_t count;
struct hlist_node hash;
};

#define DEAD_PIDFD_HASH_SIZE 32
static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE];

void init_dead_pidfd_hash(void)
{
for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++)
INIT_HLIST_HEAD(&dead_pidfd_hash[i]);
}

static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino)
{
struct dead_pidfd *dead;
struct hlist_head *chain;

chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE];
hlist_for_each_entry(dead, chain, hash) {
if (dead->ino == ino)
return dead;
}
return NULL;
}

static void free_dead_pidfd_hash(void)
{
int i;
struct dead_pidfd *dead;
struct hlist_node *tmp;

for (i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) {
hlist_for_each_entry_safe(dead, tmp, &dead_pidfd_hash[i], hash) {
xfree(dead);
}
INIT_HLIST_HEAD(&dead_pidfd_hash[i]);
}
}

int is_pidfd_link(char *link)
{
/*
* pidfs was introduced in Linux 6.9
* before which anonymous-inodes were used
*/
return is_anon_link_type(link, "[pidfd]");
}

static void pr_info_pidfd(char *action, PidfdEntry *pidfe)
{
pr_info("%s: id %#08x flags %u NSpid %d ino %u\n",
action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino
);
}

static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p)
{
struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT};
FileEntry fe = FILE_ENTRY__INIT;

if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info))
return -1;

if (p->flags & PIDFD_THREAD) {
pr_err("PIDFD_THREAD flag is currently not supported\n");
return -1;
}

/*
* Check if the pid pidfd refers to is part of process tree
* This ensures the process will exist on restore.
*/
if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) {
pr_err("pidfd pid %d is not a part of process tree..\n",
pidfd_info.pid);
return -1;
}

pidfd_info.pidfe.id = id;
pidfd_info.pidfe.flags = (p->flags & ~O_RDWR);
pidfd_info.pidfe.fown = (FownEntry *)&p->fown;

fe.type = FD_TYPES__PIDFD;
fe.id = pidfd_info.pidfe.id;
fe.pidfd = &pidfd_info.pidfe;

pr_info_pidfd("Dumping", &pidfd_info.pidfe);
return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE);
}

const struct fdtype_ops pidfd_dump_ops = {
.type = FD_TYPES__PIDFD,
.dump = dump_one_pidfd,
};

static int pidfd_open(pid_t pid, int flags)
{
return syscall(__NR_pidfd_open, pid, flags);
}

static int create_tmp_process(void)
{
int tmp_process;
tmp_process = fork();
if (tmp_process < 0) {
pr_perror("Could not fork");
return -1;
} else if (tmp_process == 0) {
while(1)
sleep(1);
}
return tmp_process;
}

static int open_one_pidfd(struct file_desc *d, int *new_fd)
{
struct pidfd_info *info;
struct dead_pidfd *dead;
int pidfd, status;

info = container_of(d, struct pidfd_info, d);
if (info->pidfe->nspid != -1) {
pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags);
if (pidfd < 0) {
pr_perror("Could not open pidfd for %d", info->pidfe->nspid);
goto err_close;
}
goto out;
}

dead = lookup_dead_pidfd(info->pidfe->ino);
BUG_ON(!dead);

dead->count--;
if (dead->pid == -1) {
dead->pid = create_tmp_process();
if (dead->pid < 0)
goto err_close;
}

pidfd = pidfd_open(dead->pid, info->pidfe->flags);
if (pidfd < 0) {
pr_perror("Could not open pidfd for %d", info->pidfe->nspid);
goto err_close;
}

if (dead->count > 0)
goto out;

if (kill(dead->pid, SIGKILL) < 0) {
pr_perror("Could not kill temporary process with pid: %d",
dead->pid);
goto err_close;
}

if (waitpid(dead->pid, &status, 0) != dead->pid) {
pr_perror("Could not wait on temporary process with pid: %d",
dead->pid);
goto err_close;
}

if (!WIFSIGNALED(status)) {
pr_err("Expected temporary process to be terminated by a signal\n");
goto err_close;
}

if (WTERMSIG(status) != SIGKILL) {
pr_err("Expected temporary process to be terminated by SIGKILL\n");
goto err_close;
}

hlist_del(&dead->hash);
xfree(dead);
out:
if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) {
goto err_close;
}

*new_fd = pidfd;
return 0;

err_close:
pr_err("Can't create pidfd %#08x NSpid: %d flags: %u",
info->pidfe->id, info->pidfe->nspid, info->pidfe->flags);
free_dead_pidfd_hash();
return -1;
}

static struct file_desc_ops pidfd_desc_ops = {
.type = FD_TYPES__PIDFD,
.open = open_one_pidfd
};

static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i)
{
struct dead_pidfd *dead;
struct pidfd_info *info = obj;

info->pidfe = pb_msg(msg, PidfdEntry);
pr_info_pidfd("Collected ", info->pidfe);

if (info->pidfe->nspid != -1)
goto out;

dead = lookup_dead_pidfd(info->pidfe->ino);
if (dead) {
dead->count++;
goto out;
}

dead = xmalloc(sizeof(*dead));
if (!dead)
return -1;

INIT_HLIST_NODE(&dead->hash);
dead->ino = info->pidfe->ino;
dead->count = 1;
dead->pid = -1;

hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]);
out:
return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops);
}

struct collect_image_info pidfd_cinfo = {
.fd_type = CR_FD_PIDFD,
.pb_type = PB_PIDFD,
.priv_size = sizeof(struct pidfd_info),
.collect = collect_one_pidfd,
};
Loading

0 comments on commit 829c11b

Please sign in to comment.