forked from checkpoint-restore/criu
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Process file descriptors (pidfds) were introduced to provide a stable handle on a process. They solve the problem of pid recycling. For a detailed explanation, see https://lwn.net/Articles/801319/ and http://www.corsix.org/content/what-is-a-pidfd Before Linux 6.9, anonymous inodes were used for the implementation of pidfds. So, we detect them in a fashion similiar to other fd types that use anonymous inodes by calling `readlink()`. After 6.9, pidfs (a file system for pidfds) was introduced. In 6.9 `S_ISREG()` returned true for pidfds, but this again changed with 6.10. (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/pidfs.c?h=v6.11-rc2#n285) After this change, pidfs inodes have no file type in st_mode in userspace. We use `PID_FS_MAGIC` to detect pidfds for kernel >= 6.9 Hence, check for pidfds occurs before the check for regular files. For pidfds that refer to dead processes, we lose the pid of the process as the Pid and NSpid fields in /proc/<pid>/fdinfo/<pidfd> change to -1. So, we create a temporary process for each unique inode and open pidfds that refer to this process. After all pidfds have been opened we kill this temporary process. This commit does not include support for pidfds that point to a specific thread, i.e pidfds opened with `PIDFD_THREAD` flag. Fixes: checkpoint-restore#2258 checkpoint-restore#2459 Signed-off-by: Bhavik Sachdev <[email protected]>
- Loading branch information
Showing
12 changed files
with
326 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -113,6 +113,7 @@ enum { | |
CR_FD_PIPES, | ||
CR_FD_TTY_FILES, | ||
CR_FD_MEMFD_FILE, | ||
CR_FD_PIDFD, | ||
|
||
CR_FD_AUTOFS, | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#ifndef __CR_PIDFD_H__ | ||
#define __CR_PIDFD_H__ | ||
|
||
#include "files.h" | ||
#include "pidfd.pb-c.h" | ||
|
||
extern const struct fdtype_ops pidfd_dump_ops; | ||
extern struct collect_image_info pidfd_cinfo; | ||
extern int is_pidfd_link(char *link); | ||
extern void init_dead_pidfd_hash(void); | ||
struct pidfd_dump_info { | ||
PidfdEntry pidfe; | ||
pid_t pid; | ||
}; | ||
|
||
#endif /* __CR_PIDFD_H__ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,6 +70,7 @@ enum { | |
PB_BPFMAP_FILE, | ||
PB_BPFMAP_DATA, | ||
PB_APPARMOR, | ||
PB_PIDFD, | ||
|
||
/* PB_AUTOGEN_STOP */ | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,257 @@ | ||
#include "imgset.h" | ||
#include "pidfd.h" | ||
#include "fdinfo.h" | ||
#include "pidfd.pb-c.h" | ||
#include "protobuf.h" | ||
#include "pstree.h" | ||
#include <stdlib.h> | ||
#include <sys/wait.h> | ||
#include <signal.h> | ||
#include "common/bug.h" | ||
|
||
#undef LOG_PREFIX | ||
#define LOG_PREFIX "pidfd: " | ||
|
||
#ifndef PIDFD_THREAD | ||
#define PIDFD_THREAD O_EXCL | ||
#endif | ||
|
||
struct pidfd_info { | ||
PidfdEntry *pidfe; | ||
struct file_desc d; | ||
}; | ||
|
||
struct dead_pidfd { | ||
unsigned int ino; | ||
int pid; | ||
size_t count; | ||
struct hlist_node hash; | ||
}; | ||
|
||
#define DEAD_PIDFD_HASH_SIZE 32 | ||
static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; | ||
|
||
void init_dead_pidfd_hash(void) | ||
{ | ||
for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) | ||
INIT_HLIST_HEAD(&dead_pidfd_hash[i]); | ||
} | ||
|
||
static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) | ||
{ | ||
struct dead_pidfd *dead; | ||
struct hlist_head *chain; | ||
|
||
chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; | ||
hlist_for_each_entry(dead, chain, hash) { | ||
if (dead->ino == ino) | ||
return dead; | ||
} | ||
return NULL; | ||
} | ||
|
||
static void free_dead_pidfd_hash(void) | ||
{ | ||
int i; | ||
struct dead_pidfd *dead; | ||
struct hlist_node *tmp; | ||
|
||
for (i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) { | ||
hlist_for_each_entry_safe(dead, tmp, &dead_pidfd_hash[i], hash) { | ||
xfree(dead); | ||
} | ||
INIT_HLIST_HEAD(&dead_pidfd_hash[i]); | ||
} | ||
} | ||
|
||
int is_pidfd_link(char *link) | ||
{ | ||
/* | ||
* pidfs was introduced in Linux 6.9 | ||
* before which anonymous-inodes were used | ||
*/ | ||
return is_anon_link_type(link, "[pidfd]"); | ||
} | ||
|
||
static void pr_info_pidfd(char *action, PidfdEntry *pidfe) | ||
{ | ||
pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", | ||
action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino | ||
); | ||
} | ||
|
||
static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) | ||
{ | ||
struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; | ||
FileEntry fe = FILE_ENTRY__INIT; | ||
|
||
if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) | ||
return -1; | ||
|
||
if (p->flags & PIDFD_THREAD) { | ||
pr_err("PIDFD_THREAD flag is currently not supported\n"); | ||
return -1; | ||
} | ||
|
||
/* | ||
* Check if the pid pidfd refers to is part of process tree | ||
* This ensures the process will exist on restore. | ||
*/ | ||
if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { | ||
pr_err("pidfd pid %d is not a part of process tree..\n", | ||
pidfd_info.pid); | ||
return -1; | ||
} | ||
|
||
pidfd_info.pidfe.id = id; | ||
pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); | ||
pidfd_info.pidfe.fown = (FownEntry *)&p->fown; | ||
|
||
fe.type = FD_TYPES__PIDFD; | ||
fe.id = pidfd_info.pidfe.id; | ||
fe.pidfd = &pidfd_info.pidfe; | ||
|
||
pr_info_pidfd("Dumping", &pidfd_info.pidfe); | ||
return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); | ||
} | ||
|
||
const struct fdtype_ops pidfd_dump_ops = { | ||
.type = FD_TYPES__PIDFD, | ||
.dump = dump_one_pidfd, | ||
}; | ||
|
||
static int pidfd_open(pid_t pid, int flags) | ||
{ | ||
return syscall(__NR_pidfd_open, pid, flags); | ||
} | ||
|
||
static int create_tmp_process(void) | ||
{ | ||
int tmp_process; | ||
tmp_process = fork(); | ||
if (tmp_process < 0) { | ||
pr_perror("Could not fork"); | ||
return -1; | ||
} else if (tmp_process == 0) { | ||
while(1) | ||
sleep(1); | ||
} | ||
return tmp_process; | ||
} | ||
|
||
static int open_one_pidfd(struct file_desc *d, int *new_fd) | ||
{ | ||
struct pidfd_info *info; | ||
struct dead_pidfd *dead; | ||
int pidfd, status; | ||
|
||
info = container_of(d, struct pidfd_info, d); | ||
if (info->pidfe->nspid != -1) { | ||
pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); | ||
if (pidfd < 0) { | ||
pr_perror("Could not open pidfd for %d", info->pidfe->nspid); | ||
goto err_close; | ||
} | ||
goto out; | ||
} | ||
|
||
dead = lookup_dead_pidfd(info->pidfe->ino); | ||
BUG_ON(!dead); | ||
|
||
dead->count--; | ||
if (dead->pid == -1) { | ||
dead->pid = create_tmp_process(); | ||
if (dead->pid < 0) | ||
goto err_close; | ||
} | ||
|
||
pidfd = pidfd_open(dead->pid, info->pidfe->flags); | ||
if (pidfd < 0) { | ||
pr_perror("Could not open pidfd for %d", info->pidfe->nspid); | ||
goto err_close; | ||
} | ||
|
||
if (dead->count > 0) | ||
goto out; | ||
|
||
if (kill(dead->pid, SIGKILL) < 0) { | ||
pr_perror("Could not kill temporary process with pid: %d", | ||
dead->pid); | ||
goto err_close; | ||
} | ||
|
||
if (waitpid(dead->pid, &status, 0) != dead->pid) { | ||
pr_perror("Could not wait on temporary process with pid: %d", | ||
dead->pid); | ||
goto err_close; | ||
} | ||
|
||
if (!WIFSIGNALED(status)) { | ||
pr_err("Expected temporary process to be terminated by a signal\n"); | ||
goto err_close; | ||
} | ||
|
||
if (WTERMSIG(status) != SIGKILL) { | ||
pr_err("Expected temporary process to be terminated by SIGKILL\n"); | ||
goto err_close; | ||
} | ||
|
||
hlist_del(&dead->hash); | ||
xfree(dead); | ||
out: | ||
if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { | ||
goto err_close; | ||
} | ||
|
||
*new_fd = pidfd; | ||
return 0; | ||
|
||
err_close: | ||
pr_err("Can't create pidfd %#08x NSpid: %d flags: %u", | ||
info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); | ||
free_dead_pidfd_hash(); | ||
return -1; | ||
} | ||
|
||
static struct file_desc_ops pidfd_desc_ops = { | ||
.type = FD_TYPES__PIDFD, | ||
.open = open_one_pidfd | ||
}; | ||
|
||
static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) | ||
{ | ||
struct dead_pidfd *dead; | ||
struct pidfd_info *info = obj; | ||
|
||
info->pidfe = pb_msg(msg, PidfdEntry); | ||
pr_info_pidfd("Collected ", info->pidfe); | ||
|
||
if (info->pidfe->nspid != -1) | ||
goto out; | ||
|
||
dead = lookup_dead_pidfd(info->pidfe->ino); | ||
if (dead) { | ||
dead->count++; | ||
goto out; | ||
} | ||
|
||
dead = xmalloc(sizeof(*dead)); | ||
if (!dead) | ||
return -1; | ||
|
||
INIT_HLIST_NODE(&dead->hash); | ||
dead->ino = info->pidfe->ino; | ||
dead->count = 1; | ||
dead->pid = -1; | ||
|
||
hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); | ||
out: | ||
return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); | ||
} | ||
|
||
struct collect_image_info pidfd_cinfo = { | ||
.fd_type = CR_FD_PIDFD, | ||
.pb_type = PB_PIDFD, | ||
.priv_size = sizeof(struct pidfd_info), | ||
.collect = collect_one_pidfd, | ||
}; |
Oops, something went wrong.