Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

criu: Add support for pidfds #2449

Merged
merged 8 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/zdtm/static/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ TST_NOFILE := \
shm-mp \
ptrace_sig \
pidfd_self \
pidfd_dead \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this test fails in Fedora Rawhide:

======================= Run zdtm/static/pidfd_dead in h ========================
Start test
./pidfd_dead --pidfile=pidfd_dead.pid --outfile=pidfd_dead.out
Run criu dump
Run criu restore
=[log]=> dump/zdtm/static/pidfd_dead/168/1/restore.log
------------------------ grep Error ------------------------
b'(00.004951)    168: \t\t\tGoing to dup 1 into 2'
b'(00.004953)    168: \tReceive fd for 2'
b'(00.004961)    168: \t\tCreate fd for 3'
b'(00.005264)    168: \t\tCreate fd for 4'
b'(00.005486)    168: Error (criu/cr-restore.c:1261): 175 killed by signal 9: Killed'
b'(00.005504) Error (criu/cr-restore.c:2314): Restoring FAILED.'
------------------------ ERROR OVER ------------------------
############### Test zdtm/static/pidfd_dead FAIL at CRIU restore ###############
Test output: ================================

https://github.com/checkpoint-restore/criu/runs/31040419148

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will try to see why this happens!
Is there any way that I can replicate this test locally?
I did try running the tests on a VM with Fedora Rawhide 6.12.0-0.rc1.20241005git27cc6fdf7201.22.fc42.x86_64
where these tests end up passing!

Copy link
Member

@rst0git rst0git Oct 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bsach64 There is a race condition between kill() and waitpid() in free_dead_pidfd().

You can replicate the error with the following change:

diff --git a/criu/pidfd.c b/criu/pidfd.c
index fdf5dec60..23062ae02 100644
--- a/criu/pidfd.c
+++ b/criu/pidfd.c
@@ -151,7 +151,7 @@ static int free_dead_pidfd(struct dead_pidfd *dead)
                dead->pid);
                goto err;
        }
-
+       sleep(1);
        if (waitpid(dead->pid, &status, 0) != dead->pid) {
                pr_perror("Could not wait on temporary process with pid: %d",
                dead->pid);
$ sudo python3 zdtm.py run -t zdtm/static/pidfd_dead
userns is supported
=== Run 1/1 ================ zdtm/static/pidfd_dead
====================== Run zdtm/static/pidfd_dead in uns =======================
Start test
./pidfd_dead --pidfile=pidfd_dead.pid --outfile=pidfd_dead.out
Run criu dump
Run criu restore
=[log]=> dump/zdtm/static/pidfd_dead/64/1/restore.log
------------------------ grep Error ------------------------
b'(00.016134)      1: No ipcns-sem-11.img image'
b'(00.020782)      1: net: Try to restore a link 10:1:lo'
b'(00.020820)      1: net: Restoring link lo type 1'
b'(00.024435)      1: net: \tRunning ip addr restore'
b'Error: ipv4: Address already assigned.'
b'Error: ipv6: address already assigned.'
b'(00.141528)      4: \t\tCreate fd for 3'
b'(00.141715)      1: `- render 1 iovs (0x7f8433c61000:8192...)'
b'(00.141790)      1: Restore via sigreturn'
b'(00.142590)      4: \t\tCreate fd for 4'
b'(00.143514)      4: Error (criu/cr-restore.c:1261): 18 killed by signal 9: Killed'
b'(00.143579)      4: Error (criu/pidfd.c:156): pidfd: Could not wait on temporary process with pid: 18: No child processes'
b'(00.143591)      4: Error (criu/pidfd.c:218): pidfd: Failed to delete dead_pidfd struct'
b"(00.143605)      4: Error (criu/pidfd.c:234): pidfd: Can't create pidfd 0x00000d NSpid: -1 flags: 0"
b'(00.143615)      4: Error (criu/files.c:1221): Unable to open fd=5 id=0xd'
b'(00.147752) uns: calling exit_usernsd (-1, 1)'
b'(00.147902) uns: daemon calls 0x479de0 (91, -1, 1)'
b'(00.147939) uns: `- daemon exits w/ 0'
b'(00.150476) uns: daemon stopped'
b'(00.150514) Error (criu/cr-restore.c:2314): Restoring FAILED.'
------------------------ ERROR OVER ------------------------
############### Test zdtm/static/pidfd_dead FAIL at CRIU restore ###############
Test output: ================================

 <<< ================================
##################################### FAIL #####################################

We might need to add the temporary process PID (i.e., the processes created with create_tmp_process()) in the list of task helpers (ta->helpers). See collect_helper_pids() in cr-restore.c.

cc: @avagin

Copy link
Member Author

@bsach64 bsach64 Oct 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another solution might be to block SIGCHLD?
See sysctl.c line 271 https://github.com/checkpoint-restore/criu/blob/criu-dev/criu/sysctl.c#L271

diff --git a/criu/pidfd.c b/criu/pidfd.c
index fdf5dec60..a63a9a4b8 100644
--- a/criu/pidfd.c
+++ b/criu/pidfd.c
@@ -27,6 +27,7 @@ struct dead_pidfd {
 	unsigned int ino;
 	int pid;
 	size_t count;
+	sigset_t oldmask;
 	mutex_t pidfd_lock;
 	struct hlist_node hash;
 };
@@ -152,12 +153,14 @@ static int free_dead_pidfd(struct dead_pidfd *dead)
 		goto err;
 	}
 
+	sleep(1);
 	if (waitpid(dead->pid, &status, 0) != dead->pid) {
 		pr_perror("Could not wait on temporary process with pid: %d",
 		dead->pid);
 		goto err;
 	}
 
+	sigprocmask(SIG_SETMASK, &dead->oldmask, NULL);
 	if (!WIFSIGNALED(status)) {
 		pr_err("Expected temporary process to be terminated by a signal\n");
 		goto err;
@@ -180,6 +183,7 @@ static int open_one_pidfd(struct file_desc *d, int *new_fd)
 {
 	struct pidfd_info *info;
 	struct dead_pidfd *dead = NULL;
+	sigset_t blockmask;
 	int pidfd;
 
 	info = container_of(d, struct pidfd_info, d);
@@ -199,6 +203,9 @@ static int open_one_pidfd(struct file_desc *d, int *new_fd)
 	BUG_ON(dead->count == 0);
 	dead->count--;
 	if (dead->pid == -1) {
+		sigemptyset(&blockmask);
+		sigaddset(&blockmask, SIGCHLD);
+		sigprocmask(SIG_BLOCK, &blockmask, &dead->oldmask);
 		dead->pid = create_tmp_process();
 		if (dead->pid < 0) {
 			mutex_unlock(&dead->pidfd_lock);
@@ -270,6 +277,7 @@ static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i)
 	dead->ino = info->pidfe->ino;
 	dead->count = 1;
 	dead->pid = -1;
+	sigemptyset(&dead->oldmask);
 	mutex_init(&dead->pidfd_lock);
 
 	mutex_lock(dead_pidfd_hash_lock);

cc: @mihalicyn

pidfd_child \
pidfd_kill \
pipe00 \
Expand Down
244 changes: 244 additions & 0 deletions test/zdtm/static/pidfd_dead.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
#include <sys/statfs.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <sys/stat.h>

#include "zdtmtst.h"

const char *test_doc = "Check C/R of pidfds that point to dead processes\n";
const char *test_author = "Bhavik Sachdev <[email protected]>";

#ifndef PID_FS_MAGIC
#define PID_FS_MAGIC 0x50494446
#endif

/*
* main
* `- child
* `- grandchild
*
* main opens a pidfd for both child and grandchild.
* Before C/R we kill both child and grandchild.
* We end up with two unique dead pidfds.
*/

static long get_fs_type(int lfd)
{
struct statfs fst;

if (fstatfs(lfd, &fst)) {
return -1;
}
return fst.f_type;
}

static int pidfd_open(pid_t pid, unsigned int flags)
{
return syscall(__NR_pidfd_open, pid, flags);
}

static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags)
{
return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
}

static int open_pidfd_pair(int pidfd[2], int pid)
{
pidfd[0] = pidfd_open(pid, 0);
if (pidfd[0] < 0) {
pr_perror("pidfd_open() failed");
return 1;
}

pidfd[1] = pidfd_open(pid, 0);
if (pidfd[1] < 0) {
close(pidfd[0]);
pr_perror("pidfd_open() failed");
return 1;
}
return 0;
}

static int compare_pidfds(int pidfd[2])
{
/*
* After linux 6.9 we can compare inode numbers
* to determine if two pidfds point to the same process.
* While the inode number may change before and after C/R
* pidfds pointing to the same pid should have the same inode number.
*/
struct statx stats[2];
statx(pidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]);
statx(pidfd[1], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]);
if (stats[0].stx_ino != stats[1].stx_ino)
return 1;
return 0;
}

static int check_for_pidfs(void)
{
long type;
int pidfd = pidfd_open(getpid(), 0);
if (pidfd < 0) {
pr_perror("pidfd open() failed");
return -1;
}
type = get_fs_type(pidfd);
close(pidfd);
return type == PID_FS_MAGIC;
}

int main(int argc, char* argv[])
{
#define READ 0
#define WRITE 1

int child, ret, gchild, p[2], status;
int cpidfd[2], gpidfd[2];
struct statx stats[2];

test_init(argc, argv);

ret = check_for_pidfs();
if (ret < 0)
return 1;

if (ret == 0) {
test_daemon();
test_waitsig();
skip("Test requires pidfs. skipping...");
pass();
return 0;
}

if (pipe(p)) {
pr_perror("pipe");
return 1;
}

child = test_fork();
if (child < 0) {
pr_perror("fork");
return 1;
} else if (child == 0) {
avagin marked this conversation as resolved.
Show resolved Hide resolved
int gchild = test_fork();
close(p[READ]);
if (gchild < 0) {
pr_perror("fork");
return 1;
} else if (gchild == 0) {
close(p[WRITE]);
while(1)
sleep(1000);
} else {
if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) {
pr_perror("write");
return 1;
}
close(p[WRITE]);
if (waitpid(gchild, &status, 0) != gchild) {
pr_perror("waitpid");
return 1;
}

if (!WIFSIGNALED(status)) {
fail("Expected grandchild to be terminated by a signal");
return 1;
}

if (WTERMSIG(status) != SIGKILL) {
fail("Expected grandchild to be terminated by SIGKILL");
return 1;
}

return 0;
}
}

ret = open_pidfd_pair(cpidfd, child);
if (ret)
return 1;

close(p[WRITE]);
if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) {
pr_perror("write");
return 1;
}
close(p[READ]);

ret = open_pidfd_pair(gpidfd, gchild);
if (ret)
return 1;

/*
* We kill grandchild and child processes only after opening pidfds.
*/
if (pidfd_send_signal(gpidfd[0], SIGKILL, NULL, 0)) {
pr_perror("pidfd_send_signal");
goto fail_close;
}

if (waitpid(child, &status, 0) != child) {
pr_perror("waitpid");
goto fail_close;
}

if (!WIFEXITED(status)) {
fail("Expected child to exit normally");
goto fail_close;
}

if (WEXITSTATUS(status) != 0) {
fail("Expected child to exit with 0");
goto fail_close;
}
usleep(1000);

if (kill(gchild, 0) != -1 && errno != ESRCH) {
fail("Expected grand child to not exist");
goto fail_close;
}

if (kill(child, 0) != -1 && errno != ESRCH) {
fail("Expected child to not exist");
goto fail_close;
}

test_daemon();
test_waitsig();

avagin marked this conversation as resolved.
Show resolved Hide resolved
ret = compare_pidfds(cpidfd);
if (ret) {
fail("inodes not same for same pid");
goto fail_close;
}

ret = compare_pidfds(gpidfd);
if (ret) {
fail("inodes not same for same pid");
goto fail_close;
}

statx(cpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]);
statx(gpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]);
if (stats[0].stx_ino == stats[1].stx_ino) {
fail("pidfds pointing to diff pids should have diff inodes");
goto fail_close;
}

pass();
close(cpidfd[0]);
close(cpidfd[1]);
close(gpidfd[0]);
close(gpidfd[1]);
return 0;

fail_close:
close(cpidfd[0]);
close(cpidfd[1]);
close(gpidfd[0]);
close(gpidfd[1]);
return 1;
}