Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

demand paging for anonymous mmaps #607

Merged
merged 15 commits into from
Mar 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/runtime/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ static inline void buffer_extend(buffer b, bytes len)
assert(!b->wrapped); /* wrapped buffers can't be extended */
int oldlen = b->length;
b->length = 2*((b->end-b->start)+len);
void *new = allocate(b->h, b->length);
void *new = allocate(b->h, b->length);
assert(new != INVALID_ADDRESS);
runtime_memcpy(new, b->contents + b->start, (b->end-b->start));
deallocate(b->h, b->contents, oldlen);
b->end = b->end - b->start;
Expand All @@ -91,6 +92,7 @@ static inline buffer wrap_buffer(heap h,
bytes length)
{
buffer new = allocate(h, sizeof(struct buffer));
assert(new != INVALID_ADDRESS);
new->contents = body;
new->start = 0;
new->h = h;
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/heap/heap.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ static inline int subdivide(int quantum, int per, int s, int o)
#define allocate_zero(__h, __b) ({\
u64 __len = __b;\
void *x = allocate(__h, __len); \
zero(x, __len); \
if (x != INVALID_ADDRESS) zero(x, __len); \
x; })

static inline void leak(heap h, u64 x, bytes length)
Expand Down
9 changes: 3 additions & 6 deletions src/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ typedef u32 character;

#define true 1
#define false 0
#define infinity (-1ull)
#define INVALID_PHYSICAL ((u64)infinity)
#define INVALID_ADDRESS ((void *)infinity)

typedef u64 timestamp;

Expand Down Expand Up @@ -112,12 +115,6 @@ physical vtop(void *x);
physical physical_from_virtual(void *x);
#endif

#define infinity (-1ull)

#define INVALID_PHYSICAL ((u64)0xffffffffffffffff)

#define INVALID_ADDRESS ((void *)0xffffffffffffffffull)

heap zero_wrap(heap meta, heap parent);

boolean validate_virtual(void *base, u64 length);
Expand Down
197 changes: 138 additions & 59 deletions src/unix/mmap.c
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
#include <unix_internal.h>

#define VMAP_FLAG_MMAP 1
#define VMAP_FLAG_ANONYMOUS 2

typedef struct vmap {
struct rmnode node;
/* oh, what we could do here */
heap vheap; /* presently either p->virtual or p->virtual32 */
u32 flags;
u32 prot;
} *vmap;

static inline vmap allocate_vmap(heap h, range r)
static vmap allocate_vmap(heap h, range r)
{
vmap vm = allocate(h, sizeof(struct vmap));
if (vm == INVALID_ADDRESS)
Expand All @@ -14,6 +19,70 @@ static inline vmap allocate_vmap(heap h, range r)
return vm;
}

/* Page faults may be caused by:

- user program instructions

- syscall top halves accessing unmapped anonymous pages

- syscall bottom halves accessing unmapped anonymous pages (while
in the interrupt handler!)

- can consider manually faulting in user pages in top half to
avoid faults from interrupt handler; this is debatable

Therefore:

- allocating a physical page must be fast and safe at interrupt
level

- as elsewhere in the kernel, if/when we move from the bifurcated
runqueue / interrupt processing scheme, we need to put the
proper locks in place

- we can easily build a free list of physical pages

- also note that, given that id deallocations don't need to
match their source allocations, we can take any size
deallocation and bust it up into single pages to cache

- map() needs to be safe at interrupt and non-interrupt levels

- the page fault handler runs on its own stack (set as IST0 in
TSS), given that the user stack may live on an anonymous mapping
and need to have pages faulted in on its own behalf - otherwise
we eventually wind up with a triple fault as the CPU cannot push
onto the stack when invoking the exception handler
*/

boolean unix_fault_page(u64 vaddr)
{
process p = current->p;
kernel_heaps kh = get_kernel_heaps();
vmap vm;

if ((vm = (vmap)rangemap_lookup(p->vmap, vaddr)) != INVALID_ADDRESS) {
u32 flags = VMAP_FLAG_MMAP | VMAP_FLAG_ANONYMOUS;
if ((vm->flags & flags) != flags) {
msg_err("vaddr 0x%P matched vmap with invalid flags (0x%P)\n", vaddr, vm->flags);
return false;
}

/* XXX make free list */
u64 paddr = allocate_u64(heap_physical(kh), PAGESIZE);
if (paddr == INVALID_PHYSICAL) {
msg_err("cannot get physical page; OOM\n");
return false;
}
u64 vaddr_aligned = vaddr & ~MASK(PAGELOG);
map(vaddr_aligned, paddr, PAGESIZE, heap_pages(kh));
zero(pointer_from_u64(vaddr_aligned), PAGESIZE);
return true;
}
msg_err("no vmap found for vaddr 0x%P\n", vaddr);
return false;
}

sysreturn mremap(void *old_address, u64 old_size, u64 new_size, int flags, void * new_address)
{
kernel_heaps kh = get_kernel_heaps();
Expand Down Expand Up @@ -139,72 +208,81 @@ static sysreturn mmap(void *target, u64 size, int prot, int flags, int fd, u64 o
process p = current->p;
kernel_heaps kh = get_kernel_heaps();
heap h = heap_general(kh);
heap pages = heap_pages(kh);
heap physical = heap_physical(kh);
// its really unclear whether this should be extended or truncated
u64 len = pad(size, PAGESIZE) & MASK(32);
u64 where = u64_from_pointer(target);
u64 end = where + size - 1;
boolean fixed = (flags & MAP_FIXED) != 0;
boolean mapped = false;
thread_log(current, "mmap: target %p, size %P, prot %P, flags %P, fd %d, offset %P",
target, size, prot, flags, fd, offset);

if (target) {
if (where) {
thread_log(current, " %s at %P", fixed ? "fixed" : "hint", where);

/* 32 bit mode is ignored if MAP_FIXED */
heap vh = p->virtual;
if (where < HUGE_PAGESIZE && end < HUGE_PAGESIZE) {
/* bound by kernel and zero page. */
if (where >= PROCESS_VIRTUAL_32_HEAP_START || end <= PROCESS_VIRTUAL_32_HEAP_END) {
/* Attempt to reserve low memory fixed mappings in
virtual32 to avoid collisions in any future low mem
allocation. Don't fail if we can't reserve or it's
already reserved. */
id_heap_reserve(p->virtual32, where, size);
} else if (fixed) {
thread_log(current, " map [%P - %P] outside of valid 32-bit range [%P - %P]",
where, end, PROCESS_VIRTUAL_32_HEAP_START, PROCESS_VIRTUAL_32_HEAP_END);
return -ENOMEM;
} else {
target = 0; /* allocate */
}
vmap vmap_start = (vmap)rangemap_lookup(p->vmap, where);
vmap vmap_end = (vmap)rangemap_lookup(p->vmap, end);
if (vmap_start != INVALID_ADDRESS &&
vmap_end == vmap_start &&
(vmap_start->flags & VMAP_FLAG_ANONYMOUS) == 0) {
mapped = true;
} else {
if (where < PROCESS_VIRTUAL_HEAP_START || end > PROCESS_VIRTUAL_HEAP_END) {
/* Try to allow outside our process virtual space, as
long as we can block it out in virtual_huge. */
vh = heap_virtual_huge(kh);
}

/* XXX range lookup in rtrie is broke, do manually until
fixed... note that this check could pass even if start and
end lie in two different mmapped areas. No matter, as we
just need to verify that this overlapping map lies in a
huge page that we're already using...the overlapping mmap
lawlessness is to be tolerated for the moment.

This is like a really crude start to vm tracking...
*/
if (rangemap_lookup(p->vmap, where) == INVALID_ADDRESS ||
rangemap_lookup(p->vmap, end) == INVALID_ADDRESS) {
u64 mapstart = where & ~(HUGE_PAGESIZE - 1);
u64 mapend = pad(end, HUGE_PAGESIZE);
u64 maplen = mapend - mapstart + 1;

if (id_heap_reserve(vh, mapstart, maplen)) {
vmap vm = allocate_vmap(h, irange(mapstart, mapstart + maplen));
if (vm == INVALID_ADDRESS) {
msg_err("failed to allocate vmap\n");
return -ENOMEM;
}
assert(rangemap_insert(p->vmap, &vm->node));
/* 32 bit mode is ignored if MAP_FIXED */
heap vh = p->virtual;
if (where < HUGE_PAGESIZE && end < HUGE_PAGESIZE) {
/* bound by kernel and zero page. */
if (where >= PROCESS_VIRTUAL_32_HEAP_START || end <= PROCESS_VIRTUAL_32_HEAP_END) {
/* Attempt to reserve low memory fixed mappings in
virtual32 to avoid collisions in any future low mem
allocation. Don't fail if we can't reserve or it's
already reserved. */
id_heap_reserve(p->virtual32, where, size);
} else if (fixed) {
thread_log(current, " failed to reserve area [%P - %P] in id heap",
where, end);
thread_log(current, " map [%P - %P] outside of valid 32-bit range [%P - %P]",
where, end, PROCESS_VIRTUAL_32_HEAP_START, PROCESS_VIRTUAL_32_HEAP_END);
return -ENOMEM;
} else {
target = 0; /* allocate */
}
} else {
if (where < PROCESS_VIRTUAL_HEAP_START || end > PROCESS_VIRTUAL_HEAP_END) {
/* Try to allow outside our process virtual space, as
long as we can block it out in virtual_huge. */
vh = heap_virtual_huge(kh);
}

/* XXX range lookup in rtrie is broke, do manually until
fixed... note that this check could pass even if start and
end lie in two different mmapped areas. No matter, as we
just need to verify that this overlapping map lies in a
huge page that we're already using...the overlapping mmap
lawlessness is to be tolerated for the moment.

This is like a really crude start to vm tracking...
*/
if (vmap_start == INVALID_ADDRESS || vmap_end == INVALID_ADDRESS) {
u64 mapstart = where & ~(HUGE_PAGESIZE - 1);
u64 mapend = pad(end, HUGE_PAGESIZE);
u64 maplen = mapend - mapstart + 1;

if (id_heap_reserve(vh, mapstart, maplen)) {
vmap vm = allocate_vmap(h, irange(mapstart, mapstart + maplen));
if (vm == INVALID_ADDRESS) {
msg_err("failed to allocate vmap\n");
return -ENOMEM;
}
vm->flags = VMAP_FLAG_MMAP;
if ((flags & MAP_ANONYMOUS))
vm->flags |= VMAP_FLAG_ANONYMOUS;
assert(rangemap_insert(p->vmap, &vm->node));
} else if (fixed) {
thread_log(current, " failed to reserve area [%P - %P] in id heap",
where, end);
return -ENOMEM;
} else {
target = 0; /* allocate */
}
}
}
}
}
Expand Down Expand Up @@ -233,19 +311,20 @@ static sysreturn mmap(void *target, u64 size, int prot, int flags, int fd, u64 o
deallocate_u64(vh, where, maplen);
return -ENOMEM;
}
vm->flags = VMAP_FLAG_MMAP;
if ((flags & MAP_ANONYMOUS))
vm->flags |= VMAP_FLAG_ANONYMOUS;
assert(rangemap_insert(p->vmap, &vm->node));
}

// make a generic zero page function
if (flags & MAP_ANONYMOUS) {
u64 m = allocate_u64(physical, len);
if (m == INVALID_PHYSICAL) {
msg_err("failed to allocate physical memory, size %d\n", len);
return -ENOMEM;
if (mapped) {
/* just zero */
zero(pointer_from_u64(where), len);
} else {
thread_log(current, " anon nomap target: %P, len: %P (given size: %P)", where, len, size);
}
map(where, m, len, pages);
thread_log(current, " anon target: %P, len: %P (given size: %P)", where, len, size);
zero(pointer_from_u64(where), len);
return where;
}

Expand Down
27 changes: 24 additions & 3 deletions src/unix/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,10 @@ static sysreturn brk(void *x)
} else {
// I guess assuming we're aligned
u64 alloc = pad(u64_from_pointer(x), PAGESIZE) - pad(u64_from_pointer(p->brk), PAGESIZE);
map(u64_from_pointer(p->brk), allocate_u64(heap_physical(kh), alloc), alloc, heap_pages(kh));
u64 phys = allocate_u64(heap_physical(kh), alloc);
if (phys == INVALID_PHYSICAL)
return -ENOMEM;
map(u64_from_pointer(p->brk), phys, alloc, heap_pages(kh));
// people shouldn't depend on this
zero(p->brk, alloc);
p->brk += alloc;
Expand Down Expand Up @@ -1381,6 +1384,13 @@ sysreturn setgid(gid_t gid)
return 0; /* stub */
}

sysreturn prctl(int option, u64 arg2, u64 arg3, u64 arg4, u64 arg5)
{
thread_log(current, "prctl: option %d, arg2 0x%P, arg3 0x%P, arg4 0x%P, arg5 0x%P",
option, arg2, arg3, arg4, arg5);
return 0;
}

void register_file_syscalls(void **map)
{
register_syscall(map, SYS_read, read);
Expand Down Expand Up @@ -1430,6 +1440,7 @@ void register_file_syscalls(void **map)
register_syscall(map, SYS_setgroups, setgroups);
register_syscall(map, SYS_setuid, setuid);
register_syscall(map, SYS_setgid, setgid);
register_syscall(map, SYS_prctl, prctl);
}

void *linux_syscalls[SYS_MAX];
Expand All @@ -1447,20 +1458,28 @@ buffer install_syscall(heap h)
return b;
}

static context syscall_frame;

extern char *syscall_name(int);
static void syscall_debug()
{
u64 *f = current->frame;
int call = f[FRAME_VECTOR];
void *debugsyscalls = table_find(current->p->process_root, sym(debugsyscalls));
if(debugsyscalls)
if (debugsyscalls)
thread_log(current, syscall_name(call));
sysreturn (*h)(u64, u64, u64, u64, u64, u64) = current->p->syscall_handlers[call];
sysreturn res = -ENOSYS;
if (h) {
/* exchange frames so that a fault won't clobber the syscall
context, but retain the fault handler that has current enclosed */
context saveframe = running_frame;
running_frame = syscall_frame;
running_frame[FRAME_FAULT_HANDLER] = f[FRAME_FAULT_HANDLER];
res = h(f[FRAME_RDI], f[FRAME_RSI], f[FRAME_RDX], f[FRAME_R10], f[FRAME_R8], f[FRAME_R9]);
if (debugsyscalls)
thread_log(current, "direct return: %d", res);
thread_log(current, "direct return: %d, rsp 0x%P", res, f[FRAME_RSP]);
running_frame = saveframe;
} else if (debugsyscalls) {
thread_log(current, "nosyscall %s", syscall_name(call));
}
Expand All @@ -1475,5 +1494,7 @@ void init_syscalls()
{
//syscall = b->contents;
// debug the synthesized version later, at least we have the table dispatch
heap h = heap_general(get_kernel_heaps());
syscall = syscall_debug;
syscall_frame = allocate_frame(h);
}
4 changes: 2 additions & 2 deletions src/unix/thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,8 @@ void run_thread(thread t)
{
current = t;
thread_log(t, "run frame %p, RIP=%p", t->frame, t->frame[FRAME_RIP]);
frame = t->frame;
IRETURN(frame);
running_frame = t->frame;
IRETURN(running_frame);
}

// it might be easier, if a little skeezy, to use the return value
Expand Down
Loading