Skip to content

Commit

Permalink
demand paging for anonymous mmaps (#607)
Browse files Browse the repository at this point in the history
* demand paging for anonymous mmaps - wip

* set up TSS and alternate stack for page fault handler

* allocate_zero() was not detecting allocation failure; demand paging cleanup

* cleanup, add fallback fault handler registration for faults in int handler

* intframe was overzealously removed; clean up frame handling and consolidate into interrupt.c

* syscall frame cleanup

* -1ull redunancies

* superfluous newline

* clean up mixed use of msg_err and console in unix_fault_page
  • Loading branch information
wjhun authored Mar 19, 2019
1 parent df0887b commit e942733
Show file tree
Hide file tree
Showing 13 changed files with 346 additions and 118 deletions.
4 changes: 3 additions & 1 deletion src/runtime/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ static inline void buffer_extend(buffer b, bytes len)
assert(!b->wrapped); /* wrapped buffers can't be extended */
int oldlen = b->length;
b->length = 2*((b->end-b->start)+len);
void *new = allocate(b->h, b->length);
void *new = allocate(b->h, b->length);
assert(new != INVALID_ADDRESS);
runtime_memcpy(new, b->contents + b->start, (b->end-b->start));
deallocate(b->h, b->contents, oldlen);
b->end = b->end - b->start;
Expand All @@ -91,6 +92,7 @@ static inline buffer wrap_buffer(heap h,
bytes length)
{
buffer new = allocate(h, sizeof(struct buffer));
assert(new != INVALID_ADDRESS);
new->contents = body;
new->start = 0;
new->h = h;
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/heap/heap.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ static inline int subdivide(int quantum, int per, int s, int o)
#define allocate_zero(__h, __b) ({\
u64 __len = __b;\
void *x = allocate(__h, __len); \
zero(x, __len); \
if (x != INVALID_ADDRESS) zero(x, __len); \
x; })

static inline void leak(heap h, u64 x, bytes length)
Expand Down
9 changes: 3 additions & 6 deletions src/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ typedef u32 character;

#define true 1
#define false 0
#define infinity (-1ull)
#define INVALID_PHYSICAL ((u64)infinity)
#define INVALID_ADDRESS ((void *)infinity)

typedef u64 timestamp;

Expand Down Expand Up @@ -112,12 +115,6 @@ physical vtop(void *x);
physical physical_from_virtual(void *x);
#endif

#define infinity (-1ull)

#define INVALID_PHYSICAL ((u64)0xffffffffffffffff)

#define INVALID_ADDRESS ((void *)0xffffffffffffffffull)

heap zero_wrap(heap meta, heap parent);

boolean validate_virtual(void *base, u64 length);
Expand Down
197 changes: 138 additions & 59 deletions src/unix/mmap.c
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
#include <unix_internal.h>

#define VMAP_FLAG_MMAP 1
#define VMAP_FLAG_ANONYMOUS 2

typedef struct vmap {
struct rmnode node;
/* oh, what we could do here */
heap vheap; /* presently either p->virtual or p->virtual32 */
u32 flags;
u32 prot;
} *vmap;

static inline vmap allocate_vmap(heap h, range r)
static vmap allocate_vmap(heap h, range r)
{
vmap vm = allocate(h, sizeof(struct vmap));
if (vm == INVALID_ADDRESS)
Expand All @@ -14,6 +19,70 @@ static inline vmap allocate_vmap(heap h, range r)
return vm;
}

/* Page faults may be caused by:
- user program instructions
- syscall top halves accessing unmapped anonymous pages
- syscall bottom halves accessing unmapped anonymous pages (while
in the interrupt handler!)
- can consider manually faulting in user pages in top half to
avoid faults from interrupt handler; this is debatable
Therefore:
- allocating a physical page must be fast and safe at interrupt
level
- as elsewhere in the kernel, if/when we move from the bifurcated
runqueue / interrupt processing scheme, we need to put the
proper locks in place
- we can easily build a free list of physical pages
- also note that, given that id deallocations don't need to
match their source allocations, we can take any size
deallocation and bust it up into single pages to cache
- map() needs to be safe at interrupt and non-interrupt levels
- the page fault handler runs on its own stack (set as IST0 in
TSS), given that the user stack may live on an anonymous mapping
and need to have pages faulted in on its own behalf - otherwise
we eventually wind up with a triple fault as the CPU cannot push
onto the stack when invoking the exception handler
*/

boolean unix_fault_page(u64 vaddr)
{
process p = current->p;
kernel_heaps kh = get_kernel_heaps();
vmap vm;

if ((vm = (vmap)rangemap_lookup(p->vmap, vaddr)) != INVALID_ADDRESS) {
u32 flags = VMAP_FLAG_MMAP | VMAP_FLAG_ANONYMOUS;
if ((vm->flags & flags) != flags) {
msg_err("vaddr 0x%P matched vmap with invalid flags (0x%P)\n", vaddr, vm->flags);
return false;
}

/* XXX make free list */
u64 paddr = allocate_u64(heap_physical(kh), PAGESIZE);
if (paddr == INVALID_PHYSICAL) {
msg_err("cannot get physical page; OOM\n");
return false;
}
u64 vaddr_aligned = vaddr & ~MASK(PAGELOG);
map(vaddr_aligned, paddr, PAGESIZE, heap_pages(kh));
zero(pointer_from_u64(vaddr_aligned), PAGESIZE);
return true;
}
msg_err("no vmap found for vaddr 0x%P\n", vaddr);
return false;
}

sysreturn mremap(void *old_address, u64 old_size, u64 new_size, int flags, void * new_address)
{
kernel_heaps kh = get_kernel_heaps();
Expand Down Expand Up @@ -139,72 +208,81 @@ static sysreturn mmap(void *target, u64 size, int prot, int flags, int fd, u64 o
process p = current->p;
kernel_heaps kh = get_kernel_heaps();
heap h = heap_general(kh);
heap pages = heap_pages(kh);
heap physical = heap_physical(kh);
// its really unclear whether this should be extended or truncated
u64 len = pad(size, PAGESIZE) & MASK(32);
u64 where = u64_from_pointer(target);
u64 end = where + size - 1;
boolean fixed = (flags & MAP_FIXED) != 0;
boolean mapped = false;
thread_log(current, "mmap: target %p, size %P, prot %P, flags %P, fd %d, offset %P",
target, size, prot, flags, fd, offset);

if (target) {
if (where) {
thread_log(current, " %s at %P", fixed ? "fixed" : "hint", where);

/* 32 bit mode is ignored if MAP_FIXED */
heap vh = p->virtual;
if (where < HUGE_PAGESIZE && end < HUGE_PAGESIZE) {
/* bound by kernel and zero page. */
if (where >= PROCESS_VIRTUAL_32_HEAP_START || end <= PROCESS_VIRTUAL_32_HEAP_END) {
/* Attempt to reserve low memory fixed mappings in
virtual32 to avoid collisions in any future low mem
allocation. Don't fail if we can't reserve or it's
already reserved. */
id_heap_reserve(p->virtual32, where, size);
} else if (fixed) {
thread_log(current, " map [%P - %P] outside of valid 32-bit range [%P - %P]",
where, end, PROCESS_VIRTUAL_32_HEAP_START, PROCESS_VIRTUAL_32_HEAP_END);
return -ENOMEM;
} else {
target = 0; /* allocate */
}
vmap vmap_start = (vmap)rangemap_lookup(p->vmap, where);
vmap vmap_end = (vmap)rangemap_lookup(p->vmap, end);
if (vmap_start != INVALID_ADDRESS &&
vmap_end == vmap_start &&
(vmap_start->flags & VMAP_FLAG_ANONYMOUS) == 0) {
mapped = true;
} else {
if (where < PROCESS_VIRTUAL_HEAP_START || end > PROCESS_VIRTUAL_HEAP_END) {
/* Try to allow outside our process virtual space, as
long as we can block it out in virtual_huge. */
vh = heap_virtual_huge(kh);
}

/* XXX range lookup in rtrie is broke, do manually until
fixed... note that this check could pass even if start and
end lie in two different mmapped areas. No matter, as we
just need to verify that this overlapping map lies in a
huge page that we're already using...the overlapping mmap
lawlessness is to be tolerated for the moment.
This is like a really crude start to vm tracking...
*/
if (rangemap_lookup(p->vmap, where) == INVALID_ADDRESS ||
rangemap_lookup(p->vmap, end) == INVALID_ADDRESS) {
u64 mapstart = where & ~(HUGE_PAGESIZE - 1);
u64 mapend = pad(end, HUGE_PAGESIZE);
u64 maplen = mapend - mapstart + 1;

if (id_heap_reserve(vh, mapstart, maplen)) {
vmap vm = allocate_vmap(h, irange(mapstart, mapstart + maplen));
if (vm == INVALID_ADDRESS) {
msg_err("failed to allocate vmap\n");
return -ENOMEM;
}
assert(rangemap_insert(p->vmap, &vm->node));
/* 32 bit mode is ignored if MAP_FIXED */
heap vh = p->virtual;
if (where < HUGE_PAGESIZE && end < HUGE_PAGESIZE) {
/* bound by kernel and zero page. */
if (where >= PROCESS_VIRTUAL_32_HEAP_START || end <= PROCESS_VIRTUAL_32_HEAP_END) {
/* Attempt to reserve low memory fixed mappings in
virtual32 to avoid collisions in any future low mem
allocation. Don't fail if we can't reserve or it's
already reserved. */
id_heap_reserve(p->virtual32, where, size);
} else if (fixed) {
thread_log(current, " failed to reserve area [%P - %P] in id heap",
where, end);
thread_log(current, " map [%P - %P] outside of valid 32-bit range [%P - %P]",
where, end, PROCESS_VIRTUAL_32_HEAP_START, PROCESS_VIRTUAL_32_HEAP_END);
return -ENOMEM;
} else {
target = 0; /* allocate */
}
} else {
if (where < PROCESS_VIRTUAL_HEAP_START || end > PROCESS_VIRTUAL_HEAP_END) {
/* Try to allow outside our process virtual space, as
long as we can block it out in virtual_huge. */
vh = heap_virtual_huge(kh);
}

/* XXX range lookup in rtrie is broke, do manually until
fixed... note that this check could pass even if start and
end lie in two different mmapped areas. No matter, as we
just need to verify that this overlapping map lies in a
huge page that we're already using...the overlapping mmap
lawlessness is to be tolerated for the moment.
This is like a really crude start to vm tracking...
*/
if (vmap_start == INVALID_ADDRESS || vmap_end == INVALID_ADDRESS) {
u64 mapstart = where & ~(HUGE_PAGESIZE - 1);
u64 mapend = pad(end, HUGE_PAGESIZE);
u64 maplen = mapend - mapstart + 1;

if (id_heap_reserve(vh, mapstart, maplen)) {
vmap vm = allocate_vmap(h, irange(mapstart, mapstart + maplen));
if (vm == INVALID_ADDRESS) {
msg_err("failed to allocate vmap\n");
return -ENOMEM;
}
vm->flags = VMAP_FLAG_MMAP;
if ((flags & MAP_ANONYMOUS))
vm->flags |= VMAP_FLAG_ANONYMOUS;
assert(rangemap_insert(p->vmap, &vm->node));
} else if (fixed) {
thread_log(current, " failed to reserve area [%P - %P] in id heap",
where, end);
return -ENOMEM;
} else {
target = 0; /* allocate */
}
}
}
}
}
Expand Down Expand Up @@ -233,19 +311,20 @@ static sysreturn mmap(void *target, u64 size, int prot, int flags, int fd, u64 o
deallocate_u64(vh, where, maplen);
return -ENOMEM;
}
vm->flags = VMAP_FLAG_MMAP;
if ((flags & MAP_ANONYMOUS))
vm->flags |= VMAP_FLAG_ANONYMOUS;
assert(rangemap_insert(p->vmap, &vm->node));
}

// make a generic zero page function
if (flags & MAP_ANONYMOUS) {
u64 m = allocate_u64(physical, len);
if (m == INVALID_PHYSICAL) {
msg_err("failed to allocate physical memory, size %d\n", len);
return -ENOMEM;
if (mapped) {
/* just zero */
zero(pointer_from_u64(where), len);
} else {
thread_log(current, " anon nomap target: %P, len: %P (given size: %P)", where, len, size);
}
map(where, m, len, pages);
thread_log(current, " anon target: %P, len: %P (given size: %P)", where, len, size);
zero(pointer_from_u64(where), len);
return where;
}

Expand Down
27 changes: 24 additions & 3 deletions src/unix/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,10 @@ static sysreturn brk(void *x)
} else {
// I guess assuming we're aligned
u64 alloc = pad(u64_from_pointer(x), PAGESIZE) - pad(u64_from_pointer(p->brk), PAGESIZE);
map(u64_from_pointer(p->brk), allocate_u64(heap_physical(kh), alloc), alloc, heap_pages(kh));
u64 phys = allocate_u64(heap_physical(kh), alloc);
if (phys == INVALID_PHYSICAL)
return -ENOMEM;
map(u64_from_pointer(p->brk), phys, alloc, heap_pages(kh));
// people shouldn't depend on this
zero(p->brk, alloc);
p->brk += alloc;
Expand Down Expand Up @@ -1381,6 +1384,13 @@ sysreturn setgid(gid_t gid)
return 0; /* stub */
}

sysreturn prctl(int option, u64 arg2, u64 arg3, u64 arg4, u64 arg5)
{
thread_log(current, "prctl: option %d, arg2 0x%P, arg3 0x%P, arg4 0x%P, arg5 0x%P",
option, arg2, arg3, arg4, arg5);
return 0;
}

void register_file_syscalls(void **map)
{
register_syscall(map, SYS_read, read);
Expand Down Expand Up @@ -1430,6 +1440,7 @@ void register_file_syscalls(void **map)
register_syscall(map, SYS_setgroups, setgroups);
register_syscall(map, SYS_setuid, setuid);
register_syscall(map, SYS_setgid, setgid);
register_syscall(map, SYS_prctl, prctl);
}

void *linux_syscalls[SYS_MAX];
Expand All @@ -1447,20 +1458,28 @@ buffer install_syscall(heap h)
return b;
}

static context syscall_frame;

extern char *syscall_name(int);
static void syscall_debug()
{
u64 *f = current->frame;
int call = f[FRAME_VECTOR];
void *debugsyscalls = table_find(current->p->process_root, sym(debugsyscalls));
if(debugsyscalls)
if (debugsyscalls)
thread_log(current, syscall_name(call));
sysreturn (*h)(u64, u64, u64, u64, u64, u64) = current->p->syscall_handlers[call];
sysreturn res = -ENOSYS;
if (h) {
/* exchange frames so that a fault won't clobber the syscall
context, but retain the fault handler that has current enclosed */
context saveframe = running_frame;
running_frame = syscall_frame;
running_frame[FRAME_FAULT_HANDLER] = f[FRAME_FAULT_HANDLER];
res = h(f[FRAME_RDI], f[FRAME_RSI], f[FRAME_RDX], f[FRAME_R10], f[FRAME_R8], f[FRAME_R9]);
if (debugsyscalls)
thread_log(current, "direct return: %d", res);
thread_log(current, "direct return: %d, rsp 0x%P", res, f[FRAME_RSP]);
running_frame = saveframe;
} else if (debugsyscalls) {
thread_log(current, "nosyscall %s", syscall_name(call));
}
Expand All @@ -1475,5 +1494,7 @@ void init_syscalls()
{
//syscall = b->contents;
// debug the synthesized version later, at least we have the table dispatch
heap h = heap_general(get_kernel_heaps());
syscall = syscall_debug;
syscall_frame = allocate_frame(h);
}
4 changes: 2 additions & 2 deletions src/unix/thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,8 @@ void run_thread(thread t)
{
current = t;
thread_log(t, "run frame %p, RIP=%p", t->frame, t->frame[FRAME_RIP]);
frame = t->frame;
IRETURN(frame);
running_frame = t->frame;
IRETURN(running_frame);
}

// it might be easier, if a little skeezy, to use the return value
Expand Down
Loading

0 comments on commit e942733

Please sign in to comment.