diff --git a/src/runtime/buffer.h b/src/runtime/buffer.h index 503639144..e543e4b69 100644 --- a/src/runtime/buffer.h +++ b/src/runtime/buffer.h @@ -64,7 +64,8 @@ static inline void buffer_extend(buffer b, bytes len) assert(!b->wrapped); /* wrapped buffers can't be extended */ int oldlen = b->length; b->length = 2*((b->end-b->start)+len); - void *new = allocate(b->h, b->length); + void *new = allocate(b->h, b->length); + assert(new != INVALID_ADDRESS); runtime_memcpy(new, b->contents + b->start, (b->end-b->start)); deallocate(b->h, b->contents, oldlen); b->end = b->end - b->start; @@ -91,6 +92,7 @@ static inline buffer wrap_buffer(heap h, bytes length) { buffer new = allocate(h, sizeof(struct buffer)); + assert(new != INVALID_ADDRESS); new->contents = body; new->start = 0; new->h = h; diff --git a/src/runtime/heap/heap.h b/src/runtime/heap/heap.h index 2ef052b7d..4a8568c83 100644 --- a/src/runtime/heap/heap.h +++ b/src/runtime/heap/heap.h @@ -45,7 +45,7 @@ static inline int subdivide(int quantum, int per, int s, int o) #define allocate_zero(__h, __b) ({\ u64 __len = __b;\ void *x = allocate(__h, __len); \ - zero(x, __len); \ + if (x != INVALID_ADDRESS) zero(x, __len); \ x; }) static inline void leak(heap h, u64 x, bytes length) diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index bafedf2cc..faba75c05 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -5,6 +5,9 @@ typedef u32 character; #define true 1 #define false 0 +#define infinity (-1ull) +#define INVALID_PHYSICAL ((u64)infinity) +#define INVALID_ADDRESS ((void *)infinity) typedef u64 timestamp; @@ -112,12 +115,6 @@ physical vtop(void *x); physical physical_from_virtual(void *x); #endif -#define infinity (-1ull) - -#define INVALID_PHYSICAL ((u64)0xffffffffffffffff) - -#define INVALID_ADDRESS ((void *)0xffffffffffffffffull) - heap zero_wrap(heap meta, heap parent); boolean validate_virtual(void *base, u64 length); diff --git a/src/unix/mmap.c b/src/unix/mmap.c index 46e87c4b6..2913d7fd0 100644 --- a/src/unix/mmap.c +++ b/src/unix/mmap.c @@ -1,11 +1,16 @@ #include +#define VMAP_FLAG_MMAP 1 +#define VMAP_FLAG_ANONYMOUS 2 + typedef struct vmap { struct rmnode node; - /* oh, what we could do here */ + heap vheap; /* presently either p->virtual or p->virtual32 */ + u32 flags; + u32 prot; } *vmap; -static inline vmap allocate_vmap(heap h, range r) +static vmap allocate_vmap(heap h, range r) { vmap vm = allocate(h, sizeof(struct vmap)); if (vm == INVALID_ADDRESS) @@ -14,6 +19,70 @@ static inline vmap allocate_vmap(heap h, range r) return vm; } +/* Page faults may be caused by: + + - user program instructions + + - syscall top halves accessing unmapped anonymous pages + + - syscall bottom halves accessing unmapped anonymous pages (while + in the interrupt handler!) + + - can consider manually faulting in user pages in top half to + avoid faults from interrupt handler; this is debatable + + Therefore: + + - allocating a physical page must be fast and safe at interrupt + level + + - as elsewhere in the kernel, if/when we move from the bifurcated + runqueue / interrupt processing scheme, we need to put the + proper locks in place + + - we can easily build a free list of physical pages + + - also note that, given that id deallocations don't need to + match their source allocations, we can take any size + deallocation and bust it up into single pages to cache + + - map() needs to be safe at interrupt and non-interrupt levels + + - the page fault handler runs on its own stack (set as IST0 in + TSS), given that the user stack may live on an anonymous mapping + and need to have pages faulted in on its own behalf - otherwise + we eventually wind up with a triple fault as the CPU cannot push + onto the stack when invoking the exception handler +*/ + +boolean unix_fault_page(u64 vaddr) +{ + process p = current->p; + kernel_heaps kh = get_kernel_heaps(); + vmap vm; + + if ((vm = (vmap)rangemap_lookup(p->vmap, vaddr)) != INVALID_ADDRESS) { + u32 flags = VMAP_FLAG_MMAP | VMAP_FLAG_ANONYMOUS; + if ((vm->flags & flags) != flags) { + msg_err("vaddr 0x%P matched vmap with invalid flags (0x%P)\n", vaddr, vm->flags); + return false; + } + + /* XXX make free list */ + u64 paddr = allocate_u64(heap_physical(kh), PAGESIZE); + if (paddr == INVALID_PHYSICAL) { + msg_err("cannot get physical page; OOM\n"); + return false; + } + u64 vaddr_aligned = vaddr & ~MASK(PAGELOG); + map(vaddr_aligned, paddr, PAGESIZE, heap_pages(kh)); + zero(pointer_from_u64(vaddr_aligned), PAGESIZE); + return true; + } + msg_err("no vmap found for vaddr 0x%P\n", vaddr); + return false; +} + sysreturn mremap(void *old_address, u64 old_size, u64 new_size, int flags, void * new_address) { kernel_heaps kh = get_kernel_heaps(); @@ -139,72 +208,81 @@ static sysreturn mmap(void *target, u64 size, int prot, int flags, int fd, u64 o process p = current->p; kernel_heaps kh = get_kernel_heaps(); heap h = heap_general(kh); - heap pages = heap_pages(kh); - heap physical = heap_physical(kh); // its really unclear whether this should be extended or truncated u64 len = pad(size, PAGESIZE) & MASK(32); u64 where = u64_from_pointer(target); u64 end = where + size - 1; boolean fixed = (flags & MAP_FIXED) != 0; + boolean mapped = false; thread_log(current, "mmap: target %p, size %P, prot %P, flags %P, fd %d, offset %P", target, size, prot, flags, fd, offset); - if (target) { + if (where) { thread_log(current, " %s at %P", fixed ? "fixed" : "hint", where); - /* 32 bit mode is ignored if MAP_FIXED */ - heap vh = p->virtual; - if (where < HUGE_PAGESIZE && end < HUGE_PAGESIZE) { - /* bound by kernel and zero page. */ - if (where >= PROCESS_VIRTUAL_32_HEAP_START || end <= PROCESS_VIRTUAL_32_HEAP_END) { - /* Attempt to reserve low memory fixed mappings in - virtual32 to avoid collisions in any future low mem - allocation. Don't fail if we can't reserve or it's - already reserved. */ - id_heap_reserve(p->virtual32, where, size); - } else if (fixed) { - thread_log(current, " map [%P - %P] outside of valid 32-bit range [%P - %P]", - where, end, PROCESS_VIRTUAL_32_HEAP_START, PROCESS_VIRTUAL_32_HEAP_END); - return -ENOMEM; - } else { - target = 0; /* allocate */ - } + vmap vmap_start = (vmap)rangemap_lookup(p->vmap, where); + vmap vmap_end = (vmap)rangemap_lookup(p->vmap, end); + if (vmap_start != INVALID_ADDRESS && + vmap_end == vmap_start && + (vmap_start->flags & VMAP_FLAG_ANONYMOUS) == 0) { + mapped = true; } else { - if (where < PROCESS_VIRTUAL_HEAP_START || end > PROCESS_VIRTUAL_HEAP_END) { - /* Try to allow outside our process virtual space, as - long as we can block it out in virtual_huge. */ - vh = heap_virtual_huge(kh); - } - - /* XXX range lookup in rtrie is broke, do manually until - fixed... note that this check could pass even if start and - end lie in two different mmapped areas. No matter, as we - just need to verify that this overlapping map lies in a - huge page that we're already using...the overlapping mmap - lawlessness is to be tolerated for the moment. - - This is like a really crude start to vm tracking... - */ - if (rangemap_lookup(p->vmap, where) == INVALID_ADDRESS || - rangemap_lookup(p->vmap, end) == INVALID_ADDRESS) { - u64 mapstart = where & ~(HUGE_PAGESIZE - 1); - u64 mapend = pad(end, HUGE_PAGESIZE); - u64 maplen = mapend - mapstart + 1; - - if (id_heap_reserve(vh, mapstart, maplen)) { - vmap vm = allocate_vmap(h, irange(mapstart, mapstart + maplen)); - if (vm == INVALID_ADDRESS) { - msg_err("failed to allocate vmap\n"); - return -ENOMEM; - } - assert(rangemap_insert(p->vmap, &vm->node)); + /* 32 bit mode is ignored if MAP_FIXED */ + heap vh = p->virtual; + if (where < HUGE_PAGESIZE && end < HUGE_PAGESIZE) { + /* bound by kernel and zero page. */ + if (where >= PROCESS_VIRTUAL_32_HEAP_START || end <= PROCESS_VIRTUAL_32_HEAP_END) { + /* Attempt to reserve low memory fixed mappings in + virtual32 to avoid collisions in any future low mem + allocation. Don't fail if we can't reserve or it's + already reserved. */ + id_heap_reserve(p->virtual32, where, size); } else if (fixed) { - thread_log(current, " failed to reserve area [%P - %P] in id heap", - where, end); + thread_log(current, " map [%P - %P] outside of valid 32-bit range [%P - %P]", + where, end, PROCESS_VIRTUAL_32_HEAP_START, PROCESS_VIRTUAL_32_HEAP_END); return -ENOMEM; } else { target = 0; /* allocate */ } + } else { + if (where < PROCESS_VIRTUAL_HEAP_START || end > PROCESS_VIRTUAL_HEAP_END) { + /* Try to allow outside our process virtual space, as + long as we can block it out in virtual_huge. */ + vh = heap_virtual_huge(kh); + } + + /* XXX range lookup in rtrie is broke, do manually until + fixed... note that this check could pass even if start and + end lie in two different mmapped areas. No matter, as we + just need to verify that this overlapping map lies in a + huge page that we're already using...the overlapping mmap + lawlessness is to be tolerated for the moment. + + This is like a really crude start to vm tracking... + */ + if (vmap_start == INVALID_ADDRESS || vmap_end == INVALID_ADDRESS) { + u64 mapstart = where & ~(HUGE_PAGESIZE - 1); + u64 mapend = pad(end, HUGE_PAGESIZE); + u64 maplen = mapend - mapstart + 1; + + if (id_heap_reserve(vh, mapstart, maplen)) { + vmap vm = allocate_vmap(h, irange(mapstart, mapstart + maplen)); + if (vm == INVALID_ADDRESS) { + msg_err("failed to allocate vmap\n"); + return -ENOMEM; + } + vm->flags = VMAP_FLAG_MMAP; + if ((flags & MAP_ANONYMOUS)) + vm->flags |= VMAP_FLAG_ANONYMOUS; + assert(rangemap_insert(p->vmap, &vm->node)); + } else if (fixed) { + thread_log(current, " failed to reserve area [%P - %P] in id heap", + where, end); + return -ENOMEM; + } else { + target = 0; /* allocate */ + } + } } } } @@ -233,19 +311,20 @@ static sysreturn mmap(void *target, u64 size, int prot, int flags, int fd, u64 o deallocate_u64(vh, where, maplen); return -ENOMEM; } + vm->flags = VMAP_FLAG_MMAP; + if ((flags & MAP_ANONYMOUS)) + vm->flags |= VMAP_FLAG_ANONYMOUS; assert(rangemap_insert(p->vmap, &vm->node)); } // make a generic zero page function if (flags & MAP_ANONYMOUS) { - u64 m = allocate_u64(physical, len); - if (m == INVALID_PHYSICAL) { - msg_err("failed to allocate physical memory, size %d\n", len); - return -ENOMEM; + if (mapped) { + /* just zero */ + zero(pointer_from_u64(where), len); + } else { + thread_log(current, " anon nomap target: %P, len: %P (given size: %P)", where, len, size); } - map(where, m, len, pages); - thread_log(current, " anon target: %P, len: %P (given size: %P)", where, len, size); - zero(pointer_from_u64(where), len); return where; } diff --git a/src/unix/syscall.c b/src/unix/syscall.c index c0221d8aa..228848e3a 100644 --- a/src/unix/syscall.c +++ b/src/unix/syscall.c @@ -1222,7 +1222,10 @@ static sysreturn brk(void *x) } else { // I guess assuming we're aligned u64 alloc = pad(u64_from_pointer(x), PAGESIZE) - pad(u64_from_pointer(p->brk), PAGESIZE); - map(u64_from_pointer(p->brk), allocate_u64(heap_physical(kh), alloc), alloc, heap_pages(kh)); + u64 phys = allocate_u64(heap_physical(kh), alloc); + if (phys == INVALID_PHYSICAL) + return -ENOMEM; + map(u64_from_pointer(p->brk), phys, alloc, heap_pages(kh)); // people shouldn't depend on this zero(p->brk, alloc); p->brk += alloc; @@ -1381,6 +1384,13 @@ sysreturn setgid(gid_t gid) return 0; /* stub */ } +sysreturn prctl(int option, u64 arg2, u64 arg3, u64 arg4, u64 arg5) +{ + thread_log(current, "prctl: option %d, arg2 0x%P, arg3 0x%P, arg4 0x%P, arg5 0x%P", + option, arg2, arg3, arg4, arg5); + return 0; +} + void register_file_syscalls(void **map) { register_syscall(map, SYS_read, read); @@ -1430,6 +1440,7 @@ void register_file_syscalls(void **map) register_syscall(map, SYS_setgroups, setgroups); register_syscall(map, SYS_setuid, setuid); register_syscall(map, SYS_setgid, setgid); + register_syscall(map, SYS_prctl, prctl); } void *linux_syscalls[SYS_MAX]; @@ -1447,20 +1458,28 @@ buffer install_syscall(heap h) return b; } +static context syscall_frame; + extern char *syscall_name(int); static void syscall_debug() { u64 *f = current->frame; int call = f[FRAME_VECTOR]; void *debugsyscalls = table_find(current->p->process_root, sym(debugsyscalls)); - if(debugsyscalls) + if (debugsyscalls) thread_log(current, syscall_name(call)); sysreturn (*h)(u64, u64, u64, u64, u64, u64) = current->p->syscall_handlers[call]; sysreturn res = -ENOSYS; if (h) { + /* exchange frames so that a fault won't clobber the syscall + context, but retain the fault handler that has current enclosed */ + context saveframe = running_frame; + running_frame = syscall_frame; + running_frame[FRAME_FAULT_HANDLER] = f[FRAME_FAULT_HANDLER]; res = h(f[FRAME_RDI], f[FRAME_RSI], f[FRAME_RDX], f[FRAME_R10], f[FRAME_R8], f[FRAME_R9]); if (debugsyscalls) - thread_log(current, "direct return: %d", res); + thread_log(current, "direct return: %d, rsp 0x%P", res, f[FRAME_RSP]); + running_frame = saveframe; } else if (debugsyscalls) { thread_log(current, "nosyscall %s", syscall_name(call)); } @@ -1475,5 +1494,7 @@ void init_syscalls() { //syscall = b->contents; // debug the synthesized version later, at least we have the table dispatch + heap h = heap_general(get_kernel_heaps()); syscall = syscall_debug; + syscall_frame = allocate_frame(h); } diff --git a/src/unix/thread.c b/src/unix/thread.c index 6d3efa798..adc29fc0b 100644 --- a/src/unix/thread.c +++ b/src/unix/thread.c @@ -274,8 +274,8 @@ void run_thread(thread t) { current = t; thread_log(t, "run frame %p, RIP=%p", t->frame, t->frame[FRAME_RIP]); - frame = t->frame; - IRETURN(frame); + running_frame = t->frame; + IRETURN(running_frame); } // it might be easier, if a little skeezy, to use the return value diff --git a/src/unix/unix.c b/src/unix/unix.c index 17cc02a29..ab1797a11 100644 --- a/src/unix/unix.c +++ b/src/unix/unix.c @@ -30,17 +30,35 @@ void deallocate_fd(process p, int fd) deallocate_u64(p->fdallocator, fd, 1); } +CLOSURE_1_1(default_fault_handler, context, thread, context); context default_fault_handler(thread t, context frame) { - print_frame(t->frame); - print_stack(t->frame); + /* frame can be: + - t->frame if user or syscall + - miscframe in interrupt level + */ + if (frame[FRAME_VECTOR] == 14) { + /* XXX move this to x86_64 */ + u64 fault_address; + mov_from_cr("cr2", fault_address); + if (unix_fault_page(fault_address)) + return frame; + } + + console("Unhandled: "); + print_u64(frame[FRAME_VECTOR]); + console("\n"); + print_frame(frame); + print_stack(frame); - if (table_find (t->p->process_root, sym(fault))) { + if (table_find (current->p->process_root, sym(fault))) { console("starting gdb\n"); - init_tcp_gdb(heap_general(get_kernel_heaps()), t->p, 9090); + init_tcp_gdb(heap_general(get_kernel_heaps()), current->p, 9090); thread_sleep(current); + } else { + halt("halt\n"); } - halt(""); + return frame; } static CLOSURE_0_3(dummy_read, sysreturn, void *, u64, u64); @@ -154,7 +172,20 @@ process init_unix(kernel_heaps kh, tuple root, filesystem fs) set_syscall_handler(syscall_enter); process kernel_process = create_process(uh, root, fs); current = create_thread(kernel_process); - frame = current->frame; + running_frame = current->frame; + + /* Install a fault handler for use when anonymous pages are + faulted in within the interrupt handler (e.g. syscall bottom + halves, I/O directly to user buffers). This is permissible now + because we only support one process address space. Should this + ever change, this will need to be reworked; either we make + faults from the interrupt handler illegal or store a reference + to the relevant thread frame upon entering the bottom half + routine. + */ + fault_handler fallback_handler = closure(h, default_fault_handler, current); + install_fallback_fault_handler(fallback_handler); + init_vdso(heap_physical(kh), heap_pages(kh)); register_special_files(kernel_process); init_syscalls(); diff --git a/src/unix/unix_internal.h b/src/unix/unix_internal.h index 08467708f..812b58a59 100644 --- a/src/unix/unix_internal.h +++ b/src/unix/unix_internal.h @@ -220,6 +220,8 @@ boolean pipe_init(unix_heaps uh); extern sysreturn syscall_ignore(); context default_fault_handler(thread t, context frame); +boolean unix_fault_page(u64 vaddr); + void thread_log_internal(thread t, char *desc, ...); #define thread_log(__t, __desc, ...) thread_log_internal(__t, __desc, ##__VA_ARGS__) // this should always be current diff --git a/src/x86_64/crt0.s b/src/x86_64/crt0.s index b3ffd1f23..202c8fd8b 100644 --- a/src/x86_64/crt0.s +++ b/src/x86_64/crt0.s @@ -10,7 +10,7 @@ global_func _start extern init_service -extern frame +extern running_frame %include "frame_nasm.h" %define FS_MSR 0xc0000100 @@ -51,7 +51,7 @@ extern syscall global_func syscall_enter syscall_enter: push rax - mov rax, [frame] + mov rax, [running_frame] mov [rax+FRAME_RBX*8], rbx pop rbx mov [rax+FRAME_VECTOR*8], rbx @@ -72,16 +72,14 @@ syscall_enter: mov rax, syscall mov rax, [rax] call rax - mov rbx, [frame] -# no more implicit syscall return here -# mov [rbx + FRAME_RAX], rax + mov rbx, [running_frame] jmp frame_enter .end: extern common_handler interrupt_common: push rax - mov rax, [frame] + mov rax, [running_frame] mov [rax+FRAME_RBX*8], rbx mov [rax+FRAME_RCX*8], rcx mov [rax+FRAME_RDX*8], rdx @@ -122,7 +120,7 @@ getrip: ;; could always use iret? global_func frame_return frame_return: - mov rbx, [frame] + mov rbx, [running_frame] mov rax, [rbx+FRAME_FS*8] mov rcx, FS_MSR @@ -238,9 +236,18 @@ _start: hlt .end: -global_func move_gdt -move_gdt: +global_func install_gdt64_and_tss +install_gdt64_and_tss: lgdt [GDT64.Pointer] + mov rax, TSS + mov [GDT64 + GDT64.TSS + 2], ax + shr rax, 0x10 + mov [GDT64 + GDT64.TSS + 4], al + mov [GDT64 + GDT64.TSS + 7], ah + shr rax, 0x10 + mov [GDT64 + GDT64.TSS + 8], eax + mov rax, GDT64.TSS + ltr ax ret .end: @@ -276,7 +283,47 @@ GDT64: ; Global Descriptor Table (64-bit). db 10010010b ; Access (read/write). db 00000000b ; Granularity. db 0 ; Base (high). - + .TSS: equ $ - GDT64 ; TSS descriptor (system segment descriptor - 64bit mode) + dw (TSS.end - TSS) ; Limit (low) + dw 0 ; Base [15:0] [fill in base at runtime, for I lack nasm sauce] + db 0 ; Base [23:16] + db 10001001b ; Present, long mode type available TSS + db 00000000b ; byte granularity + db 0 ; Base [31:24] + dd 0 ; Base [63:32] + dd 0 ; Reserved .Pointer: ; The GDT-pointer. dw $ - GDT64 - 1 ; Limit. dq GDT64 ; 64 bit Base. + + align 16 ; XXX ?? +global_data TSS +TSS: ; 64 bit TSS + dd 0 ; reserved 0x00 + dd 0 ; RSP0 (low) 0x04 + dd 0 ; RSP0 (high) 0x08 + dd 0 ; RSP1 (low) 0x0c + dd 0 ; RSP1 (high) 0x10 + dd 0 ; RSP2 (low) 0x14 + dd 0 ; RSP2 (high) 0x18 + dd 0 ; reserved 0x1c + dd 0 ; reserved 0x20 + dd 0 ; IST1 (low) 0x24 + dd 0 ; IST1 (high) 0x28 + dd 0 ; IST2 (low) 0x2c + dd 0 ; IST2 (high) 0x30 + dd 0 ; IST3 (low) 0x34 + dd 0 ; IST3 (high) 0x38 + dd 0 ; IST4 (low) 0x3c + dd 0 ; IST4 (high) 0x40 + dd 0 ; IST5 (low) 0x44 + dd 0 ; IST5 (high) 0x48 + dd 0 ; IST6 (low) 0x4c + dd 0 ; IST6 (high) 0x50 + dd 0 ; IST7 (low) 0x54 + dd 0 ; IST7 (high) 0x58 + dd 0 ; reserved 0x5c + dd 0 ; reserved 0x60 + dw 0 ; IOPB offset 0x64 + dw 0 ; reserved 0x66 +.end: diff --git a/src/x86_64/interrupt.c b/src/x86_64/interrupt.c index 850403d04..f34c105d7 100644 --- a/src/x86_64/interrupt.c +++ b/src/x86_64/interrupt.c @@ -84,12 +84,11 @@ char *interrupt_name(u64 s) } -void write_idt(u64 *idt, int interrupt, void *hv) +void write_idt(u64 *idt, int interrupt, void *hv, u64 ist) { // huh, idt entries are virtual u64 h = u64_from_pointer(hv); u64 selector = 0x08; - u64 ist = 0; // this is a stask switch through the tss u64 type_attr = 0x8e; u64 *target = (void *)(u64)(idt + 2*interrupt); @@ -128,7 +127,7 @@ char *register_name(u64 s) } static thunk *handlers; -context frame; +context running_frame; void *apic_base = (void *)0xfee00000; @@ -252,24 +251,46 @@ void lapic_eoi() write_barrier(); } +context miscframe; /* for context save on interrupt */ +context intframe; /* for context save on exception within interrupt */ + +void handle_interrupts() +{ + running_frame = miscframe; + enable_interrupts(); + __asm__("hlt"); + disable_interrupts(); +} + +void install_fallback_fault_handler(fault_handler h) +{ + assert(miscframe); + miscframe[FRAME_FAULT_HANDLER] = u64_from_pointer(h); + intframe[FRAME_FAULT_HANDLER] = u64_from_pointer(h); +} + void common_handler() { - int i = frame[FRAME_VECTOR]; + int i = running_frame[FRAME_VECTOR]; if ((i < interrupt_size) && handlers[i]) { - // should we switch to the 'kernel process'? + context saveframe = running_frame; + running_frame = intframe; apply(handlers[i]); lapic_eoi(); + running_frame = saveframe; } else { - fault_handler f = pointer_from_u64(frame[FRAME_FAULT_HANDLER]); + fault_handler f = pointer_from_u64(running_frame[FRAME_FAULT_HANDLER]); if (f == 0) { rprintf ("no fault handler\n"); - print_frame(frame); - print_stack(frame); + print_frame(running_frame); + print_stack(running_frame); vm_exit(VM_EXIT_FAULT); } - if (i < 25) frame = apply(f, frame); + if (i < 25) { + running_frame = apply(f, running_frame); + } } } @@ -345,7 +366,29 @@ void configure_lapic_timer(heap h) } extern u32 interrupt_size; - + +#define FAULT_STACK_PAGES 8 + +extern volatile void * TSS; +static inline void write_tss_u64(int offset, u64 val) +{ + u64 * vec = (u64 *)(u64_from_pointer(&TSS) + offset); + *vec = val; +} + +static void set_ist(int i, u64 sp) +{ + assert(i > 0 && i <= 7); + write_tss_u64(0x24 + (i - 1) * 8, sp); +} + +context allocate_frame(heap h) +{ + context f = allocate_zero(h, FRAME_MAX * sizeof(u64)); + assert(f != INVALID_ADDRESS); + return f; +} + void start_interrupts(kernel_heaps kh) { // these are simple enough it would be better to just @@ -354,16 +397,28 @@ void start_interrupts(kernel_heaps kh) void *start = &interrupt0; heap general = heap_general(kh); heap pages = heap_pages(kh); + + /* exception handlers */ handlers = allocate_zero(general, interrupt_size * sizeof(thunk)); + assert(handlers != INVALID_ADDRESS); + + /* alternate frame storage */ + miscframe = allocate_frame(general); + intframe = allocate_frame(general); + + /* TSS is installed at the end of stage3 runtime initialization */ + void *faultstack = allocate_zero(pages, pages->pagesize * FAULT_STACK_PAGES); + u64 fs_top = (u64)faultstack + pages->pagesize * FAULT_STACK_PAGES - sizeof(u64); + set_ist(1, fs_top); + // architectural - end of exceptions u32 vector_start = 0x20; interrupt_vectors = create_id_heap(general, vector_start, interrupt_size - vector_start, 1); // assuming contig gives us a page aligned, page padded identity map idt = allocate(pages, pages->pagesize); - frame = allocate(pages, pages->pagesize); for (int i = 0; i < interrupt_size; i++) - write_idt(idt, i, start + i * delta); + write_idt(idt, i, start + i * delta, i == 0xe ? 1 : 0); u16 *dest = (u16 *)(idt + 2*interrupt_size); dest[0] = 16*interrupt_size -1; diff --git a/src/x86_64/page.c b/src/x86_64/page.c index 4929f18f1..f82b224b3 100644 --- a/src/x86_64/page.c +++ b/src/x86_64/page.c @@ -63,6 +63,7 @@ physical physical_from_virtual(void *x) if (!l1) return INVALID_PHYSICAL; if (l2[pindex(xt, PT3)] & PAGE_2M_SIZE) return ((u64)l1 | (xt & MASK(PT3))); u64 *l0 = pt_lookup(l1, xt, PT4); + if (!l0) return INVALID_PHYSICAL; return (u64)l0 | (xt & MASK(PT4)); } #endif @@ -254,7 +255,7 @@ static void map_range(u64 virtual, physical p, int length, u64 flags, heap h) if (flags == 0) console("unmap: area missing page mappings\n"); else - halt("map: ran out of page table memory"); + halt("map: ran out of page table memory\n"); } int off = 1ull << (fat ? PT3 : PT4); vo += off; diff --git a/src/x86_64/service.c b/src/x86_64/service.c index 3f0f8dca7..0dc6964be 100644 --- a/src/x86_64/service.c +++ b/src/x86_64/service.c @@ -42,8 +42,6 @@ static u64 bootstrap_alloc(heap h, bytes length) queue runqueue; -static context miscframe; - void runloop() { /* minimum runloop period - XXX move to a config header */ @@ -56,10 +54,7 @@ void runloop() while((t = dequeue(runqueue))) { apply(t); } - frame = miscframe; - enable_interrupts(); - __asm__("hlt"); - disable_interrupts(); + handle_interrupts(); } } @@ -133,17 +128,13 @@ static void read_kernel_syms() } } -extern void move_gdt(); +extern void install_gdt64_and_tss(); static void __attribute__((noinline)) init_service_new_stack() { kernel_heaps kh = &heaps; heap misc = heap_general(kh); heap pages = heap_pages(kh); - //heap virtual_huge = heap_virtual_huge(kh); - //heap virtual_page = heap_virtual_page(kh); - //heap physical = heap_physical(kh); - //heap backed = heap_backed(kh); /* Unmap the first page so we catch faults on null pointer references. */ unmap(0, PAGESIZE, pages); @@ -168,13 +159,10 @@ static void __attribute__((noinline)) init_service_new_stack() halt("filesystem region not found; halt\n"); init_virtio_storage(kh, closure(misc, attach_storage, root, fs_offset)); init_virtio_network(kh); - miscframe = allocate(misc, FRAME_MAX * sizeof(u64)); pci_discover(); - /* Switch gdt to kernel space and free up initial mapping, but - only after we're done with regions and anything else in that - space. */ - move_gdt(); + /* Switch to stage3 GDT64, enable TSS and free up initial map */ + install_gdt64_and_tss(); unmap(PAGESIZE, INITIAL_MAP_SIZE - PAGESIZE, pages); runloop(); diff --git a/src/x86_64/x86_64.h b/src/x86_64/x86_64.h index a22a8b9a5..1e0dff8cd 100644 --- a/src/x86_64/x86_64.h +++ b/src/x86_64/x86_64.h @@ -67,7 +67,7 @@ static inline u64 lsb(u64 x) typedef u64 *context; -extern u64 *frame; +extern context running_frame; #define BREAKPOINT_INSTRUCTION 00 #define BREAKPOINT_WRITE 01 @@ -204,7 +204,12 @@ void *queue_peek(queue q); int queue_length(queue q); queue allocate_queue(heap h, u64 size); void deallocate_queue(queue q); + +context allocate_frame(heap h); void runloop() __attribute__((noreturn)); +void handle_interrupts(); +void install_fallback_fault_handler(fault_handler h); + void map(u64 virtual, physical p, int length, heap h); void unmap(u64 virtual, int length, heap h);