mmap
实现mmap系统调用,将文件内容映射到进程虚拟地址空间,便于共享内存。
预备知识
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
mmap系统调用:将fd指向的数据(offset开始)映射到虚拟内存中的addr地址开始的最长length的位置上 本实验addr = 0,那么需要kernel来选择用于映射文件的虚拟内存位置,mmap返回这个地址 prod 指出这个内存是否可读可写可执行 flags指出这个内存是否需要被写回(MAP_SHARED),或者不需要被写回(MAP_PRIVATE) offset 在本实验中为0
lazy load page: 在缺页trap中分配物理页,向页表中加入虚拟地址到物理页的映射
file inode记录的是该文件在磁盘上的位置(块),通过readi将inode的块读到内存,readi调用either_copyout复制块的内容到内存 either_copyout将内核读取的块内容复制到用户空间,在用户进程页表中增加一个虚拟地址到这个物理地址的映射
实现流程
在makefile中加入mmaptest 增加系统调用mmap munmap
在proc.h中定义VMA结构体,让每个进程保存被映射的地址记录
struct vm_area_struct {
int valid;
uint64 addr; // starting mapped virtual address
uint64 length; // mapped length
int prot; // permission
int flags; // whether to write back
struct file *filep; // the file structure
int fd;
};
#define VMASIZE 16
// Per-process state
struct proc {
struct spinlock lock;
// ......
// mapped files
struct vm_area_struct VMA[VMASIZE];
uint64 cur_max; // the current max address for mapping
};
难点:如何选择在地址空间中的哪里存储被映射的内存
用户进程地址空间
MAXVA -> -------------
| trampoline |
-------------
| trapframe |
--------------
| |
max addr -> --------------
of mmap | |
| |
| heap |
| |
--------------
| user stack |
--------------
| user text & data
0 -> --------------
查看了网上的实现方法,非常简单和巧妙的利用了虚拟地址空间,只是多定义了一个cur_max,记录已经被分配出去的顶端地址,之后不断下移这个cur_max,继续分配就行了。在memlayout.h中定义最高的可映射地址位置,所以分配的虚拟地址从这里开始,之后向下生长
#define MMAPMAXADDR (TRAPFRAME - 10 * PGSIZE)
在sysfile.c中实现mmap
uint64
sys_mmap(void)
{
uint64 addr;
int length, prot, flags, fd, offset;
struct file *filep;
if (argaddr(0, &addr) < 0 || argint(1, &length) < 0|| argint(2, &prot) < 0 || argint(3, &flags) < 0 || argfd(4, &fd, &filep)<0 ||
argint(5, &offset) < 0) {
return ~0;
}
if (addr != 0) {
printf("Only support the kernel assigned address.\n");
return ~0;
}
// find empty VMA and the length bigger than required
struct proc *pp = myproc();
if (flags & MAP_SHARED) {
if (!(filep->writable) && (prot & PROT_WRITE)) {
printf("The file is read only, but the mmap prot want to write the memory.\n");
return ~0;
}
}
uint64 curmax = pp->cur_max;
uint64 start_addr = PGROUNDDOWN(curmax - length);
struct vm_area_struct *pvma = 0;
for (int i = 0; i < VMASIZE; ++i) {
if (pp->VMA[i].valid == 0) {
pvma = &pp->VMA[i];
break;
}
}
if (pvma) {
pvma->valid = 1;
pvma->addr = start_addr;
pvma->length = length;
pvma->prot = prot;
pvma->flags = flags;
pvma->fd = fd;
pvma->filep = filep;
filedup(pvma->filep); // increase reference count
pp->cur_max = start_addr;
} else {
return ~0;
}
return start_addr;
}
在trap.c中实现lazy page allocation
int mmap_read(struct file *f, uint64 dst_va, int off, int size) {
ilock(f->ip);
int n = readi(f->ip, 1, dst_va, off, size);
off += n;
iunlock(f->ip);
return off;
}
void
usertrap(void)
{
int which_dev = 0;
if((r_sstatus() & SSTATUS_SPP) != 0)
panic("usertrap: not from user mode");
// send interrupts and exceptions to kerneltrap(),
// since we're now in the kernel.
w_stvec((uint64)kernelvec);
struct proc *p = myproc();
// save user program counter.
p->trapframe->epc = r_sepc();
if(r_scause() == 8){
// system call
if(p->killed)
exit(-1);
// sepc points to the ecall instruction,
// but we want to return to the next instruction.
p->trapframe->epc += 4;
// an interrupt will change sstatus &c registers,
// so don't enable until done with those registers.
intr_on();
syscall();
} else if((which_dev = devintr()) != 0){
// ok
} else if (r_scause() == 15 || r_scause() == 13) { // Store/AMO page fault, load page fault
uint64 va = r_stval();
struct proc* pp = myproc();
struct vm_area_struct *pvma = 0;
for (int i = 0; i < VMASIZE; ++i) {
if (pp->VMA[i].valid == 1) {
if (va >= pp->VMA[i].addr && va < pp->VMA[i].addr + pp->VMA[i].length) {
pvma = &pp->VMA[i];
break;
}
}
}
if (!pvma) {
printf("The fault vm address not in the VMA.\n");
goto err;
}
// if it is, allocate physical page(uvmalloc)
char * pa = kalloc();
if (pa == 0) {
panic("kalloc in trap.c\n");
}
memset(pa, 0, PGSIZE);
uint64 fault_page_start = PGROUNDDOWN(va);
if (mappages(pp->pagetable, fault_page_start, PGSIZE, (uint64)pa, (pvma->prot << 1) | PTE_U ) != 0) {
kfree(pa);
goto err;
}
// write the file content to the mapped page
int offset = fault_page_start - pvma->addr; // the offset in the file
mmap_read(pvma->filep, fault_page_start, offset, PGSIZE);
} else {
err:
printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
printf(" sepc=%p stval=%p\n", r_sepc(), r_stval());
p->killed = 1;
}
if(p->killed)
exit(-1);
// give up the CPU if this is a timer interrupt.
if(which_dev == 2)
yield();
usertrapret();
}
实现unmap。 在处理unmap的region的时候,题目给了简化的假设: An munmap call might cover only a portion of an mmap-ed region, but you can assume that it will either unmap at the start, or at the end, or the whole region (but not punch a hole in the middle of a region).
uint64
sys_munmap(void)
{
uint64 addr;
int length;
if (argaddr(0, &addr) < 0 || argint(1, &length) < 0) {
return -1;
}
if (addr % PGSIZE || length < 0) {
return -1;
}
// find the vma include the addr
uint64 start_addr = PGROUNDDOWN(addr);
uint64 end_addr = PGROUNDDOWN(addr + length);
// printf("The start addr(%p), end add (%p)\n", start_addr, end_addr);
struct vm_area_struct *pvma = 0;
struct proc *pp = myproc();
for (int i = 0; i < VMASIZE; ++i) {
if (pp->VMA[i].valid == 1 &&
pp->VMA[i].addr <= start_addr &&
end_addr <= pp->VMA[i].addr + pp->VMA[i].length) {
pvma = &pp->VMA[i];
break;
}
}
if (!pvma) {
printf("Cannot find VMA, start from (%p) to (%p).\n", start_addr, end_addr);
return -1;
}
// write back if flags is MAP_SHARED
if ((pvma->flags & MAP_SHARED) && pvma->filep->writable) {
struct file *f =pvma->filep;
begin_op();
ilock(f->ip);
// todo: only write the dirty page
writei(f->ip, 1, pvma->addr, 0, pvma->length);
iunlock(f->ip);
end_op();
}
// unmap the page from rounddown(addr) ~ roundup(addr + length)
pte_t *pte;
for (uint64 va = start_addr; va < end_addr; va += PGSIZE) { // 注意类型!!
// printf("Unmap the address :%p\n", va);
if ((pte = walk(pp->pagetable, va, 0)) != 0) {
if (*pte & PTE_V) {
uvmunmap(pp->pagetable, va, 1, 1);
}
}
}
// if the whole vma is unmapped, decrease refcount
if (start_addr == pvma->addr && end_addr < pvma->addr + pvma->length) {
pvma->addr = end_addr;
pvma->length -= length;
} else if (start_addr > pvma->addr && end_addr == pvma->addr + pvma->length) {
pvma->length -= length;
} else if (start_addr == pvma->addr && end_addr == pvma->addr + pvma->length) {
// unmap the whole VMA
pvma->valid = 0;
pvma->filep->ref--;
pvma->length = 0;
} else {
printf("You punch a whole in the vma. not supported.\n");
return -1;
}
return 0;
}
在exit中unmap所有VMA
void
exit(int status)
{
struct proc *p = myproc();
if(p == initproc)
panic("init exiting");
// Close all open files.
for(int fd = 0; fd < NOFILE; fd++){
if(p->ofile[fd]){
struct file *f = p->ofile[fd];
fileclose(f);
p->ofile[fd] = 0;
}
}
// unmap all mapped region
struct vm_area_struct *vm = 0;
for (int i = 0; i < VMASIZE; ++i) {
if (p->VMA[i].valid) {
vm = &p->VMA[i];
// write back if flags is MAP_SHARED
if ((vm->flags & MAP_SHARED) && vm->filep->writable) {
struct file *f =vm->filep;
begin_op();
ilock(f->ip);
// todo: only write the dirty page
writei(f->ip, 1, vm->addr, 0, vm->length);
iunlock(f->ip);
end_op();
}
vm->valid = 0;
pte_t *pte;
for (uint64 j = vm->addr; j < vm->addr + vm->length; j += PGSIZE) {
if((pte = walk(p->pagetable, j, 0)) != 0) {
if(*pte & PTE_V) {
uvmunmap(p->pagetable, j, 1, 1);
}
}
}
}
}
....
}
实现fork中复制VMA给子进程
int
fork(void)
{
int i, pid;
struct proc *np;
struct proc *p = myproc();
// .....
// copy all VMA from parent to children
for (int i = 0; i < VMASIZE; ++i) {
if (p->VMA[i].valid) {
np->VMA[i] = p->VMA[i];
filedup(p->VMA[i].filep);
}
}
safestrcpy(np->name, p->name, sizeof(p->name));
pid = np->pid;
release(&np->lock);
acquire(&wait_lock);
np->parent = p;
release(&wait_lock);
acquire(&np->lock);
np->state = RUNNABLE;
release(&np->lock);
return pid;
}
要注意的点:address的类型时uint64,在写循环变量的时候不要写错了;对于MAP_SHARE类型的VMA,在写回时注意检查文件是否可写。