mmap

实现mmap系统调用,将文件内容映射到进程虚拟地址空间,便于共享内存。

预备知识

void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);

mmap系统调用:将fd指向的数据(offset开始)映射到虚拟内存中的addr地址开始的最长length的位置上 本实验addr = 0,那么需要kernel来选择用于映射文件的虚拟内存位置,mmap返回这个地址 prod 指出这个内存是否可读可写可执行 flags指出这个内存是否需要被写回(MAP_SHARED),或者不需要被写回(MAP_PRIVATE) offset 在本实验中为0

lazy load page: 在缺页trap中分配物理页,向页表中加入虚拟地址到物理页的映射

file inode记录的是该文件在磁盘上的位置(块),通过readi将inode的块读到内存,readi调用either_copyout复制块的内容到内存 either_copyout将内核读取的块内容复制到用户空间,在用户进程页表中增加一个虚拟地址到这个物理地址的映射

实现流程

在makefile中加入mmaptest 增加系统调用mmap munmap

在proc.h中定义VMA结构体,让每个进程保存被映射的地址记录

struct vm_area_struct {
    int valid;
    uint64 addr;  // starting mapped virtual address
    uint64 length;  // mapped length
    int prot;    // permission
    int flags;   // whether to write back
    struct file *filep;      // the file structure
    int fd;
};

#define VMASIZE 16

// Per-process state
struct proc {
  struct spinlock lock;
  // ......

  // mapped files
  struct vm_area_struct VMA[VMASIZE];
  uint64 cur_max;   // the current max address for mapping
};

难点:如何选择在地址空间中的哪里存储被映射的内存

用户进程地址空间

MAXVA ->     -------------
             | trampoline |
             -------------
             | trapframe  |
             --------------
             |            |
max addr ->  --------------
of mmap      |            |
             |            |
             |     heap   |
             |            |
             --------------
             | user stack |
             --------------
             | user text & data
     0 ->    --------------

查看了网上的实现方法,非常简单和巧妙的利用了虚拟地址空间,只是多定义了一个cur_max,记录已经被分配出去的顶端地址,之后不断下移这个cur_max,继续分配就行了。在memlayout.h中定义最高的可映射地址位置,所以分配的虚拟地址从这里开始,之后向下生长

#define MMAPMAXADDR (TRAPFRAME - 10 * PGSIZE)

在sysfile.c中实现mmap

uint64
sys_mmap(void)
{
    uint64 addr;
    int length, prot, flags, fd, offset;
    struct file *filep;
    if (argaddr(0, &addr) < 0 || argint(1, &length) < 0|| argint(2, &prot) < 0 || argint(3, &flags) < 0 || argfd(4, &fd, &filep)<0 ||
            argint(5, &offset) < 0) {
        return ~0;
    }
    if (addr != 0) {
        printf("Only support the kernel assigned address.\n");
        return ~0;
    }
    // find empty VMA and the length bigger than required
    struct proc *pp = myproc();

    if (flags & MAP_SHARED) {
        if (!(filep->writable) && (prot & PROT_WRITE)) {
            printf("The file is read only, but the mmap prot want to write the memory.\n");
            return ~0;
        }
    }

    uint64 curmax = pp->cur_max;
    uint64 start_addr = PGROUNDDOWN(curmax - length);

    struct vm_area_struct *pvma = 0;

    for (int i = 0; i < VMASIZE; ++i) {
        if (pp->VMA[i].valid == 0) {
            pvma = &pp->VMA[i];
            break;
        }
    }

    if (pvma) {
        pvma->valid = 1;
        pvma->addr = start_addr;
        pvma->length = length;
        pvma->prot = prot;
        pvma->flags = flags;
        pvma->fd = fd;
        pvma->filep = filep;
        filedup(pvma->filep); // increase reference count

        pp->cur_max = start_addr;
    } else {
        return ~0;
    }

    return start_addr;
}

在trap.c中实现lazy page allocation

int mmap_read(struct file *f, uint64 dst_va, int off, int size) {
    ilock(f->ip);
    int n = readi(f->ip, 1, dst_va, off, size);
    off += n;
    iunlock(f->ip);
    return off;
}

void
usertrap(void)
{
  int which_dev = 0;

  if((r_sstatus() & SSTATUS_SPP) != 0)
    panic("usertrap: not from user mode");

  // send interrupts and exceptions to kerneltrap(),
  // since we're now in the kernel.
  w_stvec((uint64)kernelvec);

  struct proc *p = myproc();
  
  // save user program counter.
  p->trapframe->epc = r_sepc();
  
  if(r_scause() == 8){
    // system call

    if(p->killed)
      exit(-1);

    // sepc points to the ecall instruction,
    // but we want to return to the next instruction.
    p->trapframe->epc += 4;

    // an interrupt will change sstatus &c registers,
    // so don't enable until done with those registers.
    intr_on();

    syscall();
  } else if((which_dev = devintr()) != 0){
      // ok
  } else if (r_scause() == 15 || r_scause() == 13) {  // Store/AMO page fault, load page fault
      uint64 va = r_stval();
      struct proc* pp = myproc();
      struct vm_area_struct *pvma = 0;
      for (int i = 0; i < VMASIZE; ++i) {
          if (pp->VMA[i].valid == 1) {
              if (va >= pp->VMA[i].addr && va < pp->VMA[i].addr + pp->VMA[i].length) {
                  pvma = &pp->VMA[i];
                  break;
              }
          }
      }

      if (!pvma) {
          printf("The fault vm address not in the VMA.\n");
          goto err;
      }
      // if it is, allocate physical page(uvmalloc)
      char * pa = kalloc();
      if (pa == 0) {
          panic("kalloc in trap.c\n");
      }
      memset(pa, 0, PGSIZE);

      uint64 fault_page_start = PGROUNDDOWN(va);
      if (mappages(pp->pagetable, fault_page_start, PGSIZE, (uint64)pa, (pvma->prot << 1) | PTE_U ) != 0) {
          kfree(pa);
          goto err;
      }
      // write the file content to the mapped page
      int offset = fault_page_start - pvma->addr; // the offset in the file
      mmap_read(pvma->filep, fault_page_start, offset, PGSIZE);
  } else {
err:
    printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
    printf("            sepc=%p stval=%p\n", r_sepc(), r_stval());
    p->killed = 1;
  }

  if(p->killed)
    exit(-1);

  // give up the CPU if this is a timer interrupt.
  if(which_dev == 2)
    yield();

  usertrapret();
}

实现unmap。 在处理unmap的region的时候,题目给了简化的假设: An munmap call might cover only a portion of an mmap-ed region, but you can assume that it will either unmap at the start, or at the end, or the whole region (but not punch a hole in the middle of a region).

uint64
sys_munmap(void)
{
    uint64 addr;
    int length;
    if (argaddr(0, &addr) < 0 || argint(1, &length) < 0) {
        return -1;
    }
    if (addr % PGSIZE || length < 0) {
        return -1;
    }
    // find the vma include the addr
    uint64 start_addr = PGROUNDDOWN(addr);
    uint64 end_addr = PGROUNDDOWN(addr + length);

//    printf("The start addr(%p), end add (%p)\n", start_addr, end_addr);
    struct vm_area_struct *pvma = 0;
    struct proc *pp = myproc();

    for (int i = 0; i < VMASIZE; ++i) {
        if (pp->VMA[i].valid == 1 &&
                pp->VMA[i].addr <= start_addr &&
                end_addr <= pp->VMA[i].addr + pp->VMA[i].length) {
            pvma = &pp->VMA[i];
            break;
        }
    }

    if (!pvma) {
        printf("Cannot find VMA, start from (%p) to (%p).\n", start_addr, end_addr);
        return -1;
    }

    // write back if flags is MAP_SHARED
    if ((pvma->flags & MAP_SHARED) && pvma->filep->writable) {
        struct file *f =pvma->filep;
        begin_op();
        ilock(f->ip);
        // todo: only write the dirty page
        writei(f->ip, 1, pvma->addr, 0, pvma->length);
        iunlock(f->ip);
        end_op();
    }

    // unmap the page from rounddown(addr) ~ roundup(addr + length)
    pte_t *pte;
    for (uint64 va = start_addr; va < end_addr; va += PGSIZE) {  // 注意类型!!
//        printf("Unmap the address :%p\n", va);
        if ((pte = walk(pp->pagetable, va, 0)) != 0) {
            if (*pte & PTE_V) {
                uvmunmap(pp->pagetable, va, 1, 1);
            }
        }
    }

    // if the whole vma is unmapped, decrease refcount
    if (start_addr == pvma->addr && end_addr < pvma->addr + pvma->length) {
        pvma->addr = end_addr;
        pvma->length -= length;
    } else if (start_addr > pvma->addr && end_addr == pvma->addr + pvma->length) {
        pvma->length -= length;
    } else if (start_addr == pvma->addr && end_addr == pvma->addr + pvma->length) {
        // unmap the whole VMA
        pvma->valid = 0;
        pvma->filep->ref--;
        pvma->length = 0;
    } else {
        printf("You punch a whole in the vma. not supported.\n");
        return -1;
    }

    return 0;
}

在exit中unmap所有VMA

void
exit(int status)
{
  struct proc *p = myproc();

  if(p == initproc)
    panic("init exiting");

  // Close all open files.
  for(int fd = 0; fd < NOFILE; fd++){
    if(p->ofile[fd]){
      struct file *f = p->ofile[fd];
      fileclose(f);
      p->ofile[fd] = 0;
    }
  }

  // unmap all mapped region
    struct vm_area_struct *vm = 0;
    for (int i = 0; i < VMASIZE; ++i) {
        if (p->VMA[i].valid) {
            vm = &p->VMA[i];

            // write back if flags is MAP_SHARED
            if ((vm->flags & MAP_SHARED) && vm->filep->writable) {
                struct file *f =vm->filep;
                begin_op();
                ilock(f->ip);
                // todo: only write the dirty page
                writei(f->ip, 1, vm->addr, 0, vm->length);
                iunlock(f->ip);
                end_op();
            }

            vm->valid = 0;
            pte_t *pte;
            for (uint64 j = vm->addr; j < vm->addr + vm->length; j += PGSIZE) {
                if((pte = walk(p->pagetable, j, 0)) != 0) {
                    if(*pte & PTE_V) {
                        uvmunmap(p->pagetable, j, 1, 1);
                    }
                }
            }
        }
    }
....
}

实现fork中复制VMA给子进程

int
fork(void)
{
  int i, pid;
  struct proc *np;
  struct proc *p = myproc();

  // .....
    // copy all VMA from parent to children
    for (int i = 0; i < VMASIZE; ++i) {
        if (p->VMA[i].valid) {
            np->VMA[i] = p->VMA[i];
            filedup(p->VMA[i].filep);
        }
    }

  safestrcpy(np->name, p->name, sizeof(p->name));

  pid = np->pid;

  release(&np->lock);

  acquire(&wait_lock);
  np->parent = p;
  release(&wait_lock);

  acquire(&np->lock);
  np->state = RUNNABLE;
  release(&np->lock);

  return pid;
}

要注意的点:address的类型时uint64,在写循环变量的时候不要写错了;对于MAP_SHARE类型的VMA,在写回时注意检查文件是否可写。