Copy On Write

Copy On Write Fork

在fork的普通实现中,子进程会将父进程的页表复制一份(uvmcopy->memmove)。COW fork的实现加快了fork的调用,即在创建子进程的时候,将子进程的页表项都指向父进程物理页的位置,实现共享物理页,同时相应的子进程和父进程页表项的disable PTE_W、set PTE_C(PTE_C是一个指示位,表示该页表项指向的是一个共享的物理页,在riscv.h中增加定义),直到发生对相应虚拟页的写的操作的时候,发生页错误,在trap handler中分配物理页。

由于增加了对同一个物理页的多个引用,所以在释放物理页的时候要考虑还有没有其他虚拟页引用到这个物理页。需要用一个引用计数数组来记录所有物理页的被引用次数,当最后一个虚拟页被释放的时候,才释放相关物理页。

void
kfree(void *pa)
{
  struct run *r;

    acquire(&kmem.lock);
  if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP || kmem.refcount[REFIDX(pa)] <= 0)
    panic("kfree");

  if(--kmem.refcount[REFIDX(pa)]) {
      release(&kmem.lock);
      return;
  }

  // Fill with junk to catch dangling refs.
  memset(pa, 1, PGSIZE);

  r = (struct run*)pa;

//  acquire(&kmem.lock);
  r->next = kmem.freelist;
  kmem.freelist = r;
  release(&kmem.lock);
}

再kalloc.c中定义物理页的引用记录数组,以及相关操作

#define REFIDX(pa) ((pa - KERNBASE) / PGSIZE)

struct {
  struct spinlock lock;
  struct run *freelist;
  char refcount[MAXPAGES];
} kmem;


char read_ref(uint64 pa) {
    return kmem.refcount[pa];
}
void acquire_reflock(void) {
    acquire(&kmem.lock);
}
void release_reflock(void) {
    release(&kmem.lock);
}
char modify_ref(uint64 pa, int cnt) {
    return kmem.refcount[pa] += cnt;
}

void *kalloc_nolock(void) {
    struct run *r;

    r = kmem.freelist;
    if (r)
        kmem.freelist = r->next;
    if (r) {
        memset((char *) r, 2, PGSIZE);
        if (kmem.refcount[REFIDX(r)])
            panic("kalloc: new page already has reference count.");
        kmem.refcount[REFIDX(r)] = 1;
    }

    return (void *)r;
}

void *
kalloc(void)
{
  struct run *r;

  acquire(&kmem.lock);
  r = kmem.freelist;
  if(r)
    kmem.freelist = r->next;

  if(r) {
      memset((char *) r, 5, PGSIZE); // fill with junk
      if(kmem.refcount[REFIDX(r)])
          panic("kalloc: new page already has refcount!");
      kmem.refcount[REFIDX(r)] = 1;
  }
    release(&kmem.lock);
  return (void*)r;
}

同时在def.h中声明这些函数,方便在操作虚拟内存(vm.c)、陷阱(trap.c)时使用。

// reference count management
#define REFIDX(pa) (((uint64)pa - KERNBASE) / PGSIZE)

char read_ref(uint64);
void acquire_reflock(void);
void release_reflock(void);
char modify_ref(uint64, int);
void *kalloc_nolock(void);  // 无锁得分配一个物理页,用于COW

fork使用的是uvmcopy来复制页表,修改uvmcopy,仅增加相应物理页的引用计数。

int
uvmcopy(pagetable_t old, pagetable_t new, uint64 sz) {
    pte_t *pte;
    uint64 pa, i;
    uint flags;
//    char *mem;

    for (i = 0; i < sz; i += PGSIZE) {
        if ((pte = walk(old, i, 0)) == 0)
            panic("uvmcopy: pte should exist");
        if ((*pte & PTE_V) == 0)
            panic("uvmcopy: page not present");
        pa = PTE2PA(*pte);
        flags = PTE_FLAGS(*pte);

//    if((mem = kalloc()) == 0)
//      goto err;
//    memmove(mem, (char*)pa, PGSIZE);
        *pte = (*pte & (~PTE_W)) | PTE_C;  // clear the PTE_W flag of parent page and set the COW flag
        flags &= (~PTE_W);  // clear the PTE_W flag of child page
        if (mappages(new, i, PGSIZE, (uint64) pa, flags | PTE_C) != 0) {  // map the old PTE pa to the new PTE
//            kfree(mem);
            goto err;
        }
        acquire_reflock();
        modify_ref(REFIDX(pa), 1);
        release_reflock();
    }
    return 0;

    err:
    uvmunmap(new, 0, i / PGSIZE, 1);
    return -1;
}

在中断处理程序usertrap中处理Store Page Fault,修改页表项,根据引用计数来决定是否分配物理页。

    else if((which_dev = devintr()) != 0){
    // ok
  } else if (r_scause() == 15) {
      // Store/AMO page fault
      const char *reason = 0;
      int faultaddr = r_stval();
      pte_t *pte = walk(p->pagetable, faultaddr, 0);
      if (pte == 0) {
          reason = "Page does not exist";
      } else {
          if ((*pte & PTE_V) == 0) {
              reason = "Not a valid pte";
          } else if ((*pte & PTE_C) == 0) {
              reason = "Not a COW pte";
          } else if ((*pte & PTE_U) == 0) {
              reason = "Not a user page";
          } else {
              uint64 pa = PTE2PA(*pte);
              acquire_reflock();
              if (read_ref(REFIDX(pa)) == 1) {
                  *pte = ((*pte | PTE_W) & ~PTE_C);  // set the writable and clear the COW flag
              } else {
                  char *mem = kalloc_nolock();
                  if (mem == 0) {
                      reason = "No enough memory";
                  } else {
                      memmove(mem, (void *)pa, PGSIZE);
                      int flg = ((PTE_FLAGS(*pte) | PTE_W) & ~PTE_C); // clear
                      *pte = PA2PTE((uint64)mem) | flg;
                      modify_ref(REFIDX(pa), -1);
//                      if (mappages(p->pagetable, va, PGSIZE, (uint64)mem, flg) != 0) {
//                          reason = "Cannot map the new alloc page";
//                          kfree(mem);
//                      } else {
//                          modify_ref(REFIDX(pa), -1);
//                      }
                  }
              }
              release_reflock();
          }
      }
      if (reason != 0) {
          printf("usertrap(): unhandled write page fault (%s). scause %p pid=%d\n", reason, r_scause(), p->pid);
          printf("            sepc=%p stval=%p\n", r_sepc(), r_stval());
          p->killed = 1;
      }

修改copyout,从内核空间写入到用户空间,修改页表项的set PTE_W和unset PTE_C位,根据引用计数决定是否分配物理页。

int
copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len) {
    uint64 n, va0, pa0;

    while (len > 0) {
        va0 = PGROUNDDOWN(dstva);
        if (va0 > MAXVA)
            return -1;
        pte_t *pte = walk(pagetable, va0, 0);
        if (pte == 0 || (*pte & PTE_U) == 0 || (*pte & PTE_V) == 0 || (uint64)PTE2PA(*pte) == 0) {
            return -1;
        }
        pa0 = (uint64)PTE2PA(*pte);
        if ((*pte & PTE_W) == 0) {
            if (*pte & PTE_C) {
                uint64 pa = (uint64)PTE2PA(*pte);
                acquire_reflock();
                if (read_ref(REFIDX(pa)) == 1) {  // 当前只有一个虚拟地址指向这个物理页
                    *pte = (*pte | PTE_W) & (~PTE_C);  // set writable and clear the COW flag
                    pa0 = pa;
                } else {
                    // 此物理页的引用数大于1
                    char *mem = kalloc_nolock();
                    if (mem == 0) {
                        release_reflock();
                        return -1;
                    } else {
                        memmove(mem, (void *)pa0, PGSIZE);
                        int flag = (PTE_FLAGS(*pte) & ~PTE_C) | PTE_W;  // clear the COW flag
                        *pte = PA2PTE((uint64)mem) | flag;
                        modify_ref(REFIDX(pa), -1);
                        pa0 = (uint64)mem;
                    }
                }
                if (pa0 == 0) {
                    panic("COW fails");
                }
                release_reflock();
            } else {
                return -1;
            }
        }
//        pa0 = walkaddr(pagetable, va0);
//        if (pa0 == 0)
//            return -1;
        n = PGSIZE - (dstva - va0);
        if (n > len)
            n = len;
        memmove((void *)(pa0 + (dstva - va0)), src, n);

        len -= n;
        src += n;
        dstva = va0 + PGSIZE;
    }
    return 0;
}

END