do_fault为 文件page fault处理,当vma被具体映射到某个文件使,发生page fault将会被认为是文件page fault:
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
... ...
if (!vmf->pte) {
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
else
return do_fault(vmf);
}
... ...
}
当pte表为空,即之前没有做过映射,且是文件映射,会调用do_fault处理。
文件映射判断标准
vma_is_anonymous函数为判断匿名映射还是文件映射标准,当为true时为匿名映射,否则为文件映射:
static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
return !vma->vm_ops;
}
vma->vm_ops 为NULL 即为匿名映射,否则为文件映射。
do_fault
do_fault函数处理过程中关键函数:
- do_fault主要按照三种情况处理,分别为读page fault、私有写page fault以及共享写page fault三种情况分别对应do_read_fault、do_cow_fault、do_shared_fault进行处理。
- 最后核心处理函数__do_fault和finish_fault处理。
do_fault源码如下:
static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
/*
* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
*/
if (!vma->vm_ops->fault) {
/*
* If we find a migration pmd entry or a none pmd entry, which
* should never happen, return SIGBUS
*/
if (unlikely(!pmd_present(*vmf->pmd)))
ret = VM_FAULT_SIGBUS;
else {
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
vmf->pmd,
vmf->address,
&vmf->ptl);
/*
* Make sure this is not a temporary clearing of pte
* by holding ptl and checking again. A R/M/W update
* of pte involves: take ptl, clearing the pte so that
* we don't have concurrent modification by hardware
* followed by an update.
*/
if (unlikely(pte_none(*vmf->pte)))
ret = VM_FAULT_SIGBUS;
else
ret = VM_FAULT_NOPAGE;
pte_unmap_unlock(vmf->pte, vmf->ptl);
}
} else if (!(vmf->flags & FAULT_FLAG_WRITE))
ret = do_read_fault(vmf);
else if (!(vma->vm_flags & VM_SHARED))
ret = do_cow_fault(vmf);
else
ret = do_shared_fault(vmf);
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
}
- 检查vma->vm_ops->fault是否设置,vma->vm_ops->fault为对应文件page fault,如果没有定义返回失败。pte_offset_map_lock为再次获取对应pte,主要是防止此时有其他cpu或者硬件修改pte。
- !(vmf->flags & FAULT_FLAG_WRITE):如果为读pag fault,则调用do_read_fault。
- 私有写page fault,调用do_cow_fault处理。
- 共享写page fault,调用do_shared_fault。
- 如果预申请的pte vmf->prealloc_pte没有使用,将其释放。
do_read_fault
do_read_fault为发生读文件映射page fault:
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
ret = do_fault_around(vmf);
if (ret)
return ret;
}
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
ret |= finish_fault(vmf);
unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
put_page(vmf->page);
return ret;
}
- vma->vm_ops->map_pages 如果不为空,则表明可以进行预加载,可以最多提前加载文件中16个page 到内存中,以减少page faut发生次数,do_fault_around为预加载函数处理。
- 如果vma->vm_ops->map_pages为空则说明不支持预加载,只加载申请一个页面,并加载对应地址在文件中的一个页面内容,__do_fault
- finish_fault 将申请到的物理也,刷新到pte中
- unlock_page:释放页面锁,防止文件在_do_fault中被加锁,造成文件被加锁。
do_cow_fault
do_cow_fault为私有文件映射写page fault,如果该地址是继承过来的则会已经存在旧的映射关系,需要重新申请物理页,刷新虚拟地址对应的物理页:
static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!vmf->cow_page)
return VM_FAULT_OOM;
if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
if (ret & VM_FAULT_DONE_COW)
return ret;
copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
__SetPageUptodate(vmf->cow_page);
ret |= finish_fault(vmf);
unlock_page(vmf->page);
put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
put_page(vmf->cow_page);
return ret;
}
- anon_vma_prepare是否初始化反向映射。
- vmf->cow_page 为新的页面提前调用alloc_page_vma申请物理内存,主要防止COW场景(copy on write),即该映射关系是从父进程继承过来,为了加速fork处理,只是单单将映射关系copy过来,并标记为只读,当写该地址时会发生写page fault,重新申请物理内存,并将旧内存copy到新申请物理内存中,并将修改的内容保存到新申请物理内存中。
- mem_cgroup_charge:将申请的页面加入到cgroup中。
- __do_fault:将申请物理页中加载文件内容。
- copy_user_highpage:将之前vma->page拷贝到vmf->cow_page中。
- __SetPageUptodate:更新物理页状态。
- finish_fault:刷新页表pte。
- unlock_page:对vmf->page解锁。
- put_page(vmf->page):vmf->page引用计数减一,这时因为以及为address新申请了物理页,不再使用vmf->page,故计数要减一。
do_shared_fault
do_shared_fault用于处理共享写page fault场景:
static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret, tmp;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
/*
* Check if the backing address space wants to know that the page is
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
unlock_page(vmf->page);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
}
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
unlock_page(vmf->page);
put_page(vmf->page);
return ret;
}
ret |= fault_dirty_shared_page(vmf);
return ret;
}
- __do_fault:加载文件内容到页面。
- 是否定义vma->vm_ops->page_mkwrite,如果定义则调用page_mkwrite,用来通知页面可写。
- finish_fault:刷新页表
- 由于该页为共享页,可能其他进程或线程也再使用读取该文件,因此当该页被写入时会被标记为脏页,表明与硬盘中的文件内存不一样,需要将内容写入到硬盘中,调用fault_dirty_shared_page进行处理,最后调用balance_dirty_pages_ratelimited 根据系统情况判断决定是否要将脏页写入到硬盘中。
__do_fault
__do_fault为调用自定义vm_ops->fault加载一个页面文件内容到对应物理内存中:
/*
* The mmap_lock must have been held on entry, and may have been
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
/*
* Preallocate pte before we take page_lock because this might lead to
* deadlocks for memcg reclaim which waits for pages under writeback:
* lock_page(A)
* SetPageWriteback(A)
* unlock_page(A)
* lock_page(B)
* lock_page(B)
* pte_alloc_pne
* shrink_page_list
* wait_on_page_writeback(A)
* SetPageWriteback(B)
* unlock_page(B)
* # flush A, B to clear the writeback
*/
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf->page);
put_page(vmf->page);
vmf->page = NULL;
return VM_FAULT_HWPOISON;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf->page);
else
VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
return ret;
}
- 如果pte为空,则提前为其申请好vmf->prealloc_pte ,这样可以防止在vm_ops->fault中申请物理内存造成锁问题。
- vma->vm_ops->fault加载大小为page对应的文件内容到内容中。
- PageHWPoison:如果此时物理页面发生硬件错误,则返回失败并将该页面释放。
finish_fault
finish_fault刷新pte页表即该虚拟地址对应物理地址:
/**
* finish_fault - finish page fault once we have prepared the page to fault
*
* @vmf: structure describing the fault
*
* This function handles all that is needed to finish a page fault once the
* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
* given page, adds reverse page mapping, handles memcg charges and LRU
* addition.
*
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
*
* Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
struct page *page;
vm_fault_t ret = 0;
/* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) &&
!(vmf->vma->vm_flags & VM_SHARED))
page = vmf->cow_page;
else
page = vmf->page;
/*
* check even for read faults because we might have lost our CoWed
* page
*/
if (!(vmf->vma->vm_flags & VM_SHARED))
ret = check_stable_address_space(vmf->vma->vm_mm);
if (!ret)
ret = alloc_set_pte(vmf, page);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
- 如果时私有写page fault则物理内存为vmf->cow_page,否则为vmf->page。
- alloc_set_pte:刷新页表pte。
- pte_unmap_unlock:pte解锁。
do_fault_around
发生因读取内存造成page fault是,如果对应文件映射定义vm_ops->map_pages,则可以提前将发生page fault地址后面的一些文件内容也加载到内存中,可以减少后面发生page fault次数:
fault_around_bytes
fault_around_bytes决定当调用 do_fault_around时,最多可以预加载的内存大小,默认为65536=64K:
static unsigned long fault_around_bytes __read_mostly =
rounddown_pow_of_two(65536);
do_fault_around源码分析
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
unsigned long address = vmf->address, nr_pages, mask;
pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off;
vm_fault_t ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
vmf->address = max(address & mask, vmf->vma->vm_start);
off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
* end_pgoff is either the end of the page table, the end of
* the vma or nr_pages from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
if (pmd_none(*vmf->pmd)) {
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
}
vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
/* Huge page is mapped? Page fault is solved */
if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
/* ->map_pages() haven't done anything useful. Cold page cache? */
if (!vmf->pte)
goto out;
/* check if the page fault is solved */
vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
vmf->address = address;
vmf->pte = NULL;
return ret;
}
- 首先计算需要预加载内存大小,nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT,将fault_around_bytes转换称需要多少个page,默认为16个page。
- mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK: 将fault_around_bytes按照页对齐转换称mask,方便后续地址转换。
- vmf->address = max(address & mask, vmf->vma->vm_start): 计算需要虚拟地址vmf->addres, 不能低于文件起始地址vmf->vma->vm_start,并将地址按照mask对齐
- off:计算对齐之后地址与发生page fault地址偏移
- start_pgoff -= off:减去对齐多余偏移。
- end_pgoff:按照对齐页对齐计算结束偏移。
- end_pgoff:根据文件实际大小和配置fault_around_bytes可以预加载页面nr_pages,决定实际结束偏移位置end_pgoff
- 如果pte为空,则需要提前为pte表申请物理内存,存放到vmf->prealloc_pte
- 实际 工作处理完成之后调用vm_ops->map_pages进行页面加载,起始位置为start_pgoff,结束位置为end_pgoff
- 不支持huge page .
- vmf->pte: 更新页表pte。
- pte_unmap_unlock:对页面pte锁释放。