0
点赞
收藏
分享

微信扫一扫

linux那些事之page fault(do_fault)(5)

闲云困兽 2022-03-11 阅读 156

do_fault为 文件page fault处理,当vma被具体映射到某个文件使,发生page fault将会被认为是文件page fault:

static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
    ... ...
    
  	if (!vmf->pte) {
		if (vma_is_anonymous(vmf->vma))
			return do_anonymous_page(vmf);
		else
			return do_fault(vmf);
	}

    ... ...
}

当pte表为空,即之前没有做过映射,且是文件映射,会调用do_fault处理。

文件映射判断标准

vma_is_anonymous函数为判断匿名映射还是文件映射标准,当为true时为匿名映射,否则为文件映射:

static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
	return !vma->vm_ops;
}

vma->vm_ops 为NULL 即为匿名映射,否则为文件映射。

do_fault

do_fault函数处理过程中关键函数:

  •  do_fault主要按照三种情况处理,分别为读page fault、私有写page fault以及共享写page fault三种情况分别对应do_read_fault、do_cow_fault、do_shared_fault进行处理。
  • 最后核心处理函数__do_fault和finish_fault处理。

do_fault源码如下:

static vm_fault_t do_fault(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	struct mm_struct *vm_mm = vma->vm_mm;
	vm_fault_t ret;

	/*
	 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
	 */
	if (!vma->vm_ops->fault) {
		/*
		 * If we find a migration pmd entry or a none pmd entry, which
		 * should never happen, return SIGBUS
		 */
		if (unlikely(!pmd_present(*vmf->pmd)))
			ret = VM_FAULT_SIGBUS;
		else {
			vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
						       vmf->pmd,
						       vmf->address,
						       &vmf->ptl);
			/*
			 * Make sure this is not a temporary clearing of pte
			 * by holding ptl and checking again. A R/M/W update
			 * of pte involves: take ptl, clearing the pte so that
			 * we don't have concurrent modification by hardware
			 * followed by an update.
			 */
			if (unlikely(pte_none(*vmf->pte)))
				ret = VM_FAULT_SIGBUS;
			else
				ret = VM_FAULT_NOPAGE;

			pte_unmap_unlock(vmf->pte, vmf->ptl);
		}
	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
		ret = do_read_fault(vmf);
	else if (!(vma->vm_flags & VM_SHARED))
		ret = do_cow_fault(vmf);
	else
		ret = do_shared_fault(vmf);

	/* preallocated pagetable is unused: free it */
	if (vmf->prealloc_pte) {
		pte_free(vm_mm, vmf->prealloc_pte);
		vmf->prealloc_pte = NULL;
	}
	return ret;
}
  • 检查vma->vm_ops->fault是否设置,vma->vm_ops->fault为对应文件page fault,如果没有定义返回失败。pte_offset_map_lock为再次获取对应pte,主要是防止此时有其他cpu或者硬件修改pte。
  • !(vmf->flags & FAULT_FLAG_WRITE):如果为读pag fault,则调用do_read_fault。
  • 私有写page fault,调用do_cow_fault处理。
  • 共享写page fault,调用do_shared_fault。
  • 如果预申请的pte vmf->prealloc_pte没有使用,将其释放。

do_read_fault

do_read_fault为发生读文件映射page fault:

static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	vm_fault_t ret = 0;

	/*
	 * Let's call ->map_pages() first and use ->fault() as fallback
	 * if page by the offset is not ready to be mapped (cold cache or
	 * something).
	 */
	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
		ret = do_fault_around(vmf);
		if (ret)
			return ret;
	}

	ret = __do_fault(vmf);
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		return ret;

	ret |= finish_fault(vmf);
	unlock_page(vmf->page);
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		put_page(vmf->page);
	return ret;
}
  • vma->vm_ops->map_pages 如果不为空,则表明可以进行预加载,可以最多提前加载文件中16个page 到内存中,以减少page faut发生次数,do_fault_around为预加载函数处理。
  • 如果vma->vm_ops->map_pages为空则说明不支持预加载,只加载申请一个页面,并加载对应地址在文件中的一个页面内容,__do_fault
  • finish_fault 将申请到的物理也,刷新到pte中
  • unlock_page:释放页面锁,防止文件在_do_fault中被加锁,造成文件被加锁。

do_cow_fault

do_cow_fault为私有文件映射写page fault,如果该地址是继承过来的则会已经存在旧的映射关系,需要重新申请物理页,刷新虚拟地址对应的物理页:


static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	vm_fault_t ret;

	if (unlikely(anon_vma_prepare(vma)))
		return VM_FAULT_OOM;

	vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
	if (!vmf->cow_page)
		return VM_FAULT_OOM;

	if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
		put_page(vmf->cow_page);
		return VM_FAULT_OOM;
	}
	cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);

	ret = __do_fault(vmf);
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		goto uncharge_out;
	if (ret & VM_FAULT_DONE_COW)
		return ret;

	copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
	__SetPageUptodate(vmf->cow_page);

	ret |= finish_fault(vmf);
	unlock_page(vmf->page);
	put_page(vmf->page);
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		goto uncharge_out;
	return ret;
uncharge_out:
	put_page(vmf->cow_page);
	return ret;
}
  • anon_vma_prepare是否初始化反向映射。
  • vmf->cow_page 为新的页面提前调用alloc_page_vma申请物理内存,主要防止COW场景(copy on write),即该映射关系是从父进程继承过来,为了加速fork处理,只是单单将映射关系copy过来,并标记为只读,当写该地址时会发生写page fault,重新申请物理内存,并将旧内存copy到新申请物理内存中,并将修改的内容保存到新申请物理内存中。
  • mem_cgroup_charge:将申请的页面加入到cgroup中。
  • __do_fault:将申请物理页中加载文件内容。
  • copy_user_highpage:将之前vma->page拷贝到vmf->cow_page中。
  • __SetPageUptodate:更新物理页状态。
  • finish_fault:刷新页表pte。
  • unlock_page:对vmf->page解锁。
  • put_page(vmf->page):vmf->page引用计数减一,这时因为以及为address新申请了物理页,不再使用vmf->page,故计数要减一。

do_shared_fault

do_shared_fault用于处理共享写page fault场景:


static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	vm_fault_t ret, tmp;

	ret = __do_fault(vmf);
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		return ret;

	/*
	 * Check if the backing address space wants to know that the page is
	 * about to become writable
	 */
	if (vma->vm_ops->page_mkwrite) {
		unlock_page(vmf->page);
		tmp = do_page_mkwrite(vmf);
		if (unlikely(!tmp ||
				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
			put_page(vmf->page);
			return tmp;
		}
	}

	ret |= finish_fault(vmf);
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
					VM_FAULT_RETRY))) {
		unlock_page(vmf->page);
		put_page(vmf->page);
		return ret;
	}

	ret |= fault_dirty_shared_page(vmf);
	return ret;
}
  • __do_fault:加载文件内容到页面。
  • 是否定义vma->vm_ops->page_mkwrite,如果定义则调用page_mkwrite,用来通知页面可写。
  • finish_fault:刷新页表
  • 由于该页为共享页,可能其他进程或线程也再使用读取该文件,因此当该页被写入时会被标记为脏页,表明与硬盘中的文件内存不一样,需要将内容写入到硬盘中,调用fault_dirty_shared_page进行处理,最后调用balance_dirty_pages_ratelimited 根据系统情况判断决定是否要将脏页写入到硬盘中。

__do_fault

__do_fault为调用自定义vm_ops->fault加载一个页面文件内容到对应物理内存中:


/*
 * The mmap_lock must have been held on entry, and may have been
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	vm_fault_t ret;

	/*
	 * Preallocate pte before we take page_lock because this might lead to
	 * deadlocks for memcg reclaim which waits for pages under writeback:
	 *				lock_page(A)
	 *				SetPageWriteback(A)
	 *				unlock_page(A)
	 * lock_page(B)
	 *				lock_page(B)
	 * pte_alloc_pne
	 *   shrink_page_list
	 *     wait_on_page_writeback(A)
	 *				SetPageWriteback(B)
	 *				unlock_page(B)
	 *				# flush A, B to clear the writeback
	 */
	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
		if (!vmf->prealloc_pte)
			return VM_FAULT_OOM;
		smp_wmb(); /* See comment in __pte_alloc() */
	}

	ret = vma->vm_ops->fault(vmf);
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
			    VM_FAULT_DONE_COW)))
		return ret;

	if (unlikely(PageHWPoison(vmf->page))) {
		if (ret & VM_FAULT_LOCKED)
			unlock_page(vmf->page);
		put_page(vmf->page);
		vmf->page = NULL;
		return VM_FAULT_HWPOISON;
	}

	if (unlikely(!(ret & VM_FAULT_LOCKED)))
		lock_page(vmf->page);
	else
		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);

	return ret;
}
  • 如果pte为空,则提前为其申请好vmf->prealloc_pte ,这样可以防止在vm_ops->fault中申请物理内存造成锁问题。
  •  vma->vm_ops->fault加载大小为page对应的文件内容到内容中。
  • PageHWPoison:如果此时物理页面发生硬件错误,则返回失败并将该页面释放。

finish_fault

finish_fault刷新pte页表即该虚拟地址对应物理地址:



/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
 * addition.
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t finish_fault(struct vm_fault *vmf)
{
	struct page *page;
	vm_fault_t ret = 0;

	/* Did we COW the page? */
	if ((vmf->flags & FAULT_FLAG_WRITE) &&
	    !(vmf->vma->vm_flags & VM_SHARED))
		page = vmf->cow_page;
	else
		page = vmf->page;

	/*
	 * check even for read faults because we might have lost our CoWed
	 * page
	 */
	if (!(vmf->vma->vm_flags & VM_SHARED))
		ret = check_stable_address_space(vmf->vma->vm_mm);
	if (!ret)
		ret = alloc_set_pte(vmf, page);
	if (vmf->pte)
		pte_unmap_unlock(vmf->pte, vmf->ptl);
	return ret;
}
  • 如果时私有写page fault则物理内存为vmf->cow_page,否则为vmf->page。
  • alloc_set_pte:刷新页表pte。
  • pte_unmap_unlock:pte解锁。

do_fault_around

发生因读取内存造成page fault是,如果对应文件映射定义vm_ops->map_pages,则可以提前将发生page fault地址后面的一些文件内容也加载到内存中,可以减少后面发生page fault次数:



fault_around_bytes

fault_around_bytes决定当调用 do_fault_around时,最多可以预加载的内存大小,默认为65536=64K:

static unsigned long fault_around_bytes __read_mostly =
	rounddown_pow_of_two(65536);

do_fault_around源码分析

static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
	unsigned long address = vmf->address, nr_pages, mask;
	pgoff_t start_pgoff = vmf->pgoff;
	pgoff_t end_pgoff;
	int off;
	vm_fault_t ret = 0;

	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;

	vmf->address = max(address & mask, vmf->vma->vm_start);
	off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
	start_pgoff -= off;

	/*
	 *  end_pgoff is either the end of the page table, the end of
	 *  the vma or nr_pages from start_pgoff, depending what is nearest.
	 */
	end_pgoff = start_pgoff -
		((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
		PTRS_PER_PTE - 1;
	end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
			start_pgoff + nr_pages - 1);

	if (pmd_none(*vmf->pmd)) {
		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
		if (!vmf->prealloc_pte)
			goto out;
		smp_wmb(); /* See comment in __pte_alloc() */
	}

	vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);

	/* Huge page is mapped? Page fault is solved */
	if (pmd_trans_huge(*vmf->pmd)) {
		ret = VM_FAULT_NOPAGE;
		goto out;
	}

	/* ->map_pages() haven't done anything useful. Cold page cache? */
	if (!vmf->pte)
		goto out;

	/* check if the page fault is solved */
	vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
	if (!pte_none(*vmf->pte))
		ret = VM_FAULT_NOPAGE;
	pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
	vmf->address = address;
	vmf->pte = NULL;
	return ret;
}
  • 首先计算需要预加载内存大小,nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT,将fault_around_bytes转换称需要多少个page,默认为16个page。
  • mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK: 将fault_around_bytes按照页对齐转换称mask,方便后续地址转换。
  • vmf->address = max(address & mask, vmf->vma->vm_start): 计算需要虚拟地址vmf->addres, 不能低于文件起始地址vmf->vma->vm_start,并将地址按照mask对齐
  • off:计算对齐之后地址与发生page fault地址偏移
  • start_pgoff -= off:减去对齐多余偏移。
  • end_pgoff:按照对齐页对齐计算结束偏移。
  • end_pgoff:根据文件实际大小和配置fault_around_bytes可以预加载页面nr_pages,决定实际结束偏移位置end_pgoff
  • 如果pte为空,则需要提前为pte表申请物理内存,存放到vmf->prealloc_pte
  • 实际 工作处理完成之后调用vm_ops->map_pages进行页面加载,起始位置为start_pgoff,结束位置为end_pgoff
  • 不支持huge page .
  • vmf->pte: 更新页表pte。
  • pte_unmap_unlock:对页面pte锁释放。
举报

相关推荐

0 条评论