《linux内核那些事之buddy(2)》buddy算法内存申__alloc_pages_nodemask()函数申请内存时首先调用get_page_from_freelist进行第一次尝试申请内存(可以称为快速申请,所谓快速申请相对慢速申请相对来说),如果第一次尝试申请内存失败则进入__alloc_pages_slowpath()慢速申请通道。
get_page_from_freelist()
get_page_from_freelist()函数接口如下:
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
参数:
- gfp_t gfp_mask:申请内存所使用的gfp mask。
- unsigned int order:申请物理内存阶数即 2的oder次方个物理页。
- int alloc_flags:转换后的申请内存flags
- const struct alloc_context *ac:申请的内存context即所使用的参数。
alloc_flags
alloc flags是buddy内部申请内存所使用flag,决定一些内存分配行为:
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
#define ALLOC_WMARK_LOW WMARK_LOW
#define ALLOC_WMARK_HIGH WMARK_HIGH
#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
/*
* Only MMU archs have async oom victim reclaim - aka oom_reaper so we
* cannot assume a reduced access to memory reserves is sufficient for
* !MMU
*/
#ifdef CONFIG_MMU
#define ALLOC_OOM 0x08
#else
#define ALLOC_OOM ALLOC_NO_WATERMARKS
#endif
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT 0x0
#endif
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
- ALLO_WMARK_XXX:是申请内存时与watermark相关。
- ALLOC_NO_WATERMARKS:申请内存时不检查water mark。
- ALLOC_OOM:内存不足时允许触发OOM。
- ALLOC_HARDER:是否允许使用页迁移中的MIGRATE_HIGHATOMIC 保留内存。
- ALLOC_HIGH:与__GFP_HIGH功能相同。
- ALLOC_CPUSET:是否使用CPUSET功能控制内存申请。
- ALLOC_CMA:允许从CMA中申请内存。
- ALLOC_NOFRAGMENT:如果设置则决定内存不足时使用no_fallback策略,不允许从远端节点中申请内存即不允许产生外内存碎片。
- ALLOC_KSWAPD:内存不足时允许开启kswapd。
get_page_from_freelist流程
get_page_from_freelist()为buddy算法第一次尝试申请内存,核心思想就是当内存足够时从zone中对应order的freelist中,获取到物理内存页:
get_page_from_freelist源码
结合源码和上述流程图分析:
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
bool no_fallback;
retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->highest_zoneidx, ac->nodemask) {
struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a node that is within its dirty
* limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* XXX: For now, allow allocations to potentially
* exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* nodes are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
if (ac->spread_dirty_pages) {
if (last_pgdat_dirty_limit == zone->zone_pgdat)
continue;
if (!node_dirty_ok(zone->zone_pgdat)) {
last_pgdat_dirty_limit = zone->zone_pgdat;
continue;
}
}
if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;
/*
* If moving to a remote node, retry but allow
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
local_nid = zone_to_nid(ac->preferred_zoneref->zone);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
}
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac->highest_zoneidx, alloc_flags)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac->highest_zoneidx, alloc_flags))
goto try_this_zone;
continue;
}
}
try_this_zone:
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);
return page;
} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
}
}
/*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
if (no_fallback) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
return NULL;
}
- no_fallback = alloc_flags & ALLOC_NOFRAGMENT:从alloc_flags中获取是否设置ALLOC_NOFRAGMENT,如果设置说明采用fallback zonelist,no_fallback为ture说明只从本节点中申请内存,不从NUMA中其他节点申请内存。
- z = ac->preferred_zoneref:从alloc_context中获取建议使用的zone中申请内存。
- get_next_zone_zonelist_nodemask():当preferred zone内存不足时,需要从zone list中获取到下一个zone中申请内存。
- 如果cpuset cpusets_enabled()设置,则需要确认alloc flag是否设置ALLOC_CPUSET
- alloc flag 设置ALLOC_CPUSET,进一步查看当前zone是否加入到cpuset,如果没有加入则需要从get_next_zone_zonelist_nodemask获取下一个zone.
- 当前zone属于cpuset或者没有cpuset没有使能,则继续下一步处理。
- ac->spread_dirty_pages 是否为true,如果设置且当前zone 有dirty 限制,则获取下一个zone,否则继续处理。
- no_fallback 为true,获取到的zone 与preferred zone不在一个节点,则跳出循环,重新冲prefere zone中处理申请内存。
- wmark_pages:获取当前zone 内存water mark。
- 如果当前zone内存不足,即zone_watermark_fast为false,则需要进行内存回收。
- 内存回收过程中如果node_reclaim_mode 为0,且当前zone不允许内存回收,则当前内存不足,取下一个zone.
- 允许内存回收,调用node_reclaim内存回收。
- node_reclaim内存回收能够回收满足内存申请,则下一步进行内存申请,否则获取下一个zone申请内存,内存回收只回收inactive lru中内存。
- rmqueue:申请内存处理函数,从给定zone中申请内存。
- 遍历完所有zone list都申请失败,则no_fallback 是否为true,如果为true则alloc_flags清除ALLOC_NOFRAGMENT标志位,需要再次以fall back方式申请内存尝试即从numa中其他节点申请内存。
rmqueue()
rmqueue()根据指定的zone 负责申请内存(remove queue即将空闲page从queue中移除):
static inline struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
是具体实施物理内存申请,处理逻辑如下:
- 如果是order为零 则说明是申请一个page,走perp-cpu-pages机制中申请内存可以加快整个申请过程。
- zone->lock:对zone加锁,防止同时在一个zone中申请内存。
- 如果设置ALLOC_HARDER,则调用__rmqueue_smallest接口强制从MIGRATE_HIGHATOMIC区域中申请内存。MIGRATE_HIGHATOMIC为一块保留区域,可以加快整个内存申请。
- alloc_flag没有设置ALLOC_HARDER或者从MIGRATE_HIGHATOMIC区域中申请内存失败,调用__rmqueue从指定的migratetype中申请内存。
- 申请内存失败则释放zone锁,直接NULL。
- 申请成功,则检查申请到的内存物理页是否存在硬件问题即坏页,如果是坏页则需要重新申请。
- 申请到的物理页OK,则释放zone锁,并且更新zone中统计信息,并返回。
per_cpu_pageset机制
per cpu pageset机制是从2.6版本开始引入到内核机制中,其核心思想是在多核架构下每个cpu维护一份自己的page空闲缓存,用于减少zone->lock锁竞争问题,加速内存申请过程。注意为了减少对内存浪费per cpu page机制只维护order为0 page缓存,整个管理结构如下:
- struct zone结构中新增struct per_cpu_pageset __percpu *pageset 使用关键字__percpu修饰意思是每个cpu独有的管理数据。
- struct per_cpu_pageset 结构为每个cpu pageset管理结构,关键数据为struct per_cpu_pages。
- struct per_cpu_pages为每个cpu维护的order为0的空闲page:
rmqueue_pcplist
rmqueue_pcplist()函数从per-cpu-pageset中申请order为0的空闲页:
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, gfp_t gfp_flags,
int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
unsigned long flags;
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
return page;
}
- local_irq_save:禁止本地cpu 的irq.
- this_cpu_ptr(zone->pageset)->pcp:获取每个cpu自己独有的per_cpu_pages。
- __rmqueue_pcplist:从__rmqueue_pcplist 中申请物理内存。
- zone_statistics:申请内存成功则放开更新zone统计信息。
- local_irq_restore:使能中断。
__rmqueue_pcplist
__rmqueue_pcplist()从per cpu pages list中获取空闲页,处理流程如下:
- 相对比较简单本质上就是当per cpu page list中为空内存不足时,重新调用rmqueue_bulk接口冲buffy中重新申请pcp->batch数量内存,缓存到本地cpu中进行管理,该申请的内存被当前cpu独有占领。
- 当per cpu page list中空闲页足够时,直接从list的第一个空闲页取出作为申请的page,同时将其从空闲list中移除,同时pcp->count计数减1.
- 检查申请的物理页是否坏页,如果坏页则需要重新申请。
代码如下:
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
{
struct page *page;
do {
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, alloc_flags);
if (unlikely(list_empty(list)))
return NULL;
}
page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count--;
} while (check_new_pcp(page));
return page;
}
rmqueue_bulk
当per cpu pages中内存不足时,调用 rmqueue_bulk接口从buddy中重新申请batch数量的oder为0的物理页:
/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
if (unlikely(page == NULL))
break;
if (unlikely(check_pcp_refill(page)))
continue;
/*
* Split buddy pages returned by expand() are received here in
* physical page order. The page is added to the tail of
* caller's list. From the callers perspective, the linked list
* is ordered by page number under some conditions. This is
* useful for IO devices that can forward direction from the
* head, thus also in the physical page order. This is useful
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
list_add_tail(&page->lru, list);
alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
* on i. Do not confuse with 'alloced' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
return alloced;
}
- 该函数时最终调用__rmqueue申请物理内存。
__rmqueue()
__rmqueue()主要是提供物理内存申请和migratetype fallback机制:
处理流程相对简单结合代码分析:
/*
* Try finding a free buddy page on the fallback list and put it on the free
* list of requested migratetype, possibly along with other pages from the same
* block, depending on fragmentation avoidance heuristics. Returns true if
* fallback was found so that __rmqueue_smallest() can grab it.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
int current_order;
int min_order = order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
* Do not steal pages from freelists belonging to other pageblocks
* i.e. orders < pageblock_order. If there are no local zones free,
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
*/
if (alloc_flags & ALLOC_NOFRAGMENT)
min_order = pageblock_order;
/*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
for (current_order = MAX_ORDER - 1; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt == -1)
continue;
/*
* We cannot steal all free pages from the pageblock and the
* requested migratetype is movable. In that case it's better to
* steal and split the smallest available page instead of the
* largest available page, because even if the next movable
* allocation falls back into a different pageblock than this
* one, it won't cause permanent fragmentation.
*/
if (!can_steal && start_migratetype == MIGRATE_MOVABLE
&& current_order > order)
goto find_smallest;
goto do_steal;
}
return false;
find_smallest:
for (current_order = order; current_order < MAX_ORDER;
current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt != -1)
break;
}
/*
* This should not happen - we already found a suitable fallback
* when looking for the largest page.
*/
VM_BUG_ON(current_order == MAX_ORDER);
do_steal:
page = get_page_from_free_area(area, fallback_mt);
steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
return true;
}
- 如果开启CMA,并且migratetype为MIGRATE_MOVABLE,且CMA中页数量过多,则可以直接从CMA中申请物理页。
- 否则,则调用__rmqueue_smallest进行物理内存申请。
- __rmqueue_smallest申请失败,则进入migratetype fallback机制,从其他合适type中迁移内存页(后续再介绍)。
- 迁移内存失败则说明该zone确实没有空闲物理内存,返回NULL,由get_page_from_freelist()尝试从zonelist下一个zone中申请内存或者进入慢速通道。
__rmqueue_smallest
__rmqueue_smallest()为申请物理内存最后一步,经过前期处理说明大概率会申请内存成功:
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
- 首先从order中开始 尝试从free list中申请物理内存,
- 如果指定order级别内存申请失败,则尝试从更大一级的order中申请物理内存。
- 申请物理内存成功,则调用del_page_from_free_list将其从free list中移除。
- expand:如果申请的物理内存是由大于oder级别中申请,则说明发生buffer分裂,将其多余内存分裂。
- set_pcppage_migratetype:将page所属的migratetype设置到page->index(注意为复用技术)。
per_cpu_pageset信息查看方法
/proc/zoneinfo
每个zone中每个cpu的 pageset可以通过cat /proc/zoneinfo命令查看per_cpu_pages 结构中count、high和batch值查看
如下图:
percpu_pagelist_fraction
可以通过 /proc/sys/vm/percpu_pagelist_fraction 设置和查看per_cpu_pages中的high和batch,percpu_pagelist_fraction为将占用zone中可以管理的物理页平等划分多少等份,high设置为(zone_managed_pages(zone) /percpu_pagelist_fraction ,而bactch为high/4且不超过(PAGE_SHIFT * 8)。
系统默认batch
zone_batchsize()系统启动之后,默认计算bacth大小:
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
/*
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone.
*/
batch = zone_managed_pages(zone) / 1024;
/* But no more than a meg. */
if (batch * PAGE_SIZE > 1024 * 1024)
batch = (1024 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
* of 2 value was found to be more likely to have
* suboptimal cache aliasing properties in some cases.
*
* For example if 2 tasks are alternately allocating
* batches of pages, one task can end up with a lot
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
batch = rounddown_pow_of_two(batch + batch/2) - 1;
return batch;
#else
/* The deferral and batching of frees should be suppressed under NOMMU
* conditions.
*
* The problem is that NOMMU needs to be able to allocate large chunks
* of contiguous memory as there's no hardware page translation to
* assemble apparent contiguous memory from discontiguous pages.
*
* Queueing large contiguous runs of pages for batching, however,
* causes the pages to actually be freed in smaller chunks. As there
* can be a significant delay between the individual batches being
* recycled, this leads to the once large chunks of space being
* fragmented and becoming unavailable for high-order allocations.
*/
return 0;
#endif
}
默认计算high大小为6倍batch=6*batch:
/* a companion to pageset_set_high() */
static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
{
pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
}