《linux内核那些事之buddy(2)》buddy算法内存申__alloc_pages_nodemask()函数申请内存时首先调用get_page_from_freelist进行第一次尝试申请内存（可以称为快速申请，所谓快速申请相对慢速申请相对来说），如果第一次尝试申请内存失败则进入__alloc_pages_slowpath()慢速申请通道。

get_page_from_freelist()

get_page_from_freelist()函数接口如下：

static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)

参数：

gfp_t gfp_mask：申请内存所使用的gfp mask。
unsigned int order：申请物理内存阶数即 2的oder次方个物理页。
int alloc_flags：转换后的申请内存flags
const struct alloc_context *ac：申请的内存context即所使用的参数。

alloc_flags

alloc flags是buddy内部申请内存所使用flag，决定一些内存分配行为：

/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN		WMARK_MIN
#define ALLOC_WMARK_LOW		WMARK_LOW
#define ALLOC_WMARK_HIGH	WMARK_HIGH
#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)

/*
 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 * cannot assume a reduced access to memory reserves is sufficient for
 * !MMU
 */
#ifdef CONFIG_MMU
#define ALLOC_OOM		0x08
#else
#define ALLOC_OOM		ALLOC_NO_WATERMARKS
#endif

#define ALLOC_HARDER		 0x10 /* try to alloc harder */
#define ALLOC_HIGH		 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET		 0x40 /* check for correct cpuset */
#define ALLOC_CMA		 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT	0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT	  0x0
#endif
#define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */

ALLO_WMARK_XXX:是申请内存时与watermark相关。
ALLOC_NO_WATERMARKS：申请内存时不检查water mark。
ALLOC_OOM：内存不足时允许触发OOM。
ALLOC_HARDER:是否允许使用页迁移中的MIGRATE_HIGHATOMIC 保留内存。
ALLOC_HIGH：与__GFP_HIGH功能相同。
ALLOC_CPUSET：是否使用CPUSET功能控制内存申请。
ALLOC_CMA：允许从CMA中申请内存。
ALLOC_NOFRAGMENT：如果设置则决定内存不足时使用no_fallback策略，不允许从远端节点中申请内存即不允许产生外内存碎片。
ALLOC_KSWAPD：内存不足时允许开启kswapd。

get_page_from_freelist流程

get_page_from_freelist()为buddy算法第一次尝试申请内存，核心思想就是当内存足够时从zone中对应order的freelist中,获取到物理内存页：

get_page_from_freelist源码

结合源码和上述流程图分析：


/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)
{
	struct zoneref *z;
	struct zone *zone;
	struct pglist_data *last_pgdat_dirty_limit = NULL;
	bool no_fallback;

retry:
	/*
	 * Scan zonelist, looking for a zone with enough free.
	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
	 */
	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
	z = ac->preferred_zoneref;
	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
					ac->highest_zoneidx, ac->nodemask) {
		struct page *page;
		unsigned long mark;

		if (cpusets_enabled() &&
			(alloc_flags & ALLOC_CPUSET) &&
			!__cpuset_zone_allowed(zone, gfp_mask))
				continue;
		/*
		 * When allocating a page cache page for writing, we
		 * want to get it from a node that is within its dirty
		 * limit, such that no single node holds more than its
		 * proportional share of globally allowed dirty pages.
		 * The dirty limits take into account the node's
		 * lowmem reserves and high watermark so that kswapd
		 * should be able to balance it without having to
		 * write pages from its LRU list.
		 *
		 * XXX: For now, allow allocations to potentially
		 * exceed the per-node dirty limit in the slowpath
		 * (spread_dirty_pages unset) before going into reclaim,
		 * which is important when on a NUMA setup the allowed
		 * nodes are together not big enough to reach the
		 * global limit.  The proper fix for these situations
		 * will require awareness of nodes in the
		 * dirty-throttling and the flusher threads.
		 */
		if (ac->spread_dirty_pages) {
			if (last_pgdat_dirty_limit == zone->zone_pgdat)
				continue;

			if (!node_dirty_ok(zone->zone_pgdat)) {
				last_pgdat_dirty_limit = zone->zone_pgdat;
				continue;
			}
		}

		if (no_fallback && nr_online_nodes > 1 &&
		    zone != ac->preferred_zoneref->zone) {
			int local_nid;

			/*
			 * If moving to a remote node, retry but allow
			 * fragmenting fallbacks. Locality is more important
			 * than fragmentation avoidance.
			 */
			local_nid = zone_to_nid(ac->preferred_zoneref->zone);
			if (zone_to_nid(zone) != local_nid) {
				alloc_flags &= ~ALLOC_NOFRAGMENT;
				goto retry;
			}
		}

		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
		if (!zone_watermark_fast(zone, order, mark,
				       ac->highest_zoneidx, alloc_flags)) {
			int ret;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
			/*
			 * Watermark failed for this zone, but see if we can
			 * grow this zone if it contains deferred pages.
			 */
			if (static_branch_unlikely(&deferred_pages)) {
				if (_deferred_grow_zone(zone, order))
					goto try_this_zone;
			}
#endif
			/* Checked here to keep the fast path fast */
			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
			if (alloc_flags & ALLOC_NO_WATERMARKS)
				goto try_this_zone;

			if (node_reclaim_mode == 0 ||
			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
				continue;

			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
			switch (ret) {
			case NODE_RECLAIM_NOSCAN:
				/* did not scan */
				continue;
			case NODE_RECLAIM_FULL:
				/* scanned but unreclaimable */
				continue;
			default:
				/* did we reclaim enough */
				if (zone_watermark_ok(zone, order, mark,
					ac->highest_zoneidx, alloc_flags))
					goto try_this_zone;

				continue;
			}
		}

try_this_zone:
		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
				gfp_mask, alloc_flags, ac->migratetype);
		if (page) {
			prep_new_page(page, order, gfp_mask, alloc_flags);

			/*
			 * If this is a high-order atomic allocation then check
			 * if the pageblock should be reserved for the future
			 */
			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
				reserve_highatomic_pageblock(page, zone, order);

			return page;
		} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
			/* Try again if zone has deferred pages */
			if (static_branch_unlikely(&deferred_pages)) {
				if (_deferred_grow_zone(zone, order))
					goto try_this_zone;
			}
#endif
		}
	}

	/*
	 * It's possible on a UMA machine to get through all zones that are
	 * fragmented. If avoiding fragmentation, reset and try again.
	 */
	if (no_fallback) {
		alloc_flags &= ~ALLOC_NOFRAGMENT;
		goto retry;
	}

	return NULL;
}

no_fallback = alloc_flags & ALLOC_NOFRAGMENT：从alloc_flags中获取是否设置ALLOC_NOFRAGMENT，如果设置说明采用fallback zonelist，no_fallback为ture说明只从本节点中申请内存，不从NUMA中其他节点申请内存。
z = ac->preferred_zoneref：从alloc_context中获取建议使用的zone中申请内存。
get_next_zone_zonelist_nodemask():当preferred zone内存不足时，需要从zone list中获取到下一个zone中申请内存。
如果cpuset cpusets_enabled()设置，则需要确认alloc flag是否设置ALLOC_CPUSET
alloc flag 设置ALLOC_CPUSET，进一步查看当前zone是否加入到cpuset，如果没有加入则需要从get_next_zone_zonelist_nodemask获取下一个zone.
当前zone属于cpuset或者没有cpuset没有使能，则继续下一步处理。
ac->spread_dirty_pages 是否为true，如果设置且当前zone 有dirty 限制，则获取下一个zone，否则继续处理。
no_fallback 为true，获取到的zone 与preferred zone不在一个节点，则跳出循环，重新冲prefere zone中处理申请内存。
wmark_pages:获取当前zone 内存water mark。
如果当前zone内存不足，即zone_watermark_fast为false，则需要进行内存回收。
内存回收过程中如果node_reclaim_mode 为0，且当前zone不允许内存回收，则当前内存不足，取下一个zone.
允许内存回收，调用node_reclaim内存回收。
node_reclaim内存回收能够回收满足内存申请，则下一步进行内存申请，否则获取下一个zone申请内存，内存回收只回收inactive lru中内存。
rmqueue：申请内存处理函数，从给定zone中申请内存。
遍历完所有zone list都申请失败，则no_fallback 是否为true，如果为true则alloc_flags清除ALLOC_NOFRAGMENT标志位，需要再次以fall back方式申请内存尝试即从numa中其他节点申请内存。

rmqueue()

rmqueue()根据指定的zone 负责申请内存（remove queue即将空闲page从queue中移除）：

static inline struct page *rmqueue(struct zone *preferred_zone,
			                       struct zone *zone, unsigned int order,
			                       gfp_t gfp_flags, unsigned int alloc_flags,
			                       int migratetype)

是具体实施物理内存申请，处理逻辑如下：

如果是order为零则说明是申请一个page，走perp-cpu-pages机制中申请内存可以加快整个申请过程。
zone->lock：对zone加锁，防止同时在一个zone中申请内存。
如果设置ALLOC_HARDER，则调用__rmqueue_smallest接口强制从MIGRATE_HIGHATOMIC区域中申请内存。MIGRATE_HIGHATOMIC为一块保留区域，可以加快整个内存申请。
alloc_flag没有设置ALLOC_HARDER或者从MIGRATE_HIGHATOMIC区域中申请内存失败，调用__rmqueue从指定的migratetype中申请内存。
申请内存失败则释放zone锁，直接NULL。
申请成功，则检查申请到的内存物理页是否存在硬件问题即坏页，如果是坏页则需要重新申请。
申请到的物理页OK，则释放zone锁，并且更新zone中统计信息，并返回。

per_cpu_pageset机制

per cpu pageset机制是从2.6版本开始引入到内核机制中，其核心思想是在多核架构下每个cpu维护一份自己的page空闲缓存，用于减少zone->lock锁竞争问题，加速内存申请过程。注意为了减少对内存浪费per cpu page机制只维护order为0 page缓存，整个管理结构如下：

struct zone结构中新增struct per_cpu_pageset __percpu *pageset 使用关键字__percpu修饰意思是每个cpu独有的管理数据。
struct per_cpu_pageset 结构为每个cpu pageset管理结构，关键数据为struct per_cpu_pages。
struct per_cpu_pages为每个cpu维护的order为0的空闲page:

rmqueue_pcplist

rmqueue_pcplist()函数从per-cpu-pageset中申请order为0的空闲页：

/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
			struct zone *zone, gfp_t gfp_flags,
			int migratetype, unsigned int alloc_flags)
{
	struct per_cpu_pages *pcp;
	struct list_head *list;
	struct page *page;
	unsigned long flags;

	local_irq_save(flags);
	pcp = &this_cpu_ptr(zone->pageset)->pcp;
	list = &pcp->lists[migratetype];
	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
	if (page) {
		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
		zone_statistics(preferred_zone, zone);
	}
	local_irq_restore(flags);
	return page;
}

local_irq_save：禁止本地cpu 的irq.
this_cpu_ptr(zone->pageset)->pcp:获取每个cpu自己独有的per_cpu_pages。
__rmqueue_pcplist：从__rmqueue_pcplist 中申请物理内存。
zone_statistics：申请内存成功则放开更新zone统计信息。
local_irq_restore：使能中断。

__rmqueue_pcplist

__rmqueue_pcplist()从per cpu pages list中获取空闲页，处理流程如下：

相对比较简单本质上就是当per cpu page list中为空内存不足时，重新调用rmqueue_bulk接口冲buffy中重新申请pcp->batch数量内存，缓存到本地cpu中进行管理，该申请的内存被当前cpu独有占领。
当per cpu page list中空闲页足够时，直接从list的第一个空闲页取出作为申请的page，同时将其从空闲list中移除，同时pcp->count计数减1.
检查申请的物理页是否坏页，如果坏页则需要重新申请。

代码如下：

/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
			unsigned int alloc_flags,
			struct per_cpu_pages *pcp,
			struct list_head *list)
{
	struct page *page;

	do {
		if (list_empty(list)) {
			pcp->count += rmqueue_bulk(zone, 0,
					pcp->batch, list,
					migratetype, alloc_flags);
			if (unlikely(list_empty(list)))
				return NULL;
		}

		page = list_first_entry(list, struct page, lru);
		list_del(&page->lru);
		pcp->count--;
	} while (check_new_pcp(page));

	return page;
}

rmqueue_bulk

当per cpu pages中内存不足时，调用 rmqueue_bulk接口从buddy中重新申请batch数量的oder为0的物理页：


/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
static int rmqueue_bulk(struct zone *zone, unsigned int order,
			unsigned long count, struct list_head *list,
			int migratetype, unsigned int alloc_flags)
{
	int i, alloced = 0;

	spin_lock(&zone->lock);
	for (i = 0; i < count; ++i) {
		struct page *page = __rmqueue(zone, order, migratetype,
								alloc_flags);
		if (unlikely(page == NULL))
			break;

		if (unlikely(check_pcp_refill(page)))
			continue;

		/*
		 * Split buddy pages returned by expand() are received here in
		 * physical page order. The page is added to the tail of
		 * caller's list. From the callers perspective, the linked list
		 * is ordered by page number under some conditions. This is
		 * useful for IO devices that can forward direction from the
		 * head, thus also in the physical page order. This is useful
		 * for IO devices that can merge IO requests if the physical
		 * pages are ordered properly.
		 */
		list_add_tail(&page->lru, list);
		alloced++;
		if (is_migrate_cma(get_pcppage_migratetype(page)))
			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
					      -(1 << order));
	}

	/*
	 * i pages were removed from the buddy list even if some leak due
	 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
	 * on i. Do not confuse with 'alloced' which is the number of
	 * pages added to the pcp list.
	 */
	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
	spin_unlock(&zone->lock);
	return alloced;
}

该函数时最终调用__rmqueue申请物理内存。

__rmqueue()

__rmqueue()主要是提供物理内存申请和migratetype fallback机制:

处理流程相对简单结合代码分析：


/*
 * Try finding a free buddy page on the fallback list and put it on the free
 * list of requested migratetype, possibly along with other pages from the same
 * block, depending on fragmentation avoidance heuristics. Returns true if
 * fallback was found so that __rmqueue_smallest() can grab it.
 *
 * The use of signed ints for order and current_order is a deliberate
 * deviation from the rest of this file, to make the for loop
 * condition simpler.
 */
static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
						unsigned int alloc_flags)
{
	struct free_area *area;
	int current_order;
	int min_order = order;
	struct page *page;
	int fallback_mt;
	bool can_steal;

	/*
	 * Do not steal pages from freelists belonging to other pageblocks
	 * i.e. orders < pageblock_order. If there are no local zones free,
	 * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
	 */
	if (alloc_flags & ALLOC_NOFRAGMENT)
		min_order = pageblock_order;

	/*
	 * Find the largest available free page in the other list. This roughly
	 * approximates finding the pageblock with the most free pages, which
	 * would be too costly to do exactly.
	 */
	for (current_order = MAX_ORDER - 1; current_order >= min_order;
				--current_order) {
		area = &(zone->free_area[current_order]);
		fallback_mt = find_suitable_fallback(area, current_order,
				start_migratetype, false, &can_steal);
		if (fallback_mt == -1)
			continue;

		/*
		 * We cannot steal all free pages from the pageblock and the
		 * requested migratetype is movable. In that case it's better to
		 * steal and split the smallest available page instead of the
		 * largest available page, because even if the next movable
		 * allocation falls back into a different pageblock than this
		 * one, it won't cause permanent fragmentation.
		 */
		if (!can_steal && start_migratetype == MIGRATE_MOVABLE
					&& current_order > order)
			goto find_smallest;

		goto do_steal;
	}

	return false;

find_smallest:
	for (current_order = order; current_order < MAX_ORDER;
							current_order++) {
		area = &(zone->free_area[current_order]);
		fallback_mt = find_suitable_fallback(area, current_order,
				start_migratetype, false, &can_steal);
		if (fallback_mt != -1)
			break;
	}

	/*
	 * This should not happen - we already found a suitable fallback
	 * when looking for the largest page.
	 */
	VM_BUG_ON(current_order == MAX_ORDER);

do_steal:
	page = get_page_from_free_area(area, fallback_mt);

	steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
								can_steal);

	trace_mm_page_alloc_extfrag(page, order, current_order,
		start_migratetype, fallback_mt);

	return true;

}

如果开启CMA，并且migratetype为MIGRATE_MOVABLE，且CMA中页数量过多，则可以直接从CMA中申请物理页。
否则，则调用__rmqueue_smallest进行物理内存申请。
__rmqueue_smallest申请失败，则进入migratetype fallback机制，从其他合适type中迁移内存页（后续再介绍）。
迁移内存失败则说明该zone确实没有空闲物理内存，返回NULL，由get_page_from_freelist()尝试从zonelist下一个zone中申请内存或者进入慢速通道。

__rmqueue_smallest

__rmqueue_smallest()为申请物理内存最后一步，经过前期处理说明大概率会申请内存成功：

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{
	unsigned int current_order;
	struct free_area *area;
	struct page *page;

	/* Find a page of the appropriate size in the preferred list */
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);
		page = get_page_from_free_area(area, migratetype);
		if (!page)
			continue;
		del_page_from_free_list(page, zone, current_order);
		expand(zone, page, order, current_order, migratetype);
		set_pcppage_migratetype(page, migratetype);
		return page;
	}

	return NULL;
}

首先从order中开始尝试从free list中申请物理内存，
如果指定order级别内存申请失败，则尝试从更大一级的order中申请物理内存。
申请物理内存成功，则调用del_page_from_free_list将其从free list中移除。
expand：如果申请的物理内存是由大于oder级别中申请，则说明发生buffer分裂，将其多余内存分裂。
set_pcppage_migratetype：将page所属的migratetype设置到page->index（注意为复用技术)。

per_cpu_pageset信息查看方法

/proc/zoneinfo

每个zone中每个cpu的 pageset可以通过cat /proc/zoneinfo命令查看per_cpu_pages 结构中count、high和batch值查看

如下图：

percpu_pagelist_fraction

可以通过 /proc/sys/vm/percpu_pagelist_fraction 设置和查看per_cpu_pages中的high和batch，percpu_pagelist_fraction为将占用zone中可以管理的物理页平等划分多少等份，high设置为(zone_managed_pages(zone) /percpu_pagelist_fraction ，而bactch为high/4且不超过（PAGE_SHIFT * 8）。

系统默认batch

zone_batchsize()系统启动之后，默认计算bacth大小：


static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
	int batch;

	/*
	 * The per-cpu-pages pools are set to around 1000th of the
	 * size of the zone.
	 */
	batch = zone_managed_pages(zone) / 1024;
	/* But no more than a meg. */
	if (batch * PAGE_SIZE > 1024 * 1024)
		batch = (1024 * 1024) / PAGE_SIZE;
	batch /= 4;		/* We effectively *= 4 below */
	if (batch < 1)
		batch = 1;

	/*
	 * Clamp the batch to a 2^n - 1 value. Having a power
	 * of 2 value was found to be more likely to have
	 * suboptimal cache aliasing properties in some cases.
	 *
	 * For example if 2 tasks are alternately allocating
	 * batches of pages, one task can end up with a lot
	 * of pages of one half of the possible page colors
	 * and the other with pages of the other colors.
	 */
	batch = rounddown_pow_of_two(batch + batch/2) - 1;

	return batch;

#else
	/* The deferral and batching of frees should be suppressed under NOMMU
	 * conditions.
	 *
	 * The problem is that NOMMU needs to be able to allocate large chunks
	 * of contiguous memory as there's no hardware page translation to
	 * assemble apparent contiguous memory from discontiguous pages.
	 *
	 * Queueing large contiguous runs of pages for batching, however,
	 * causes the pages to actually be freed in smaller chunks.  As there
	 * can be a significant delay between the individual batches being
	 * recycled, this leads to the once large chunks of space being
	 * fragmented and becoming unavailable for high-order allocations.
	 */
	return 0;
#endif
}

默认计算high大小为6倍batch=6*batch：

/* a companion to pageset_set_high() */
static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
{
	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
}