内核：arm64 kernel-5.0
梳理代码流程要先明白输入和输出，输入一般有参数，输出一般有return 返回的内容以及一些输出的参数，内存回收是在内存分配的过程中产生的，所以先简单看下内存分配的流程。

alloc_pages_nodemask

函数原型

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)

参数

其参数来源于驱动层，比如块设备驱动层调用blk_alloc_queue_node(GFP_KERNEL, set->numa_node)分配一个request_queue。blk_alloc_queue_node–>kmem_cache_alloc_node–>slab_alloc_node–>____cache_alloc–>cache_alloc_refill–>cache_grow_begin–>kmem_getpages–>__alloc_pages_node–>__alloc_pages–>alloc_pages_nodemask

此时参数中的gfp_mask为GFP_KERNEL，nodemask为NULL。

#define GFP_KERNEL   (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
#define ___GFP_DIRECT_RECLAIM   0x200000u
#define ___GFP_KSWAPD_RECLAIM   0x400000u
#define ___GFP_IO       0x40u
#define ___GFP_FS       0x80u

可知GFP_KERNEL 为 0x6000C0

流程

{struct page *page;unsigned int alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */struct alloc_context ac = { };if (unlikely(order >= MAX_ORDER)) {WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));return NULL;}gfp_mask &= gfp_allowed_mask;alloc_mask = gfp_mask;if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))return NULL;finalise_ac(gfp_mask, &ac);alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);/*gfs_mask中___GFP_KSWAPD_RECLAIM被置位的情况下，alloc_flags中ALLOC_KSWAPD会被置位，表示运行唤醒kswapd，用于防止内存碎片化。*/page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);if (likely(page))goto out;alloc_mask = current_gfp_context(gfp_mask);ac.spread_dirty_pages = false;if (unlikely(ac.nodemask != nodemask))ac.nodemask = nodemask;page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {__free_pages(page, order);page = NULL;}trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);return page;
}

整体流程比较简单，将alloc_flags初始化为ALLOC_WMARK_LOW，在prepare_alloc_pages和finalise_ac中填充ac。先尝试快速分配，分配不成功则进入慢速路径。

prepare_alloc_pages

prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)

{ac->high_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask);ac->nodemask = nodemask;ac->migratetype = gfpflags_to_migratetype(gfp_mask);if (cpusets_enabled()) {*alloc_mask |= __GFP_HARDWALL;if (!ac->nodemask)ac->nodemask = &cpuset_current_mems_allowed;else*alloc_flags |= ALLOC_CPUSET;}fs_reclaim_acquire(gfp_mask);fs_reclaim_release(gfp_mask);might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);if (should_fail_alloc_page(gfp_mask, order))return false;if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)*alloc_flags |= ALLOC_CMA;return true;
}

ac->high_zoneidx = gfp_zone(gfp_mask);
/*根据gfp_mask低4位的情况确定high_zoneidx，GFP_KERNEL获取到的为1，
//具体计算方式参考    https://blog.csdn.net/qqqqqq999999/article/details/90045500*/
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
//获取节点的zonelist，这里再numa的情况下区分ZONELIST_NOFALLBACK和ZONELIST_FALLBACK，本文的讨论都以没有定义CONFIG_NUMA进行说明。
//zonelist在系统初始化时通过build_zonelists完成，将各个zone按zone_type反向排列起来，假设有DMA32、NORMAL二个zone，那么在zonelist中就表示为：zonerefs[0]->zone_idx=1(ZONE_NORMAL)  zonerefs[1]->zone_idx=0(ZONE_DMA32)
ac->nodemask = nodemask; //NULL
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
//获取gfp_mask的3、4位（从0开始），GFP_KERNEL的情况得到的结果是0，即迁移类型为MIGRATE_UNMOVABLE

finalise_ac

finalise_ac(gfp_mask, &ac);

{ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); //falseac->preferred_zoneref = first_zones_zonelist(ac->zonelist,ac->high_zoneidx, ac->nodemask);
}

first_zones_zonelist返回zone->idx小于等于high_zoneidx的第一个zoneref，本情况下即zoneref[0] (zone_normal)。

get_page_from_freelist

函数原型
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
此时参数中gfp_mask为GFP_KERNEL（0x6000C0), alloc_flags为0x201(低水位，运行唤醒kswapd）

{struct zoneref *z;struct zone *zone;struct pglist_data *last_pgdat_dirty_limit = NULL;bool no_fallback;retry:no_fallback = alloc_flags & ALLOC_NOFRAGMENT; //0z = ac->preferred_zoneref; //zone_normalfor_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,ac->nodemask) { //遍历zoneliststruct page *page;unsigned long mark;if (cpusets_enabled() &&(alloc_flags & ALLOC_CPUSET) &&!__cpuset_zone_allowed(zone, gfp_mask))continue;if (ac->spread_dirty_pages) { //为false，跳过if (last_pgdat_dirty_limit == zone->zone_pgdat)continue;if (!node_dirty_ok(zone->zone_pgdat)) {last_pgdat_dirty_limit = zone->zone_pgdat;continue;}}if (no_fallback && nr_online_nodes > 1 && //no_fallback为0，跳过zone != ac->preferred_zoneref->zone) {int local_nid;local_nid = zone_to_nid(ac->preferred_zoneref->zone);if (zone_to_nid(zone) != local_nid) {alloc_flags &= ~ALLOC_NOFRAGMENT;goto retry;}}//当前mark为low水位mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);if (!zone_watermark_fast(zone, order, mark,ac_classzone_idx(ac), alloc_flags)) {int ret;#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT/** Watermark failed for this zone, but see if we can* grow this zone if it contains deferred pages.*/if (static_branch_unlikely(&deferred_pages)) {if (_deferred_grow_zone(zone, order))goto try_this_zone;}
#endif/* Checked here to keep the fast path fast */BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);if (alloc_flags & ALLOC_NO_WATERMARKS)goto try_this_zone;if (node_reclaim_mode == 0 || //默认为0，且只有config_numa的情况下有此节点!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))continue;//进行内存回收ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);switch (ret) {case NODE_RECLAIM_NOSCAN:/* did not scan */continue;case NODE_RECLAIM_FULL:/* scanned but unreclaimable */continue;default:/* did we reclaim enough */if (zone_watermark_ok(zone, order, mark,ac_classzone_idx(ac), alloc_flags))goto try_this_zone;continue;}}try_this_zone:page = rmqueue(ac->preferred_zoneref->zone, zone, order,gfp_mask, alloc_flags, ac->migratetype);if (page) {prep_new_page(page, order, gfp_mask, alloc_flags);if (unlikely(order && (alloc_flags & ALLOC_HARDER)))reserve_highatomic_pageblock(page, zone, order);return page;} else {#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT/* Try again if zone has deferred pages */if (static_branch_unlikely(&deferred_pages)) {if (_deferred_grow_zone(zone, order))goto try_this_zone;}
#endif}}if (no_fallback) {alloc_flags &= ~ALLOC_NOFRAGMENT;goto retry;}return NULL;
}

node_dirty_ok

计算脏页和正在回写的页是否小于limit

bool node_dirty_ok(struct pglist_data *pgdat)
{unsigned long limit = node_dirty_limit(pgdat);unsigned long nr_pages = 0;nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);nr_pages += node_page_state(pgdat, NR_WRITEBACK);return nr_pages <= limit;
}

node_dirtyable_memory返回（该节点中所有zone的空闲页+节点的文件页-节点的预留内存）

static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
{unsigned long nr_pages = 0;int z;for (z = 0; z < MAX_NR_ZONES; z++) {struct zone *zone = pgdat->node_zones + z;if (!populated_zone(zone))continue;nr_pages += zone_page_state(zone, NR_FREE_PAGES);}/** Pages reserved for the kernel should not be considered* dirtyable, to prevent a situation where reclaim has to* clean pages in order to balance the zones.*//totalreserve_pages的计算方式？/nr_pages -= min(nr_pages, pgdat->totalreserve_pages);nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);return nr_pages;
}

根据vm_dirty_bytes(先使用)和vm_dirty_ratio计算脏页限制，如果当前进程时实时进程dirty*1.25。

static unsigned long node_dirty_limit(struct pglist_data *pgdat)
{unsigned long node_memory = node_dirtyable_memory(pgdat);struct task_struct *tsk = current;unsigned long dirty;if (vm_dirty_bytes)dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *node_memory / global_dirtyable_memory();elsedirty = vm_dirty_ratio * node_memory / 100;if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))dirty += dirty / 4;return dirty;
}

zone_watermark_fast

函数原型
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, unsigned int alloc_flags)
z是当前遍历到的zone，mark此时为low对于内存，classzone_idx为首选zone对应的idx，此时为1

{long free_pages = zone_page_state(z, NR_FREE_PAGES);long cma_pages = 0;#ifdef CONFIG_CMA/* If allocation can't use CMA areas don't use free CMA pages */if (!(alloc_flags & ALLOC_CMA)) //此时alloc_flags中ALLOC_CMA(0X80)没有置位cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
#endifif (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])return true;return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,free_pages);
}

对于单个页面的情况做快速处理，如果当前zone的（空闲页-空闲cma ）大于（low水位+当前zone对于推荐zone的预留内存）则返回true。

每个zone有一个lowmem_reserve数组，用于防止高端zone过多地使用低端zone的内存。比如推荐的zone是normal，idx为1，那么当此时正在遍历的zone为dma32时，计算水位是否满足时就要保证空闲页大于low水位加dma32的lowmem_reserve1的值。

bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,int classzone_idx, unsigned int alloc_flags,long free_pages)
{long min = mark;int o;const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));/* free_pages may go negative - that's OK */free_pages -= (1 << order) - 1;if (alloc_flags & ALLOC_HIGH) //高优先级进程min -= min / 2; if (likely(!alloc_harder)) {free_pages -= z->nr_reserved_highatomic;} else {if (alloc_flags & ALLOC_OOM) //当前进程是oom的受害者min -= min / 2;elsemin -= min / 4;}#ifdef CONFIG_CMA/* If allocation can't use CMA areas don't use free CMA pages */if (!(alloc_flags & ALLOC_CMA))free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
#endifif (free_pages <= min + z->lowmem_reserve[classzone_idx])return false;/* If this is an order-0 request then the watermark is fine */if (!order)return true;/* For a high-order request, check at least one suitable page is free */for (o = order; o < MAX_ORDER; o++) {struct free_area *area = &z->free_area[o];int mt;if (!area->nr_free)continue;for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {if (!list_empty(&area->free_list[mt]))return true;}#ifdef CONFIG_CMAif ((alloc_flags & ALLOC_CMA) &&!list_empty(&area->free_list[MIGRATE_CMA])) {return true;}
#endifif (alloc_harder &&!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))return true;}return false;
}

在wart_mark_ok函数中可以看到min水位下的内存是怎么处理的，如果是个高优先级进程可以使用min/2以上的内存，如果是当前进程是oom的受害者同样。

空闲内存满足标准的情况下，还需要判断伙伴系统中是否有满足标准的连续内存块。
如下图所示伙伴系统中zone有一个free_area数组（表示2的order次幂个连续的页面），每个成员是一个free_list数组，表示对应迁移类型的连续页面的数量。

node_reclaim

try_this_zone

从当前zone中使用rmqueue分配内存，并对得到的page进行检查

rmqueue //待补充

__alloc_pages_slowpath

快速分配路径不成功，则进入慢速路径，进入慢速路径前会根据当前进程的状态调整gfp_mask的标志位。


static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,struct alloc_context *ac)
{bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;struct page *page = NULL;unsigned int alloc_flags;unsigned long did_some_progress;enum compact_priority compact_priority;enum compact_result compact_result;int compaction_retries;int no_progress_loops;unsigned int cpuset_mems_cookie;int reserve_flags;/** We also sanity check to catch abuse of atomic reserves being used by* callers that are not in atomic context.*/if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))gfp_mask &= ~__GFP_ATOMIC;retry_cpuset:compaction_retries = 0;no_progress_loops = 0;compact_priority = DEF_COMPACT_PRIORITY;cpuset_mems_cookie = read_mems_allowed_begin();/** The fast path uses conservative alloc_flags to succeed only until* kswapd needs to be woken up, and to avoid the cost of setting up* alloc_flags precisely. So we do that now.*/alloc_flags = gfp_to_alloc_flags(gfp_mask);//0x240,将内存水位调整为min，运行kswapd，/** We need to recalculate the starting point for the zonelist iterator* because we might have used different nodemask in the fast path, or* there was a cpuset modification and we are retrying - otherwise we* could end up iterating over non-eligible zones endlessly.*/ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,  //首选zone为normalac->high_zoneidx, ac->nodemask);if (!ac->preferred_zoneref->zone)goto nopage;if (alloc_flags & ALLOC_KSWAPD)wake_all_kswapds(order, gfp_mask, ac); //唤醒kswapd/** The adjusted alloc_flags might result in immediate success, so try* that first*/page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page)goto got_pg;/** For costly allocations, try direct compaction first, as it's likely* that we have enough base pages and don't need to reclaim. For non-* movable high-order allocations, do that as well, as compaction will* try prevent permanent fragmentation by migrating from blocks of the* same migratetype.* Don't try this for allocations that are allowed to ignore* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.*/if (can_direct_reclaim &&(costly_order ||(order > 0 && ac->migratetype != MIGRATE_MOVABLE))&& !gfp_pfmemalloc_allowed(gfp_mask)) {page = __alloc_pages_direct_compact(gfp_mask, order,alloc_flags, ac,INIT_COMPACT_PRIORITY,&compact_result);if (page)goto got_pg;/** Checks for costly allocations with __GFP_NORETRY, which* includes THP page fault allocations*/if (costly_order && (gfp_mask & __GFP_NORETRY)) {/** If compaction is deferred for high-order allocations,* it is because sync compaction recently failed. If* this is the case and the caller requested a THP* allocation, we do not want to heavily disrupt the* system, so we fail the allocation instead of entering* direct reclaim.*/if (compact_result == COMPACT_DEFERRED)goto nopage;/** Looks like reclaim/compaction is worth trying, but* sync compaction could be very expensive, so keep* using async compaction.*/compact_priority = INIT_COMPACT_PRIORITY;}}retry:/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */if (alloc_flags & ALLOC_KSWAPD)wake_all_kswapds(order, gfp_mask, ac);reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);if (reserve_flags)alloc_flags = reserve_flags;/** Reset the nodemask and zonelist iterators if memory policies can be* ignored. These allocations are high priority and system rather than* user oriented.*/if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {ac->nodemask = NULL;ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,ac->high_zoneidx, ac->nodemask);}/* Attempt with potentially adjusted zonelist and alloc_flags */page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page)goto got_pg;/* Caller is not willing to reclaim, we can't balance anything */if (!can_direct_reclaim)goto nopage;/* Avoid recursion of direct reclaim */if (current->flags & PF_MEMALLOC)goto nopage;/* Try direct reclaim and then allocating */page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,&did_some_progress);if (page)goto got_pg;/* Try direct compaction and then allocating */page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,compact_priority, &compact_result);if (page)goto got_pg;/* Do not loop if specifically requested */if (gfp_mask & __GFP_NORETRY)goto nopage;/** Do not retry costly high order allocations unless they are* __GFP_RETRY_MAYFAIL*/if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))goto nopage;if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,did_some_progress > 0, &no_progress_loops))goto retry;/** It doesn't make any sense to retry for the compaction if the order-0* reclaim is not able to make any progress because the current* implementation of the compaction depends on the sufficient amount* of free memory (see __compaction_suitable)*/if (did_some_progress > 0 &&should_compact_retry(ac, order, alloc_flags,compact_result, &compact_priority,&compaction_retries))goto retry;/* Deal with possible cpuset update races before we start OOM killing */if (check_retry_cpuset(cpuset_mems_cookie, ac))goto retry_cpuset;/* Reclaim has failed us, start killing things */page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);if (page)goto got_pg;/* Avoid allocations with no watermarks from looping endlessly */if (tsk_is_oom_victim(current) &&(alloc_flags == ALLOC_OOM ||(gfp_mask & __GFP_NOMEMALLOC)))goto nopage;/* Retry as long as the OOM killer is making progress */if (did_some_progress) {no_progress_loops = 0;goto retry;}nopage:/* Deal with possible cpuset update races before we fail */if (check_retry_cpuset(cpuset_mems_cookie, ac))goto retry_cpuset;/** Make sure that __GFP_NOFAIL request doesn't leak out and make sure* we always retry*/if (gfp_mask & __GFP_NOFAIL) {/** All existing users of the __GFP_NOFAIL are blockable, so warn* of any new users that actually require GFP_NOWAIT*/if (WARN_ON_ONCE(!can_direct_reclaim))goto fail;/** PF_MEMALLOC request from this context is rather bizarre* because we cannot reclaim anything and only can loop waiting* for somebody to do a work for us*/WARN_ON_ONCE(current->flags & PF_MEMALLOC);/** non failing costly orders are a hard requirement which we* are not prepared for much so let's warn about these users* so that we can identify them and convert them to something* else.*/WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);/** Help non-failing allocations by giving them access to memory* reserves but do not use ALLOC_NO_WATERMARKS because this* could deplete whole memory reserves which would just make* the situation worse*/page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);if (page)goto got_pg;cond_resched();goto retry;}
fail:warn_alloc(gfp_mask, ac->nodemask,"page allocation failure: order:%u", order);
got_pg:return page;
}

简化后的整体流程如下：

会通过kswapd异步回收、直接回收、内存规整、oom等方式分配内存，如果尝试各种方式都无法分配成功，就只能失败并打印page allocation failure了。

wake_all_kswapds

wakeup_kswapd

此时zone为normal，gfp_flags为GFP_KERNEL,classzone_idx为1。

void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,enum zone_type classzone_idx)
{pg_data_t *pgdat;if (!managed_zone(zone))return;if (!cpuset_zone_allowed(zone, gfp_flags))return;pgdat = zone->zone_pgdat;pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,classzone_idx);pgdat->kswapd_order = max(pgdat->kswapd_order, order);if (!waitqueue_active(&pgdat->kswapd_wait))return;/* Hopeless node, leave it to direct reclaim if possible */if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||   //kswapd失败次数太多（kswapd已经救不了它了），或者是内存高于high水位且watermarkboost没有设置（不需要唤醒）(pgdat_balanced(pgdat, order, classzone_idx) &&!pgdat_watermark_boosted(pgdat, classzone_idx))) {/** There may be plenty of free memory available, but it's too* fragmented for high-order allocations.  Wake up kcompactd* and rely on compaction_suitable() to determine if it's* needed.  If it fails, it will defer subsequent attempts to* ratelimit its work.*/if (!(gfp_flags & __GFP_DIRECT_RECLAIM))wakeup_kcompactd(pgdat, order, classzone_idx);return;}trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,gfp_flags);wake_up_interruptible(&pgdat->kswapd_wait);
}

kswapd

static int kswapd(void *p)
{unsigned int alloc_order, reclaim_order;unsigned int classzone_idx = MAX_NR_ZONES - 1;pg_data_t *pgdat = (pg_data_t*)p;struct task_struct *tsk = current;struct reclaim_state reclaim_state = {.reclaimed_slab = 0,};const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);if (!cpumask_empty(cpumask))set_cpus_allowed_ptr(tsk, cpumask); //绑核操作，一般不建议current->reclaim_state = &reclaim_state;/** Tell the memory management that we're a "memory allocator",* and that if we need more memory we should get access to it* regardless (see "__alloc_pages()"). "kswapd" should* never get caught in the normal page freeing logic.** (Kswapd normally doesn't need memory anyway, but sometimes* you need a small amount of memory in order to be able to* page out something else, and this flag essentially protects* us from recursively trying to free more memory as we're* trying to free the first piece of memory in the first place).*//*a. PF_KSWAPD主要⽤于标记该线程是个kswapd线程，在后续运⾏中，与其他业务线程进行区别，例如在                   get_scan_count中⽤于调节swappiness等。b. PF_MEMALLOC 标记该线程是个关键线程，即在紧急情况下能在低于Min Watermark 的情况下继续分配内存,说明该线程分配内存时不受Watermark的限制。这是因为kswapd在运⾏的时候说明系统已经低内存，⽽kswapd有时为了回收更多内存需要临时分配内存用作周转，例如在把Anon page压缩到zram的过程中，⾸先需要分配内存来保存压缩后的内存，如果分配不成功，则可能⽆法完成内存回收工作。c. PF_SWAPWRITE 表⽰可以写swap，说明kswapd可以回收匿名⻚
————————————————
版权声明：本文为CSDN博主「内核工匠」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/feelabclihu/article/details/124054410*/tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; //设置kswapd的进程标志位set_freezable();pgdat->kswapd_order = 0;pgdat->kswapd_classzone_idx = MAX_NR_ZONES;for ( ; ; ) {bool ret;alloc_order = reclaim_order = pgdat->kswapd_order;classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);kswapd_try_sleep:kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, //尝试睡眠，并把自己放入pgdat→kswapd_wait等待队列中，直到kswapd被唤醒。classzone_idx);/* Read the new order and classzone_idx */alloc_order = reclaim_order = pgdat->kswapd_order;classzone_idx = kswapd_classzone_idx(pgdat, 0);pgdat->kswapd_order = 0;pgdat->kswapd_classzone_idx = MAX_NR_ZONES;ret = try_to_freeze();if (kthread_should_stop())break;/** We can speed up thawing tasks if we don't call balance_pgdat* after returning from the refrigerator*/if (ret)continue;/** Reclaim begins at the requested order but if a high-order* reclaim fails then kswapd falls back to reclaiming for* order-0. If that happens, kswapd will consider sleeping* for the order it finished reclaiming at (reclaim_order)* but kcompactd is woken to compact for the original* request (alloc_order).*/trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,alloc_order);reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);if (reclaim_order < alloc_order)goto kswapd_try_sleep;}tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);current->reclaim_state = NULL;return 0;
}

代码流程大致如下

balance_pgdat

static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{int i;unsigned long nr_soft_reclaimed;unsigned long nr_soft_scanned;unsigned long pflags;unsigned long nr_boost_reclaim;unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };bool boosted;struct zone *zone;struct scan_control sc = {.gfp_mask = GFP_KERNEL,.order = order,.may_unmap = 1,};psi_memstall_enter(&pflags);__fs_reclaim_acquire();count_vm_event(PAGEOUTRUN);/** Account for the reclaim boost. Note that the zone boost is left in* place so that parallel allocations that are near the watermark will* stall or direct reclaim until kswapd is finished.*/nr_boost_reclaim = 0;for (i = 0; i <= classzone_idx; i++) {zone = pgdat->node_zones + i;if (!managed_zone(zone))continue;nr_boost_reclaim += zone->watermark_boost;zone_boosts[i] = zone->watermark_boost;}boosted = nr_boost_reclaim;restart:sc.priority = DEF_PRIORITY;do {unsigned long nr_reclaimed = sc.nr_reclaimed;bool raise_priority = true;bool balanced;bool ret;sc.reclaim_idx = classzone_idx;/** If the number of buffer_heads exceeds the maximum allowed* then consider reclaiming from all zones. This has a dual* purpose -- on 64-bit systems it is expected that* buffer_heads are stripped during active rotation. On 32-bit* systems, highmem pages can pin lowmem memory and shrinking* buffers can relieve lowmem pressure. Reclaim may still not* go ahead if all eligible zones for the original allocation* request are balanced to avoid excessive reclaim from kswapd.*/if (buffer_heads_over_limit) {for (i = MAX_NR_ZONES - 1; i >= 0; i--) {zone = pgdat->node_zones + i;if (!managed_zone(zone))continue;sc.reclaim_idx = i;break;}}/** If the pgdat is imbalanced then ignore boosting and preserve* the watermarks for a later time and restart. Note that the* zone watermarks will be still reset at the end of balancing* on the grounds that the normal reclaim should be enough to* re-evaluate if boosting is required when kswapd next wakes.*/balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); //是否有空闲内存高于high的zone。if (!balanced && nr_boost_reclaim) {nr_boost_reclaim = 0;goto restart;}/** If boosting is not active then only reclaim if there are no* eligible zones. Note that sc.reclaim_idx is not used as* buffer_heads_over_limit may have adjusted it.*/if (!nr_boost_reclaim && balanced) //有空闲内存高于high的zone，直接退出kswapd内存回收goto out;/* Limit the priority of boosting to avoid reclaim writeback */if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)raise_priority = false;/** Do not writeback or swap pages for boosted reclaim. The* intent is to relieve pressure not issue sub-optimal IO* from reclaim context. If no pages are reclaimed, the* reclaim will be aborted.*/sc.may_writepage = !laptop_mode && !nr_boost_reclaim;sc.may_swap = !nr_boost_reclaim;sc.may_shrinkslab = !nr_boost_reclaim;/** Do some background aging of the anon list, to give* pages a chance to be referenced before reclaiming. All* pages are rotated regardless of classzone as this is* about consistent aging.*/age_active_anon(pgdat, &sc); //完成匿名页的老化处理，以4g的机器来说，开机后总内存3.x，对应的比例计算的5，即inactive*5<active时，需要把active 中的页面迁移到inactive中。在迁移的时候一次性选择32页，先把这32页从lru中剥离出来，对其进行单独处理，这也可以尽量短地持有lru链表的锁。剥离出来的这部分会被加入到ISOLATED类型的页面中。/** If we're getting trouble reclaiming, start doing writepage* even in laptop mode.*/if (sc.priority < DEF_PRIORITY - 2)sc.may_writepage = 1;/* Call soft limit reclaim before calling shrink_node. */sc.nr_scanned = 0;nr_soft_scanned = 0;nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,sc.gfp_mask, &nr_soft_scanned);sc.nr_reclaimed += nr_soft_reclaimed;/** There should be no need to raise the scanning priority if* enough pages are already being scanned that that high* watermark would be met at 100% efficiency.*/if (kswapd_shrink_node(pgdat, &sc))raise_priority = false;/** If the low watermark is met there is no need for processes* to be throttled on pfmemalloc_wait as they should not be* able to safely make forward progress. Wake them*/if (waitqueue_active(&pgdat->pfmemalloc_wait) &&allow_direct_reclaim(pgdat))wake_up_all(&pgdat->pfmemalloc_wait);/* Check if kswapd should be suspending */__fs_reclaim_release();ret = try_to_freeze();__fs_reclaim_acquire();if (ret || kthread_should_stop())break;/** Raise priority if scanning rate is too low or there was no* progress in reclaiming pages*/nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);/** If reclaim made no progress for a boost, stop reclaim as* IO cannot be queued and it could be an infinite loop in* extreme circumstances.*/if (nr_boost_reclaim && !nr_reclaimed)break;if (raise_priority || !nr_reclaimed)sc.priority--;} while (sc.priority >= 1);if (!sc.nr_reclaimed)pgdat->kswapd_failures++;out:/* If reclaim was boosted, account for the reclaim done in this pass */if (boosted) {unsigned long flags;for (i = 0; i <= classzone_idx; i++) {if (!zone_boosts[i])continue;/* Increments are under the zone lock */zone = pgdat->node_zones + i;spin_lock_irqsave(&zone->lock, flags);zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);spin_unlock_irqrestore(&zone->lock, flags);}/** As there is now likely space, wakeup kcompact to defragment* pageblocks.*/wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);}snapshot_refaults(NULL, pgdat);__fs_reclaim_release();psi_memstall_leave(&pflags);/** Return the order kswapd stopped reclaiming at as* prepare_kswapd_sleep() takes it into account. If another caller* entered the allocator slow path while kswapd was awake, order will* remain at the higher level.*/return sc.order;
}

kswapd_shrink_node

static bool kswapd_shrink_node(pg_data_t *pgdat,struct scan_control *sc)
{struct zone *zone;int z;/* Reclaim a number of pages proportional to the number of zones */sc->nr_to_reclaim = 0;for (z = 0; z <= sc->reclaim_idx; z++) {zone = pgdat->node_zones + z;if (!managed_zone(zone))continue;sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);}/** Historically care was taken to put equal pressure on all zones but* now pressure is applied based on node LRU order.*/shrink_node(pgdat, sc);/** Fragmentation may mean that the system cannot be rebalanced for* high-order allocations. If twice the allocation size has been* reclaimed then recheck watermarks only at order-0 to prevent* excessive reclaim. Assume that a process requested a high-order* can direct reclaim/compact.*/if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))sc->order = 0;return sc->nr_scanned >= sc->nr_to_reclaim;
}

shrink_node


static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{struct reclaim_state *reclaim_state = current->reclaim_state;unsigned long nr_reclaimed, nr_scanned;bool reclaimable = false;do {struct mem_cgroup *root = sc->target_mem_cgroup;struct mem_cgroup_reclaim_cookie reclaim = {.pgdat = pgdat,.priority = sc->priority,};unsigned long node_lru_pages = 0;struct mem_cgroup *memcg;memset(&sc->nr, 0, sizeof(sc->nr));nr_reclaimed = sc->nr_reclaimed;nr_scanned = sc->nr_scanned;memcg = mem_cgroup_iter(root, NULL, &reclaim);do {unsigned long lru_pages;unsigned long reclaimed;unsigned long scanned;switch (mem_cgroup_protected(root, memcg)) {case MEMCG_PROT_MIN:/** Hard protection.* If there is no reclaimable memory, OOM.*/continue;case MEMCG_PROT_LOW:/** Soft protection.* Respect the protection only as long as* there is an unprotected supply* of reclaimable memory from other cgroups.*/if (!sc->memcg_low_reclaim) {sc->memcg_low_skipped = 1;continue;}memcg_memory_event(memcg, MEMCG_LOW);break;case MEMCG_PROT_NONE:break;}reclaimed = sc->nr_reclaimed;scanned = sc->nr_scanned;shrink_node_memcg(pgdat, memcg, sc, &lru_pages);node_lru_pages += lru_pages;if (sc->may_shrinkslab) {shrink_slab(sc->gfp_mask, pgdat->node_id,memcg, sc->priority);}/* Record the group's reclaim efficiency */vmpressure(sc->gfp_mask, memcg, false,sc->nr_scanned - scanned,sc->nr_reclaimed - reclaimed);/** Direct reclaim and kswapd have to scan all memory* cgroups to fulfill the overall scan target for the* node.** Limit reclaim, on the other hand, only cares about* nr_to_reclaim pages to be reclaimed and it will* retry with decreasing priority if one round over the* whole hierarchy is not sufficient.*/if (!global_reclaim(sc) &&sc->nr_reclaimed >= sc->nr_to_reclaim) {mem_cgroup_iter_break(root, memcg);break;}} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));if (reclaim_state) {sc->nr_reclaimed += reclaim_state->reclaimed_slab;reclaim_state->reclaimed_slab = 0;}/* Record the subtree's reclaim efficiency */vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,sc->nr_scanned - nr_scanned,sc->nr_reclaimed - nr_reclaimed);if (sc->nr_reclaimed - nr_reclaimed)reclaimable = true;if (current_is_kswapd()) {/** If reclaim is isolating dirty pages under writeback,* it implies that the long-lived page allocation rate* is exceeding the page laundering rate. Either the* global limits are not being effective at throttling* processes due to the page distribution throughout* zones or there is heavy usage of a slow backing* device. The only option is to throttle from reclaim* context which is not ideal as there is no guarantee* the dirtying process is throttled in the same way* balance_dirty_pages() manages.** Once a node is flagged PGDAT_WRITEBACK, kswapd will* count the number of pages under pages flagged for* immediate reclaim and stall if any are encountered* in the nr_immediate check below.*/if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)set_bit(PGDAT_WRITEBACK, &pgdat->flags);/** Tag a node as congested if all the dirty pages* scanned were backed by a congested BDI and* wait_iff_congested will stall.*/if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)set_bit(PGDAT_CONGESTED, &pgdat->flags);/* Allow kswapd to start writing pages during reclaim.*/if (sc->nr.unqueued_dirty == sc->nr.file_taken)set_bit(PGDAT_DIRTY, &pgdat->flags);/** If kswapd scans pages marked marked for immediate* reclaim and under writeback (nr_immediate), it* implies that pages are cycling through the LRU* faster than they are written so also forcibly stall.*/if (sc->nr.immediate)congestion_wait(BLK_RW_ASYNC, HZ/10);}/** Legacy memcg will stall in page writeback so avoid forcibly* stalling in wait_iff_congested().*/if (!global_reclaim(sc) && sane_reclaim(sc) &&sc->nr.dirty && sc->nr.dirty == sc->nr.congested)set_memcg_congestion(pgdat, root, true);/** Stall direct reclaim for IO completions if underlying BDIs* and node is congested. Allow kswapd to continue until it* starts encountering unqueued dirty pages or cycling through* the LRU too quickly.*/if (!sc->hibernation_mode && !current_is_kswapd() &&current_may_throttle() && pgdat_memcg_congested(pgdat, root))wait_iff_congested(BLK_RW_ASYNC, HZ/10);} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,sc->nr_scanned - nr_scanned, sc));/** Kswapd gives up on balancing particular nodes after too* many failures to reclaim anything from them and goes to* sleep. On reclaim progress, reset the failure counter. A* successful direct reclaim run will revive a dormant kswapd.*/if (reclaimable)pgdat->kswapd_failures = 0;return reclaimable;
}

shrink_node_memcg


/** This is a basic per-node page freer.  Used by both kswapd and direct reclaim.*/
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,struct scan_control *sc, unsigned long *lru_pages)
{struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);unsigned long nr[NR_LRU_LISTS];unsigned long targets[NR_LRU_LISTS];unsigned long nr_to_scan;enum lru_list lru;unsigned long nr_reclaimed = 0;unsigned long nr_to_reclaim = sc->nr_to_reclaim;struct blk_plug plug;bool scan_adjusted;get_scan_count(lruvec, memcg, sc, nr, lru_pages); //根据swappiness的值计算需要回收的匿名页和文件页的数量/* Record the original scan target for proportional adjustments later */memcpy(targets, nr, sizeof(nr));/** Global reclaiming within direct reclaim at DEF_PRIORITY is a normal* event that can occur when there is little memory pressure e.g.* multiple streaming readers/writers. Hence, we do not abort scanning* when the requested number of pages are reclaimed when scanning at* DEF_PRIORITY on the assumption that the fact we are direct* reclaiming implies that kswapd is not keeping up and it is best to* do a batch of work at once. For memcg reclaim one check is made to* abort proportional reclaim if either the file or anon lru has already* dropped to zero at the first pass.*/scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&sc->priority == DEF_PRIORITY);blk_start_plug(&plug); //块设备层接口，因为内存回收过程中涉及io操作，在这里对io进行蓄洪。while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||nr[LRU_INACTIVE_FILE]) {unsigned long nr_anon, nr_file, percentage;unsigned long nr_scanned;for_each_evictable_lru(lru) {if (nr[lru]) {nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);nr[lru] -= nr_to_scan;nr_reclaimed += shrink_list(lru, nr_to_scan,lruvec, memcg, sc);}}cond_resched();if (nr_reclaimed < nr_to_reclaim || scan_adjusted)continue;/** For kswapd and memcg, reclaim at least the number of pages* requested. Ensure that the anon and file LRUs are scanned* proportionally what was requested by get_scan_count(). We* stop reclaiming one LRU and reduce the amount scanning* proportional to the original scan target.*/nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];/** It's just vindictive to attack the larger once the smaller* has gone to zero.  And given the way we stop scanning the* smaller below, this makes sure that we only make one nudge* towards proportionality once we've got nr_to_reclaim.*/if (!nr_file || !nr_anon)break;if (nr_file > nr_anon) {unsigned long scan_target = targets[LRU_INACTIVE_ANON] +targets[LRU_ACTIVE_ANON] + 1;lru = LRU_BASE;percentage = nr_anon * 100 / scan_target;} else {unsigned long scan_target = targets[LRU_INACTIVE_FILE] +targets[LRU_ACTIVE_FILE] + 1;lru = LRU_FILE;percentage = nr_file * 100 / scan_target;}/* Stop scanning the smaller of the LRU */nr[lru] = 0;nr[lru + LRU_ACTIVE] = 0;/** Recalculate the other LRU scan count based on its original* scan target and the percentage scanning already complete*/lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;nr_scanned = targets[lru] - nr[lru];nr[lru] = targets[lru] * (100 - percentage) / 100;nr[lru] -= min(nr[lru], nr_scanned);lru += LRU_ACTIVE;nr_scanned = targets[lru] - nr[lru];nr[lru] = targets[lru] * (100 - percentage) / 100;nr[lru] -= min(nr[lru], nr_scanned);scan_adjusted = true;}blk_finish_plug(&plug); //完成蓄洪，将io一次性下发。sc->nr_reclaimed += nr_reclaimed;/** Even if we did not try to evict anon pages at all, we want to* rebalance the anon lru active/inactive ratio.*/if (inactive_list_is_low(lruvec, false, memcg, sc, true)) //对active 和inactive类型的LRU链表进行调整，保证inactive list不会太低。shrink_active_list(SWAP_CLUSTER_MAX, lruvec,sc, LRU_ACTIVE_ANON);
}

get_scan_acount

static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,struct scan_control *sc, unsigned long *nr,unsigned long *lru_pages)
{int swappiness = mem_cgroup_swappiness(memcg);struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;u64 fraction[2];u64 denominator = 0;    /* gcc */struct pglist_data *pgdat = lruvec_pgdat(lruvec);unsigned long anon_prio, file_prio;enum scan_balance scan_balance;unsigned long anon, file;unsigned long ap, fp;enum lru_list lru;/* If we have no swap space, do not bother scanning anon pages. */if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {scan_balance = SCAN_FILE;goto out;}/** Global reclaim will swap to prevent OOM even with no* swappiness, but memcg users want to use this knob to* disable swapping for individual groups completely when* using the memory controller's swap limit feature would be* too expensive.*/if (!global_reclaim(sc) && !swappiness) {scan_balance = SCAN_FILE;goto out;}/** Do not apply any pressure balancing cleverness when the* system is close to OOM, scan both anon and file equally* (unless the swappiness setting disagrees with swapping).*/if (!sc->priority && swappiness) {scan_balance = SCAN_EQUAL;goto out;}/** Prevent the reclaimer from falling into the cache trap: as* cache pages start out inactive, every cache fault will tip* the scan balance towards the file LRU.  And as the file LRU* shrinks, so does the window for rotation from references.* This means we have a runaway feedback loop where a tiny* thrashing file LRU becomes infinitely more attractive than* anon pages.  Try to detect this based on file LRU size.*/if (global_reclaim(sc)) {unsigned long pgdatfile;unsigned long pgdatfree;int z;unsigned long total_high_wmark = 0;pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +node_page_state(pgdat, NR_INACTIVE_FILE);for (z = 0; z < MAX_NR_ZONES; z++) {struct zone *zone = &pgdat->node_zones[z];if (!managed_zone(zone))continue;total_high_wmark += high_wmark_pages(zone);}if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {/** Force SCAN_ANON if there are enough inactive* anonymous pages on the LRU in eligible zones.* Otherwise, the small LRU gets thrashed.*/if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)>> sc->priority) {scan_balance = SCAN_ANON;goto out;}}}/** If there is enough inactive page cache, i.e. if the size of the* inactive list is greater than that of the active list *and* the* inactive list actually has some pages to scan on this priority, we* do not reclaim anything from the anonymous working set right now.* Without the second condition we could end up never scanning an* lruvec even if it has plenty of old anonymous pages unless the* system is under heavy pressure.*/if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {scan_balance = SCAN_FILE;goto out;}scan_balance = SCAN_FRACT;/** With swappiness at 100, anonymous and file have the same priority.* This scanning priority is essentially the inverse of IO cost.*/anon_prio = swappiness;file_prio = 200 - anon_prio;/** OK, so we have swap space and a fair amount of page cache* pages.  We use the recently rotated / recently scanned* ratios to determine how valuable each cache is.** Because workloads change over time (and to avoid overflow)* we keep these statistics as a floating average, which ends* up weighing recent references more than old ones.** anon in [0], file in [1]*/anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);spin_lock_irq(&pgdat->lru_lock);if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {reclaim_stat->recent_scanned[0] /= 2;reclaim_stat->recent_rotated[0] /= 2;}if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {reclaim_stat->recent_scanned[1] /= 2;reclaim_stat->recent_rotated[1] /= 2;}/** The amount of pressure on anon vs file pages is inversely* proportional to the fraction of recently scanned pages on* each list that were recently referenced and in active use.*/ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);ap /= reclaim_stat->recent_rotated[0] + 1;fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);fp /= reclaim_stat->recent_rotated[1] + 1;spin_unlock_irq(&pgdat->lru_lock);fraction[0] = ap;fraction[1] = fp;denominator = ap + fp + 1;
out:*lru_pages = 0;for_each_evictable_lru(lru) {int file = is_file_lru(lru);unsigned long size;unsigned long scan;size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);scan = size >> sc->priority;/** If the cgroup's already been deleted, make sure to* scrape out the remaining cache.*/if (!scan && !mem_cgroup_online(memcg))scan = min(size, SWAP_CLUSTER_MAX);switch (scan_balance) {case SCAN_EQUAL:/* Scan lists relative to size */break;case SCAN_FRACT:/** Scan types proportional to swappiness and* their relative recent reclaim efficiency.* Make sure we don't miss the last page* because of a round-off error.*/scan = DIV64_U64_ROUND_UP(scan * fraction[file],denominator);break;case SCAN_FILE:case SCAN_ANON:/* Scan one type exclusively */if ((scan_balance == SCAN_FILE) != file) {size = 0;scan = 0;}break;default:/* Look ma, no brain */BUG();}*lru_pages += size;nr[lru] = scan;}
}

shrink_list shrink_inactive_list

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,struct lruvec *lruvec, struct mem_cgroup *memcg,struct scan_control *sc)
{if (is_active_lru(lru)) {if (inactive_list_is_low(lruvec, is_file_lru(lru),memcg, sc, true))shrink_active_list(nr_to_scan, lruvec, sc, lru);return 0;}return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}

static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,struct scan_control *sc, enum lru_list lru)
{LIST_HEAD(page_list);unsigned long nr_scanned;unsigned long nr_reclaimed = 0;unsigned long nr_taken;struct reclaim_stat stat = {};isolate_mode_t isolate_mode = 0;int file = is_file_lru(lru);struct pglist_data *pgdat = lruvec_pgdat(lruvec);struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;bool stalled = false;while (unlikely(too_many_isolated(pgdat, file, sc))) {if (stalled)return 0;/* wait a bit for the reclaimer. */msleep(100);stalled = true;/* We are about to die and free our memory. Return now. */if (fatal_signal_pending(current))return SWAP_CLUSTER_MAX;}lru_add_drain();if (!sc->may_unmap)isolate_mode |= ISOLATE_UNMAPPED;spin_lock_irq(&pgdat->lru_lock);nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,&nr_scanned, sc, isolate_mode, lru);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);reclaim_stat->recent_scanned[file] += nr_taken;if (current_is_kswapd()) {if (global_reclaim(sc))__count_vm_events(PGSCAN_KSWAPD, nr_scanned);count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,nr_scanned);} else {if (global_reclaim(sc))__count_vm_events(PGSCAN_DIRECT, nr_scanned);count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,nr_scanned);}spin_unlock_irq(&pgdat->lru_lock);if (nr_taken == 0)return 0;nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,&stat, false);spin_lock_irq(&pgdat->lru_lock);if (current_is_kswapd()) {if (global_reclaim(sc))__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,nr_reclaimed);} else {if (global_reclaim(sc))__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,nr_reclaimed);}putback_inactive_pages(lruvec, &page_list);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);spin_unlock_irq(&pgdat->lru_lock);mem_cgroup_uncharge_list(&page_list);free_unref_page_list(&page_list);/** If dirty pages are scanned that are not queued for IO, it* implies that flushers are not doing their job. This can* happen when memory pressure pushes dirty pages to the end of* the LRU before the dirty limits are breached and the dirty* data has expired. It can also happen when the proportion of* dirty pages grows not through writes but through memory* pressure reclaiming all the clean cache. And in some cases,* the flushers simply cannot keep up with the allocation* rate. Nudge the flusher threads in case they are asleep.*/if (stat.nr_unqueued_dirty == nr_taken)wakeup_flusher_threads(WB_REASON_VMSCAN);sc->nr.dirty += stat.nr_dirty;sc->nr.congested += stat.nr_congested;sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;sc->nr.writeback += stat.nr_writeback;sc->nr.immediate += stat.nr_immediate;sc->nr.taken += nr_taken;if (file)sc->nr.file_taken += nr_taken;trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,nr_scanned, nr_reclaimed, &stat, sc->priority, file);return nr_reclaimed;
}

shrink_page_list


/** shrink_page_list() returns the number of reclaimed pages*/
static unsigned long shrink_page_list(struct list_head *page_list,struct pglist_data *pgdat,struct scan_control *sc,enum ttu_flags ttu_flags,struct reclaim_stat *stat,bool force_reclaim)
{LIST_HEAD(ret_pages);LIST_HEAD(free_pages);int pgactivate = 0;unsigned nr_unqueued_dirty = 0;unsigned nr_dirty = 0;unsigned nr_congested = 0;unsigned nr_reclaimed = 0;unsigned nr_writeback = 0;unsigned nr_immediate = 0;unsigned nr_ref_keep = 0;unsigned nr_unmap_fail = 0;cond_resched();while (!list_empty(page_list)) {struct address_space *mapping;struct page *page;int may_enter_fs;enum page_references references = PAGEREF_RECLAIM_CLEAN;bool dirty, writeback;cond_resched();page = lru_to_page(page_list);list_del(&page->lru);if (!trylock_page(page))goto keep;VM_BUG_ON_PAGE(PageActive(page), page);sc->nr_scanned++;if (unlikely(!page_evictable(page)))goto activate_locked;if (!sc->may_unmap && page_mapped(page))goto keep_locked;/* Double the slab pressure for mapped and swapcache pages */if ((page_mapped(page) || PageSwapCache(page)) &&!(PageAnon(page) && !PageSwapBacked(page)))sc->nr_scanned++;may_enter_fs = (sc->gfp_mask & __GFP_FS) ||(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));/** The number of dirty pages determines if a node is marked* reclaim_congested which affects wait_iff_congested. kswapd* will stall and start writing pages if the tail of the LRU* is all dirty unqueued pages.*/page_check_dirty_writeback(page, &dirty, &writeback);if (dirty || writeback)nr_dirty++;if (dirty && !writeback)nr_unqueued_dirty++;/** Treat this page as congested if the underlying BDI is or if* pages are cycling through the LRU so quickly that the* pages marked for immediate reclaim are making it to the* end of the LRU a second time.*/mapping = page_mapping(page);if (((dirty || writeback) && mapping &&inode_write_congested(mapping->host)) ||(writeback && PageReclaim(page)))nr_congested++;/** If a page at the tail of the LRU is under writeback, there* are three cases to consider.** 1) If reclaim is encountering an excessive number of pages*    under writeback and this page is both under writeback and*    PageReclaim then it indicates that pages are being queued*    for IO but are being recycled through the LRU before the*    IO can complete. Waiting on the page itself risks an*    indefinite stall if it is impossible to writeback the*    page due to IO error or disconnected storage so instead*    note that the LRU is being scanned too quickly and the*    caller can stall after page list has been processed.** 2) Global or new memcg reclaim encounters a page that is*    not marked for immediate reclaim, or the caller does not*    have __GFP_FS (or __GFP_IO if it's simply going to swap,*    not to fs). In this case mark the page for immediate*    reclaim and continue scanning.**    Require may_enter_fs because we would wait on fs, which*    may not have submitted IO yet. And the loop driver might*    enter reclaim, and deadlock if it waits on a page for*    which it is needed to do the write (loop masks off*    __GFP_IO|__GFP_FS for this reason); but more thought*    would probably show more reasons.** 3) Legacy memcg encounters a page that is already marked*    PageReclaim. memcg does not have any dirty pages*    throttling so we could easily OOM just because too many*    pages are in writeback and there is nothing else to*    reclaim. Wait for the writeback to complete.** In cases 1) and 2) we activate the pages to get them out of* the way while we continue scanning for clean pages on the* inactive list and refilling from the active list. The* observation here is that waiting for disk writes is more* expensive than potentially causing reloads down the line.* Since they're marked for immediate reclaim, they won't put* memory pressure on the cache working set any longer than it* takes to write them to disk.*/if (PageWriteback(page)) {/* Case 1 above */if (current_is_kswapd() &&PageReclaim(page) &&test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {nr_immediate++;goto activate_locked;/* Case 2 above */} else if (sane_reclaim(sc) ||!PageReclaim(page) || !may_enter_fs) {/** This is slightly racy - end_page_writeback()* might have just cleared PageReclaim, then* setting PageReclaim here end up interpreted* as PageReadahead - but that does not matter* enough to care.  What we do want is for this* page to have PageReclaim set next time memcg* reclaim reaches the tests above, so it will* then wait_on_page_writeback() to avoid OOM;* and it's also appropriate in global reclaim.*/SetPageReclaim(page);nr_writeback++;goto activate_locked;/* Case 3 above */} else {unlock_page(page);wait_on_page_writeback(page);//the disk sleep state occur!/* then go back and try same page again */list_add_tail(&page->lru, page_list);continue;}}if (!force_reclaim)references = page_check_references(page, sc);switch (references) {case PAGEREF_ACTIVATE:goto activate_locked;case PAGEREF_KEEP:nr_ref_keep++;goto keep_locked;case PAGEREF_RECLAIM:case PAGEREF_RECLAIM_CLEAN:; /* try to reclaim the page below */}/** Anonymous process memory has backing store?* Try to allocate it some swap space here.* Lazyfree page could be freed directly*/if (PageAnon(page) && PageSwapBacked(page)) {if (!PageSwapCache(page)) {if (!(sc->gfp_mask & __GFP_IO))goto keep_locked;if (PageTransHuge(page)) {/* cannot split THP, skip it */if (!can_split_huge_page(page, NULL))goto activate_locked;/** Split pages without a PMD map right* away. Chances are some or all of the* tail pages can be freed without IO.*/if (!compound_mapcount(page) &&split_huge_page_to_list(page,page_list))goto activate_locked;}if (!add_to_swap(page)) {if (!PageTransHuge(page))goto activate_locked;/* Fallback to swap normal pages */if (split_huge_page_to_list(page,page_list))goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGEcount_vm_event(THP_SWPOUT_FALLBACK);
#endifif (!add_to_swap(page))goto activate_locked;}may_enter_fs = 1;/* Adding to swap updated mapping */mapping = page_mapping(page);}} else if (unlikely(PageTransHuge(page))) {/* Split file THP */if (split_huge_page_to_list(page, page_list))goto keep_locked;}/** The page is mapped into the page tables of one or more* processes. Try to unmap it here.*/if (page_mapped(page)) {enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;if (unlikely(PageTransHuge(page)))flags |= TTU_SPLIT_HUGE_PMD;if (!try_to_unmap(page, flags)) {nr_unmap_fail++;goto activate_locked;}}if (PageDirty(page)) {/** Only kswapd can writeback filesystem pages* to avoid risk of stack overflow. But avoid* injecting inefficient single-page IO into* flusher writeback as much as possible: only* write pages when we've encountered many* dirty pages, and when we've already scanned* the rest of the LRU for clean pages and see* the same dirty pages again (PageReclaim).*/if (page_is_file_cache(page) &&(!current_is_kswapd() || !PageReclaim(page) ||!test_bit(PGDAT_DIRTY, &pgdat->flags))) {/** Immediately reclaim when written back.* Similar in principal to deactivate_page()* except we already have the page isolated* and know it's dirty*/inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);SetPageReclaim(page);goto activate_locked;}if (references == PAGEREF_RECLAIM_CLEAN)goto keep_locked;if (!may_enter_fs)goto keep_locked;if (!sc->may_writepage)goto keep_locked;/** Page is dirty. Flush the TLB if a writable entry* potentially exists to avoid CPU writes after IO* starts and then write it out here.*/try_to_unmap_flush_dirty();switch (pageout(page, mapping, sc)) {case PAGE_KEEP:goto keep_locked;case PAGE_ACTIVATE:goto activate_locked;case PAGE_SUCCESS:if (PageWriteback(page))goto keep;if (PageDirty(page))goto keep;/** A synchronous write - probably a ramdisk.  Go* ahead and try to reclaim the page.*/if (!trylock_page(page))goto keep;if (PageDirty(page) || PageWriteback(page))goto keep_locked;mapping = page_mapping(page);case PAGE_CLEAN:; /* try to free the page below */}}/** If the page has buffers, try to free the buffer mappings* associated with this page. If we succeed we try to free* the page as well.** We do this even if the page is PageDirty().* try_to_release_page() does not perform I/O, but it is* possible for a page to have PageDirty set, but it is actually* clean (all its buffers are clean).  This happens if the* buffers were written out directly, with submit_bh(). ext3* will do this, as well as the blockdev mapping.* try_to_release_page() will discover that cleanness and will* drop the buffers and mark the page clean - it can be freed.** Rarely, pages can have buffers and no ->mapping.  These are* the pages which were not successfully invalidated in* truncate_complete_page().  We try to drop those buffers here* and if that worked, and the page is no longer mapped into* process address space (page_count == 1) it can be freed.* Otherwise, leave the page on the LRU so it is swappable.*/if (page_has_private(page)) {if (!try_to_release_page(page, sc->gfp_mask))goto activate_locked;if (!mapping && page_count(page) == 1) {unlock_page(page);if (put_page_testzero(page))goto free_it;else {/** rare race with speculative reference.* the speculative reference will free* this page shortly, so we may* increment nr_reclaimed here (and* leave it off the LRU).*/nr_reclaimed++;continue;}}}if (PageAnon(page) && !PageSwapBacked(page)) {/* follow __remove_mapping for reference */if (!page_ref_freeze(page, 1))goto keep_locked;if (PageDirty(page)) {page_ref_unfreeze(page, 1);goto keep_locked;}count_vm_event(PGLAZYFREED);count_memcg_page_event(page, PGLAZYFREED);} else if (!mapping || !__remove_mapping(mapping, page, true))goto keep_locked;unlock_page(page);
free_it:nr_reclaimed++;/** Is there need to periodically free_page_list? It would* appear not as the counts should be low*/if (unlikely(PageTransHuge(page))) {mem_cgroup_uncharge(page);(*get_compound_page_dtor(page))(page);} elselist_add(&page->lru, &free_pages);continue;activate_locked:/* Not a candidate for swapping, so reclaim swap space. */if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||PageMlocked(page)))try_to_free_swap(page);VM_BUG_ON_PAGE(PageActive(page), page);if (!PageMlocked(page)) {SetPageActive(page);pgactivate++;count_memcg_page_event(page, PGACTIVATE);}
keep_locked:unlock_page(page);
keep:list_add(&page->lru, &ret_pages);VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);}mem_cgroup_uncharge_list(&free_pages);try_to_unmap_flush();free_unref_page_list(&free_pages);list_splice(&ret_pages, page_list);count_vm_events(PGACTIVATE, pgactivate);if (stat) {stat->nr_dirty = nr_dirty;stat->nr_congested = nr_congested;stat->nr_unqueued_dirty = nr_unqueued_dirty;stat->nr_writeback = nr_writeback;stat->nr_immediate = nr_immediate;stat->nr_activate = pgactivate;stat->nr_ref_keep = nr_ref_keep;stat->nr_unmap_fail = nr_unmap_fail;}return nr_reclaimed;
}

shrink_slab

遍历全局变量shrinker_list上注册的shrinker，调用对应的scan_object函数进行回收。

static unsigned long shrink_slab(gfp_t gfp_mask, int nid,struct mem_cgroup *memcg,int priority)
{unsigned long ret, freed = 0;struct shrinker *shrinker;if (!mem_cgroup_is_root(memcg))return shrink_slab_memcg(gfp_mask, nid, memcg, priority);if (!down_read_trylock(&shrinker_rwsem))goto out;list_for_each_entry(shrinker, &shrinker_list, list) {struct shrink_control sc = {.gfp_mask = gfp_mask,.nid = nid,.memcg = memcg,};ret = do_shrink_slab(&sc, shrinker, priority);if (ret == SHRINK_EMPTY)ret = 0;freed += ret;/** Bail out if someone want to register a new shrinker to* prevent the regsitration from being stalled for long periods* by parallel ongoing shrinking.*/if (rwsem_is_contended(&shrinker_rwsem)) {freed = freed ? : 1;break;}}up_read(&shrinker_rwsem);
out:cond_resched();return freed;
}

__alloc_pages_direct_reclaim

static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
unsigned long *did_some_progress)

 struct page *page = NULL;bool drained = false;*did_some_progress = __perform_reclaim(gfp_mask, order, ac);if (unlikely(!(*did_some_progress)))return NULL;retry:page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);/** If an allocation failed after direct reclaim, it could be because* pages are pinned on the per-cpu lists or in high alloc reserves.* Shrink them them and try again*/if (!page && !drained) {unreserve_highatomic_pageblock(ac, false);drain_all_pages(NULL);drained = true;goto retry;}return page;

代码流程如下

linux内存回收流程相关推荐

linux内存回收(一）---kswapd回收
正式开始十一之旅,有大量的时间将目前工作中遇到的内存回收进行总结下,主要是对内存回收的整个过程进行重新梳理.在linux操作系统中,当内存充足的时候,内核会尽量使用内存作为文件缓存(page ca ...
linux内存回收(二）--直接内存回收机制
上一章,我们学习了kswapd的内存回收的机制,其本身是一个内核线程,它和调用者的关系是异步的,那么本章就开始学习内核的内存回收的方式.因为在不同的内存分配路径中,会触发不同的内存回收方式,内存回收针 ...
linux内存回收机制
无论计算机上有多少内存都是不够的,因而linux kernel需要回收一些很少使用的内存页面来保证系统持续有内存使用.页面回收的方式有页回写.页交换和页丢弃三种方式:如果一个很少使用的页的后备存储器是 ...
linux内存回收内核参数
ss -atu| awk '/^tcp/{++S[$2]} END {for(a in S) print a,S[a]}' ps up $(pid) (RSS:实际内存大小,长驻内存) ps o ...
Linux内核：内存管理——内存回收
概述当linux系统内存压力就大时,就会对系统的每个压力大的zone进程内存回收,内存回收主要是针对匿名页和文件页进行的.对于匿名页,内存回收过程中会筛选出一些不经常使用的匿名页,将它们写入到swa ...
Linux内存管理回收机制
Linux内存管理回收机制 1.Linux内存管理简介 Linux将所管理的内存划分为内存节点(node).内存分区(zone)和页框(page). 1.1.内存节点(node) 依据 ...
linux内存分配与回收
大神博客传送门:http://luodw.cc/2016/08/13/linux-cache/ 之前在实习时,听了OOM的分享之后,就对linux内核内存管理充满兴趣:但是这块知识非常庞大,没有一定积 ...
嵌入式linux内存使用和性能优化
这本书有两个关切点:系统内存(用户层)和性能优化. 这本书和Brendan Gregg的<Systems Performance>相比,无论是技术层次还是更高的理论都有较大差距.但是这不影 ...
Linux内存管理：Swap介绍以及如何使交换具有可扩展性
目录一.swap分区 1. swap分区的创建 2. swap的数据结构二.swap out 1. 回收流程 2. swap slot cache 3. ssd查找算法三.swap in 1. ...

linux内存回收流程