Linux 内存管理涉及引导分配器、伙伴系统和 Slab 分配器。引导分配器负责内核初始化阶段的内存管理,包括 bootmem 和 memblock。伙伴系统管理物理页面,通过分裂和合并机制减少碎片。Slab 分配器针对小对象优化,提供缓存和快速分配。三者协同工作以高效利用内存资源。
时间旅人0 浏览
深入解析 Linux 内存管理:三大分配器原理
Linux 内存管理是指操作系统对于计算机系统中的内存资源进行有效利用和管理的过程。它包括了内存分配、页面置换、内存映射等方面。Linux 内存管理的核心在于三大分配器:引导内存分配器(Bootmem/Memblock)、伙伴分配器(Buddy Allocator)和 Slab 分配器。
一、引导内存分配器
Linux 系统中使用伙伴系统对物理页面进行分配管理,但是伙伴分配系统需要内核完成初始化以及建立相关内核数据结构后才能够正常工作。因此,在内核初始化相关数据结构时需要另一种内存分配器。早期 Linux 没有较为完善的引导内存分配器,随着硬件的发展和日趋复杂,处理不同体系的内存分配代码也渐渐复杂起来,随之就需要引导内存分配器来初始化系统主要内存分配器的数据结构以确保其正常工作。
structmemblock_type {unsignedlong cnt; /* number of regions */unsignedlong max; /* size of the allocated array */phys_addr_t total_size; /* size of all regions */structmemblock_region *regions;char *name;
};
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid, ulong flags)
{
phys_addr_t kernel_end, ret;
/* pump up @end */if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
end = memblock.current_limit;
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
kernel_end = __pa_symbol(_end);
/*
* try bottom-up allocation only when bottom-up mode
* is set and @end is above the kernel image.
*/if (memblock_bottom_up() && end > kernel_end) {
phys_addr_t bottom_up_start;
/* make sure we will allocate above the kernel */
bottom_up_start = max(start, kernel_end);
/* ok, try bottom-up allocation first */
ret = __memblock_find_range_bottom_up(bottom_up_start, end,
size, align, nid, flags);
if (ret)
return ret;
/*
* we always limit bottom-up allocation above the kernel,
* but top-down allocation doesn't have the limit, so
* retrying top-down allocation may succeed when bottom-up
* allocation failed.
*
* bottom-up allocation is expected to be fail very rarely,
* so we use WARN_ONCE() here to see the stack trace if
* fail happens.
*/
WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid, flags);
}
该函数中 size 表示寻找物理内存大小 (字节),align 为物理内存对齐,start 表示所寻找内存区域最小物理地址,end 为最大物理地址,nid 为节点号,flags 为标志位。检查 end 是否等于 MEMBLOCK_ALLOC_ACCESSIBLE,相等则设置 end 为 memblock.current_limit,重新计算所查询物理地址范围。
检查 memblock.bottom_up 和 end>kernel_end 是否均为真,即内存查询是否为从低地址项高地址且 end 高于内核最大虚拟地址所对应的物理地址,重新获取查询范围并进行查询。
最后进行从高地址到低地址进行查询。
(3) 向 memblock_type 添加 memblock_region
int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, unsignedlong flags)
{
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size);
int idx, nr_new;
structmemblock_region *rgn;if (!size)
return0;
/* special case for empty array */if (type->regions[0].size == 0) {
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return0;
}
repeat:
/*
* The following is executed twice. Once with %false @insert and
* then with %true. The first counts the number of regions needed
* to accommodate the new area. The second actually inserts them.
*/
base = obase;
nr_new = 0;
for_each_memblock_type(idx, type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
if (rbase >= end)
break;
if (rend <= base)
continue;
/*
* @rgn overlaps. If it separates the lower part of new
* area, insert that portion.
*/if (rbase > base) {
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
WARN_ON(flags != rgn->flags);
nr_new++;
if (insert)
memblock_insert_region(type, idx++, base,
rbase - base, nid,
flags);
}
/* area below @rend is dealt with, forget about it */
base = min(rend, end);
}
/* insert the remaining portion */if (base < end) {
nr_new++;
if (insert)
memblock_insert_region(type, idx, base, end - base,
nid, flags);
}
if (!nr_new)
return0;
/*
* If this was the first round, resize array and repeat for actual
* insertions; otherwise, merge and return.
*/if (!insert) {
while (type->cnt + nr_new > type->max)
if (memblock_double_array(type, obase, size) < 0)
return -ENOMEM;
insert = true;
goto repeat;
} else {
memblock_merge_regions(type);
return0;
}
}
该函数 base 为添加内存起始物理地址,size 为添加的物理内存大小 (字节数目),nid 为添加内存所在节点号,flags 为添加 region 标志位。首先计算获取所添加物理内存所在物理地址范围,然后检查 memblock_type 中 region 数组是否为空。如果为空则初始化 region 数组首个元素并返回。
经过遍历后,检查内存区域是否完全插入,如果有部分区域并未插入则检查 insert 是否为真,为真则将剩余部分也插入 region 数组中,否则继续运行。
检查 insert 是否为真,若为真则将 memblock_type 中的 region 数组进行合并,否则检查 nr_new+type->cnt 是否超出 type 中 region 数组最大数,超出则对 type 中 region 数组进行扩充,设置 insert 为真继续重复以上操作。
(4) 扩充 memblock_type 中 region 数组空间
staticint __init_memblock memblock_double_array(struct memblock_type *type,
phys_addr_t new_area_start,
phys_addr_t new_area_size)
{
structmemblock_region *new_array, *old_array;phys_addr_t old_alloc_size, new_alloc_size;
phys_addr_t old_size, new_size, addr;
int use_slab = slab_is_available();
int *in_slab;
/* We don't allow resizing until we know about the reserved regions
* of memory that aren't suitable for allocation
*/if (!memblock_can_resize)
return-1;
/* Calculate new doubled size */
old_size = type->max * sizeof(struct memblock_region);
new_size = old_size << 1;
/*
* We need to allocated new one align to PAGE_SIZE,
* so we can free them completely later.
*/
old_alloc_size = PAGE_ALIGN(old_size);
new_alloc_size = PAGE_ALIGN(new_size);
/* Retrieve the slab flag */if (type == &memblock.memory)
in_slab = &memblock_memory_in_slab;
else
in_slab = &memblock_reserved_in_slab;
if (use_slab) {
new_array = kmalloc(new_size, GFP_KERNEL);
addr = new_array ? __pa(new_array) : 0;
} else {
/* only exclude range when trying to double reserved.regions */if (type != &memblock.reserved)
new_area_start = new_area_size = 0;
addr = memblock_find_in_range(new_area_start + new_area_size,
memblock.current_limit,
new_alloc_size, PAGE_SIZE);
if (!addr && new_area_size)
addr = memblock_find_in_range(0,
min(new_area_start, memblock.current_limit),
new_alloc_size, PAGE_SIZE);
new_array = addr ? __va(addr) : NULL;
}
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
type->name, type->max, type->max * 2);
return-1;
}
memcpy(new_array, type->regions, old_size);
memset(new_array + type->max, 0, old_size);
old_array = type->regions;
type->regions = new_array;
type->max <<= 1;
/* Free old array. We needn't free it if the array is the static one */if (*in_slab)
kfree(old_array);
elseif (old_array != memblock_memory_init_regions &&
old_array != memblock_reserved_init_regions)
memblock_free(__pa(old_array), old_alloc_size);
/*
* Reserve the new array if that comes from the memblock. Otherwise, we
* needn't do it
*/if (!use_slab)
BUG_ON(memblock_reserve(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
return0;
}
该函数中 new_area_start 表示向 type 中添加物理内存区域起始物理地址,new_area_size 表示内存区域大小。收件检查是否使用 slab 缓存以及是否可以扩充 type 中 region 数组大小。
获取原 region 数组大小并将其乘以 2 作为将要分配 region 数组大小,按页面对齐并根据 type 类型获取相应 slab 缓存使用标志位。
开始分配新 region 数组内存空间,如果使用 slab,则从 slab 缓存中分配,否则从 memblock 中分配。
最后复制原来 region 数组并将剩余空间设为 0 并释放原数组所在内存空间。
(5) 移除 memblock_type 中相应内存区域
staticint __init_memblock memblock_isolate_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int *start_rgn, int *end_rgn)
{
phys_addr_t end = base + memblock_cap_size(base, &size);
int idx;
structmemblock_region *rgn;
*start_rgn = *end_rgn = 0;
if (!size)
return0;
/* we'll create at most two more regions */while (type->cnt + 2 > type->max)
if (memblock_double_array(type, base, size) < 0)
return -ENOMEM;
for_each_memblock_type(idx, type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
if (rbase >= end)
break;
if (rend <= base)
continue;
if (rbase < base) {
/*
* @rgn intersects from below. Split and continue
* to process the next region - the new top half.
*/
rgn->base = base;
rgn->size -= base - rbase;
type->total_size -= base - rbase;
memblock_insert_region(type, idx, rbase, base - rbase,
memblock_get_region_node(rgn),
rgn->flags);
} elseif (rend > end) {
/*
* @rgn intersects from above. Split and redo the
* current region - the new bottom half.
*/
rgn->base = end;
rgn->size -= end - rbase;
type->total_size -= end - rbase;
memblock_insert_region(type, idx--, rbase, end - rbase,
memblock_get_region_node(rgn),
rgn->flags);
} else {
/* @rgn is fully contained, record it */if (!*end_rgn)
*start_rgn = idx;
*end_rgn = idx + 1;
}
}
return0;
}
该函数从 type 中 region 数组分离指定范围物理内存,base 表示分离内存区域起始物理地址,size 为分离物理内存大小,start_rgn 为输出分离后的内存区域起始 region,end_rgn 为结束 region。如果 type 中 region 数组空间不足则扩充 region 数组。
循环遍历 type 中 region 数组,对于处于分离内存区域的 region 进行分割并重新插入到数组中,最后返回索引。
优点: 由于将物理内存按照 PFN 将不同的 page 放入到不同 order 中,根据需要分配内存的大小,计算当前这次分配应该在哪个 order 中去找空闲的内存块,如果当前 order 中没有空闲,则到更高阶的 order 中去查找,因此分配的效率比 boot memory 的线性扫描 bitmap 要快很多。
typedefstructpglist_data {structzonenode_zones[MAX_NR_ZONES];//内存区域数组structzonelistnode_zonelists[MAX_ZONELISTS];//MAX_ZONELISTS 个备用区域数组int nr_zones;//该节点包含的内存区域数量
......
}
//struct zone 在 linux 内存管理(一)中structzonelist {structzoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
structzoneref {structzone *zone;//指向内存区域数据结构int zone_idx;//成员 zone 指向内存区域的类型
};
enum {
ZONELIST_FALLBACK,//包含所有内存节点的的备用区域列表#ifdef CONFIG_NUMA/*
* The NUMA zonelists are doubled because we need zonelists that
* restrict the allocations to a single node for __GFP_THISNODE.
*/
ZONELIST_NOFALLBACK,//只包含当前节点的备用区域列表(NUMA 专用)#endif
MAX_ZONELISTS//表示备用区域列表数量
};
UMA 系统只有一个备用区域的列表,按照区域类型从高到低顺序排列。假设 UMA 系统中包含普通区域和 DMA 区域,则备用区域列表为:(普通区域、DMA 区域)。NUMA 系统中每个内存节点有两个备用区域列表:一个包含所有节点的内存区域,另一个仅包含当前节点的内存区域。
ZONELIST_FALLBACK(包含所有内存节点的备用区域)列表有两种排序方法:
a. 节点优先顺序:先根据节点距离从小到大排序,然后在每个节点里面根据区域类型从高到低排序。优点是优先选择距离近的内存,缺点是在高区域耗尽以前使用低区域。
b. 区域优先顺序:先根据区域类型从高到低排序,然后在每个区域类型里面根据节点距离从小到大排序。优点是减少低区域耗尽的概率,缺点是不能保证优先选择距离近的内存。
首选的内存区域什么情况下从备用区域借用物理页呢?每个内存区域有 3 个水线:
a. 高水线(high):如果内存区域的空闲页数大于高水线,说明内存区域的内存非常充足;
b. 低水线(low):如果内存区域的空闲页数小于低水线,说明内存区域的内存轻微不足;
c. 最低水线(min):如果内存区域的空闲页数小于最低水线,说明内存区域的内存严重不足。
而且每个区域的水位线是初始化的时候通过每个区域的物理页情况计算出来的。计算后存到 struct zone 的 watermark 数组中,使用的时候直接通过下面的宏定义获取:
staticinlinestructpage *
__alloc_pages_slowpath(gfp_tgfp_mask, unsignedintorder,
structalloc_context *ac)
{bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
constbool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
structpage *page =NULL;
unsignedint alloc_flags;
unsignedlong did_some_progress;
enumcompact_prioritycompact_priority;enumcompact_resultcompact_result;int compaction_retries;
int no_progress_loops;
unsignedint cpuset_mems_cookie;
int reserve_flags;
/*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
retry_cpuset:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
//后面可能会检查 cpuset 是否允许当前进程从哪些内存节点申请页
cpuset_mems_cookie = read_mems_allowed_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*///把分配标志位转化为内部的分配标志位
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* We need to recalculate the starting point for the zonelist iterator
* because we might have used different nodemask in the fast path, or
* there was a cpuset modification and we are retrying - otherwise we
* could end up iterating over non-eligible zones endlessly.
*///获取首选的内存区域,因为在快速路径中使用了不同的节点掩码,避免再次遍历不合格的区域。
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;
//异步回收页,唤醒 kswapd 内核线程进行页面回收if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, gfp_mask, ac);
/*
* The adjusted alloc_flags might result in immediate success, so try
* that first
*///调整 alloc_flags 后可能会立即申请成功,所以先尝试一下
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/*
* For costly allocations, try direct compaction first, as it's likely
* that we have enough base pages and don't need to reclaim. For non-
* movable high-order allocations, do that as well, as compaction will
* try prevent permanent fragmentation by migrating from blocks of the
* same migratetype.
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*///申请阶数大于 0,不可移动的位于高阶的,忽略水位线的if (can_direct_reclaim &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
//直接页面回收,然后进行页面分配
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
if (page)
goto got_pg;
/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes THP page fault allocations
*/if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If compaction is deferred for high-order allocations,
* it is because sync compaction recently failed. If
* this is the case and the caller requested a THP
* allocation, we do not want to heavily disrupt the
* system, so we fail the allocation instead of entering
* direct reclaim.
*/if (compact_result == COMPACT_DEFERRED)
goto nopage;
/*
* Looks like reclaim/compaction is worth trying, but
* sync compaction could be very expensive, so keep
* using async compaction.
*///同步压缩非常昂贵,所以继续使用异步压缩
compact_priority = INIT_COMPACT_PRIORITY;
}
}
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop *///如果页回收线程意外睡眠则再次唤醒if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, gfp_mask, ac);
//如果调用者承若给我们紧急内存使用,我们就忽略水线
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
alloc_flags = reserve_flags;
/*
* Reset the nodemask and zonelist iterators if memory policies can be
* ignored. These allocations are high priority and system rather than
* user oriented.
*///如果可以忽略内存策略,则重置 nodemask 和 zonelistif (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
/* Attempt with potentially adjusted zonelist and alloc_flags *///尝试使用可能调整的区域备用列表和分配标志
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/* Caller is not willing to reclaim, we can't balance anything *///如果不可以直接回收,则申请失败if (!can_direct_reclaim)
goto nopage;
/* Avoid recursion of direct reclaim */if (current->flags & PF_MEMALLOC)
goto nopage;
/* Try direct reclaim and then allocating *///直接页面回收,然后进行页面分配
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;
/* Try direct compaction and then allocating *///进行页面压缩,然后进行页面分配
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;
/* Do not loop if specifically requested *///如果调用者要求不要重试,则放弃if (gfp_mask & __GFP_NORETRY)
goto nopage;
/*
* Do not retry costly high order allocations unless they are
* __GFP_RETRY_MAYFAIL
*///不要重试代价高昂的高阶分配,除非它们是__GFP_RETRY_MAYFAILif (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
//重新尝试回收页if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
/*
* It doesn't make any sense to retry for the compaction if the order-0
* reclaim is not able to make any progress because the current
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*///如果申请阶数大于 0,判断是否需要重新尝试压缩if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
goto retry;
/* Deal with possible cpuset update races before we start OOM killing *///如果 cpuset 允许修改内存节点申请就修改if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/* Reclaim has failed us, start killing things *///使用 oom 选择一个进程杀死
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly *///如果当前进程是 oom 选择的进程,并且忽略了水线,则放弃申请if (tsk_is_oom_victim(current) &&
(alloc_flags == ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
/* Retry as long as the OOM killer is making progress *///如果 OOM 杀手正在取得进展,再试一次if (did_some_progress) {
no_progress_loops = 0;
goto retry;
}
nopage:
/* Deal with possible cpuset update races before we fail */if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/if (gfp_mask & __GFP_NOFAIL) {
/*
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/if (WARN_ON_ONCE(!can_direct_reclaim))
goto fail;
/*
* PF_MEMALLOC request from this context is rather bizarre
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
WARN_ON_ONCE(current->flags & PF_MEMALLOC);
/*
* non failing costly orders are a hard requirement which we
* are not prepared for much so let's warn about these users
* so that we can identify them and convert them to something
* else.
*/
WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
/*
* Help non-failing allocations by giving them access to memory
* reserves but do not use ALLOC_NO_WATERMARKS because this
* could deplete whole memory reserves which would just make
* the situation worse
*///允许它们访问内存备用列表
page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
if (page)
goto got_pg;
cond_resched();
goto retry;
}
fail:
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
return page;
}
structkmem_cache {structkmem_cache_cpu __percpu *cpu_slab;/* Used for retriving partial slabs etc */unsignedlong flags;
unsignedlong min_partial;
int size; /* The size of an object including meta data */int object_size; /* The size of an object without meta data */int offset; /* Free pointer offset. */#ifdef CONFIG_SLUB_CPU_PARTIALint cpu_partial; /* Number of per cpu partial objects to keep around */#endifstructkmem_cache_order_objectsoo;/* Allocation and freeing of slabs */structkmem_cache_order_objectsmax;structkmem_cache_order_objectsmin;gfp_t allocflags; /* gfp flags to use on each alloc */int refcount; /* Refcount for slab cache destroy */void (*ctor)(void *);
......
constchar *name; /* Name (only for display!) */structlist_headlist;/* List of slab caches */
......
structkmem_cache_node *node[MAX_NUMNODES];
};
structkmem_cache_cpu {void **freelist; /* Pointer to next available object */unsignedlong tid; /* Globally unique transaction id */structpage *page;/* The slab from which we are allocating */#ifdef CONFIG_SLUB_CPU_PARTIALstructpage *partial;/* Partially allocated frozen slabs */#endif
......
};
/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
*
* The fastpath works by first checking if the lockless freelist can be used.
* If not then __slab_alloc is called for slow processing.
*
* Otherwise we can simply pick the next object from the lockless free list.
*/static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsignedlong addr)
{
void *object;
structkmem_cache_cpu *c;structpage *page;unsignedlong tid;
......
tid = this_cpu_read(s->cpu_slab->tid);
c = raw_cpu_ptr(s->cpu_slab);
......
object = c->freelist;
page = c->page;
if (unlikely(!object || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
stat(s, ALLOC_SLOWPATH);
}
......
return object;
}
structkmem_cache {//为了提高效率,每个 CPU 都有一个 slab 空闲对象缓存structarray_cache __percpu *cpu_cache;/* 1) Cache tunables. Protected by slab_mutex */unsignedint batchcount;//从本地高速缓存批量移入或移出对象的数量unsignedint limit;//本地高速缓存中空闲对象的最大数量unsignedint shared;
unsignedint size;
structreciprocal_valuereciprocal_buffer_size;/* 2) touched by every alloc & free from the backend */unsignedint flags; /* constant flags */unsignedint num; //每个 slab 的 obj 对象个数/* 3) cache_grow/shrink *//* order of pgs per slab (2^n) */unsignedint gfporder; //每个 slab 中连续页框的数量/* force GFP flags, e.g. GFP_DMA */gfp_t allocflags;
size_t colour; /* cache colouring range */unsignedint colour_off; /* colour offset */structkmem_cache *freelist_cache;unsignedint freelist_size;
/* constructor func */void (*ctor)(void *obj);
/* 4) cache creation/removal */constchar *name;
structlist_headlist;int refcount;
int object_size;
int align;
/* 5) statistics */#ifdef CONFIG_DEBUG_SLABunsignedlong num_active;
unsignedlong num_allocations;
unsignedlong high_mark;
unsignedlong grown;
unsignedlong reaped;
unsignedlong errors;
unsignedlong max_freeable;
unsignedlong node_allocs;
unsignedlong node_frees;
unsignedlong node_overflow;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
/*
* If debugging is enabled, then the allocator can add additional
* fields and/or padding to every object. size contains the total
* object size including these internal fields, the following two
* variables contain the offset to the user object and its size.
*/int obj_offset;
#endif/* CONFIG_DEBUG_SLAB */#ifdef CONFIG_MEMCG_KMEMstructmemcg_cache_paramsmemcg_params;#endifstructkmem_cache_node *node[MAX_NUMNODES];//内存节点实例个数
};
structarray_cache {unsignedint avail; //本地高速缓存中可用对象的个数,也是空闲数组位置的索引unsignedint limit; //本地高速缓存大小unsignedint batchcount; //本地高速缓存天长或者清空时是用到的对象个数unsignedint touched;//如果本地高速缓存最近被使用过,设置成 1void *entry[]; /* 对象的地址
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing the entries.
*
* Entries should not be directly dereferenced as
* entries belonging to slabs marked pfmemalloc will
* have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
/*
* The slab lists for all objects.
*/structkmem_cache_node {spinlock_t list_lock;
#ifdef CONFIG_SLABstructlist_headslabs_partial;//部分分配的 slabstructlist_headslabs_full;//已经完全分配的 slabstructlist_headslabs_free;//空 slab,或者没有对象被分配unsignedlong free_objects;
unsignedint free_limit;
unsignedint colour_next; /* Per-node cache coloring */structarray_cache *shared;/* shared per node */structalien_cache **alien;/* on other nodes */unsignedlong next_reap; /* updated without locking */int free_touched; /* updated without locking */#endif#ifdef CONFIG_SLUBunsignedlong nr_partial;
structlist_headpartial;#ifdef CONFIG_SLUB_DEBUGatomic_long_t nr_slabs;
atomic_long_t total_objects;
structlist_headfull;#endif#endif
};
structpage {/* First double word block */unsignedlong flags; /* Atomic flags, some possibly
* updated asynchronously */union {structaddress_space *mapping;/* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/void *s_mem; /* slab first object */
};
/* Second double word */struct {union {pgoff_t index; /* Our offset within mapping. */void *freelist; /* sl[aou]b first free object */
};