深入解析 Linux 内存管理：三大分配器原理 | 极客日志

C算法

深入解析 Linux 内存管理：三大分配器原理

Linux 内存管理涉及引导分配器、伙伴系统和 Slab 分配器。引导分配器负责内核初始化阶段的内存管理，包括 bootmem 和 memblock。伙伴系统管理物理页面，通过分裂和合并机制减少碎片。Slab 分配器针对小对象优化，提供缓存和快速分配。三者协同工作以高效利用内存资源。

时间旅人发布于 2025/2/6更新于 2026/5/3019 浏览

深入解析 Linux 内存管理：三大分配器原理

Linux 内存管理是指操作系统对于计算机系统中的内存资源进行有效利用和管理的过程。它包括了内存分配、页面置换、内存映射等方面。Linux 内存管理的核心在于三大分配器：引导内存分配器（Bootmem/Memblock）、伙伴分配器（Buddy Allocator）和 Slab 分配器。

一、引导内存分配器

Linux 系统中使用伙伴系统对物理页面进行分配管理，但是伙伴分配系统需要内核完成初始化以及建立相关内核数据结构后才能够正常工作。因此，在内核初始化相关数据结构时需要另一种内存分配器。早期 Linux 没有较为完善的引导内存分配器，随着硬件的发展和日趋复杂，处理不同体系的内存分配代码也渐渐复杂起来，随之就需要引导内存分配器来初始化系统主要内存分配器的数据结构以确保其正常工作。

在内核 2.3.23 版本中 bootmem 引导内存分配器补丁被加入，使用位图来表示页面使用状况。然后在内核 2.3.48 版本时，Linux 内核移植到 IA64 时正式使用 bootmem 作为引导内存分配器。随着时间的流逝，内存检测已经从简单地向 BIOS 询问扩展内存块的大小演变为处理复杂的表、块、库和群集。这时开始使用 memblock 作为引导内存分配器。

在 bootmem 向 memblock 过渡时，出现 nobootmem 作为兼容层，提供与 bootmem 类似 API。在内核版本 4.17 时，在 Linux 所支持的 24 种架构中，只有 5 种仍在使用 bootmem 作为唯一的早期内存分配器，14 中将 memblock 与 nobootmem 一起使用，其余同时使用 memblock 和 bootmem 作为引导内存分配器。

1.1 引导内存分配器 bootmem

首先查看 bootmem_data 数据结构，表示每个节点物理内存以及其页面使用情况。

typedef struct bootmem_data {
    unsigned long node_min_pfn;
    unsigned long node_low_pfn;
    void *node_bootmem_map;
    unsigned long last_end_off;
    unsigned long hint_idx;
    struct list_head list;
} bootmem_data_t;

node_min_pfn 和 node_low_pfn：表示该节点内存物理页面范围：node_min_pfn 为起始页面，node_low_pfn 则为结束页面；
node_bootmem_map：指向位图，每位表示内存页面使用情况，当页面可以被使用时，所对应的位图设为 0，相反则设为 1；
last_end_off：表示上次所分配内存的物理地址相对 bootmem 起始页面偏移 (以字节计算)；
hint_idx：记录上次设置位图的索引；
list：加入 bootmem_data 全局链表 bdata_list；除了全局链表外，还存在 bootmem_node_data 所指向的 bootmem_data 全局数组，索引为内存对应节点号。

(1) 初始化 bootmem_data

static unsigned  __init 
{
      mapsize;

    mminit_validate_memmodel_limits(&start, &end);
    bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
    bdata->node_min_pfn = start;
    bdata->node_low_pfn = end;
    link_bootmem(bdata);

    
    mapsize = bootmap_bytes(end - start);
    (bdata->node_bootmem_map, , mapsize);

    bdebug(,
        bdata - bootmem_node_data, start, mapstart, end, mapsize);

     mapsize;
}

相关免费在线工具

加密/解密文本
使用加密算法（如AES、TripleDES、Rabbit或RC4）加密和解密文本明文。在线工具，加密/解密文本在线工具，online
Gemini 图片去水印
基于开源反向 Alpha 混合算法去除 Gemini/Nano Banana 图片水印，支持批量处理与下载。在线工具，Gemini 图片去水印在线工具，online
Base64 字符串编码/解码
将字符串编码和解码为其 Base64 格式表示形式即可。在线工具，Base64 字符串编码/解码在线工具，online
Base64 文件转换器
将字符串、文件或图像转换为其 Base64 表示形式。在线工具，Base64 文件转换器在线工具，online
Markdown转HTML
将 Markdown（GFM）转为 HTML 片段，浏览器内 marked 解析；与 HTML转Markdown 互为补充。在线工具，Markdown转HTML在线工具，online
HTML转Markdown
将 HTML 片段转为 GitHub Flavored Markdown，支持标题、列表、链接、代码块与表格等；浏览器内处理，可链接预填。在线工具，HTML转Markdown在线工具，online

static void __init __free(bootmem_data_t *bdata,
            unsigned long sidx, unsigned long eidx)
{
    unsigned long idx;

    bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
        sidx + bdata->node_min_pfn,
        eidx + bdata->node_min_pfn);

    if (WARN_ON(bdata->node_bootmem_map == NULL))
        return;

    if (bdata->hint_idx > sidx)
        bdata->hint_idx = sidx;

    for (idx = sidx; idx < eidx; idx++)
        if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
            BUG();
}

static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
            unsigned long eidx, int flags)
{
    unsigned long idx;
    int exclusive = flags & BOOTMEM_EXCLUSIVE;

    bdebug("nid=%td start=%lx end=%lx flags=%x\n",
        bdata - bootmem_node_data,
        sidx + bdata->node_min_pfn,
        eidx + bdata->node_min_pfn,
        flags);

    if (WARN_ON(bdata->node_bootmem_map == NULL))
        return 0;

    for (idx = sidx; idx < eidx; idx++)
        if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
            if (exclusive) {
                __free(bdata, sidx, idx);
                return -EBUSY;
            }
            bdebug("silent double reserve of PFN %lx\n",
                idx + bdata->node_min_pfn);
        }
    return 0;
}

static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
                    unsigned long size, unsigned long align,
                    unsigned long goal, unsigned long limit)
{
    unsigned long fallback = 0;
    unsigned long min, max, start, sidx, midx, step;

    bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
        bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
        align, goal, limit);

    BUG_ON(!size);
    BUG_ON(align & (align - 1));
    BUG_ON(limit && goal + size > limit);

    if (!bdata->node_bootmem_map)
        return NULL;

    min = bdata->node_min_pfn;
    max = bdata->node_low_pfn;

    goal >>= PAGE_SHIFT;
    limit >>= PAGE_SHIFT;

    if (limit && max > limit)
        max = limit;
    if (max <= min)
        return NULL;

    step = max(align >> PAGE_SHIFT, 1UL);

    if (goal && min < goal && goal < max)
        start = ALIGN(goal, step);
    else
        start = ALIGN(min, step);

    sidx = start - bdata->node_min_pfn;
    midx = max - bdata->node_min_pfn;

    if (bdata->hint_idx > sidx) {
        /*
         * Handle the valid case of sidx being zero and still
         * catch the fallback below.
         */
        fallback = sidx + 1;
        sidx = align_idx(bdata, bdata->hint_idx, step);
    }

    while (1) {
        int merge;
        void *region;
        unsigned long eidx, i, start_off, end_off;
find_block:
        sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
        sidx = align_idx(bdata, sidx, step);
        eidx = sidx + PFN_UP(size);

        if (sidx >= midx || eidx > midx)
            break;

        for (i = sidx; i < eidx; i++)
            if (test_bit(i, bdata->node_bootmem_map)) {
                sidx = align_idx(bdata, i, step);
                if (sidx == i)
                    sidx += step;
                goto find_block;
            }

        if (bdata->last_end_off & (PAGE_SIZE - 1) &&
                PFN_DOWN(bdata->last_end_off) + 1 == sidx)
            start_off = align_off(bdata, bdata->last_end_off, align);
        else
            start_off = PFN_PHYS(sidx);

        merge = PFN_DOWN(start_off) < sidx;
        end_off = start_off + size;

        bdata->last_end_off = end_off;
        bdata->hint_idx = PFN_UP(end_off);

        /*
         * Reserve the area now:
         */
        if (__reserve(bdata, PFN_DOWN(start_off) + merge,
                PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
            BUG();

        region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
                start_off);
        memset(region, 0, size);
        /*
         * The min_count is set to 0 so that bootmem allocated blocks
         * are never reported as leaks.
         */
        kmemleak_alloc(region, size, 0, 0);
        return region;
    }

    if (fallback) {
        sidx = align_idx(bdata, fallback - 1, step);
        fallback = 0;
        goto find_block;
    }
}

static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
    struct page *page;
    unsigned long *map, start, end, pages, cur, count = 0;

    if (!bdata->node_bootmem_map)
        return 0;

    map = bdata->node_bootmem_map;
    start = bdata->node_min_pfn;
    end = bdata->node_low_pfn;

    bdebug("nid=%td start=%lx end=%lx\n",
        bdata - bootmem_node_data, start, end);

    while (start < end) {
        unsigned long idx, vec;
        unsigned shift;

        idx = start - bdata->node_min_pfn;
        shift = idx & (BITS_PER_LONG - 1);
        /*
         * vec holds at most BITS_PER_LONG map bits,
         * bit 0 corresponds to start.
         */
        vec = ~map[idx / BITS_PER_LONG];

        if (shift) {
            vec >>= shift;
            if (end - start >= BITS_PER_LONG)
                vec |= ~map[idx / BITS_PER_LONG + 1] <<
                    (BITS_PER_LONG - shift);
        }
        /*
         * If we have a properly aligned and fully unreserved
         * BITS_PER_LONG block of pages in front of us, free
         * it in one go.
         */
        if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
            int order = ilog2(BITS_PER_LONG);

            __free_pages_bootmem(pfn_to_page(start), start, order);
            count += BITS_PER_LONG;
            start += BITS_PER_LONG;
        } else {
            cur = start;

            start = ALIGN(start + 1, BITS_PER_LONG);
            while (vec && cur != start) {
                if (vec & 1) {
                    page = pfn_to_page(cur);
                    __free_pages_bootmem(page, cur, 0);
                    count++;
                }
                vec >>= 1;
                ++cur;
            }
        }
    }
}

struct memblock_type {
    unsigned long cnt;    /* number of regions */
    unsigned long max;    /* size of the allocated array */
    phys_addr_t total_size;    /* size of all regions */
    struct memblock_region *regions;
    char *name;
};

struct memblock_region {
    phys_addr_t base;
    phys_addr_t size;
    unsigned long flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
    int nid;
#endif
};

void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
                      struct memblock_type *type_a,
                      struct memblock_type *type_b,
                      phys_addr_t *out_start,
                      phys_addr_t *out_end, int *out_nid)
{
    int idx_a = *idx & 0xffffffff;
    int idx_b = *idx >> 32;

    if (WARN_ONCE(nid == MAX_NUMNODES,
    "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
        nid = NUMA_NO_NODE;

    for (; idx_a < type_a->cnt; idx_a++) {
        struct memblock_region *m = &type_a->regions[idx_a];

        phys_addr_t m_start = m->base;
        phys_addr_t m_end = m->base + m->size;
        int        m_nid = memblock_get_region_node(m);

        /* only memory regions are associated with nodes, check it */
        if (nid != NUMA_NO_NODE && nid != m_nid)
            continue;

        /* skip hotpluggable memory regions if needed */
        if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
            continue;

        /* if we want mirror memory skip non-mirror memory regions */
        if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
            continue;

        /* skip nomap memory unless we were asked for it explicitly */
        if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
            continue;

        if (!type_b) {
            if (out_start)
                *out_start = m_start;
            if (out_end)
                *out_end = m_end;
            if (out_nid)
                *out_nid = m_nid;
            idx_a++;
            *idx = (u32)idx_a | (u64)idx_b << 32;
            return;
        }

        /* scan areas before each reservation */
        for (; idx_b < type_b->cnt + 1; idx_b++) {
            struct memblock_region *r;
            phys_addr_t r_start;
            phys_addr_t r_end;

            r = &type_b->regions[idx_b];
            r_start = idx_b ? r[-1].base + r[-1].size : 0;
            r_end = idx_b < type_b->cnt ?
                r->base : ULLONG_MAX;

            /*
             * if idx_b advanced past idx_a,
             * break out to advance idx_a
             */
            if (r_start >= m_end)
                break;
            /* if the two regions intersect, we're done */
            if (m_start < r_end) {
                if (out_start)
                    *out_start =
                        max(m_start, r_start);
                if (out_end)
                    *out_end = min(m_end, r_end);
                if (out_nid)
                    *out_nid = m_nid;
                /*
                 * The region which ends first is
                 * advanced for the next iteration.
                 */
                if (m_end <= r_end)
                    idx_a++;
                else
                    idx_b++;
                *idx = (u32)idx_a | (u64)idx_b << 32;
                return;
            }
        }
    }

    /* signal end of iteration */
    *idx = ULLONG_MAX;
}

phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
                    phys_addr_t align, phys_addr_t start,
                    phys_addr_t end, int nid, ulong flags)
{
    phys_addr_t kernel_end, ret;

    /* pump up @end */
    if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
        end = memblock.current_limit;

    /* avoid allocating the first page */
    start = max_t(phys_addr_t, start, PAGE_SIZE);
    end = max(start, end);
    kernel_end = __pa_symbol(_end);

    /*
     * try bottom-up allocation only when bottom-up mode
     * is set and @end is above the kernel image.
     */
    if (memblock_bottom_up() && end > kernel_end) {
        phys_addr_t bottom_up_start;

        /* make sure we will allocate above the kernel */
        bottom_up_start = max(start, kernel_end);

        /* ok, try bottom-up allocation first */
        ret = __memblock_find_range_bottom_up(bottom_up_start, end,
                              size, align, nid, flags);
        if (ret)
            return ret;

        /*
         * we always limit bottom-up allocation above the kernel,
         * but top-down allocation doesn't have the limit, so
         * retrying top-down allocation may succeed when bottom-up
         * allocation failed.
         *
         * bottom-up allocation is expected to be fail very rarely,
         * so we use WARN_ONCE() here to see the stack trace if
         * fail happens.
         */
        WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
    }

    return __memblock_find_range_top_down(start, end, size, align, nid, flags);
}

int __init_memblock memblock_add_range(struct memblock_type *type,
                phys_addr_t base, phys_addr_t size,
                int nid, unsigned long flags)
{
    bool insert = false;
    phys_addr_t obase = base;
    phys_addr_t end = base + memblock_cap_size(base, &size);
    int idx, nr_new;
    struct memblock_region *rgn;

    if (!size)
        return 0;

    /* special case for empty array */
    if (type->regions[0].size == 0) {
        WARN_ON(type->cnt != 1 || type->total_size);
        type->regions[0].base = base;
        type->regions[0].size = size;
        type->regions[0].flags = flags;
        memblock_set_region_node(&type->regions[0], nid);
        type->total_size = size;
        return 0;
    }

repeat:
    /*
     * The following is executed twice.  Once with %false @insert and
     * then with %true.  The first counts the number of regions needed
     * to accommodate the new area.  The second actually inserts them.
     */
    base = obase;
    nr_new = 0;

    for_each_memblock_type(idx, type, rgn) {
        phys_addr_t rbase = rgn->base;
        phys_addr_t rend = rbase + rgn->size;

        if (rbase >= end)
            break;
        if (rend <= base)
            continue;
        /*
         * @rgn overlaps.  If it separates the lower part of new
         * area, insert that portion.
         */
        if (rbase > base) {
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
            WARN_ON(nid != memblock_get_region_node(rgn));
#endif
            WARN_ON(flags != rgn->flags);
            nr_new++;
            if (insert)
                memblock_insert_region(type, idx++, base,
                               rbase - base, nid,
                               flags);
        }
        /* area below @rend is dealt with, forget about it */
        base = min(rend, end);
    }

    /* insert the remaining portion */
    if (base < end) {
        nr_new++;
        if (insert)
            memblock_insert_region(type, idx, base, end - base,
                           nid, flags);
    }

    if (!nr_new)
        return 0;

    /*
     * If this was the first round, resize array and repeat for actual
     * insertions; otherwise, merge and return.
     */
    if (!insert) {
        while (type->cnt + nr_new > type->max)
            if (memblock_double_array(type, obase, size) < 0)
                return -ENOMEM;
        insert = true;
        goto repeat;
    } else {
        memblock_merge_regions(type);
        return 0;
    }
}

static int __init_memblock memblock_double_array(struct memblock_type *type,
                        phys_addr_t new_area_start,
                        phys_addr_t new_area_size)
{
    struct memblock_region *new_array, *old_array;
    phys_addr_t old_alloc_size, new_alloc_size;
    phys_addr_t old_size, new_size, addr;
    int use_slab = slab_is_available();
    int *in_slab;

    /* We don't allow resizing until we know about the reserved regions
     * of memory that aren't suitable for allocation
     */
    if (!memblock_can_resize)
        return -1;

    /* Calculate new doubled size */
    old_size = type->max * sizeof(struct memblock_region);
    new_size = old_size << 1;
    /*
     * We need to allocated new one align to PAGE_SIZE,
     *   so we can free them completely later.
     */
    old_alloc_size = PAGE_ALIGN(old_size);
    new_alloc_size = PAGE_ALIGN(new_size);

    /* Retrieve the slab flag */
    if (type == &memblock.memory)
        in_slab = &memblock_memory_in_slab;
    else
        in_slab = &memblock_reserved_in_slab;

    if (use_slab) {
        new_array = kmalloc(new_size, GFP_KERNEL);
        addr = new_array ? __pa(new_array) : 0;
    } else {
        /* only exclude range when trying to double reserved.regions */
        if (type != &memblock.reserved)
            new_area_start = new_area_size = 0;

        addr = memblock_find_in_range(new_area_start + new_area_size,
                        memblock.current_limit,
                        new_alloc_size, PAGE_SIZE);
        if (!addr && new_area_size)
            addr = memblock_find_in_range(0,
                min(new_area_start, memblock.current_limit),
                new_alloc_size, PAGE_SIZE);

        new_array = addr ? __va(addr) : NULL;
    }
    if (!addr) {
        pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
               type->name, type->max, type->max * 2);
        return -1;
    }

    memcpy(new_array, type->regions, old_size);
    memset(new_array + type->max, 0, old_size);
    old_array = type->regions;
    type->regions = new_array;
    type->max <<= 1;

    /* Free old array. We needn't free it if the array is the static one */
    if (*in_slab)
        kfree(old_array);
    else if (old_array != memblock_memory_init_regions &&
         old_array != memblock_reserved_init_regions)
        memblock_free(__pa(old_array), old_alloc_size);

    /*
     * Reserve the new array if that comes from the memblock.  Otherwise, we
     * needn't do it
     */
    if (!use_slab)
        BUG_ON(memblock_reserve(addr, new_alloc_size));

    /* Update slab flag */
    *in_slab = use_slab;

    return 0;
}

static int __init_memblock memblock_isolate_range(struct memblock_type *type,
                    phys_addr_t base, phys_addr_t size,
                    int *start_rgn, int *end_rgn)
{
    phys_addr_t end = base + memblock_cap_size(base, &size);
    int idx;
    struct memblock_region *rgn;

    *start_rgn = *end_rgn = 0;

    if (!size)
        return 0;

    /* we'll create at most two more regions */
    while (type->cnt + 2 > type->max)
        if (memblock_double_array(type, base, size) < 0)
            return -ENOMEM;

    for_each_memblock_type(idx, type, rgn) {
        phys_addr_t rbase = rgn->base;
        phys_addr_t rend = rbase + rgn->size;

        if (rbase >= end)
            break;
        if (rend <= base)
            continue;

        if (rbase < base) {
            /*
             * @rgn intersects from below.  Split and continue
             * to process the next region - the new top half.
             */
            rgn->base = base;
            rgn->size -= base - rbase;
            type->total_size -= base - rbase;
            memblock_insert_region(type, idx, rbase, base - rbase,
                           memblock_get_region_node(rgn),
                           rgn->flags);
        } else if (rend > end) {
            /*
             * @rgn intersects from above.  Split and redo the
             * current region - the new bottom half.
             */
            rgn->base = end;
            rgn->size -= end - rbase;
            type->total_size -= end - rbase;
            memblock_insert_region(type, idx--, rbase, end - rbase,
                           memblock_get_region_node(rgn),
                           rgn->flags);
        } else {
            /* @rgn is fully contained, record it */
            if (!*end_rgn)
                *start_rgn = idx;
            *end_rgn = idx + 1;
        }
    }

    return 0;
}

static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
    type->total_size -= type->regions[r].size;
    memmove(&type->regions[r], &type->regions[r + 1],
        (type->cnt - (r + 1)) * sizeof(type->regions[r]));
    type->cnt--;

    /* Special case for empty arrays */
    if (type->cnt == 0) {
        WARN_ON(type->total_size != 0);
        type->cnt = 1;
        type->regions[0].base = 0;
        type->regions[0].size = 0;
        type->regions[0].flags = 0;
        memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
    }
}

static int __init_memblock memblock_remove_range(struct memblock_type *type,
                      phys_addr_t base, phys_addr_t size)
{
    int start_rgn, end_rgn;
    int i, ret;

    ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
    if (ret)
        return ret;

    for (i = end_rgn - 1; i >= start_rgn; i--)
        memblock_remove_region(type, i);
    return 0;
}

#include/linux/mmzone.h
struct zone
{
     /* free areas of different sizes */
    struct free_area        free_area[MAX_ORDER];
};
......
/* Free memory management - zoned buddy allocator.  */
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))

struct free_area {
    struct list_head        free_list[MIGRATE_TYPES];//是用于连接空闲页的链表。页链表包含大小相同的连续内存区
    unsigned long           nr_free;//指定了当前内存区中空闲页块的数目（对 0 阶内存区逐页计算，对 1 阶内存区计算 2 页的数目，对 2 阶内存区计算 4 页集合的数目，依次类推
};

typedef struct pglist_data {
	struct zone node_zones[MAX_NR_ZONES];//内存区域数组
	struct zonelist node_zonelists[MAX_ZONELISTS];//MAX_ZONELISTS 个备用区域数组

	int nr_zones;//该节点包含的内存区域数量
......
}
//struct zone 在 linux 内存管理（一）中
struct zonelist {
	struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
struct zoneref {
	struct zone *zone;//指向内存区域数据结构
	int zone_idx;//成员 zone 指向内存区域的类型
};
enum {
	ZONELIST_FALLBACK,//包含所有内存节点的的备用区域列表
#ifdef CONFIG_NUMA
	/*
	 * The NUMA zonelists are doubled because we need zonelists that
	 * restrict the allocations to a single node for __GFP_THISNODE.
	 */
	ZONELIST_NOFALLBACK,//只包含当前节点的备用区域列表（NUMA 专用）
#endif
	MAX_ZONELISTS//表示备用区域列表数量
};

#define min_wmark_pages(z) (z->watermark[WMARK_MIN])
#define low_wmark_pages(z) (z->watermark[WMARK_LOW])
#define high_wmark_pages(z) (z->watermark[WMARK_HIGH])

jian@ubuntu:~/share/linux-4.19.40-note$ cat /proc/zoneinfo
Node 0, zone      DMA
  pages free     3912
        min      7
        low      8
        high     10
        scanned  0
        spanned  4095
        present  3997
        managed  3976
...
Node 0, zone    DMA32
  pages free     6515
        min      1497
        low      1871
        high     2245
        scanned  0
        spanned  1044480
        present  782288
        managed  762172
  ...
Node 0, zone   Normal
  pages free     2964
        min      474
        low      592
        high     711
        scanned  0
        spanned  262144
        present  262144
        managed  241089
  ...

/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN		WMARK_MIN	//使用最低水线
#define ALLOC_WMARK_LOW		WMARK_LOW	//使用低水线
#define ALLOC_WMARK_HIGH	WMARK_HIGH	//使用高水线
#define ALLOC_NO_WATERMARKS	0x04 		//完全不检查水线
#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)//得到水位线的掩码
#ifdef CONFIG_MMU
#define ALLOC_OOM		0x08	//允许内存耗尽
#else
#define ALLOC_OOM		ALLOC_NO_WATERMARKS//允许内存耗尽
#endif
#define ALLOC_HARDER		0x10 //试图更努力分配
#define ALLOC_HIGH			0x20 //调用者是高优先级
#define ALLOC_CPUSET		0x40 //检查 cpuset 是否允许进程从某个内存节点分配页
#define ALLOC_CMA			0x80 //允许从 CMA（连续内存分配器）迁移类型分配

static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)
{
	return alloc_pages_current(gfp_mask, order);
}

struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
	struct mempolicy *pol = &default_policy;
	struct page *page;

	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
		pol = get_task_policy(current);

	if (pol->mode == MPOL_INTERLEAVE)
		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
	else
		page = __alloc_pages_nodemask(gfp, order,
				policy_node(gfp, pol, numa_node_id()),
				policy_nodemask(gfp, pol));

	return page;
}

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
						nodemask_t *nodemask)
{
	...
	/* First allocation attempt */ //快速路径分配函数
	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
	if (likely(page))
		goto out;
	...
	//快速路径分配失败，会调用下面的慢速分配函数
	page = __alloc_pages_slowpath(alloc_mask, order, &ac);

out:
	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
		unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
		__free_pages(page, order);
		page = NULL;
	}

	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

	return page;
}

static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)
{
	struct zoneref *z = ac->preferred_zoneref;
	struct zone *zone;
	struct pglist_data *last_pgdat_dirty_limit = NULL;

	//扫描备用区域列表中每一个满足条件的区域：区域类型小于等于首选区域类型
	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
									ac->nodemask) {
		struct page *page;
		unsigned long mark;

		if (cpusets_enabled() &&			//如果编译了 cpuset 功能		
			(alloc_flags & ALLOC_CPUSET) &&	//如果设置了 ALLOC_CPUSET		
			!__cpuset_zone_allowed(zone, gfp_mask))	//如果 cpu 设置了不允许从当前区域分配内存		
			continue;						//那么不允许从这个区域分配，进入下个循环

		if (ac->spread_dirty_pages) {//如果设置了写标志位，表示要分配写缓存		
			//那么要检查内存脏页数量是否超出限制，超过限制就不能从这个区域分配		
			if (last_pgdat_dirty_limit == zone->zone_pgdat)
				continue;

			if (!node_dirty_ok(zone->zone_pgdat)) {
				last_pgdat_dirty_limit = zone->zone_pgdat;
				continue;
			}
		}

		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];//检查允许分配水线
		//判断（区域空闲页 - 申请页数）是否小于水线
		if (!zone_watermark_fast(zone, order, mark,
				       ac_classzone_idx(ac), alloc_flags)) {
			int ret;

			/* Checked here to keep the fast path fast */
			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
			//如果没有水线要求，直接选择该区域
			if (alloc_flags & ALLOC_NO_WATERMARKS)
				goto try_this_zone;

			//如果没有开启节点回收功能或者当前节点和首选节点距离大于回收距离
			if (node_reclaim_mode == 0 ||
			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
				continue;

			//从节点回收'没有映射到进程虚拟地址空间的内存页'，然后检查水线
			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
			switch (ret) {
			case NODE_RECLAIM_NOSCAN:
				/* did not scan */
				continue;
			case NODE_RECLAIM_FULL:
				/* scanned but unreclaimable */
				continue;
			default:
				/* did we reclaim enough */
				if (zone_watermark_ok(zone, order, mark,
						ac_classzone_idx(ac), alloc_flags))
					goto try_this_zone;

				continue;
			}
		}

try_this_zone://满足上面的条件了，开始分配		
		//从当前区域分配页
		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
				gfp_mask, alloc_flags, ac->migratetype);
		if (page) {
			//分配成功，初始化页
			prep_new_page(page, order, gfp_mask, alloc_flags);

			/*
			 * If this is a high-order atomic allocation then check
			 * if the pageblock should be reserved for the future
			 */
			//如果这是一个高阶的内存并且是 ALLOC_HARDER，需要检查以后是否需要保留
			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
				reserve_highatomic_pageblock(page, zone, order);

			return page;
		} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
			/* Try again if zone has deferred pages */
			//如果分配失败，延迟分配
			if (static_branch_unlikely(&deferred_pages)) {
				if (_deferred_grow_zone(zone, order))
					goto try_this_zone;
			}
#endif
		}
	}

	return NULL;
}

static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
            int migratetype)
{
  unsigned int current_order;
  struct free_area *area;
  struct page *page;


  /* Find a page of the appropriate size in the preferred list */
  for (current_order = order; current_order < MAX_ORDER; ++current_order) {
    area = &(zone->free_area[current_order]);
    page = list_first_entry_or_null(&area->free_list[migratetype],
              struct page, lru);
    if (!page)
      continue;
    list_del(&page->lru);
    rmv_page_order(page);
    area->nr_free--;
    expand(zone, page, order, current_order, area, migratetype);
    set_pcppage_migratetype(page, migratetype);
    return page;
  }
  return NULL;
}

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
					struct alloc_context *ac)
{
	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
	struct page *page = NULL;
	unsigned int alloc_flags;
	unsigned long did_some_progress;
	enum compact_priority compact_priority;
	enum compact_result compact_result;
	int compaction_retries;
	int no_progress_loops;
	unsigned int cpuset_mems_cookie;
	int reserve_flags;

	/*
	 * We also sanity check to catch abuse of atomic reserves being used by
	 * callers that are not in atomic context.
	 */
	if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
				(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
		gfp_mask &= ~__GFP_ATOMIC;

retry_cpuset:
	compaction_retries = 0;
	no_progress_loops = 0;
	compact_priority = DEF_COMPACT_PRIORITY;
	//后面可能会检查 cpuset 是否允许当前进程从哪些内存节点申请页
	cpuset_mems_cookie = read_mems_allowed_begin();

	/*
	 * The fast path uses conservative alloc_flags to succeed only until
	 * kswapd needs to be woken up, and to avoid the cost of setting up
	 * alloc_flags precisely. So we do that now.
	 */
	//把分配标志位转化为内部的分配标志位
	alloc_flags = gfp_to_alloc_flags(gfp_mask);

	/*
	 * We need to recalculate the starting point for the zonelist iterator
	 * because we might have used different nodemask in the fast path, or
	 * there was a cpuset modification and we are retrying - otherwise we
	 * could end up iterating over non-eligible zones endlessly.
	 */
	//获取首选的内存区域，因为在快速路径中使用了不同的节点掩码，避免再次遍历不合格的区域。
	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->high_zoneidx, ac->nodemask);
	if (!ac->preferred_zoneref->zone)
		goto nopage;

	//异步回收页，唤醒 kswapd 内核线程进行页面回收
	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
		wake_all_kswapds(order, gfp_mask, ac);

	/*
	 * The adjusted alloc_flags might result in immediate success, so try
	 * that first
	 */
	//调整 alloc_flags 后可能会立即申请成功，所以先尝试一下
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page)
		goto got_pg;

	/*
	 * For costly allocations, try direct compaction first, as it's likely
	 * that we have enough base pages and don't need to reclaim. For non-
	 * movable high-order allocations, do that as well, as compaction will
	 * try prevent permanent fragmentation by migrating from blocks of the
	 * same migratetype.
	 * Don't try this for allocations that are allowed to ignore
	 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
	 */
	//申请阶数大于 0，不可移动的位于高阶的，忽略水位线的
	if (can_direct_reclaim &&
			(costly_order ||
			   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
			&& !gfp_pfmemalloc_allowed(gfp_mask)) {
		//直接页面回收，然后进行页面分配
		page = __alloc_pages_direct_compact(gfp_mask, order,
							alloc_flags, ac,
							INIT_COMPACT_PRIORITY,
							&compact_result);
		if (page)
			goto got_pg;

		/*
		 * Checks for costly allocations with __GFP_NORETRY, which
		 * includes THP page fault allocations
		 */
		if (costly_order && (gfp_mask & __GFP_NORETRY)) {
			/*
			 * If compaction is deferred for high-order allocations,
			 * it is because sync compaction recently failed. If
			 * this is the case and the caller requested a THP
			 * allocation, we do not want to heavily disrupt the
			 * system, so we fail the allocation instead of entering
			 * direct reclaim.
			 */
			if (compact_result == COMPACT_DEFERRED)
				goto nopage;

			/*
			 * Looks like reclaim/compaction is worth trying, but
			 * sync compaction could be very expensive, so keep
			 * using async compaction.
			 */
			//同步压缩非常昂贵，所以继续使用异步压缩
			compact_priority = INIT_COMPACT_PRIORITY;
		}
	}

retry:
	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
	//如果页回收线程意外睡眠则再次唤醒
	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
		wake_all_kswapds(order, gfp_mask, ac);

	//如果调用者承若给我们紧急内存使用，我们就忽略水线
	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
	if (reserve_flags)
		alloc_flags = reserve_flags;

	/*
	 * Reset the nodemask and zonelist iterators if memory policies can be
	 * ignored. These allocations are high priority and system rather than
	 * user oriented.
	 */
	//如果可以忽略内存策略，则重置 nodemask 和 zonelist
	if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
		ac->nodemask = NULL;
		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->high_zoneidx, ac->nodemask);
	}

	/* Attempt with potentially adjusted zonelist and alloc_flags */
	//尝试使用可能调整的区域备用列表和分配标志
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page)
		goto got_pg;

	/* Caller is not willing to reclaim, we can't balance anything */
	//如果不可以直接回收，则申请失败
	if (!can_direct_reclaim)
		goto nopage;

	/* Avoid recursion of direct reclaim */
	if (current->flags & PF_MEMALLOC)
		goto nopage;

	/* Try direct reclaim and then allocating */
	//直接页面回收，然后进行页面分配
	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
									&did_some_progress);
	if (page)
		goto got_pg;

	/* Try direct compaction and then allocating */
	//进行页面压缩，然后进行页面分配
	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
							compact_priority, &compact_result);
	if (page)
		goto got_pg;

	/* Do not loop if specifically requested */
	//如果调用者要求不要重试，则放弃
	if (gfp_mask & __GFP_NORETRY)
		goto nopage;

	/*
	 * Do not retry costly high order allocations unless they are
	 * __GFP_RETRY_MAYFAIL
	 */
	//不要重试代价高昂的高阶分配，除非它们是__GFP_RETRY_MAYFAIL
	if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
		goto nopage;

	//重新尝试回收页
	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
					 did_some_progress > 0, &no_progress_loops))
		goto retry;

	/*
	 * It doesn't make any sense to retry for the compaction if the order-0
	 * reclaim is not able to make any progress because the current
	 * implementation of the compaction depends on the sufficient amount
	 * of free memory (see __compaction_suitable)
	 */
	//如果申请阶数大于 0，判断是否需要重新尝试压缩
	if (did_some_progress > 0 &&
			should_compact_retry(ac, order, alloc_flags,
				compact_result, &compact_priority,
				&compaction_retries))
		goto retry;


	/* Deal with possible cpuset update races before we start OOM killing */
	//如果 cpuset 允许修改内存节点申请就修改
	if (check_retry_cpuset(cpuset_mems_cookie, ac))
		goto retry_cpuset;

	/* Reclaim has failed us, start killing things */
	//使用 oom 选择一个进程杀死
	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
	if (page)
		goto got_pg;

	/* Avoid allocations with no watermarks from looping endlessly */
	//如果当前进程是 oom 选择的进程，并且忽略了水线，则放弃申请
	if (tsk_is_oom_victim(current) &&
	    (alloc_flags == ALLOC_OOM ||
	     (gfp_mask & __GFP_NOMEMALLOC)))
		goto nopage;

	/* Retry as long as the OOM killer is making progress */
	//如果 OOM 杀手正在取得进展，再试一次
	if (did_some_progress) {
		no_progress_loops = 0;
		goto retry;
	}

nopage:
	/* Deal with possible cpuset update races before we fail */
	if (check_retry_cpuset(cpuset_mems_cookie, ac))
		goto retry_cpuset;

	/*
	 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
	 * we always retry
	 */
	if (gfp_mask & __GFP_NOFAIL) {
		/*
		 * All existing users of the __GFP_NOFAIL are blockable, so warn
		 * of any new users that actually require GFP_NOWAIT
		 */
		if (WARN_ON_ONCE(!can_direct_reclaim))
			goto fail;

		/*
		 * PF_MEMALLOC request from this context is rather bizarre
		 * because we cannot reclaim anything and only can loop waiting
		 * for somebody to do a work for us
		 */
		WARN_ON_ONCE(current->flags & PF_MEMALLOC);

		/*
		 * non failing costly orders are a hard requirement which we
		 * are not prepared for much so let's warn about these users
		 * so that we can identify them and convert them to something
		 * else.
		 */
		WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);

		/*
		 * Help non-failing allocations by giving them access to memory
		 * reserves but do not use ALLOC_NO_WATERMARKS because this
		 * could deplete whole memory reserves which would just make
		 * the situation worse
		 */
		//允许它们访问内存备用列表
		page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
		if (page)
			goto got_pg;

		cond_resched();
		goto retry;
	}
fail:
	warn_alloc(gfp_mask, ac->nodemask,
			"page allocation failure: order:%u", order);
got_pg:
	return page;
}

#define __GFP_WAIT  ((__force gfp_t)0x10u) /* 可以等待和重调度？ */
#define __GFP_HIGH  ((__force gfp_t)0x20u) /* 应该访问紧急分配池？ */
#define __GFP_IO ((__force gfp_t)0x40u) /* 可以启动物理 IO？ */
#define __GFP_FS ((__force gfp_t)0x80u) /* 可以调用底层文件系统？ */
#define __GFP_COLD  ((__force gfp_t)0x100u) /* 需要非缓存的冷页 */
#define __GFP_NOWARN  ((__force gfp_t)0x200u) /* 禁止分配失败警告 */
#define __GFP_REPEAT  ((__force gfp_t)0x400u) /* 重试分配，可能失败 */
#define __GFP_NOFAIL  ((__force gfp_t)0x800u) /* 一直重试，不会失败 */
#define __GFP_NORETRY  ((__force gfp_t)0x1000u)  /* 不重试，可能失败 */
#define __GFP_NO_GROW  ((__force gfp_t)0x2000u)  /* slab 内部使用 */
#define __GFP_COMP  ((__force gfp_t)0x4000u)  /* 增加复合页元数据 */
#define __GFP_ZERO  ((__force gfp_t)0x8000u)  /* 成功则返回填充字节 0 的页 */
#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* 不使用紧急分配链表 */
#define __GFP_HARDWALL  ((__force gfp_t)0x20000u) /* 只允许在进程允许运行的 CPU 所关联的结点分配内存 */
#define __GFP_THISNODE  ((__force gfp_t)0x40000u) /* 没有备用结点，没有策略 */
#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* 页是可回收的 */
#define __GFP_MOVABLE  ((__force gfp_t)0x100000u) /* 页是可移动的 */

#define GFP_ATOMIC  (__GFP_HIGH)
#define GFP_NOIO  (__GFP_WAIT)
#define GFP_NOFS  (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL  (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_USER  (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_HIGHUSER  (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
__GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
__GFP_HARDWALL | __GFP_HIGHMEM | \
__GFP_MOVABLE)
#define GFP_DMA __GFP_DMA
#define GFP_DMA32 __GFP_DMA32

struct kmem_cache {
  struct kmem_cache_cpu __percpu *cpu_slab;
  /* Used for retriving partial slabs etc */
  unsigned long flags;
  unsigned long min_partial;
  int size;    /* The size of an object including meta data */
  int object_size;  /* The size of an object without meta data */
  int offset;    /* Free pointer offset. */
#ifdef CONFIG_SLUB_CPU_PARTIAL
  int cpu_partial;  /* Number of per cpu partial objects to keep around */
#endif
  struct kmem_cache_order_objects oo;
  /* Allocation and freeing of slabs */
  struct kmem_cache_order_objects max;
  struct kmem_cache_order_objects min;
  gfp_t allocflags;  /* gfp flags to use on each alloc */
  int refcount;    /* Refcount for slab cache destroy */
  void (*ctor)(void *);
......
  const char *name;  /* Name (only for display!) */
  struct list_head list;  /* List of slab caches */
......
  struct kmem_cache_node *node[MAX_NUMNODES];
};

struct kmem_cache_cpu {
  void **freelist;  /* Pointer to next available object */
  unsigned long tid;  /* Globally unique transaction id */
  struct page *page;  /* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
  struct page *partial;  /* Partially allocated frozen slabs */
#endif
......
};

struct kmem_cache_node {
  spinlock_t list_lock;
......
#ifdef CONFIG_SLUB
  unsigned long nr_partial;
  struct list_head partial;
......
#endif
};

/*
 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
 * have the fastpath folded into their functions. So no function call
 * overhead for requests that can be satisfied on the fastpath.
 *
 * The fastpath works by first checking if the lockless freelist can be used.
 * If not then __slab_alloc is called for slow processing.
 *
 * Otherwise we can simply pick the next object from the lockless free list.
 */
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
    gfp_t gfpflags, int node, unsigned long addr)
{
  void *object;
  struct kmem_cache_cpu *c;
  struct page *page;
  unsigned long tid;
......
  tid = this_cpu_read(s->cpu_slab->tid);
  c = raw_cpu_ptr(s->cpu_slab);
......
  object = c->freelist;
  page = c->page;
  if (unlikely(!object || !node_match(page, node))) {
    object = __slab_alloc(s, gfpflags, node, addr, c);
    stat(s, ALLOC_SLOWPATH);
  }
......
  return object;
}

static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        unsigned long addr, struct kmem_cache_cpu *c)
{
  void *freelist;
  struct page *page;
......
redo:
......
  /* must check again c->freelist in case of cpu migration or IRQ */
  freelist = c->freelist;
  if (freelist)
    goto load_freelist;


  freelist = get_freelist(s, page);


  if (!freelist) {
    c->page = NULL;
    stat(s, DEACTIVATE_BYPASS);
    goto new_slab;
  }


load_freelist:
  c->freelist = get_freepointer(s, freelist);
  c->tid = next_tid(c->tid);
  return freelist;


new_slab:


  if (slub_percpu_partial(c)) {
    page = c->page = slub_percpu_partial(c);
    slub_set_percpu_partial(c, page);
    stat(s, CPU_PARTIAL_ALLOC);
    goto redo;
  }


  freelist = new_slab_objects(s, gfpflags, node, &c);
......
  return freelis

static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
      int node, struct kmem_cache_cpu **pc)
{
  void *freelist;
  struct kmem_cache_cpu *c = *pc;
  struct page *page;


  freelist = get_partial(s, flags, node, c);


  if (freelist)
    return freelist;


  page = new_slab(s, flags, node);
  if (page) {
    c = raw_cpu_ptr(s->cpu_slab);
    if (c->page)
      flush_slab(s, c);


    freelist = page->freelist;
    page->freelist = NULL;


    stat(s, ALLOC_SLAB);
    c->page = page;
    *pc = c;
  } else
    freelist = NULL;


  return freelis

/*
 * Try to allocate a partial slab from a specific node.
 */
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
        struct kmem_cache_cpu *c, gfp_t flags)
{
  struct page *page, *page2;
  void *object = NULL;
  int available = 0;
  int objects;
......
  list_for_each_entry_safe(page, page2, &n->partial, lru) {
    void *t;


    t = acquire_slab(s, n, page, object == NULL, &objects);
    if (!t)
      break;


    available += objects;
    if (!object) {
      c->page = page;
      stat(s, ALLOC_FROM_PARTIAL);
      object = t;
    } else {
      put_cpu_partial(s, page, 0);
      stat(s, CPU_PARTIAL_NODE);
    }
    if (!kmem_cache_has_cpu_partial(s)
      || available > slub_cpu_partial(s) / 2)
      break;
  }
......
  return object;
}

static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
  struct page *page;
  struct kmem_cache_order_objects oo = s->oo;
  gfp_t alloc_gfp;
  void *start, *p;
  int idx, order;
  bool shuffle;


  flags &= gfp_allowed_mask;
......
  page = alloc_slab_page(s, alloc_gfp, node, oo);
  if (unlikely(!page)) {
    oo = s->min;
    alloc_gfp = flags;
    /*
     * Allocation may have failed due to fragmentation.
     * Try a lower order alloc if possible
     */
    page = alloc_slab_page(s, alloc_gfp, node, oo);
    if (unlikely(!page))
      goto out;
    stat(s, ORDER_FALLBACK);
  }
......
  return page;
}

//include/linux/slab.h
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
......
	return __kmalloc(size, flags);
}
// in mm/slab.c
void *__kmalloc(size_t size, gfp_t flags)
{
    return __do_kmalloc(size, flags, _RET_IP_);
}
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
                      unsigned long caller)
{
    ...
}
/*
 * Common kmalloc functions provided by all allocators
 */
void * __must_check __krealloc(const void *, size_t, gfp_t);
void * __must_check krealloc(const void *, size_t, gfp_t);
void kfree(const void *);
void kzfree(const void *);
size_t ksize(const void *);

struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
		  unsigned long flags, void (*ctor)(void *))
{
	struct kmem_cache *s = NULL;
	const char *cache_name;
	int err;

struct kmem_cache {
    //为了提高效率，每个 CPU 都有一个 slab 空闲对象缓存
	struct array_cache __percpu *cpu_cache;

/* 1) Cache tunables. Protected by slab_mutex */
	unsigned int batchcount;//从本地高速缓存批量移入或移出对象的数量
	unsigned int limit;//本地高速缓存中空闲对象的最大数量
	unsigned int shared;

	unsigned int size;
	struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */

	unsigned int flags;		/* constant flags */
	unsigned int num;	//每个 slab 的 obj 对象个数

/* 3) cache_grow/shrink */
	/* order of pgs per slab (2^n) */
	unsigned int gfporder; //每个 slab 中连续页框的数量

	/* force GFP flags, e.g. GFP_DMA */
	gfp_t allocflags;

	size_t colour;			/* cache colouring range */
	unsigned int colour_off;	/* colour offset */
	struct kmem_cache *freelist_cache;
	unsigned int freelist_size;

	/* constructor func */
	void (*ctor)(void *obj);

/* 4) cache creation/removal */
	const char *name;
	struct list_head list;
	int refcount;
	int object_size;
	int align;

/* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
	unsigned long num_active;
	unsigned long num_allocations;
	unsigned long high_mark;
	unsigned long grown;
	unsigned long reaped;
	unsigned long errors;
	unsigned long max_freeable;
	unsigned long node_allocs;
	unsigned long node_frees;
	unsigned long node_overflow;
	atomic_t allochit;
	atomic_t allocmiss;
	atomic_t freehit;
	atomic_t freemiss;

	/*
	 * If debugging is enabled, then the allocator can add additional
	 * fields and/or padding to every object. size contains the total
	 * object size including these internal fields, the following two
	 * variables contain the offset to the user object and its size.
	 */
	int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG_KMEM
	struct memcg_cache_params memcg_params;
#endif

	struct kmem_cache_node *node[MAX_NUMNODES]; //内存节点实例个数
};

struct array_cache {
	unsigned int avail; //本地高速缓存中可用对象的个数，也是空闲数组位置的索引
	unsigned int limit; //本地高速缓存大小
	unsigned int batchcount; //本地高速缓存天长或者清空时是用到的对象个数
	unsigned int touched;//如果本地高速缓存最近被使用过，设置成 1
	void *entry[];	/* 对象的地址
				* Must have this definition in here for the proper
				* alignment of array_cache. Also simplifies accessing the entries.
				*
				* Entries should not be directly dereferenced as
				* entries belonging to slabs marked pfmemalloc will
				* have the lower bits set SLAB_OBJ_PFMEMALLOC
				*/
};

/*
 * The slab lists for all objects.
 */
struct kmem_cache_node {
	spinlock_t list_lock;

#ifdef CONFIG_SLAB
	struct list_head slabs_partial;	//部分分配的 slab
	struct list_head slabs_full; //已经完全分配的 slab
	struct list_head slabs_free; //空 slab，或者没有对象被分配
	unsigned long free_objects;
	unsigned int free_limit;
	unsigned int colour_next;	/* Per-node cache coloring */
	struct array_cache *shared;	/* shared per node */
	struct alien_cache **alien;	/* on other nodes */
	unsigned long next_reap;	/* updated without locking */
	int free_touched;		/* updated without locking */
#endif

#ifdef CONFIG_SLUB
	unsigned long nr_partial;
	struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
	atomic_long_t nr_slabs;
	atomic_long_t total_objects;
	struct list_head full;
#endif
#endif

};

struct page {
	/* First double word block */
	unsigned long flags;		/* Atomic flags, some possibly
								 * updated asynchronously */
	union {
		struct address_space *mapping;	/* If low bit clear, points to
										 * inode address_space, or NULL.
										 * If page mapped as anonymous
										 * memory, low bit is set, and
										 * it points to anon_vma object:
										 * see PAGE_MAPPING_ANON below.
										 */
		void *s_mem;			/* slab first object */
	};

	/* Second double word */
	struct {
		union {
			pgoff_t index;		/* Our offset within mapping. */
			void *freelist;		/* sl[aou]b first free object */
		};

深入解析 Linux 内存管理：三大分配器原理

深入解析 Linux 内存管理：三大分配器原理

一、引导内存分配器

1.1 引导内存分配器 bootmem

(1) 初始化 bootmem_data

更多推荐文章

相关免费在线工具

(2) 释放 bootmem 所保留的页面

(3) 保留 bootmem 中页面

(4) 从 bootmem 中分配内存

(5) 释放 bootmem 中页面

1.2 引导内存分配器 memblock

内核数据结构

(1) 获取 memblock_region

(2) 寻找空闲内存区域

(3) 向 memblock_type 添加 memblock_region

(4) 扩充 memblock_type 中 region 数组空间

(5) 移除 memblock_type 中相应内存区域

二、伙伴分配器

2.1 伙伴分配器原理

2.2 数据结构

2.3 伙伴分配器的优缺点

2.4 伙伴分配器的结构

内存区域水线

2.5 伙伴分配器分配过程分析

三、Slab 分配器

3.1 Slab 核心思想

3.2 Slab 分配器的作用

3.3 Slab 分配流程

3.4 用户 API

3.5 数据结构

更多推荐文章

相关免费在线工具

深入解析 Linux 内存管理：三大分配器原理

深入解析 Linux 内存管理：三大分配器原理

一、引导内存分配器

1.1 引导内存分配器 bootmem

(1) 初始化 bootmem_data

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

(2) 释放 bootmem 所保留的页面

(3) 保留 bootmem 中页面

(4) 从 bootmem 中分配内存

(5) 释放 bootmem 中页面

1.2 引导内存分配器 memblock

内核数据结构

(1) 获取 memblock_region

(2) 寻找空闲内存区域

(3) 向 memblock_type 添加 memblock_region

(4) 扩充 memblock_type 中 region 数组空间

(5) 移除 memblock_type 中相应内存区域

二、伙伴分配器

2.1 伙伴分配器原理

2.2 数据结构

2.3 伙伴分配器的优缺点

2.4 伙伴分配器的结构

内存区域水线

2.5 伙伴分配器分配过程分析

三、Slab 分配器

3.1 Slab 核心思想

3.2 Slab 分配器的作用

3.3 Slab 分配流程

3.4 用户 API

3.5 数据结构

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具