众所周知,Linux内存管理的核心是伙伴系统(buddy system)。其实在linux启动的那一刻,内存管理就已经开始了,只不过不是buddy在管理。在内核中,实现物理内存管理的allocator包括:
-
连续物理内存管理buddy allocator
-
非连续物理内存管理vmalloc allocator
-
小块物理内存管理slab allocator
-
高端物理内存管理kmapper
-
初始化阶段物理内存管理memblock
在系统初始化阶段会先启用一个bootmem分配器和memblock分配器,此分配器是专门用于启动阶段的,一个bootmem分配器管理着一个node结点的所有内存,也就是在numa架构中多个node有多个bootmem,他们被链入bdata_list链表中保存。而伙伴系统的初始化就是将bootmem管理的所有物理页框释放到伙伴系统中去,本章的主要是分析下,如何实现bootmem到buddy的过度的整个流程。
1. 由mem_init开始
void __init mem_init(void)
{
#ifdef CONFIG_HAVE_TCM
/* These pointers are filled in on TCM detection */
extern u32 dtcm_end;
extern u32 itcm_end;
#endif
set_max_mapnr(pfn_to_page(max_pfn) - mem_map); --------------(1)
/* this will put all unused low memory onto the freelists */
free_unused_memmap(); --------------(2)
free_all_bootmem(); --------------(3)
#ifdef CONFIG_SA1111
/* now that our DMA memory is actually so designated, we can free it */
free_reserved_area(__va(PHYS_OFFSET), swapper_pg_dir, -1, NULL);
#endif
free_highpages(); --------------(4)
mem_init_print_info(NULL); --------------(5)
#define MLK(b, t) b, t, ((t) - (b)) >> 10
#define MLM(b, t) b, t, ((t) - (b)) >> 20
#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
pr_notice("Virtual kernel memory layout:\n"
" vector : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HAVE_TCM
" DTCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
" ITCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld MB)\n"
#endif
#ifdef CONFIG_MODULES
" modules : 0x%08lx - 0x%08lx (%4ld MB)\n"
#endif
" .text : 0x%p" " - 0x%p" " (%4td kB)\n"
" .init : 0x%p" " - 0x%p" " (%4td kB)\n"
" .data : 0x%p" " - 0x%p" " (%4td kB)\n"
" .bss : 0x%p" " - 0x%p" " (%4td kB)\n",
MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
(PAGE_SIZE)),
#ifdef CONFIG_HAVE_TCM
MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
MLK(ITCM_OFFSET, (unsigned long) itcm_end),
#endif
MLK(FIXADDR_START, FIXADDR_END),
MLM(VMALLOC_START, VMALLOC_END),
MLM(PAGE_OFFSET, (unsigned long)high_memory),
#ifdef CONFIG_HIGHMEM
MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
(PAGE_SIZE)),
#endif
#ifdef CONFIG_MODULES
MLM(MODULES_VADDR, MODULES_END),
#endif
MLK_ROUNDUP(_text, _etext),
MLK_ROUNDUP(__init_begin, __init_end),
MLK_ROUNDUP(_sdata, _edata),
MLK_ROUNDUP(__bss_start, __bss_stop));
#undef MLK
#undef MLM
#undef MLK_ROUNDUP
/*
* Check boundaries twice: Some fundamental inconsistencies can
* be detected at build time already.
*/
#ifdef CONFIG_MMU
BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
BUG_ON(TASK_SIZE > MODULES_VADDR);
#endif
#ifdef CONFIG_HIGHMEM
BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE > PAGE_OFFSET);
BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE > PAGE_OFFSET);
#endif
if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
extern int sysctl_overcommit_memory;
/*
* On a machine this small we won't get
* anywhere without overcommit, so turn
* it on by default.
*/
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
}
- 函数set_max_mapnr()就是用于计算max_mapnr,实际指向实际物理内存大小
- free_unused_memmap将物理上不存在的页(hole)在页管理位图中全部记录为"不适用"。
- 在页管理位图中记录为"不使用"后,free_all_bootmem函数进行释放,使其能够在伙伴系统中管理空白页。
- free_highpages将高端内存区域释放到伙伴系统,使其能管理空白页
- mem_init_print_info()是把内核映像的各个段地址打印出来,后面主要是将整个内核空间的虚拟映射空间打印出来,对于我们现在使用的开发板其打印信息如下
2. 空闲内存释放
函数free_unused_memmap()和free_all_bootmem()都是把空闲内存释放到伙伴系统,前者释放memblock中空闲内存,后者释放bootmem中内存。
static void __init free_unused_memmap(void)
{
unsigned long start, prev_end = 0;
struct memblock_region *reg;
/*
* This relies on each bank being in address order.
* The banks are sorted previously in bootmem_init().
*/
for_each_memblock(memory, reg) {
start = memblock_region_memory_base_pfn(reg);
#ifdef CONFIG_SPARSEMEM
start = min(start,
ALIGN(prev_end, PAGES_PER_SECTION));
#else
start = round_down(start, MAX_ORDER_NR_PAGES);
#endif
if (prev_end && prev_end < start)
free_memmap(prev_end, start);
prev_end = ALIGN(memblock_region_memory_end_pfn(reg),
MAX_ORDER_NR_PAGES);
}
#ifdef CONFIG_SPARSEMEM
if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION))
free_memmap(prev_end,
ALIGN(prev_end, PAGES_PER_SECTION));
#endif
}
该主要是获得memblock的memory,对于IMX开发板,其reg为0x8000000,得到对应的start为0x80000,所以不满足free_memap的条件,之后拿到的prev_end为0xa0000,而对于该开发板只有一片内存,所以对于memblock中没有相对应的空闲内存释放。系统在分配内存节点的mem_map时是按照这个内存节点起始地址到末尾地址分配的,这个地址空间中可能有空洞,这个空洞地址对应的page数据结构是可以释放掉,如下图所示
下面我们来看看bootm的释放,首先我们来看看bootmem的struct bootmem_data结构:
typedef struct bootmem_data {
unsigned long node_min_pfn;
unsigned long node_low_pfn;
void *node_bootmem_map;
unsigned long last_end_off;
unsigned long hint_idx;
struct list_head list;
} bootmem_data_t;
结构体成员 | 含义 |
---|---|
node_min_pfn | 此块内存开始页框号 |
node_low_pfn | 此块内存结束页框号,如果是32位系统下此保存的是ZONE_NORMAL最后一个页框号 |
node_bootmem_map | 指向位图内存区,node中所有ZONE_HIGHMEM之前的页框都在这里面有一个位,每次需要分配内存时就会扫描找出一个空闲页框,空洞的内存也会占用位,不过空洞的内存应该设置为已分配 |
last_end_off | 上次分配距离末尾的偏移量 |
hint_idx | |
list | 链入bdata_list结构链表 |
bootm分配器的核心就是node_bootmem_map这个位图,每一位代表这个node的一个页,当需要分配时,就会去扫描这个位图,然后获得一段物理页框进行分配,一般都会从开始处向后分配。而伙伴系统初始化时会根据这个位图,将位图中空闲的页释放回伙伴系统,而已经分配出去的页则不会在初始化阶段释放回伙伴系统,不过有可能在运行过程中释放回伙伴系统。由于对于支持memblock的内核,内核配置了CONFIG_NO_BOOTMEM,其实现在mm/nobootmem.c,具体实现如下:
unsigned long __init free_all_bootmem(void)
{
unsigned long pages;
reset_all_zones_managed_pages(); ---------------(1)
pages = free_low_memory_core_early(); ---------------(2)
totalram_pages += pages;
return pages;
}
- 设置所有node的所有zone的managed_pages为0,该函数只会启动时候调用一次
- 遍历所有需要释放的启动内存数据块,释放bdata启动内存块中所有页框到页框分配器中,计算所有的内存页数据,存储在totalram_pages中,并返回总共释放的页数量。
继续看free_low_memory_core_early,其主要的实现如下所示
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
phys_addr_t start, end;
u64 i;
memblock_clear_hotplug(0, -1);
for_each_reserved_mem_region(i, &start, &end)
reserve_bootmem_region(start, end);
/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
NULL)
count += __free_memory_core(start, end);
return count;
}
- 遍历memblock.reserved类型的regions,对每个regions设置页面属性为Reserved,对于Imx,这个reserved区域为
[root@qemu_imx6ul:~]# cat /sys/kernel/debug/memblock/reserved
0: 0x0000000080003000..0x0000000080007fff
1: 0x0000000080200000..0x00000000810e8eeb
2: 0x0000000088000000..0x0000000088014303
3: 0x000000008bad3000..0x000000008bb40fff
4: 0x000000008bb413c0..0x000000008bb433bf
5: 0x000000008bb433f4..0x000000008bffefff
6: 0x000000008bfff740..0x000000008bfff77b
7: 0x000000008bfff780..0x000000008bfff7bb
8: 0x000000008bfff7c0..0x000000008bfff837
9: 0x000000008bfff840..0x000000008bfff843
10: 0x000000008bfff880..0x000000008bfff883
11: 0x000000008bfff8c0..0x000000008bfff8c3
12: 0x000000008bfff900..0x000000008bfff903
13: 0x000000008bfff940..0x000000008bfff9a1
14: 0x000000008bfff9c0..0x000000008bfffa21
15: 0x000000008bfffa40..0x000000008bfffaa1
16: 0x000000008bfffaac..0x000000008bfffac6
17: 0x000000008bfffac8..0x000000008bfffae2
18: 0x000000008bfffae4..0x000000008bfffb5e
19: 0x000000008bfffb60..0x000000008bfffb7a
20: 0x000000008bfffb7c..0x000000008bfffb96
21: 0x000000008bfffb98..0x000000008bfffbb2
22: 0x000000008bfffbb4..0x000000008bfffbce
23: 0x000000008bfffbd0..0x000000008bfffbea
24: 0x000000008bfffbec..0x000000008bfffc06
25: 0x000000008bfffc08..0x000000008bfffc22
26: 0x000000008bfffc24..0x000000008bfffccc
27: 0x000000008bfffcd0..0x000000008bfffce8
28: 0x000000008bfffcec..0x000000008bfffd04
29: 0x000000008bfffd08..0x000000008bfffd20
30: 0x000000008bfffd24..0x000000008bfffd3c
31: 0x000000008bfffd40..0x000000008bfffd5c
32: 0x000000008bfffd60..0x000000008bfffd7c
33: 0x000000008bfffd80..0x000000008bfffdc7
34: 0x000000008bfffdd8..0x000000009fffffff
- 遍历所有在memblock.memory中,但是不在memblock.reserve中的regions。然后清Reserved页面属性
下面重点看看页面是如何完成reserved的配置,其代码如下,主要是清空各页的page->flags的PG_reserved位,将reserved的区域的页标签位为PG_reserved,并加入到page->lru链表中。
void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
for (; start_pfn < end_pfn; start_pfn++) {
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn);
init_reserved_page(start_pfn);
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
SetPageReserved(page);
}
}
}
我们看看重点的__free_memory_core,其主要是遍历所有在memblock.memory中,但是不在memblock.reserve中的regions,对于imx的开发板,其信息如下
0: 0x0000000080000000 .. 0x0000000080003000
1: 0x0000000080008000 .. 0x0000000080200000
2: 0x00000000810e9000 .. 0x0000000080200000
3: 0x0000000088015000 .. 0x000000008bad3000
4:0x000000008bb41000 .. 0x000000008bb41000
5: 0x000000008bfff000 .. 0x000000008bfff000
static unsigned long __init __free_memory_core(phys_addr_t start,
phys_addr_t end)
{
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = min_t(unsigned long,
PFN_DOWN(end), max_low_pfn);
if (start_pfn > end_pfn)
return 0;
__free_pages_memory(start_pfn, end_pfn);
return end_pfn - start_pfn;
}
核心的__free_pages_memory函数,该函数以顺序为单位释放页,清空各页的PG_reserved位,设置pgae->count为0后,然后调用__free_pages,代码实现为
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
}
}
-
首先检查页次数page->_refcount减1后的值是否为0
-
为0的时,即释放1页时调用free_hot_cold_page,否则调用__free_pages_ok,将页以顺序单位释放
该过程比较复杂,涉及到伙伴系统的一些算法,先留一个疑问,后面深入分析下。
3. 高端内存释放
static void __init free_highpages(void)
{
#ifdef CONFIG_HIGHMEM
unsigned long max_low = max_low_pfn;
struct memblock_region *mem, *res;
/* set highmem page free */
for_each_memblock(memory, mem) {
unsigned long start = memblock_region_memory_base_pfn(mem);
unsigned long end = memblock_region_memory_end_pfn(mem);
/* Ignore complete lowmem entries */
if (end <= max_low)
continue;
if (memblock_is_nomap(mem))
continue;
/* Truncate partial highmem entries */
if (start < max_low)
start = max_low;
/* Find and exclude any reserved regions */
for_each_memblock(reserved, res) {
unsigned long res_start, res_end;
res_start = memblock_region_reserved_base_pfn(res);
res_end = memblock_region_reserved_end_pfn(res);
if (res_end < start)
continue;
if (res_start < start)
res_start = start;
if (res_start > end)
res_start = end;
if (res_end > end)
res_end = end;
if (res_start != start)
free_area_high(start, res_start);
start = res_end;
if (start == end)
break;
}
/* And now free anything which remains */
if (start < end)
free_area_high(start, end);
}
#endif
}
存在高端内存时,该代码求出高端内存的起始页帧和尾页帧,然后调用free\_area\_high函数使伙伴系统管理空白页,free\_area\_high函数在内部调用\_\_free\_page函数,将空白页和一般内存区域共同释放到伙伴系统。
# 4. 总结 #
本章的mem\_init()函数结束启动时的内存分配器memblock和bootmem,将bootmem和memblock管理的空白页以顺序单位构建列表,构建好的伙伴系统将为Linux的内存分配器slab提供空白页。